# manga_translator.py """ Enhanced Manga Translation Pipeline with improved text visibility controls Handles OCR, translation, and advanced text rendering for manga panels Now with proper history management and full page context support """ import os import json import base64 import logging import time import traceback import cv2 from PIL import ImageEnhance, ImageFilter from typing import List, Dict, Tuple, Optional, Any from dataclasses import dataclass from concurrent.futures import ThreadPoolExecutor, as_completed import threading from PIL import Image, ImageDraw, ImageFont import numpy as np from bubble_detector import BubbleDetector from TransateKRtoEN import send_with_interrupt # Google Cloud Vision imports try: from google.cloud import vision GOOGLE_CLOUD_VISION_AVAILABLE = True except ImportError: GOOGLE_CLOUD_VISION_AVAILABLE = False print("Warning: Google Cloud Vision not installed. Install with: pip install google-cloud-vision") # Import HistoryManager for proper context management try: from history_manager import HistoryManager except ImportError: HistoryManager = None print("Warning: HistoryManager not available. Context tracking will be limited.") logger = logging.getLogger(__name__) @dataclass class TextRegion: """Represents a detected text region (speech bubble, narration box, etc.)""" text: str vertices: List[Tuple[int, int]] # Polygon vertices from Cloud Vision bounding_box: Tuple[int, int, int, int] # x, y, width, height confidence: float region_type: str # 'text_block' from Cloud Vision translated_text: Optional[str] = None bubble_bounds: Optional[Tuple[int, int, int, int]] = None # RT-DETR bubble bounds for rendering def to_dict(self): return { 'text': self.text, 'vertices': self.vertices, 'bounding_box': self.bounding_box, 'confidence': self.confidence, 'region_type': self.region_type, 'translated_text': self.translated_text } class MangaTranslator: """Main class for manga translation pipeline using Google Cloud Vision + API Key""" # Global, process-wide registry to make local inpainting init safe across threads # Only dictionary operations are locked (microseconds); heavy work happens outside the lock. _inpaint_pool_lock = threading.Lock() _inpaint_pool = {} # (method, model_path) -> {'inpainter': obj|None, 'loaded': bool, 'event': threading.Event()} # Detector preloading pool for non-singleton bubble detector instances _detector_pool_lock = threading.Lock() _detector_pool = {} # (detector_type, model_id_or_path) -> {'spares': list[BubbleDetector]} # Bubble detector singleton loading coordination _singleton_bd_event = threading.Event() _singleton_bd_loading = False # SINGLETON PATTERN: Shared model instances across all translators _singleton_lock = threading.Lock() _singleton_bubble_detector = None _singleton_local_inpainter = None _singleton_refs = 0 # Reference counter for singleton instances # Class-level cancellation flag for all instances _global_cancelled = False _global_cancel_lock = threading.RLock() @classmethod def set_global_cancellation(cls, cancelled: bool): """Set global cancellation flag for all translator instances""" with cls._global_cancel_lock: cls._global_cancelled = cancelled @classmethod def is_globally_cancelled(cls) -> bool: """Check if globally cancelled""" with cls._global_cancel_lock: return cls._global_cancelled @classmethod def reset_global_flags(cls): """Reset global cancellation flags when starting new translation""" with cls._global_cancel_lock: cls._global_cancelled = False def _return_inpainter_to_pool(self): """Return a checked-out inpainter instance back to the pool for reuse.""" if not hasattr(self, '_checked_out_inpainter') or not hasattr(self, '_inpainter_pool_key'): return # Nothing checked out try: with MangaTranslator._inpaint_pool_lock: key = self._inpainter_pool_key rec = MangaTranslator._inpaint_pool.get(key) if rec and 'checked_out' in rec: checked_out = rec['checked_out'] if self._checked_out_inpainter in checked_out: checked_out.remove(self._checked_out_inpainter) self._log(f"πŸ”„ Returned inpainter to pool ({len(checked_out)}/{len(rec.get('spares', []))} still in use)", "info") # Clear the references self._checked_out_inpainter = None self._inpainter_pool_key = None except Exception as e: # Non-critical - just log try: self._log(f"⚠️ Failed to return inpainter to pool: {e}", "debug") except: pass def _return_bubble_detector_to_pool(self): """Return a checked-out bubble detector instance back to the pool for reuse.""" if not hasattr(self, '_checked_out_bubble_detector') or not hasattr(self, '_bubble_detector_pool_key'): return # Nothing checked out try: with MangaTranslator._detector_pool_lock: key = self._bubble_detector_pool_key rec = MangaTranslator._detector_pool.get(key) if rec and 'checked_out' in rec: checked_out = rec['checked_out'] if self._checked_out_bubble_detector in checked_out: checked_out.remove(self._checked_out_bubble_detector) self._log(f"πŸ”„ Returned bubble detector to pool ({len(checked_out)}/{len(rec.get('spares', []))} still in use)", "info") # Clear the references self._checked_out_bubble_detector = None self._bubble_detector_pool_key = None except Exception as e: # Non-critical - just log try: self._log(f"⚠️ Failed to return bubble detector to pool: {e}", "debug") except: pass @classmethod def cleanup_singletons(cls, force=False): """Clean up singleton instances when no longer needed Args: force: If True, cleanup even if references exist (for app shutdown) """ with cls._singleton_lock: if force or cls._singleton_refs == 0: # Cleanup singleton bubble detector if cls._singleton_bubble_detector is not None: try: if hasattr(cls._singleton_bubble_detector, 'unload'): cls._singleton_bubble_detector.unload(release_shared=True) cls._singleton_bubble_detector = None print("πŸ€– Singleton bubble detector cleaned up") except Exception as e: print(f"Failed to cleanup singleton bubble detector: {e}") # Cleanup singleton local inpainter if cls._singleton_local_inpainter is not None: try: if hasattr(cls._singleton_local_inpainter, 'unload'): cls._singleton_local_inpainter.unload() cls._singleton_local_inpainter = None print("🎨 Singleton local inpainter cleaned up") except Exception as e: print(f"Failed to cleanup singleton local inpainter: {e}") cls._singleton_refs = 0 def __init__(self, ocr_config: dict, unified_client, main_gui, log_callback=None): """Initialize with OCR configuration and API client from main GUI Args: ocr_config: Dictionary with OCR provider settings: { 'provider': 'google' or 'azure', 'google_credentials_path': str (if google), 'azure_key': str (if azure), 'azure_endpoint': str (if azure) } """ # CRITICAL: Set thread limits FIRST before any heavy library operations # This must happen before cv2, torch, numpy operations try: parallel_enabled = main_gui.config.get('manga_settings', {}).get('advanced', {}).get('parallel_processing', False) if not parallel_enabled: # Force single-threaded mode for all computational libraries os.environ['OMP_NUM_THREADS'] = '1' os.environ['MKL_NUM_THREADS'] = '1' os.environ['OPENBLAS_NUM_THREADS'] = '1' os.environ['NUMEXPR_NUM_THREADS'] = '1' os.environ['VECLIB_MAXIMUM_THREADS'] = '1' os.environ['ONNXRUNTIME_NUM_THREADS'] = '1' # Set torch and cv2 thread limits if already imported try: import torch torch.set_num_threads(1) except (ImportError, RuntimeError): pass try: cv2.setNumThreads(1) except (AttributeError, NameError): pass except Exception: pass # Silently fail if config not available # Set up logging first self.log_callback = log_callback self.main_gui = main_gui # Set up stdout capture to redirect prints to GUI self._setup_stdout_capture() # Pass log callback to unified client self.client = unified_client if hasattr(self.client, 'log_callback'): self.client.log_callback = log_callback elif hasattr(self.client, 'set_log_callback'): self.client.set_log_callback(log_callback) self.ocr_config = ocr_config self.main_gui = main_gui self.log_callback = log_callback self.config = main_gui.config self.manga_settings = self.config.get('manga_settings', {}) # Concise logging flag from Advanced settings try: self.concise_logs = bool(self.manga_settings.get('advanced', {}).get('concise_logs', True)) except Exception: self.concise_logs = True # Ensure all GUI environment variables are set self._sync_environment_variables() # Initialize attributes self.current_image = None self.current_mask = None self.text_regions = [] self.translated_regions = [] self.final_image = None # Initialize inpainter attributes self.local_inpainter = None self.hybrid_inpainter = None self.inpainter = None # Initialize bubble detector (will check singleton mode later) self.bubble_detector = None # Default: do NOT use singleton models unless explicitly enabled self.use_singleton_models = self.manga_settings.get('advanced', {}).get('use_singleton_models', False) # For bubble detector specifically, prefer a singleton so it stays resident in RAM self.use_singleton_bubble_detector = self.manga_settings.get('advanced', {}).get('use_singleton_bubble_detector', True) # Processing flags self.is_processing = False self.cancel_requested = False self.stop_flag = None # Initialize stop_flag attribute # Initialize batch mode attributes (API parallelism) from environment, not GUI local toggles # BATCH_TRANSLATION controls whether UnifiedClient allows concurrent API calls across threads. try: self.batch_mode = os.getenv('BATCH_TRANSLATION', '0') == '1' except Exception: self.batch_mode = False # OCR ROI cache - PER IMAGE ONLY (cleared aggressively to prevent text leakage) # CRITICAL: This cache MUST be cleared before every new image to prevent text contamination # THREAD-SAFE: Each translator instance has its own cache (safe for parallel panel translation) self.ocr_roi_cache = {} self._current_image_hash = None # Track current image to force cache invalidation # Thread-safe lock for cache operations (critical for parallel panel translation) import threading self._cache_lock = threading.Lock() try: self.batch_size = int(os.getenv('BATCH_SIZE', '1')) except Exception: # Fallback to GUI entry if present; otherwise default to 1 try: self.batch_size = int(main_gui.batch_size_var.get()) if hasattr(main_gui, 'batch_size_var') else 1 except Exception: self.batch_size = 1 self.batch_current = 1 if self.batch_mode: self._log(f"πŸ“¦ BATCH MODE: Processing {self.batch_size} images") self._log(f"⏱️ Keeping API delay for rate limit protection") # NOTE: We NO LONGER preload models here! # Models should only be loaded when actually needed # This was causing unnecessary RAM usage ocr_settings = self.manga_settings.get('ocr', {}) bubble_detection_enabled = ocr_settings.get('bubble_detection_enabled', False) if bubble_detection_enabled: self._log("πŸ“¦ BATCH MODE: Bubble detection will be loaded on first use") else: self._log("πŸ“¦ BATCH MODE: Bubble detection is disabled") # Cache for processed images - DEPRECATED/UNUSED (kept for backward compatibility) # DO NOT USE THIS FOR TEXT DATA - IT CAN LEAK BETWEEN IMAGES self.cache = {} # Determine OCR provider self.ocr_provider = ocr_config.get('provider', 'google') if self.ocr_provider == 'google': if not GOOGLE_CLOUD_VISION_AVAILABLE: raise ImportError("Google Cloud Vision required. Install with: pip install google-cloud-vision") google_path = ocr_config.get('google_credentials_path') if not google_path: raise ValueError("Google credentials path required") os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = google_path self.vision_client = vision.ImageAnnotatorClient() elif self.ocr_provider == 'azure': # Import Azure libraries try: from azure.cognitiveservices.vision.computervision import ComputerVisionClient from msrest.authentication import CognitiveServicesCredentials self.azure_cv = ComputerVisionClient self.azure_creds = CognitiveServicesCredentials except ImportError: raise ImportError("Azure Computer Vision required. Install with: pip install azure-cognitiveservices-vision-computervision") azure_key = ocr_config.get('azure_key') azure_endpoint = ocr_config.get('azure_endpoint') if not azure_key or not azure_endpoint: raise ValueError("Azure key and endpoint required") self.vision_client = self.azure_cv( azure_endpoint, self.azure_creds(azure_key) ) else: # New OCR providers handled by OCR manager try: from ocr_manager import OCRManager self.ocr_manager = OCRManager(log_callback=log_callback) print(f"Initialized OCR Manager for {self.ocr_provider}") # Initialize OCR manager with stop flag awareness if hasattr(self.ocr_manager, 'reset_stop_flags'): self.ocr_manager.reset_stop_flags() except Exception as _e: self.ocr_manager = None self._log(f"Failed to initialize OCRManager: {str(_e)}", "error") self.client = unified_client self.main_gui = main_gui self.log_callback = log_callback # Prefer allocator that can return memory to OS (effective before torch loads) try: os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") except Exception: pass # Get all settings from GUI self.api_delay = float(self.main_gui.delay_entry.get() if hasattr(main_gui, 'delay_entry') else 2.0) # Propagate API delay to unified_api_client via env var so its internal pacing/logging matches GUI try: os.environ["SEND_INTERVAL_SECONDS"] = str(self.api_delay) except Exception: pass self.temperature = float(main_gui.trans_temp.get() if hasattr(main_gui, 'trans_temp') else 0.3) self.max_tokens = int(main_gui.max_output_tokens if hasattr(main_gui, 'max_output_tokens') else 4000) if hasattr(main_gui, 'token_limit_disabled') and main_gui.token_limit_disabled: self.input_token_limit = None # None means no limit self._log("πŸ“Š Input token limit: DISABLED (unlimited)") else: token_limit_value = main_gui.token_limit_entry.get() if hasattr(main_gui, 'token_limit_entry') else '120000' if token_limit_value and token_limit_value.strip().isdigit(): self.input_token_limit = int(token_limit_value.strip()) else: self.input_token_limit = 120000 # Default self._log(f"πŸ“Š Input token limit: {self.input_token_limit} tokens") # Get contextual settings from GUI self.contextual_enabled = main_gui.contextual_var.get() if hasattr(main_gui, 'contextual_var') else False self.translation_history_limit = int(main_gui.trans_history.get() if hasattr(main_gui, 'trans_history') else 3) self.rolling_history_enabled = main_gui.translation_history_rolling_var.get() if hasattr(main_gui, 'translation_history_rolling_var') else False # Initialize HistoryManager placeholder self.history_manager = None self.history_manager_initialized = False self.history_output_dir = None # Full page context translation settings self.full_page_context_enabled = True # Default prompt for full page context mode self.full_page_context_prompt = ( "You will receive multiple text segments from a manga page, each prefixed with an index like [0], [1], etc. " "Translate each segment considering the context of all segments together. " "Maintain consistency in character names, tone, and style across all translations.\n\n" "CRITICAL: Return your response as a valid JSON object where each key includes BOTH the index prefix " "AND the original text EXACTLY as provided (e.g., '[0] こんにけは'), and each value is the translation.\n" "This is essential for correct mapping - do not modify or omit the index prefixes!\n\n" "Make sure to properly escape any special characters in the JSON:\n" "- Use \\n for newlines\n" "- Use \\\" for quotes\n" "- Use \\\\ for backslashes\n\n" "Example:\n" '{\n' ' "[0] こんにけは": "Hello",\n' ' "[1] γ‚γ‚ŠγŒγ¨γ†": "Thank you",\n' ' "[2] γ•γ‚ˆγ†γͺら": "Goodbye"\n' '}\n\n' 'REMEMBER: Keep the [index] prefix in each JSON key exactly as shown in the input!' ) # Visual context setting (for non-vision model support) self.visual_context_enabled = main_gui.config.get('manga_visual_context_enabled', True) # Store context for contextual translation (backwards compatibility) self.translation_context = [] # Font settings for text rendering self.font_path = self._find_font() self.min_font_size = 10 self.max_font_size = 60 try: _ms = main_gui.config.get('manga_settings', {}) or {} _rend = _ms.get('rendering', {}) or {} _font = _ms.get('font_sizing', {}) or {} self.min_readable_size = int(_rend.get('auto_min_size', _font.get('min_size', 16))) except Exception: self.min_readable_size = int(main_gui.config.get('manga_min_readable_size', 16)) self.max_font_size_limit = main_gui.config.get('manga_max_font_size', 24) self.strict_text_wrapping = main_gui.config.get('manga_strict_text_wrapping', False) # Enhanced text rendering settings - Load from config if available config = main_gui.config if hasattr(main_gui, 'config') else {} self.text_bg_opacity = config.get('manga_bg_opacity', 255) # 0-255, default fully opaque self.text_bg_style = config.get('manga_bg_style', 'box') # 'box', 'circle', 'wrap' self.text_bg_reduction = config.get('manga_bg_reduction', 1.0) # Size reduction factor (0.5-1.0) self.constrain_to_bubble = config.get('manga_constrain_to_bubble', True) # Text color from config manga_text_color = config.get('manga_text_color', [0, 0, 0]) self.text_color = tuple(manga_text_color) # Convert list to tuple self.outline_color = (255, 255, 255) # White outline self.outline_width_factor = 15 # Divider for font_size to get outline width self.selected_font_style = config.get('manga_font_path', None) # Will store selected font path self.custom_font_size = config.get('manga_font_size', None) if config.get('manga_font_size', 0) > 0 else None # Text shadow settings from config self.shadow_enabled = config.get('manga_shadow_enabled', False) manga_shadow_color = config.get('manga_shadow_color', [128, 128, 128]) self.shadow_color = tuple(manga_shadow_color) # Convert list to tuple self.shadow_offset_x = config.get('manga_shadow_offset_x', 2) self.shadow_offset_y = config.get('manga_shadow_offset_y', 2) self.shadow_blur = config.get('manga_shadow_blur', 0) # 0 = sharp shadow, higher = more blur self.force_caps_lock = config.get('manga_force_caps_lock', False) self.skip_inpainting = config.get('manga_skip_inpainting', True) # Font size multiplier mode - Load from config self.font_size_mode = config.get('manga_font_size_mode', 'fixed') # 'fixed' or 'multiplier' self.font_size_multiplier = config.get('manga_font_size_multiplier', 1.0) # Default multiplierr #inpainting quality self.inpaint_quality = config.get('manga_inpaint_quality', 'high') # 'high' or 'fast' self._log("\nπŸ”§ MangaTranslator initialized with settings:") self._log(f" API Delay: {self.api_delay}s") self._log(f" Temperature: {self.temperature}") self._log(f" Max Output Tokens: {self.max_tokens}") self._log(f" Input Token Limit: {'DISABLED' if self.input_token_limit is None else self.input_token_limit}") self._log(f" Contextual Translation: {'ENABLED' if self.contextual_enabled else 'DISABLED'}") self._log(f" Translation History Limit: {self.translation_history_limit}") self._log(f" Rolling History: {'ENABLED' if self.rolling_history_enabled else 'DISABLED'}") self._log(f" Font Path: {self.font_path or 'Default'}") self._log(f" Text Rendering: BG {self.text_bg_style}, Opacity {int(self.text_bg_opacity/255*100)}%") self._log(f" Shadow: {'ENABLED' if self.shadow_enabled else 'DISABLED'}\n") self.manga_settings = config.get('manga_settings', {}) # Initialize local inpainter if configured (respects singleton mode) if self.manga_settings.get('inpainting', {}).get('method') == 'local': if self.use_singleton_models: self._initialize_singleton_local_inpainter() else: self._initialize_local_inpainter() # advanced settings self.debug_mode = self.manga_settings.get('advanced', {}).get('debug_mode', False) self.save_intermediate = self.manga_settings.get('advanced', {}).get('save_intermediate', False) self.parallel_processing = self.manga_settings.get('advanced', {}).get('parallel_processing', True) self.max_workers = self.manga_settings.get('advanced', {}).get('max_workers', 2) # Deep cleanup control: if True, release models after every image (aggressive) self.force_deep_cleanup_each_image = self.manga_settings.get('advanced', {}).get('force_deep_cleanup_each_image', False) # RAM cap adv = self.manga_settings.get('advanced', {}) self.ram_cap_enabled = bool(adv.get('ram_cap_enabled', False)) self.ram_cap_mb = int(adv.get('ram_cap_mb', 0) or 0) self.ram_cap_mode = str(adv.get('ram_cap_mode', 'soft')) self.ram_check_interval_sec = float(adv.get('ram_check_interval_sec', 1.0)) self.ram_recovery_margin_mb = int(adv.get('ram_recovery_margin_mb', 256)) self._mem_over_cap = False self._mem_stop_event = threading.Event() self._mem_thread = None # Advanced RAM gate tuning self.ram_gate_timeout_sec = float(adv.get('ram_gate_timeout_sec', 10.0)) self.ram_min_floor_over_baseline_mb = int(adv.get('ram_min_floor_over_baseline_mb', 128)) # Measure baseline at init try: self.ram_baseline_mb = self._get_process_rss_mb() or 0 except Exception: self.ram_baseline_mb = 0 if self.ram_cap_enabled and self.ram_cap_mb > 0: self._init_ram_cap() def set_stop_flag(self, stop_flag): """Set the stop flag for checking interruptions""" self.stop_flag = stop_flag self.cancel_requested = False def reset_stop_flags(self): """Reset all stop flags when starting new translation""" self.cancel_requested = False self.is_processing = False # Reset global flags self.reset_global_flags() self._log("πŸ”„ Stop flags reset for new translation", "debug") def _check_stop(self): """Check if stop has been requested using multiple sources""" # Check global cancellation first if self.is_globally_cancelled(): self.cancel_requested = True return True # Check local stop flag (only if it exists and is set) if hasattr(self, 'stop_flag') and self.stop_flag and self.stop_flag.is_set(): self.cancel_requested = True return True # Check processing flag if hasattr(self, 'cancel_requested') and self.cancel_requested: return True return False def _setup_stdout_capture(self): """Set up stdout capture to redirect print statements to GUI""" import sys import builtins # Store original print function self._original_print = builtins.print # Create custom print function def gui_print(*args, **kwargs): """Custom print that redirects to GUI""" # Convert args to string message = ' '.join(str(arg) for arg in args) # Check if this is one of the specific messages we want to capture if any(marker in message for marker in ['πŸ”', 'βœ…', '⏳', 'INFO:', 'ERROR:', 'WARNING:']): if self.log_callback: # Clean up the message message = message.strip() # Determine level level = 'info' if 'ERROR:' in message or '❌' in message: level = 'error' elif 'WARNING:' in message or '⚠️' in message: level = 'warning' # Remove prefixes like "INFO:" if present for prefix in ['INFO:', 'ERROR:', 'WARNING:', 'DEBUG:']: message = message.replace(prefix, '').strip() # Send to GUI self.log_callback(message, level) return # Don't print to console # For other messages, use original print self._original_print(*args, **kwargs) # Replace the built-in print builtins.print = gui_print def __del__(self): """Restore original print when MangaTranslator is destroyed""" if hasattr(self, '_original_print'): import builtins builtins.print = self._original_print # Best-effort shutdown in case caller forgot to call shutdown() try: self.shutdown() except Exception: pass def _cleanup_thread_locals(self): """Aggressively release thread-local heavy objects (onnx sessions, detectors).""" try: if hasattr(self, '_thread_local'): tl = self._thread_local # Release thread-local inpainters if hasattr(tl, 'local_inpainters') and isinstance(tl.local_inpainters, dict): try: for inp in list(tl.local_inpainters.values()): try: if hasattr(inp, 'unload'): inp.unload() except Exception: pass finally: try: tl.local_inpainters.clear() except Exception: pass # Return thread-local bubble detector to pool (DO NOT unload) if hasattr(tl, 'bubble_detector') and tl.bubble_detector is not None: try: # Instead of unloading, return to pool for reuse self._return_bubble_detector_to_pool() # Keep thread-local reference intact for reuse in next image # Only clear if we're truly shutting down the thread except Exception: pass except Exception: # Best-effort cleanup only pass def shutdown(self): """Fully release resources for MangaTranslator (models, detectors, torch caches, threads).""" try: # Decrement singleton reference counter if using singleton mode if hasattr(self, 'use_singleton_models') and self.use_singleton_models: with MangaTranslator._singleton_lock: MangaTranslator._singleton_refs = max(0, MangaTranslator._singleton_refs - 1) self._log(f"Singleton refs: {MangaTranslator._singleton_refs}", "debug") # Stop memory watchdog thread if running if hasattr(self, '_mem_stop_event') and getattr(self, '_mem_stop_event', None) is not None: try: self._mem_stop_event.set() except Exception: pass # Perform deep cleanup, then try to teardown torch try: self._deep_cleanup_models() except Exception: pass try: self._force_torch_teardown() except Exception: pass try: self._huggingface_teardown() except Exception: pass try: self._trim_working_set() except Exception: pass # Null out heavy references for attr in [ 'client', 'vision_client', 'local_inpainter', 'hybrid_inpainter', 'inpainter', 'bubble_detector', 'ocr_manager', 'history_manager', 'current_image', 'current_mask', 'text_regions', 'translated_regions', 'final_image' ]: try: if hasattr(self, attr): setattr(self, attr, None) except Exception: pass except Exception as e: try: self._log(f"⚠️ shutdown() encountered: {e}", "warning") except Exception: pass def _sync_environment_variables(self): """Sync all GUI environment variables to ensure manga translation respects GUI settings This ensures settings like RETRY_TRUNCATED, THINKING_BUDGET, etc. are properly set """ try: # Get config from main_gui if available if not hasattr(self, 'main_gui') or not self.main_gui: return # Use the main_gui's set_all_environment_variables method if available if hasattr(self.main_gui, 'set_all_environment_variables'): self.main_gui.set_all_environment_variables() else: # Fallback: manually set key variables config = self.main_gui.config if hasattr(self.main_gui, 'config') else {} # Thinking settings (most important for speed) thinking_enabled = config.get('enable_gemini_thinking', True) thinking_budget = config.get('gemini_thinking_budget', -1) # CRITICAL FIX: If thinking is disabled, force budget to 0 regardless of config value if not thinking_enabled: thinking_budget = 0 os.environ['ENABLE_GEMINI_THINKING'] = '1' if thinking_enabled else '0' os.environ['GEMINI_THINKING_BUDGET'] = str(thinking_budget) os.environ['THINKING_BUDGET'] = str(thinking_budget) # Also set for unified_api_client # Retry settings retry_truncated = config.get('retry_truncated', False) max_retry_tokens = config.get('max_retry_tokens', 16384) max_retries = config.get('max_retries', 7) os.environ['RETRY_TRUNCATED'] = '1' if retry_truncated else '0' os.environ['MAX_RETRY_TOKENS'] = str(max_retry_tokens) os.environ['MAX_RETRIES'] = str(max_retries) # Safety settings disable_gemini_safety = config.get('disable_gemini_safety', False) os.environ['DISABLE_GEMINI_SAFETY'] = '1' if disable_gemini_safety else '0' except Exception as e: self._log(f"⚠️ Failed to sync environment variables: {e}", "warning") def _force_torch_teardown(self): """Best-effort teardown of PyTorch CUDA context and caches to drop closer to baseline. Safe to call even if CUDA is not available. """ try: import torch, os, gc # CPU: free cached tensors try: gc.collect() except Exception: pass # CUDA path if hasattr(torch, 'cuda') and torch.cuda.is_available(): try: torch.cuda.synchronize() except Exception: pass try: torch.cuda.empty_cache() except Exception: pass try: torch.cuda.ipc_collect() except Exception: pass # Try to clear cuBLAS workspaces (not always available) try: getattr(torch._C, "_cuda_clearCublasWorkspaces")() except Exception: pass # Optional hard reset via CuPy if present reset_done = False try: import cupy try: cupy.cuda.runtime.deviceReset() reset_done = True self._log("CUDA deviceReset via CuPy", "debug") except Exception: pass except Exception: pass # Fallback: attempt to call cudaDeviceReset from cudart on Windows if os.name == 'nt' and not reset_done: try: import ctypes candidates = [ "cudart64_12.dll", "cudart64_120.dll", "cudart64_110.dll", "cudart64_102.dll", "cudart64_101.dll", "cudart64_100.dll", "cudart64_90.dll" ] for name in candidates: try: dll = ctypes.CDLL(name) dll.cudaDeviceReset.restype = ctypes.c_int rc = dll.cudaDeviceReset() self._log(f"cudaDeviceReset via {name} rc={rc}", "debug") reset_done = True break except Exception: continue except Exception: pass except Exception: pass def _huggingface_teardown(self): """Best-effort teardown of HuggingFace/transformers/tokenizers state. - Clears on-disk model cache for known repos (via _clear_hf_cache) - Optionally purges relevant modules from sys.modules (AGGRESSIVE_HF_UNLOAD=1) """ try: import os, sys, gc # Clear disk cache for detectors (and any default repo) to avoid growth across runs try: self._clear_hf_cache() except Exception: pass # Optional aggressive purge of modules to free Python-level caches if os.getenv('AGGRESSIVE_HF_UNLOAD', '1') == '1': prefixes = ( 'transformers', 'huggingface_hub', 'tokenizers', 'safetensors', 'accelerate', ) to_purge = [m for m in list(sys.modules.keys()) if m.startswith(prefixes)] for m in to_purge: try: del sys.modules[m] except Exception: pass gc.collect() except Exception: pass def _deep_cleanup_models(self): """Release ALL model references and caches to reduce RAM after translation. This is the COMPREHENSIVE cleanup that ensures all models are unloaded from RAM. """ self._log("🧹 Starting comprehensive model cleanup to free RAM...", "info") try: # ========== 1. CLEANUP OCR MODELS ========== try: if hasattr(self, 'ocr_manager'): ocr_manager = getattr(self, 'ocr_manager', None) if ocr_manager: self._log(" Cleaning up OCR models...", "debug") # Clear all loaded OCR providers if hasattr(ocr_manager, 'providers'): for provider_name, provider in ocr_manager.providers.items(): try: # Unload the model if hasattr(provider, 'model'): provider.model = None if hasattr(provider, 'processor'): provider.processor = None if hasattr(provider, 'tokenizer'): provider.tokenizer = None if hasattr(provider, 'reader'): provider.reader = None if hasattr(provider, 'is_loaded'): provider.is_loaded = False self._log(f" βœ“ Unloaded {provider_name} OCR provider", "debug") except Exception as e: self._log(f" Warning: Failed to unload {provider_name}: {e}", "debug") # Clear the entire OCR manager self.ocr_manager = None self._log(" βœ“ OCR models cleaned up", "debug") except Exception as e: self._log(f" Warning: OCR cleanup failed: {e}", "debug") # ========== 2. CLEANUP BUBBLE DETECTOR (YOLO/RT-DETR) ========== try: # Instance-level bubble detector if hasattr(self, 'bubble_detector') and self.bubble_detector is not None: # Check if using singleton mode - don't unload shared instance if (getattr(self, 'use_singleton_bubble_detector', False)) or (hasattr(self, 'use_singleton_models') and self.use_singleton_models): self._log(" Skipping bubble detector cleanup (singleton mode)", "debug") # Just clear our reference, don't unload the shared instance self.bubble_detector = None else: self._log(" Cleaning up bubble detector (YOLO/RT-DETR)...", "debug") bd = self.bubble_detector try: if hasattr(bd, 'unload'): bd.unload(release_shared=True) # This unloads YOLO and RT-DETR models self._log(" βœ“ Called bubble detector unload", "debug") except Exception as e: self._log(f" Warning: Bubble detector unload failed: {e}", "debug") self.bubble_detector = None self._log(" βœ“ Bubble detector cleaned up", "debug") # Also clean class-level shared RT-DETR models unless keeping singleton warm if not getattr(self, 'use_singleton_bubble_detector', False): try: from bubble_detector import BubbleDetector if hasattr(BubbleDetector, '_rtdetr_shared_model'): BubbleDetector._rtdetr_shared_model = None if hasattr(BubbleDetector, '_rtdetr_shared_processor'): BubbleDetector._rtdetr_shared_processor = None if hasattr(BubbleDetector, '_rtdetr_loaded'): BubbleDetector._rtdetr_loaded = False self._log(" βœ“ Cleared shared RT-DETR cache", "debug") except Exception: pass # Clear preloaded detector spares try: with MangaTranslator._detector_pool_lock: for rec in MangaTranslator._detector_pool.values(): try: rec['spares'] = [] except Exception: pass except Exception: pass except Exception as e: self._log(f" Warning: Bubble detector cleanup failed: {e}", "debug") # ========== 3. CLEANUP INPAINTERS ========== try: self._log(" Cleaning up inpainter models...", "debug") # Instance-level inpainter if hasattr(self, 'local_inpainter') and self.local_inpainter is not None: # Check if using singleton mode - don't unload shared instance if hasattr(self, 'use_singleton_models') and self.use_singleton_models: self._log(" Skipping local inpainter cleanup (singleton mode)", "debug") # Just clear our reference, don't unload the shared instance self.local_inpainter = None else: try: if hasattr(self.local_inpainter, 'unload'): self.local_inpainter.unload() self._log(" βœ“ Unloaded local inpainter", "debug") except Exception: pass self.local_inpainter = None # Hybrid inpainter if hasattr(self, 'hybrid_inpainter') and self.hybrid_inpainter is not None: try: if hasattr(self.hybrid_inpainter, 'unload'): self.hybrid_inpainter.unload() self._log(" βœ“ Unloaded hybrid inpainter", "debug") except Exception: pass self.hybrid_inpainter = None # Generic inpainter reference if hasattr(self, 'inpainter') and self.inpainter is not None: try: if hasattr(self.inpainter, 'unload'): self.inpainter.unload() self._log(" βœ“ Unloaded inpainter", "debug") except Exception: pass self.inpainter = None # Release any shared inpainters in the global pool with MangaTranslator._inpaint_pool_lock: for key, rec in list(MangaTranslator._inpaint_pool.items()): try: inp = rec.get('inpainter') if isinstance(rec, dict) else None if inp is not None: try: if hasattr(inp, 'unload'): inp.unload() self._log(f" βœ“ Unloaded pooled inpainter: {key}", "debug") except Exception: pass # Drop any spare instances as well try: for spare in rec.get('spares') or []: try: if hasattr(spare, 'unload'): spare.unload() except Exception: pass rec['spares'] = [] except Exception: pass except Exception: pass MangaTranslator._inpaint_pool.clear() self._log(" βœ“ Cleared inpainter pool", "debug") # Release process-wide shared inpainter if hasattr(MangaTranslator, '_shared_local_inpainter'): shared = getattr(MangaTranslator, '_shared_local_inpainter', None) if shared is not None: try: if hasattr(shared, 'unload'): shared.unload() self._log(" βœ“ Unloaded shared inpainter", "debug") except Exception: pass setattr(MangaTranslator, '_shared_local_inpainter', None) self._log(" βœ“ Inpainter models cleaned up", "debug") except Exception as e: self._log(f" Warning: Inpainter cleanup failed: {e}", "debug") # ========== 4. CLEANUP THREAD-LOCAL MODELS ========== try: if hasattr(self, '_thread_local') and self._thread_local is not None: self._log(" Cleaning up thread-local models...", "debug") tl = self._thread_local # Thread-local inpainters if hasattr(tl, 'local_inpainters') and isinstance(tl.local_inpainters, dict): for key, inp in list(tl.local_inpainters.items()): try: if hasattr(inp, 'unload'): inp.unload() self._log(f" βœ“ Unloaded thread-local inpainter: {key}", "debug") except Exception: pass tl.local_inpainters.clear() # Thread-local bubble detector if hasattr(tl, 'bubble_detector') and tl.bubble_detector is not None: try: if hasattr(tl.bubble_detector, 'unload'): tl.bubble_detector.unload(release_shared=False) self._log(" βœ“ Unloaded thread-local bubble detector", "debug") except Exception: pass tl.bubble_detector = None self._log(" βœ“ Thread-local models cleaned up", "debug") except Exception as e: self._log(f" Warning: Thread-local cleanup failed: {e}", "debug") # ========== 5. CLEAR PYTORCH/CUDA CACHE ========== try: import torch if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() self._log(" βœ“ Cleared CUDA cache", "debug") except Exception: pass # ========== 6. FORCE GARBAGE COLLECTION ========== try: import gc gc.collect() # Multiple passes for stubborn references gc.collect() gc.collect() self._log(" βœ“ Forced garbage collection", "debug") except Exception: pass self._log("βœ… Model cleanup complete - RAM should be freed", "info") except Exception as e: # Never raise from deep cleanup self._log(f"⚠️ Model cleanup encountered error: {e}", "warning") pass def _clear_hf_cache(self, repo_id: str = None): """Best-effort: clear Hugging Face cache for a specific repo (RT-DETR by default). This targets disk cache; it won’t directly reduce RAM but helps avoid growth across runs. """ try: # Determine repo_id from BubbleDetector if not provided if repo_id is None: try: import bubble_detector as _bdmod BD = getattr(_bdmod, 'BubbleDetector', None) if BD is not None and hasattr(BD, '_rtdetr_repo_id'): repo_id = getattr(BD, '_rtdetr_repo_id') or 'ogkalu/comic-text-and-bubble-detector' else: repo_id = 'ogkalu/comic-text-and-bubble-detector' except Exception: repo_id = 'ogkalu/comic-text-and-bubble-detector' # Try to use huggingface_hub to delete just the matching repo cache try: from huggingface_hub import scan_cache_dir info = scan_cache_dir() repos = getattr(info, 'repos', []) to_delete = [] for repo in repos: rid = getattr(repo, 'repo_id', None) or getattr(repo, 'id', None) if rid == repo_id: to_delete.append(repo) if to_delete: # Prefer the high-level deletion API if present if hasattr(info, 'delete_repos'): info.delete_repos(to_delete) else: import shutil for repo in to_delete: repo_dir = getattr(repo, 'repo_path', None) or getattr(repo, 'repo_dir', None) if repo_dir and os.path.exists(repo_dir): shutil.rmtree(repo_dir, ignore_errors=True) except Exception: # Fallback: try removing default HF cache dir for this repo pattern try: from pathlib import Path hf_home = os.environ.get('HF_HOME') if hf_home: base = Path(hf_home) else: base = Path.home() / '.cache' / 'huggingface' / 'hub' # Repo cache dirs are named like models--{org}--{name} safe_name = repo_id.replace('/', '--') candidates = list(base.glob(f'models--{safe_name}*')) import shutil for c in candidates: shutil.rmtree(str(c), ignore_errors=True) except Exception: pass except Exception: # Best-effort only pass def _trim_working_set(self): """Release freed memory back to the OS where possible. - On Windows: use EmptyWorkingSet on current process - On Linux: attempt malloc_trim(0) - On macOS: no direct API; rely on GC """ import sys import platform try: system = platform.system() if system == 'Windows': import ctypes psapi = ctypes.windll.psapi kernel32 = ctypes.windll.kernel32 h_process = kernel32.GetCurrentProcess() psapi.EmptyWorkingSet(h_process) elif system == 'Linux': import ctypes libc = ctypes.CDLL('libc.so.6') try: libc.malloc_trim(0) except Exception: pass except Exception: pass def _get_process_rss_mb(self) -> int: """Return current RSS in MB (cross-platform best-effort).""" try: import psutil, os as _os return int(psutil.Process(_os.getpid()).memory_info().rss / (1024*1024)) except Exception: # Windows fallback try: import ctypes, os as _os class PROCESS_MEMORY_COUNTERS(ctypes.Structure): _fields_ = [ ("cb", ctypes.c_uint), ("PageFaultCount", ctypes.c_uint), ("PeakWorkingSetSize", ctypes.c_size_t), ("WorkingSetSize", ctypes.c_size_t), ("QuotaPeakPagedPoolUsage", ctypes.c_size_t), ("QuotaPagedPoolUsage", ctypes.c_size_t), ("QuotaPeakNonPagedPoolUsage", ctypes.c_size_t), ("QuotaNonPagedPoolUsage", ctypes.c_size_t), ("PagefileUsage", ctypes.c_size_t), ("PeakPagefileUsage", ctypes.c_size_t), ] GetCurrentProcess = ctypes.windll.kernel32.GetCurrentProcess GetProcessMemoryInfo = ctypes.windll.psapi.GetProcessMemoryInfo counters = PROCESS_MEMORY_COUNTERS() counters.cb = ctypes.sizeof(PROCESS_MEMORY_COUNTERS) GetProcessMemoryInfo(GetCurrentProcess(), ctypes.byref(counters), counters.cb) return int(counters.WorkingSetSize / (1024*1024)) except Exception: return 0 def _apply_windows_job_memory_limit(self, cap_mb: int) -> bool: """Apply a hard memory cap using Windows Job Objects. Returns True on success.""" try: import ctypes from ctypes import wintypes JOB_OBJECT_LIMIT_JOB_MEMORY = 0x00000200 JobObjectExtendedLimitInformation = 9 class JOBOBJECT_BASIC_LIMIT_INFORMATION(ctypes.Structure): _fields_ = [ ("PerProcessUserTimeLimit", ctypes.c_longlong), ("PerJobUserTimeLimit", ctypes.c_longlong), ("LimitFlags", wintypes.DWORD), ("MinimumWorkingSetSize", ctypes.c_size_t), ("MaximumWorkingSetSize", ctypes.c_size_t), ("ActiveProcessLimit", wintypes.DWORD), ("Affinity", ctypes.c_void_p), ("PriorityClass", wintypes.DWORD), ("SchedulingClass", wintypes.DWORD), ] class IO_COUNTERS(ctypes.Structure): _fields_ = [ ("ReadOperationCount", ctypes.c_ulonglong), ("WriteOperationCount", ctypes.c_ulonglong), ("OtherOperationCount", ctypes.c_ulonglong), ("ReadTransferCount", ctypes.c_ulonglong), ("WriteTransferCount", ctypes.c_ulonglong), ("OtherTransferCount", ctypes.c_ulonglong), ] class JOBOBJECT_EXTENDED_LIMIT_INFORMATION(ctypes.Structure): _fields_ = [ ("BasicLimitInformation", JOBOBJECT_BASIC_LIMIT_INFORMATION), ("IoInfo", IO_COUNTERS), ("ProcessMemoryLimit", ctypes.c_size_t), ("JobMemoryLimit", ctypes.c_size_t), ("PeakProcessMemoryUsed", ctypes.c_size_t), ("PeakJobMemoryUsed", ctypes.c_size_t), ] kernel32 = ctypes.WinDLL('kernel32', use_last_error=True) CreateJobObject = kernel32.CreateJobObjectW CreateJobObject.argtypes = [ctypes.c_void_p, wintypes.LPCWSTR] CreateJobObject.restype = wintypes.HANDLE SetInformationJobObject = kernel32.SetInformationJobObject SetInformationJobObject.argtypes = [wintypes.HANDLE, wintypes.INT, ctypes.c_void_p, wintypes.DWORD] SetInformationJobObject.restype = wintypes.BOOL AssignProcessToJobObject = kernel32.AssignProcessToJobObject AssignProcessToJobObject.argtypes = [wintypes.HANDLE, wintypes.HANDLE] AssignProcessToJobObject.restype = wintypes.BOOL GetCurrentProcess = kernel32.GetCurrentProcess GetCurrentProcess.restype = wintypes.HANDLE hJob = CreateJobObject(None, None) if not hJob: return False info = JOBOBJECT_EXTENDED_LIMIT_INFORMATION() info.BasicLimitInformation.LimitFlags = JOB_OBJECT_LIMIT_JOB_MEMORY info.JobMemoryLimit = ctypes.c_size_t(int(cap_mb) * 1024 * 1024) ok = SetInformationJobObject(hJob, JobObjectExtendedLimitInformation, ctypes.byref(info), ctypes.sizeof(info)) if not ok: return False ok = AssignProcessToJobObject(hJob, GetCurrentProcess()) if not ok: return False return True except Exception: return False def _memory_watchdog(self): try: import time while not self._mem_stop_event.is_set(): if not self.ram_cap_enabled or self.ram_cap_mb <= 0: break rss = self._get_process_rss_mb() if rss and rss > self.ram_cap_mb: self._mem_over_cap = True # Aggressive attempt to reduce memory try: self._deep_cleanup_models() except Exception: pass try: self._trim_working_set() except Exception: pass # Wait a bit before re-checking time.sleep(max(0.2, self.ram_check_interval_sec / 2)) time.sleep(0.1) # Brief pause for stability self._log("πŸ’€ Memory watchdog pausing briefly for stability", "debug") else: # Below cap or couldn't read RSS self._mem_over_cap = False time.sleep(self.ram_check_interval_sec) except Exception: pass def _init_ram_cap(self): # Hard cap via Windows Job Object if selected and on Windows try: import platform if self.ram_cap_mode.startswith('hard') or self.ram_cap_mode == 'hard': if platform.system() == 'Windows': if not self._apply_windows_job_memory_limit(self.ram_cap_mb): self._log("⚠️ Failed to apply hard RAM cap; falling back to soft mode", "warning") self.ram_cap_mode = 'soft' else: self._log("⚠️ Hard RAM cap only supported on Windows; using soft mode", "warning") self.ram_cap_mode = 'soft' except Exception: self.ram_cap_mode = 'soft' # Start watchdog regardless of mode to proactively stay under cap during operations try: self._mem_thread = threading.Thread(target=self._memory_watchdog, daemon=True) self._mem_thread.start() except Exception: pass def _block_if_over_cap(self, context_msg: str = ""): # If over cap, block until we drop under cap - margin if not self.ram_cap_enabled or self.ram_cap_mb <= 0: return import time # Never require target below baseline + floor margin baseline = max(0, getattr(self, 'ram_baseline_mb', 0)) floor = baseline + max(0, self.ram_min_floor_over_baseline_mb) # Compute target below cap by recovery margin, but not below floor target = self.ram_cap_mb - max(64, min(self.ram_recovery_margin_mb, self.ram_cap_mb // 4)) target = max(target, floor) start = time.time() waited = False last_log = 0 while True: rss = self._get_process_rss_mb() now = time.time() if rss and rss <= target: break # Timeout to avoid deadlock when baseline can't go lower than target if now - start > max(2.0, self.ram_gate_timeout_sec): self._log(f"βŒ› RAM gate timeout for {context_msg}: RSS={rss} MB, target={target} MB; proceeding in low-memory mode", "warning") break waited = True # Periodic log to help diagnose if now - last_log > 3.0 and rss: self._log(f"⏳ Waiting for RAM drop: RSS={rss} MB, target={target} MB ({context_msg})", "info") last_log = now # Attempt cleanup while waiting try: self._deep_cleanup_models() except Exception: pass try: self._trim_working_set() except Exception: pass if self._check_stop(): break time.sleep(0.1) # Brief pause for stability self._log("πŸ’€ RAM gate pausing briefly for stability", "debug") if waited and context_msg: self._log(f"🧹 Proceeding with {context_msg} (RSS now {self._get_process_rss_mb()} MB; target {target} MB)", "info") def set_batch_mode(self, enabled: bool, batch_size: int = 1): """Enable or disable batch mode optimizations""" self.batch_mode = enabled self.batch_size = batch_size if enabled: # Check if bubble detection is actually enabled before considering preload ocr_settings = self.manga_settings.get('ocr', {}) if hasattr(self, 'manga_settings') else {} bubble_detection_enabled = ocr_settings.get('bubble_detection_enabled', False) # Only suggest preloading if bubble detection is actually going to be used if bubble_detection_enabled: self._log("πŸ“¦ BATCH MODE: Bubble detection models will load on first use") # NOTE: We don't actually preload anymore to save RAM # Models are loaded on-demand when first needed # Similarly for OCR models - they load on demand if hasattr(self, 'ocr_manager') and self.ocr_manager: self._log(f"πŸ“¦ BATCH MODE: {self.ocr_provider} will load on first use") # NOTE: We don't preload OCR models either self._log(f"πŸ“¦ BATCH MODE ENABLED: Processing {batch_size} images") self._log(f"⏱️ API delay: {self.api_delay}s (preserved for rate limiting)") else: self._log("πŸ“ BATCH MODE DISABLED") def _ensure_bubble_detector_ready(self, ocr_settings): """Ensure a usable BubbleDetector for current thread, auto-reloading models after cleanup.""" try: bd = self._get_thread_bubble_detector() detector_type = ocr_settings.get('detector_type', 'rtdetr_onnx') if detector_type == 'rtdetr_onnx': if not getattr(bd, 'rtdetr_onnx_loaded', False): model_id = ocr_settings.get('rtdetr_model_url') or ocr_settings.get('bubble_model_path') if not bd.load_rtdetr_onnx_model(model_id=model_id): return None elif detector_type == 'rtdetr': if not getattr(bd, 'rtdetr_loaded', False): model_id = ocr_settings.get('rtdetr_model_url') or ocr_settings.get('bubble_model_path') if not bd.load_rtdetr_model(model_id=model_id): return None elif detector_type == 'yolo': model_path = ocr_settings.get('bubble_model_path') if model_path and not getattr(bd, 'model_loaded', False): if not bd.load_model(model_path): return None else: # auto # Prefer RT-DETR if available, else YOLO if configured if not getattr(bd, 'rtdetr_loaded', False): bd.load_rtdetr_model(model_id=ocr_settings.get('rtdetr_model_url') or ocr_settings.get('bubble_model_path')) return bd except Exception: return None def _merge_with_bubble_detection(self, regions: List[TextRegion], image_path: str) -> List[TextRegion]: """Merge text regions by bubble and filter based on RT-DETR class settings""" try: # Get detector settings from config ocr_settings = self.main_gui.config.get('manga_settings', {}).get('ocr', {}) detector_type = ocr_settings.get('detector_type', 'rtdetr_onnx') # Ensure detector is ready (auto-reload after cleanup) bd = self._ensure_bubble_detector_ready(ocr_settings) if bd is None: self._log("⚠️ Bubble detector unavailable after cleanup; falling back to proximity merge", "warning") # Use more conservative threshold for Azure/Google to avoid cross-bubble merging threshold = 30 if getattr(self, 'ocr_provider', '').lower() in ('azure', 'google') else 50 return self._merge_nearby_regions(regions, threshold=threshold) # Check if bubble detection is enabled if not ocr_settings.get('bubble_detection_enabled', False): self._log("πŸ“¦ Bubble detection is disabled in settings", "info") # Use more conservative threshold for Azure/Google to avoid cross-bubble merging threshold = 30 if getattr(self, 'ocr_provider', '').lower() in ('azure', 'google') else 50 return self._merge_nearby_regions(regions, threshold=threshold) # Initialize thread-local detector bd = self._get_thread_bubble_detector() bubbles = None rtdetr_detections = None if detector_type == 'rtdetr_onnx': if not self.batch_mode: self._log("πŸ€– Using RTEDR_onnx for bubble detection", "info") if self.batch_mode and getattr(bd, 'rtdetr_onnx_loaded', False): pass elif not getattr(bd, 'rtdetr_onnx_loaded', False): self._log("πŸ“₯ Loading RTEDR_onnx model...", "info") if not bd.load_rtdetr_onnx_model(): self._log("⚠️ Failed to load RTEDR_onnx, falling back to traditional merging", "warning") return self._merge_nearby_regions(regions) else: # Model loaded successfully - mark in pool for reuse try: model_id = ocr_settings.get('rtdetr_model_url') or ocr_settings.get('bubble_model_path') or '' key = ('rtdetr_onnx', model_id) with MangaTranslator._detector_pool_lock: if key not in MangaTranslator._detector_pool: MangaTranslator._detector_pool[key] = {'spares': []} # Mark this detector type as loaded for next run MangaTranslator._detector_pool[key]['loaded'] = True except Exception: pass rtdetr_confidence = ocr_settings.get('rtdetr_confidence', 0.3) detect_empty = ocr_settings.get('detect_empty_bubbles', True) detect_text_bubbles = ocr_settings.get('detect_text_bubbles', True) detect_free_text = ocr_settings.get('detect_free_text', True) if not self.batch_mode: self._log(f"πŸ“‹ RTEDR_onnx class filters:", "info") self._log(f" Empty bubbles: {'βœ“' if detect_empty else 'βœ—'}", "info") self._log(f" Text bubbles: {'βœ“' if detect_text_bubbles else 'βœ—'}", "info") self._log(f" Free text: {'βœ“' if detect_free_text else 'βœ—'}", "info") self._log(f"🎯 RTEDR_onnx confidence threshold: {rtdetr_confidence:.2f}", "info") rtdetr_detections = bd.detect_with_rtdetr_onnx( image_path=image_path, confidence=rtdetr_confidence, return_all_bubbles=False ) # Combine enabled bubble types for merging bubbles = [] if detect_empty and 'bubbles' in rtdetr_detections: bubbles.extend(rtdetr_detections['bubbles']) if detect_text_bubbles and 'text_bubbles' in rtdetr_detections: bubbles.extend(rtdetr_detections['text_bubbles']) # Store free text locations for filtering later free_text_regions = rtdetr_detections.get('text_free', []) if detect_free_text else [] self._log(f"βœ… RTEDR_onnx detected:", "success") self._log(f" {len(rtdetr_detections.get('bubbles', []))} empty bubbles", "info") self._log(f" {len(rtdetr_detections.get('text_bubbles', []))} text bubbles", "info") self._log(f" {len(rtdetr_detections.get('text_free', []))} free text regions", "info") elif detector_type == 'rtdetr': # BATCH OPTIMIZATION: Less verbose logging if not self.batch_mode: self._log("πŸ€– Using RT-DETR for bubble detection", "info") # BATCH OPTIMIZATION: Don't reload if already loaded if self.batch_mode and bd.rtdetr_loaded: # Model already loaded, skip the loading step entirely pass elif not bd.rtdetr_loaded: self._log("πŸ“₯ Loading RT-DETR model...", "info") if not bd.load_rtdetr_model(): self._log("⚠️ Failed to load RT-DETR, falling back to traditional merging", "warning") return self._merge_nearby_regions(regions) else: # Model loaded successfully - mark in pool for reuse try: model_id = ocr_settings.get('rtdetr_model_url') or ocr_settings.get('bubble_model_path') or '' key = ('rtdetr', model_id) with MangaTranslator._detector_pool_lock: if key not in MangaTranslator._detector_pool: MangaTranslator._detector_pool[key] = {'spares': []} # Mark this detector type as loaded for next run MangaTranslator._detector_pool[key]['loaded'] = True except Exception: pass # Get settings rtdetr_confidence = ocr_settings.get('rtdetr_confidence', 0.3) detect_empty = ocr_settings.get('detect_empty_bubbles', True) detect_text_bubbles = ocr_settings.get('detect_text_bubbles', True) detect_free_text = ocr_settings.get('detect_free_text', True) # BATCH OPTIMIZATION: Reduce logging if not self.batch_mode: self._log(f"πŸ“‹ RT-DETR class filters:", "info") self._log(f" Empty bubbles: {'βœ“' if detect_empty else 'βœ—'}", "info") self._log(f" Text bubbles: {'βœ“' if detect_text_bubbles else 'βœ—'}", "info") self._log(f" Free text: {'βœ“' if detect_free_text else 'βœ—'}", "info") self._log(f"🎯 RT-DETR confidence threshold: {rtdetr_confidence:.2f}", "info") # Get FULL RT-DETR detections (not just bubbles) rtdetr_detections = bd.detect_with_rtdetr( image_path=image_path, confidence=rtdetr_confidence, return_all_bubbles=False # Get dict with all classes ) # Combine enabled bubble types for merging bubbles = [] if detect_empty and 'bubbles' in rtdetr_detections: bubbles.extend(rtdetr_detections['bubbles']) if detect_text_bubbles and 'text_bubbles' in rtdetr_detections: bubbles.extend(rtdetr_detections['text_bubbles']) # Store free text locations for filtering later free_text_regions = rtdetr_detections.get('text_free', []) if detect_free_text else [] # Helper to test if a point lies in any bbox def _point_in_any_bbox(cx, cy, boxes): try: for (bx, by, bw, bh) in boxes or []: if bx <= cx <= bx + bw and by <= cy <= by + bh: return True except Exception: pass return False self._log(f"βœ… RT-DETR detected:", "success") self._log(f" {len(rtdetr_detections.get('bubbles', []))} empty bubbles", "info") self._log(f" {len(rtdetr_detections.get('text_bubbles', []))} text bubbles", "info") self._log(f" {len(rtdetr_detections.get('text_free', []))} free text regions", "info") elif detector_type == 'yolo': # Use YOLOv8 (existing code) self._log("πŸ€– Using YOLOv8 for bubble detection", "info") model_path = ocr_settings.get('bubble_model_path') if not model_path: self._log("⚠️ No YOLO model configured, falling back to traditional merging", "warning") return self._merge_nearby_regions(regions) if not bd.model_loaded: self._log(f"πŸ“₯ Loading YOLO model: {os.path.basename(model_path)}") if not bd.load_model(model_path): self._log("⚠️ Failed to load YOLO model, falling back to traditional merging", "warning") return self._merge_nearby_regions(regions) confidence = ocr_settings.get('bubble_confidence', 0.3) self._log(f"🎯 Detecting bubbles with YOLO (confidence >= {confidence:.2f})") bubbles = bd.detect_bubbles(image_path, confidence=confidence, use_rtdetr=False) else: # Unknown detector type self._log(f"❌ Unknown detector type: {detector_type}", "error") self._log(" Valid options: rtdetr_onnx, rtdetr, yolo", "error") return self._merge_nearby_regions(regions) if not bubbles: self._log("⚠️ No bubbles detected, using traditional merging", "warning") return self._merge_nearby_regions(regions) self._log(f"βœ… Found {len(bubbles)} bubbles for grouping", "success") # Merge regions within bubbles merged_regions = [] used_indices = set() # Build lookup of free text regions for exclusion free_text_bboxes = free_text_regions if detector_type in ('rtdetr', 'rtdetr_onnx') else [] # DEBUG: Log free text bboxes if free_text_bboxes: self._log(f"πŸ” Free text exclusion zones: {len(free_text_bboxes)} regions", "debug") for idx, (fx, fy, fw, fh) in enumerate(free_text_bboxes): self._log(f" Free text zone {idx + 1}: x={fx:.0f}, y={fy:.0f}, w={fw:.0f}, h={fh:.0f}", "debug") else: self._log(f"⚠️ No free text exclusion zones detected by RT-DETR", "warning") # Helper to check if a point is in any free text region def _point_in_free_text(cx, cy, free_boxes): try: for idx, (fx, fy, fw, fh) in enumerate(free_boxes or []): if fx <= cx <= fx + fw and fy <= cy <= fy + fh: self._log(f" βœ“ Point ({cx:.0f}, {cy:.0f}) is in free text zone {idx + 1}", "debug") return True except Exception as e: self._log(f" ⚠️ Error checking free text: {e}", "debug") pass return False for bubble_idx, (bx, by, bw, bh) in enumerate(bubbles): bubble_regions = [] self._log(f"\n Processing bubble {bubble_idx + 1}: x={bx:.0f}, y={by:.0f}, w={bw:.0f}, h={bh:.0f}", "debug") for idx, region in enumerate(regions): if idx in used_indices: continue rx, ry, rw, rh = region.bounding_box region_center_x = rx + rw / 2 region_center_y = ry + rh / 2 # Check if center is inside this bubble if (bx <= region_center_x <= bx + bw and by <= region_center_y <= by + bh): self._log(f" Region '{region.text[:20]}...' center ({region_center_x:.0f}, {region_center_y:.0f}) is in bubble", "debug") # CRITICAL: Don't merge if this region is in a free text area # Free text should stay separate from bubbles if _point_in_free_text(region_center_x, region_center_y, free_text_bboxes): # This region is in a free text area, don't merge it into bubble self._log(f" ❌ SKIPPING: Region overlaps with free text area", "debug") continue self._log(f" βœ“ Adding region to bubble {bubble_idx + 1}", "debug") bubble_regions.append(region) used_indices.add(idx) if bubble_regions: # CRITICAL: Check if this "bubble" actually contains multiple separate bubbles # This happens when RT-DETR detects one large bubble over stacked speech bubbles split_groups = self._split_bubble_if_needed(bubble_regions) # Process each split group as a separate bubble for group_idx, group in enumerate(split_groups): merged_text = " ".join(r.text for r in group) min_x = min(r.bounding_box[0] for r in group) min_y = min(r.bounding_box[1] for r in group) max_x = max(r.bounding_box[0] + r.bounding_box[2] for r in group) max_y = max(r.bounding_box[1] + r.bounding_box[3] for r in group) all_vertices = [] for r in group: if hasattr(r, 'vertices') and r.vertices: all_vertices.extend(r.vertices) if not all_vertices: all_vertices = [ (min_x, min_y), (max_x, min_y), (max_x, max_y), (min_x, max_y) ] merged_region = TextRegion( text=merged_text, vertices=all_vertices, bounding_box=(min_x, min_y, max_x - min_x, max_y - min_y), confidence=0.95, region_type='bubble_detected', bubble_bounds=(bx, by, bw, bh) # Pass bubble_bounds in constructor ) # Store original regions for masking merged_region.original_regions = group # Classify as text bubble for downstream rendering/masking merged_region.bubble_type = 'text_bubble' # Mark that this should be inpainted merged_region.should_inpaint = True merged_regions.append(merged_region) # DEBUG: Verify bubble_bounds was set if not getattr(self, 'concise_logs', False): has_bb = hasattr(merged_region, 'bubble_bounds') and merged_region.bubble_bounds is not None self._log(f" πŸ” Merged region has bubble_bounds: {has_bb}", "debug") if has_bb: self._log(f" bubble_bounds = {merged_region.bubble_bounds}", "debug") if len(split_groups) > 1: self._log(f" Bubble {bubble_idx + 1}.{group_idx + 1}: Merged {len(group)} text regions (split from {len(bubble_regions)} total)", "info") else: self._log(f" Bubble {bubble_idx + 1}: Merged {len(group)} text regions", "info") # Handle text outside bubbles based on RT-DETR settings for idx, region in enumerate(regions): if idx not in used_indices: # This text is outside any bubble # For RT-DETR mode, check if we should include free text if detector_type in ('rtdetr', 'rtdetr_onnx'): # If "Free Text" checkbox is checked, include ALL text outside bubbles # Don't require RT-DETR to specifically detect it as free text if ocr_settings.get('detect_free_text', True): region.should_inpaint = True # If RT-DETR detected free text box covering this region's center, mark explicitly try: cx = region.bounding_box[0] + region.bounding_box[2] / 2 cy = region.bounding_box[1] + region.bounding_box[3] / 2 # Find which free text bbox this region belongs to (if any) found_free_text_box = False for fx, fy, fw, fh in free_text_bboxes: if fx <= cx <= fx + fw and fy <= cy <= fy + fh: region.bubble_type = 'free_text' # CRITICAL: Set bubble_bounds to the RT-DETR free text detection box # This ensures rendering uses the full RT-DETR bounds, not just OCR polygon if not hasattr(region, 'bubble_bounds') or region.bubble_bounds is None: region.bubble_bounds = (fx, fy, fw, fh) found_free_text_box = True self._log(f" Free text region INCLUDED: '{region.text[:30]}...'", "debug") break if not found_free_text_box: # Text outside bubbles but not in free text box - still mark as free text region.bubble_type = 'free_text' # Use region's own bbox if no RT-DETR free text box found if not hasattr(region, 'bubble_bounds') or region.bubble_bounds is None: region.bubble_bounds = region.bounding_box self._log(f" Text outside bubbles INCLUDED (as free text): '{region.text[:30]}...'", "debug") except Exception: # Default to free text if check fails region.bubble_type = 'free_text' if not hasattr(region, 'bubble_bounds') or region.bubble_bounds is None: region.bubble_bounds = region.bounding_box else: region.should_inpaint = False self._log(f" Text outside bubbles EXCLUDED (Free Text unchecked): '{region.text[:30]}...'", "info") else: # For YOLO/auto, include all text by default region.should_inpaint = True merged_regions.append(region) # Log summary regions_to_inpaint = sum(1 for r in merged_regions if getattr(r, 'should_inpaint', True)) regions_to_skip = len(merged_regions) - regions_to_inpaint self._log(f"πŸ“Š Bubble detection complete: {len(regions)} β†’ {len(merged_regions)} regions", "success") if detector_type == 'rtdetr': self._log(f" {regions_to_inpaint} regions will be inpainted", "info") if regions_to_skip > 0: self._log(f" {regions_to_skip} regions will be preserved (Free Text unchecked)", "info") return merged_regions except Exception as e: self._log(f"❌ Bubble detection error: {str(e)}", "error") self._log(" Falling back to traditional merging", "warning") return self._merge_nearby_regions(regions) def set_full_page_context(self, enabled: bool, custom_prompt: str = None): """Configure full page context translation mode Args: enabled: Whether to translate all text regions in a single contextual request custom_prompt: Optional custom prompt for full page context mode """ self.full_page_context_enabled = enabled if custom_prompt: self.full_page_context_prompt = custom_prompt self._log(f"πŸ“„ Full page context mode: {'ENABLED' if enabled else 'DISABLED'}") if enabled: self._log(" All text regions will be sent together for contextual translation") else: self._log(" Text regions will be translated individually") def update_text_rendering_settings(self, bg_opacity: int = None, bg_style: str = None, bg_reduction: float = None, font_style: str = None, font_size: int = None, text_color: tuple = None, shadow_enabled: bool = None, shadow_color: tuple = None, shadow_offset_x: int = None, shadow_offset_y: int = None, shadow_blur: int = None, force_caps_lock: bool = None): # ADD THIS PARAMETER """Update text rendering settings""" self._log("πŸ“ Updating text rendering settings:", "info") if bg_opacity is not None: self.text_bg_opacity = max(0, min(255, bg_opacity)) self._log(f" Background opacity: {int(self.text_bg_opacity/255*100)}%", "info") if bg_style is not None and bg_style in ['box', 'circle', 'wrap']: self.text_bg_style = bg_style self._log(f" Background style: {bg_style}", "info") if bg_reduction is not None: self.text_bg_reduction = max(0.5, min(2.0, bg_reduction)) self._log(f" Background size: {int(self.text_bg_reduction*100)}%", "info") if font_style is not None: self.selected_font_style = font_style font_name = os.path.basename(font_style) if font_style else 'Default' self._log(f" Font: {font_name}", "info") if font_size is not None: if font_size < 0: # Negative value indicates multiplier mode self.font_size_mode = 'multiplier' self.font_size_multiplier = abs(font_size) self.custom_font_size = None # Clear fixed size self._log(f" Font size mode: Dynamic multiplier ({self.font_size_multiplier:.1f}x)", "info") else: # Positive value or 0 indicates fixed mode self.font_size_mode = 'fixed' self.custom_font_size = font_size if font_size > 0 else None self._log(f" Font size mode: Fixed ({font_size if font_size > 0 else 'Auto'})", "info") if text_color is not None: self.text_color = text_color self._log(f" Text color: RGB{text_color}", "info") if shadow_enabled is not None: self.shadow_enabled = shadow_enabled self._log(f" Shadow: {'Enabled' if shadow_enabled else 'Disabled'}", "info") if shadow_color is not None: self.shadow_color = shadow_color self._log(f" Shadow color: RGB{shadow_color}", "info") if shadow_offset_x is not None: self.shadow_offset_x = shadow_offset_x if shadow_offset_y is not None: self.shadow_offset_y = shadow_offset_y if shadow_blur is not None: self.shadow_blur = max(0, shadow_blur) if force_caps_lock is not None: # ADD THIS BLOCK self.force_caps_lock = force_caps_lock self._log(f" Force Caps Lock: {'Enabled' if force_caps_lock else 'Disabled'}", "info") self._log("βœ… Rendering settings updated", "info") def _log(self, message: str, level: str = "info"): """Log message to GUI or console, and also to file logger. The file logger is configured in translator_gui._setup_file_logging(). Enhanced with comprehensive stop suppression. """ # Enhanced stop suppression - allow only essential stop confirmation messages if self._check_stop() or self.is_globally_cancelled(): # Only allow very specific stop confirmation messages - nothing else essential_stop_keywords = [ "⏹️ Translation stopped by user", "🧹 Cleaning up models to free RAM", "βœ… Model cleanup complete - RAM should be freed", "βœ… All models cleaned up - RAM freed!" ] # Suppress ALL other messages when stopped - be very restrictive if not any(keyword in message for keyword in essential_stop_keywords): return # Concise pipeline logs: keep only high-level messages and errors/warnings if getattr(self, 'concise_logs', False): if level in ("error", "warning"): pass else: keep_prefixes = ( # Pipeline boundaries and IO "πŸ“· STARTING", "πŸ“ Input", "πŸ“ Output", # Step markers "πŸ“ [STEP", # Step 1 essentials "πŸ” Detecting text regions", # start of detection on file "πŸ“„ Detected", # format detected "Using OCR provider:", # provider line "Using Azure Read API", # azure-specific run mode "⚠️ Converting image to PNG", # azure PNG compatibility "πŸ€– Using AI bubble detection", # BD merge mode "πŸ€– Using RTEDR_onnx", # selected BD "βœ… Detected", # detected N regions after merging # Detectors/inpainter readiness "πŸ€– Using bubble detector", "🎨 Using local inpainter", # Step 2: key actions "πŸ”€ Running", # Running translation and inpainting concurrently "πŸ“„ Using FULL PAGE CONTEXT", # Explicit mode notice "πŸ“„ Full page context mode", # Alternate phrasing "πŸ“„ Full page context translation", # Start/summary "🎭 Creating text mask", "πŸ“Š Mask breakdown", "πŸ“ Applying", "🎨 Inpainting", "🧽 Using local inpainting", # Detection and summary "πŸ“Š Bubble detection complete", "βœ… Detection complete", # Mapping/translation summary "πŸ“Š Mapping", "πŸ“Š Full page context translation complete", # Rendering "✍️ Rendering", "βœ… ENHANCED text rendering complete", # Output and final summary "πŸ’Ύ Saved output", "βœ… TRANSLATION PIPELINE COMPLETE", "πŸ“Š Translation Summary", "βœ… Successful", "❌ Failed", # Cleanup "πŸ”‘ Auto cleanup", "πŸ”‘ Translator instance preserved" ) _msg = message.lstrip() if isinstance(message, str) else message if not any(_msg.startswith(p) for p in keep_prefixes): return # In batch mode, only log important messages if self.batch_mode: # Skip verbose/debug messages in batch mode if level == "debug" or "DEBUG:" in message: return # Skip repetitive messages if any(skip in message for skip in [ "Using vertex-based", "Using", "Applying", "Font size", "Region", "Found text", "Style:" ]): return # Send to GUI if available if self.log_callback: try: self.log_callback(message, level) except Exception: # Fall back to print if GUI callback fails print(message) else: print(message) # Always record to the Python logger (file) try: _logger = logging.getLogger(__name__) if level == "error": _logger.error(message) elif level == "warning": _logger.warning(message) elif level == "debug": _logger.debug(message) else: # Map custom levels like 'success' to INFO _logger.info(message) except Exception: pass def _is_primarily_english(self, text: str) -> bool: """Heuristic: treat text as English if it has no CJK and a high ASCII ratio. Conservative by default to avoid dropping legitimate content. Tunable via manga_settings.ocr: - english_exclude_threshold (float, default 0.70) - english_exclude_min_chars (int, default 4) - english_exclude_short_tokens (bool, default False) """ if not text: return False # Pull tuning knobs from settings (with safe defaults) ocr_settings = {} try: ocr_settings = self.main_gui.config.get('manga_settings', {}).get('ocr', {}) except Exception: pass threshold = float(ocr_settings.get('english_exclude_threshold', 0.70)) min_chars = int(ocr_settings.get('english_exclude_min_chars', 4)) exclude_short = bool(ocr_settings.get('english_exclude_short_tokens', False)) # 1) If text contains any CJK or full-width characters, do NOT treat as English has_cjk = any( '\u4e00' <= char <= '\u9fff' or # Chinese '\u3040' <= char <= '\u309f' or # Hiragana '\u30a0' <= char <= '\u30ff' or # Katakana '\uac00' <= char <= '\ud7af' or # Korean '\uff00' <= char <= '\uffef' # Full-width characters for char in text ) if has_cjk: return False text_stripped = text.strip() non_space_len = sum(1 for c in text_stripped if not c.isspace()) # 2) By default, do not exclude very short tokens to avoid losing interjections like "Ah", "Eh?", etc. if not exclude_short and non_space_len < max(1, min_chars): return False # Optional legacy behavior: aggressively drop very short pure-ASCII tokens if exclude_short: if len(text_stripped) == 1 and text_stripped.isalpha() and ord(text_stripped) < 128: self._log(f" Excluding single English letter: '{text_stripped}'", "debug") return True if len(text_stripped) <= 3: ascii_letters = sum(1 for char in text_stripped if char.isalpha() and ord(char) < 128) if ascii_letters >= len(text_stripped) * 0.5: self._log(f" Excluding short English text: '{text_stripped}'", "debug") return True # 3) Compute ASCII ratio (exclude spaces) ascii_chars = sum(1 for char in text if 33 <= ord(char) <= 126) total_chars = sum(1 for char in text if not char.isspace()) if total_chars == 0: return False ratio = ascii_chars / total_chars if ratio > threshold: self._log(f" Excluding English text ({ratio:.0%} ASCII, threshold {threshold:.0%}, len={non_space_len}): '{text[:30]}...'", "debug") return True return False def _load_bubble_detector(self, ocr_settings, image_path): """Load bubble detector with appropriate model based on settings Returns: dict: Detection results or None if failed """ detector_type = ocr_settings.get('detector_type', 'rtdetr_onnx') model_path = ocr_settings.get('bubble_model_path', '') confidence = ocr_settings.get('bubble_confidence', 0.3) bd = self._get_thread_bubble_detector() if detector_type == 'rtdetr_onnx' or 'RTEDR_onnx' in str(detector_type): # Load RT-DETR ONNX model if bd.load_rtdetr_onnx_model(model_id=ocr_settings.get('rtdetr_model_url') or model_path): return bd.detect_with_rtdetr_onnx( image_path=image_path, confidence=ocr_settings.get('rtdetr_confidence', confidence), return_all_bubbles=False ) elif detector_type == 'rtdetr' or 'RT-DETR' in str(detector_type): # Load RT-DETR (PyTorch) model if bd.load_rtdetr_model(model_id=ocr_settings.get('rtdetr_model_url') or model_path): return bd.detect_with_rtdetr( image_path=image_path, confidence=ocr_settings.get('rtdetr_confidence', confidence), return_all_bubbles=False ) elif detector_type == 'custom': # Custom model - try to determine type from path custom_path = ocr_settings.get('custom_model_path', model_path) if 'rtdetr' in custom_path.lower(): # Custom RT-DETR model if bd.load_rtdetr_model(model_id=custom_path): return bd.detect_with_rtdetr( image_path=image_path, confidence=confidence, return_all_bubbles=False ) else: # Assume YOLO format for other custom models if custom_path and bd.load_model(custom_path): detections = bd.detect_bubbles( image_path, confidence=confidence ) return { 'text_bubbles': detections if detections else [], 'text_free': [], 'bubbles': [] } else: # Standard YOLO model if model_path and bd.load_model(model_path): detections = bd.detect_bubbles( image_path, confidence=confidence ) return { 'text_bubbles': detections if detections else [], 'text_free': [], 'bubbles': [] } return None def _ensure_google_client(self): try: if getattr(self, 'vision_client', None) is None: from google.cloud import vision google_path = self.ocr_config.get('google_credentials_path') if hasattr(self, 'ocr_config') else None if google_path: os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = google_path self.vision_client = vision.ImageAnnotatorClient() self._log("βœ… Reinitialized Google Vision client", "debug") except Exception as e: self._log(f"❌ Failed to initialize Google Vision client: {e}", "error") def _ensure_azure_client(self): try: if getattr(self, 'vision_client', None) is None: from azure.cognitiveservices.vision.computervision import ComputerVisionClient from msrest.authentication import CognitiveServicesCredentials key = None endpoint = None try: key = (self.ocr_config or {}).get('azure_key') endpoint = (self.ocr_config or {}).get('azure_endpoint') except Exception: pass if not key: key = self.main_gui.config.get('azure_vision_key', '') if hasattr(self, 'main_gui') else None if not endpoint: endpoint = self.main_gui.config.get('azure_vision_endpoint', '') if hasattr(self, 'main_gui') else None if not key or not endpoint: raise ValueError("Azure credentials missing for client init") self.vision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(key)) self._log("βœ… Reinitialized Azure Computer Vision client", "debug") except Exception as e: self._log(f"❌ Failed to initialize Azure CV client: {e}", "error") def detect_text_regions(self, image_path: str) -> List[TextRegion]: """Detect text regions using configured OCR provider""" # Reduce logging in batch mode if not self.batch_mode: self._log(f"πŸ” Detecting text regions in: {os.path.basename(image_path)}") self._log(f" Using OCR provider: {self.ocr_provider.upper()}") else: # Only show batch progress if batch_current is set properly if hasattr(self, 'batch_current') and hasattr(self, 'batch_size'): self._log(f"πŸ” [{self.batch_current}/{self.batch_size}] {os.path.basename(image_path)}") else: self._log(f"πŸ” Detecting text: {os.path.basename(image_path)}") try: # ============================================================ # CRITICAL: FORCE CLEAR ALL TEXT-RELATED CACHES # This MUST happen for EVERY image to prevent text contamination # NO EXCEPTIONS - batch mode or not, ALL caches get cleared # ============================================================ # 1. Clear OCR ROI cache (prevents text from previous images leaking) # THREAD-SAFE: Use lock to prevent race conditions in parallel panel translation if hasattr(self, 'ocr_roi_cache'): with self._cache_lock: self.ocr_roi_cache.clear() self._log("🧹 Cleared OCR ROI cache", "debug") # 2. Clear OCR manager caches (multiple potential cache locations) if hasattr(self, 'ocr_manager') and self.ocr_manager: # Clear last_results (can contain text from previous image) if hasattr(self.ocr_manager, 'last_results'): self.ocr_manager.last_results = None # Clear generic cache if hasattr(self.ocr_manager, 'cache'): self.ocr_manager.cache.clear() # Clear provider-level caches if hasattr(self.ocr_manager, 'providers'): for provider_name, provider in self.ocr_manager.providers.items(): if hasattr(provider, 'last_results'): provider.last_results = None if hasattr(provider, 'cache'): provider.cache.clear() self._log("🧹 Cleared OCR manager caches", "debug") # 3. Clear bubble detector cache (can contain text region info) if hasattr(self, 'bubble_detector') and self.bubble_detector: if hasattr(self.bubble_detector, 'last_detections'): self.bubble_detector.last_detections = None if hasattr(self.bubble_detector, 'cache'): self.bubble_detector.cache.clear() self._log("🧹 Cleared bubble detector cache", "debug") # Get manga settings from main_gui config manga_settings = self.main_gui.config.get('manga_settings', {}) preprocessing = manga_settings.get('preprocessing', {}) ocr_settings = manga_settings.get('ocr', {}) # Get text filtering settings min_text_length = ocr_settings.get('min_text_length', 2) exclude_english = ocr_settings.get('exclude_english_text', True) confidence_threshold = ocr_settings.get('confidence_threshold', 0.1) # Load and preprocess image if enabled if preprocessing.get('enabled', False): self._log("πŸ“ Preprocessing enabled - enhancing image quality") processed_image_data = self._preprocess_image(image_path, preprocessing) else: # Read image with optional compression (separate from preprocessing) try: comp_cfg = (self.main_gui.config.get('manga_settings', {}) or {}).get('compression', {}) if comp_cfg.get('enabled', False): processed_image_data = self._load_image_with_compression_only(image_path, comp_cfg) else: with open(image_path, 'rb') as image_file: processed_image_data = image_file.read() except Exception: with open(image_path, 'rb') as image_file: processed_image_data = image_file.read() # Compute per-image hash for caching (based on uploaded bytes) # CRITICAL FIX #1: Never allow None page_hash to prevent cache key collisions try: import hashlib page_hash = hashlib.sha1(processed_image_data).hexdigest() # CRITICAL: Never allow None page_hash if page_hash is None: # Fallback: use image path + timestamp for uniqueness import time import uuid page_hash = hashlib.sha1( f"{image_path}_{time.time()}_{uuid.uuid4()}".encode() ).hexdigest() self._log("⚠️ Using fallback page hash for cache isolation", "warning") # CRITICAL: If image hash changed, force clear ROI cache # THREAD-SAFE: Use lock for parallel panel translation if hasattr(self, '_current_image_hash') and self._current_image_hash != page_hash: if hasattr(self, 'ocr_roi_cache'): with self._cache_lock: self.ocr_roi_cache.clear() self._log("🧹 Image changed - cleared ROI cache", "debug") self._current_image_hash = page_hash except Exception as e: # Emergency fallback - never let page_hash be None import uuid page_hash = str(uuid.uuid4()) self._current_image_hash = page_hash self._log(f"⚠️ Page hash generation failed: {e}, using UUID fallback", "error") regions = [] # Route to appropriate provider if self.ocr_provider == 'google': # === GOOGLE CLOUD VISION === # Ensure client exists (it might have been cleaned up between runs) try: self._ensure_google_client() except Exception: pass # Check if we should use RT-DETR for text region detection (NEW FEATURE) # IMPORTANT: bubble_detection_enabled should default to True for optimal detection if ocr_settings.get('bubble_detection_enabled', True) and ocr_settings.get('use_rtdetr_for_ocr_regions', True): self._log("🎯 Using RT-DETR to guide Google Cloud Vision OCR") # Run RT-DETR to detect text regions first _ = self._get_thread_bubble_detector() rtdetr_detections = self._load_bubble_detector(ocr_settings, image_path) if rtdetr_detections: # Collect all text-containing regions WITH TYPE TRACKING all_regions = [] # Track region type to assign bubble_type later region_types = {} idx = 0 if 'text_bubbles' in rtdetr_detections: for bbox in rtdetr_detections.get('text_bubbles', []): all_regions.append(bbox) region_types[idx] = 'text_bubble' idx += 1 if 'text_free' in rtdetr_detections: for bbox in rtdetr_detections.get('text_free', []): all_regions.append(bbox) region_types[idx] = 'free_text' idx += 1 if all_regions: self._log(f"πŸ“Š RT-DETR detected {len(all_regions)} text regions, OCR-ing each with Google Vision") # Load image for cropping import cv2 cv_image = cv2.imread(image_path) if cv_image is None: self._log("⚠️ Failed to load image, falling back to full-page OCR", "warning") else: # Define worker function for concurrent OCR def ocr_region_google(region_data): i, region_idx, x, y, w, h = region_data try: # RATE LIMITING: Add small delay to avoid potential rate limits # Google has high limits (1,800/min paid tier) but being conservative import time import random time.sleep(0.1 + random.random() * 0.2) # 0.1-0.3s random delay # Crop region cropped = self._safe_crop_region(cv_image, x, y, w, h) if cropped is None: return None # Validate and resize crop if needed (Google Vision requires minimum dimensions) h_crop, w_crop = cropped.shape[:2] MIN_SIZE = 50 # Minimum dimension (increased from 10 for better OCR) MIN_AREA = 2500 # Minimum area (50x50) if h_crop < MIN_SIZE or w_crop < MIN_SIZE or h_crop * w_crop < MIN_AREA: # Region too small - try to resize it scale_w = MIN_SIZE / w_crop if w_crop < MIN_SIZE else 1.0 scale_h = MIN_SIZE / h_crop if h_crop < MIN_SIZE else 1.0 scale = max(scale_w, scale_h) if scale > 1.0: new_w = int(w_crop * scale) new_h = int(h_crop * scale) cropped = cv2.resize(cropped, (new_w, new_h), interpolation=cv2.INTER_CUBIC) self._log(f"πŸ” Region {i} resized from {w_crop}x{h_crop}px to {new_w}x{new_h}px for OCR", "debug") h_crop, w_crop = new_h, new_w # Final validation if h_crop < 10 or w_crop < 10: self._log(f"⚠️ Region {i} too small even after resize ({w_crop}x{h_crop}px), skipping", "debug") return None # Encode cropped image _, encoded = cv2.imencode('.jpg', cropped, [cv2.IMWRITE_JPEG_QUALITY, 95]) region_image_data = encoded.tobytes() # Create Vision API image object vision_image = vision.Image(content=region_image_data) image_context = vision.ImageContext( language_hints=ocr_settings.get('language_hints', ['ja', 'ko', 'zh']) ) # Detect text in this region detection_mode = ocr_settings.get('text_detection_mode', 'document') if detection_mode == 'document': response = self.vision_client.document_text_detection( image=vision_image, image_context=image_context ) else: response = self.vision_client.text_detection( image=vision_image, image_context=image_context ) if response.error.message: self._log(f"⚠️ Region {i} error: {response.error.message}", "warning") return None # Extract text from this region region_text = response.full_text_annotation.text if response.full_text_annotation else "" if region_text.strip(): # Clean the text region_text = self._fix_encoding_issues(region_text) region_text = self._sanitize_unicode_characters(region_text) region_text = region_text.strip() # Create TextRegion with original image coordinates region = TextRegion( text=region_text, vertices=[(x, y), (x+w, y), (x+w, y+h), (x, y+h)], bounding_box=(x, y, w, h), confidence=0.9, # RT-DETR confidence region_type='text_block' ) # Assign bubble_type from RT-DETR detection region.bubble_type = region_types.get(region_idx, 'text_bubble') if not getattr(self, 'concise_logs', False): self._log(f"βœ… Region {i}/{len(all_regions)} ({region.bubble_type}): {region_text[:50]}...") return region return None except Exception as e: # Provide more detailed error info for debugging error_msg = str(e) if 'Bad Request' in error_msg or 'invalid' in error_msg.lower(): self._log(f"⏭️ Skipping region {i}: Too small or invalid for Google Vision (dimensions < 10x10px or area < 100pxΒ²)", "debug") else: self._log(f"⚠️ Error OCR-ing region {i}: {e}", "warning") return None # Process regions concurrently with RT-DETR concurrency control from concurrent.futures import ThreadPoolExecutor, as_completed # Use rtdetr_max_concurrency setting (default 12) to control parallel OCR calls max_workers = min(ocr_settings.get('rtdetr_max_concurrency', 12), len(all_regions)) region_data_list = [(i+1, i, x, y, w, h) for i, (x, y, w, h) in enumerate(all_regions)] with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = {executor.submit(ocr_region_google, rd): rd for rd in region_data_list} for future in as_completed(futures): try: result = future.result() if result: regions.append(result) finally: # Clean up future to free memory del future # If we got results, sort and post-process if regions: # CRITICAL: Sort regions by position (top-to-bottom, left-to-right) # Concurrent processing returns them in completion order, not detection order regions.sort(key=lambda r: (r.bounding_box[1], r.bounding_box[0])) self._log(f"βœ… RT-DETR + Google Vision: {len(regions)} text regions detected (sorted by position)") # POST-PROCESS: Check for text_bubbles that overlap with free_text regions # If a text_bubble's center is within a free_text bbox, reclassify it as free_text free_text_bboxes = rtdetr_detections.get('text_free', []) if free_text_bboxes: reclassified_count = 0 for region in regions: if getattr(region, 'bubble_type', None) == 'text_bubble': # Get region center x, y, w, h = region.bounding_box cx = x + w / 2 cy = y + h / 2 self._log(f" Checking text_bubble '{region.text[:30]}...' at center ({cx:.0f}, {cy:.0f})", "debug") # Check if center is in any free_text bbox for bbox_idx, (fx, fy, fw, fh) in enumerate(free_text_bboxes): in_x = fx <= cx <= fx + fw in_y = fy <= cy <= fy + fh self._log(f" vs free_text bbox {bbox_idx+1}: in_x={in_x}, in_y={in_y}", "debug") if in_x and in_y: # Reclassify as free text old_type = region.bubble_type region.bubble_type = 'free_text' reclassified_count += 1 self._log(f" βœ… RECLASSIFIED '{region.text[:30]}...' from {old_type} to free_text", "info") break if reclassified_count > 0: self._log(f"πŸ”„ Reclassified {reclassified_count} overlapping regions as free_text", "info") # MERGE: Combine free_text regions that are within the same free_text bbox # Group free_text regions by which free_text bbox they belong to free_text_groups = {} other_regions = [] for region in regions: if getattr(region, 'bubble_type', None) == 'free_text': # Find which free_text bbox this region belongs to x, y, w, h = region.bounding_box cx = x + w / 2 cy = y + h / 2 for bbox_idx, (fx, fy, fw, fh) in enumerate(free_text_bboxes): if fx <= cx <= fx + fw and fy <= cy <= fy + fh: if bbox_idx not in free_text_groups: free_text_groups[bbox_idx] = [] free_text_groups[bbox_idx].append(region) break else: # Free text region not in any bbox (shouldn't happen, but handle it) other_regions.append(region) else: other_regions.append(region) # Merge each group of free_text regions merged_free_text = [] for bbox_idx, group in free_text_groups.items(): if len(group) > 1: # Merge multiple free text regions in same bbox merged_text = " ".join(r.text for r in group) min_x = min(r.bounding_box[0] for r in group) min_y = min(r.bounding_box[1] for r in group) max_x = max(r.bounding_box[0] + r.bounding_box[2] for r in group) max_y = max(r.bounding_box[1] + r.bounding_box[3] for r in group) all_vertices = [] for r in group: if hasattr(r, 'vertices') and r.vertices: all_vertices.extend(r.vertices) if not all_vertices: all_vertices = [ (min_x, min_y), (max_x, min_y), (max_x, max_y), (min_x, max_y) ] merged_region = TextRegion( text=merged_text, vertices=all_vertices, bounding_box=(min_x, min_y, max_x - min_x, max_y - min_y), confidence=0.95, region_type='text_block' ) merged_region.bubble_type = 'free_text' merged_region.should_inpaint = True merged_free_text.append(merged_region) self._log(f"πŸ”€ Merged {len(group)} free_text regions into one: '{merged_text[:50]}...'", "debug") else: # Single region, keep as-is merged_free_text.extend(group) # Combine all regions regions = other_regions + merged_free_text self._log(f"βœ… Final: {len(regions)} regions after reclassification and merging", "info") # Skip merging section and return directly return regions else: self._log("⚠️ No text found in RT-DETR regions, falling back to full-page OCR", "warning") # If bubble detection is enabled and batch variables suggest batching, do ROI-based batched OCR try: use_roi_locality = ocr_settings.get('bubble_detection_enabled', False) and ocr_settings.get('roi_locality_enabled', False) # Determine OCR batching enable if 'ocr_batch_enabled' in ocr_settings: ocr_batch_enabled = bool(ocr_settings.get('ocr_batch_enabled')) else: ocr_batch_enabled = (os.getenv('BATCH_OCR', '0') == '1') or (os.getenv('BATCH_TRANSLATION', '0') == '1') or getattr(self, 'batch_mode', False) # Determine OCR batch size bs = int(ocr_settings.get('ocr_batch_size') or 0) if bs <= 0: bs = int(os.getenv('OCR_BATCH_SIZE', '0') or 0) if bs <= 0: bs = int(os.getenv('BATCH_SIZE', str(getattr(self, 'batch_size', 1))) or 1) ocr_batch_size = max(1, bs) except Exception: use_roi_locality = False ocr_batch_enabled = False ocr_batch_size = 1 if use_roi_locality and (ocr_batch_enabled or ocr_batch_size > 1): rois = self._prepare_ocr_rois_from_bubbles(image_path, ocr_settings, preprocessing, page_hash) if rois: # Determine concurrency for Google: OCR_MAX_CONCURRENCY env or min(BATCH_SIZE,2) try: max_cc = int(ocr_settings.get('ocr_max_concurrency') or 0) if max_cc <= 0: max_cc = int(os.getenv('OCR_MAX_CONCURRENCY', '0') or 0) if max_cc <= 0: max_cc = min(max(1, ocr_batch_size), 2) except Exception: max_cc = min(max(1, ocr_batch_size), 2) regions = self._google_ocr_rois_batched(rois, ocr_settings, max(1, ocr_batch_size), max_cc, page_hash) self._log(f"βœ… Google OCR batched over {len(rois)} ROIs β†’ {len(regions)} regions (cc={max_cc})", "info") # Force garbage collection after concurrent OCR to reduce memory spikes try: import gc gc.collect() except Exception: pass return regions # Start local inpainter preload while Google OCR runs (background; multiple if panel-parallel) try: if not getattr(self, 'skip_inpainting', False) and not getattr(self, 'use_cloud_inpainting', False): already_loaded, _lm = self._is_local_inpainter_loaded() if not already_loaded: import threading as _threading local_method = (self.manga_settings.get('inpainting', {}) or {}).get('local_method', 'anime') model_path = self.main_gui.config.get(f'manga_{local_method}_model_path', '') if hasattr(self, 'main_gui') else '' adv = self.main_gui.config.get('manga_settings', {}).get('advanced', {}) if hasattr(self, 'main_gui') else {} # Determine desired instances from panel-parallel settings desired = 1 if adv.get('parallel_panel_translation', False): try: desired = max(1, int(adv.get('panel_max_workers', 2))) except Exception: desired = 2 # Honor advanced toggle for panel-local preload; for non-panel (desired==1) always allow allow = True if desired == 1 else bool(adv.get('preload_local_inpainting_for_panels', True)) if allow: self._inpaint_preload_event = _threading.Event() def _preload_inp_many(): try: self.preload_local_inpainters_concurrent(local_method, model_path, desired) finally: try: self._inpaint_preload_event.set() except Exception: pass _threading.Thread(target=_preload_inp_many, name="InpaintPreload@GoogleOCR", daemon=True).start() except Exception: pass # Create Vision API image object (full-page fallback) image = vision.Image(content=processed_image_data) # Build image context with all parameters image_context = vision.ImageContext( language_hints=ocr_settings.get('language_hints', ['ja', 'ko', 'zh']) ) # Add text detection params if available in your API version if hasattr(vision, 'TextDetectionParams'): image_context.text_detection_params = vision.TextDetectionParams( enable_text_detection_confidence_score=True ) # Configure text detection based on settings detection_mode = ocr_settings.get('text_detection_mode', 'document') if detection_mode == 'document': response = self.vision_client.document_text_detection( image=image, image_context=image_context ) else: response = self.vision_client.text_detection( image=image, image_context=image_context ) if response.error.message: raise Exception(f"Cloud Vision API error: {response.error.message}") # Process each page (usually just one for manga) for page in response.full_text_annotation.pages: for block in page.blocks: # Extract text first to check if it's worth processing block_text = "" total_confidence = 0.0 word_count = 0 for paragraph in block.paragraphs: for word in paragraph.words: # Get word-level confidence (more reliable than block level) word_confidence = getattr(word, 'confidence', 0.0) # Default to 0 if not available word_text = ''.join([symbol.text for symbol in word.symbols]) # Only include words above threshold if word_confidence >= confidence_threshold: block_text += word_text + " " total_confidence += word_confidence word_count += 1 else: if not getattr(self, 'concise_logs', False): self._log(f" Skipping low confidence word ({word_confidence:.2f}): {word_text}") block_text = block_text.strip() # CLEAN ORIGINAL OCR TEXT - Fix cube characters and encoding issues original_text = block_text block_text = self._fix_encoding_issues(block_text) block_text = self._sanitize_unicode_characters(block_text) # Log cleaning if changes were made if block_text != original_text: self._log(f"🧹 Cleaned OCR text: '{original_text[:30]}...' β†’ '{block_text[:30]}...'", "debug") # TEXT FILTERING SECTION # Skip if text is too short (after cleaning) if len(block_text.strip()) < min_text_length: if not getattr(self, 'concise_logs', False): self._log(f" Skipping short text ({len(block_text)} chars): {block_text}") continue # Skip if primarily English and exclude_english is enabled if exclude_english and self._is_primarily_english(block_text): if not getattr(self, 'concise_logs', False): self._log(f" Skipping English text: {block_text[:50]}...") continue # Skip if no confident words found if word_count == 0 or not block_text: if not getattr(self, 'concise_logs', False): self._log(f" Skipping block - no words above threshold {confidence_threshold}") continue # Calculate average confidence for the block avg_confidence = total_confidence / word_count if word_count > 0 else 0.0 # Extract vertices and create region vertices = [(v.x, v.y) for v in block.bounding_box.vertices] # Calculate bounding box xs = [v[0] for v in vertices] ys = [v[1] for v in vertices] x_min, x_max = min(xs), max(xs) y_min, y_max = min(ys), max(ys) region = TextRegion( text=block_text, vertices=vertices, bounding_box=(x_min, y_min, x_max - x_min, y_max - y_min), confidence=avg_confidence, # Use average confidence region_type='text_block' ) regions.append(region) if not getattr(self, 'concise_logs', False): self._log(f" Found text region ({avg_confidence:.2f}): {block_text[:50]}...") elif self.ocr_provider == 'azure': # === AZURE COMPUTER VISION === # Ensure client exists (it might have been cleaned up between runs) try: self._ensure_azure_client() except Exception: pass import io import time from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes # Check if we should use RT-DETR for text region detection (NEW FEATURE) if ocr_settings.get('bubble_detection_enabled', False) and ocr_settings.get('use_rtdetr_for_ocr_regions', True): self._log("🎯 Using RT-DETR to guide Azure Computer Vision OCR") # Run RT-DETR to detect text regions first _ = self._get_thread_bubble_detector() rtdetr_detections = self._load_bubble_detector(ocr_settings, image_path) if rtdetr_detections: # Collect all text-containing regions WITH TYPE TRACKING all_regions = [] # Track region type to assign bubble_type later region_types = {} idx = 0 if 'text_bubbles' in rtdetr_detections: for bbox in rtdetr_detections.get('text_bubbles', []): all_regions.append(bbox) region_types[idx] = 'text_bubble' idx += 1 if 'text_free' in rtdetr_detections: for bbox in rtdetr_detections.get('text_free', []): all_regions.append(bbox) region_types[idx] = 'free_text' idx += 1 if all_regions: self._log(f"πŸ“Š RT-DETR detected {len(all_regions)} text regions, OCR-ing each with Azure Vision") # Load image for cropping import cv2 cv_image = cv2.imread(image_path) if cv_image is None: self._log("⚠️ Failed to load image, falling back to full-page OCR", "warning") else: ocr_results = [] # Get Azure settings azure_reading_order = ocr_settings.get('azure_reading_order', 'natural') azure_model_version = ocr_settings.get('azure_model_version', 'latest') azure_max_wait = ocr_settings.get('azure_max_wait', 60) azure_poll_interval = ocr_settings.get('azure_poll_interval', 1.0) # Define worker function for concurrent OCR def ocr_region_azure(region_data): i, region_idx, x, y, w, h = region_data try: # Crop region cropped = self._safe_crop_region(cv_image, x, y, w, h) if cropped is None: return None # Validate and resize crop if needed (Azure Vision requires minimum dimensions) h_crop, w_crop = cropped.shape[:2] MIN_SIZE = 50 # Minimum dimension (Azure requirement) MIN_AREA = 2500 # Minimum area (50x50) if h_crop < MIN_SIZE or w_crop < MIN_SIZE or h_crop * w_crop < MIN_AREA: # Region too small - try to resize it scale_w = MIN_SIZE / w_crop if w_crop < MIN_SIZE else 1.0 scale_h = MIN_SIZE / h_crop if h_crop < MIN_SIZE else 1.0 scale = max(scale_w, scale_h) if scale > 1.0: new_w = int(w_crop * scale) new_h = int(h_crop * scale) cropped = cv2.resize(cropped, (new_w, new_h), interpolation=cv2.INTER_CUBIC) self._log(f"πŸ” Region {i} resized from {w_crop}x{h_crop}px to {new_w}x{new_h}px for Azure OCR", "debug") h_crop, w_crop = new_h, new_w # Final validation if h_crop < 10 or w_crop < 10: self._log(f"⚠️ Region {i} too small even after resize ({w_crop}x{h_crop}px), skipping", "debug") return None # RATE LIMITING: Add delay between Azure API calls to avoid "Too Many Requests" # Azure Free tier: 20 calls/minute = 1 call per 3 seconds # Azure Standard tier: Higher limits but still needs throttling import time import random # Stagger requests with randomized delay (0.1-0.3 seconds) time.sleep(0.1 + random.random() * 0.2) # 0.1-0.3s random delay # Encode cropped image _, encoded = cv2.imencode('.jpg', cropped, [cv2.IMWRITE_JPEG_QUALITY, 95]) region_image_bytes = encoded.tobytes() # Call Azure Read API read_response = self.vision_client.read_in_stream( io.BytesIO(region_image_bytes), language=ocr_settings.get('language_hints', ['ja'])[0] if ocr_settings.get('language_hints') else 'ja', model_version=azure_model_version, reading_order=azure_reading_order, raw=True ) # Get operation location operation_location = read_response.headers['Operation-Location'] operation_id = operation_location.split('/')[-1] # Poll for result start_time = time.time() while True: result = self.vision_client.get_read_result(operation_id) if result.status not in [OperationStatusCodes.not_started, OperationStatusCodes.running]: break if time.time() - start_time > azure_max_wait: self._log(f"⚠️ Azure timeout for region {i}", "warning") break time.sleep(azure_poll_interval) if result.status == OperationStatusCodes.succeeded: # Extract text from result region_text = "" for text_result in result.analyze_result.read_results: for line in text_result.lines: region_text += line.text + "\n" region_text = region_text.strip() if region_text: # Clean the text region_text = self._fix_encoding_issues(region_text) region_text = self._sanitize_unicode_characters(region_text) # Create TextRegion with original image coordinates region = TextRegion( text=region_text, vertices=[(x, y), (x+w, y), (x+w, y+h), (x, y+h)], bounding_box=(x, y, w, h), confidence=0.9, # RT-DETR confidence region_type='text_block' ) # Assign bubble_type from RT-DETR detection region.bubble_type = region_types.get(region_idx, 'text_bubble') if not getattr(self, 'concise_logs', False): self._log(f"βœ… Region {i}/{len(all_regions)} ({region.bubble_type}): {region_text[:50]}...") return region return None except Exception as e: # Provide more detailed error info for debugging error_msg = str(e) if 'Bad Request' in error_msg or 'invalid' in error_msg.lower() or 'Too Many Requests' in error_msg: if 'Too Many Requests' in error_msg: self._log(f"⏸️ Region {i}: Azure rate limit hit, consider increasing delays", "warning") else: self._log(f"⏭️ Skipping region {i}: Too small or invalid for Azure Vision", "debug") else: self._log(f"⚠️ Error OCR-ing region {i}: {e}", "warning") return None # Process regions concurrently with RT-DETR concurrency control from concurrent.futures import ThreadPoolExecutor, as_completed # Use rtdetr_max_concurrency setting (default 12) # Note: Rate limiting is handled via 0.1-0.3s delays per request max_workers = min(ocr_settings.get('rtdetr_max_concurrency', 12), len(all_regions)) region_data_list = [(i+1, i, x, y, w, h) for i, (x, y, w, h) in enumerate(all_regions)] with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = {executor.submit(ocr_region_azure, rd): rd for rd in region_data_list} for future in as_completed(futures): try: result = future.result() if result: regions.append(result) finally: # Clean up future to free memory del future # If we got results, sort and post-process if regions: # CRITICAL: Sort regions by position (top-to-bottom, left-to-right) # Concurrent processing returns them in completion order, not detection order regions.sort(key=lambda r: (r.bounding_box[1], r.bounding_box[0])) self._log(f"βœ… RT-DETR + Azure Vision: {len(regions)} text regions detected (sorted by position)") # POST-PROCESS: Check for text_bubbles that overlap with free_text regions # If a text_bubble's center is within a free_text bbox, reclassify it as free_text free_text_bboxes = rtdetr_detections.get('text_free', []) # DEBUG: Log what we have self._log(f"πŸ” POST-PROCESS: Found {len(free_text_bboxes)} free_text bboxes from RT-DETR", "debug") for idx, (fx, fy, fw, fh) in enumerate(free_text_bboxes): self._log(f" Free text bbox {idx+1}: x={fx:.0f}, y={fy:.0f}, w={fw:.0f}, h={fh:.0f}", "debug") text_bubble_count = sum(1 for r in regions if getattr(r, 'bubble_type', None) == 'text_bubble') free_text_count = sum(1 for r in regions if getattr(r, 'bubble_type', None) == 'free_text') self._log(f"πŸ” Before reclassification: {text_bubble_count} text_bubbles, {free_text_count} free_text", "debug") if free_text_bboxes: reclassified_count = 0 for region in regions: if getattr(region, 'bubble_type', None) == 'text_bubble': # Get region center x, y, w, h = region.bounding_box cx = x + w / 2 cy = y + h / 2 self._log(f" Checking text_bubble '{region.text[:30]}...' at center ({cx:.0f}, {cy:.0f})", "debug") # Check if center is in any free_text bbox for bbox_idx, (fx, fy, fw, fh) in enumerate(free_text_bboxes): in_x = fx <= cx <= fx + fw in_y = fy <= cy <= fy + fh self._log(f" vs free_text bbox {bbox_idx+1}: in_x={in_x}, in_y={in_y}", "debug") if in_x and in_y: # Reclassify as free text old_type = region.bubble_type region.bubble_type = 'free_text' reclassified_count += 1 self._log(f" βœ… RECLASSIFIED '{region.text[:30]}...' from {old_type} to free_text", "info") break if reclassified_count > 0: self._log(f"πŸ”„ Reclassified {reclassified_count} overlapping regions as free_text", "info") # MERGE: Combine free_text regions that are within the same free_text bbox # Group free_text regions by which free_text bbox they belong to free_text_groups = {} other_regions = [] for region in regions: if getattr(region, 'bubble_type', None) == 'free_text': # Find which free_text bbox this region belongs to x, y, w, h = region.bounding_box cx = x + w / 2 cy = y + h / 2 for bbox_idx, (fx, fy, fw, fh) in enumerate(free_text_bboxes): if fx <= cx <= fx + fw and fy <= cy <= fy + fh: if bbox_idx not in free_text_groups: free_text_groups[bbox_idx] = [] free_text_groups[bbox_idx].append(region) break else: # Free text region not in any bbox (shouldn't happen, but handle it) other_regions.append(region) else: other_regions.append(region) # Merge each group of free_text regions merged_free_text = [] for bbox_idx, group in free_text_groups.items(): if len(group) > 1: # Merge multiple free text regions in same bbox merged_text = " ".join(r.text for r in group) min_x = min(r.bounding_box[0] for r in group) min_y = min(r.bounding_box[1] for r in group) max_x = max(r.bounding_box[0] + r.bounding_box[2] for r in group) max_y = max(r.bounding_box[1] + r.bounding_box[3] for r in group) all_vertices = [] for r in group: if hasattr(r, 'vertices') and r.vertices: all_vertices.extend(r.vertices) if not all_vertices: all_vertices = [ (min_x, min_y), (max_x, min_y), (max_x, max_y), (min_x, max_y) ] merged_region = TextRegion( text=merged_text, vertices=all_vertices, bounding_box=(min_x, min_y, max_x - min_x, max_y - min_y), confidence=0.95, region_type='text_block' ) merged_region.bubble_type = 'free_text' merged_region.should_inpaint = True merged_free_text.append(merged_region) self._log(f"πŸ”€ Merged {len(group)} free_text regions into one: '{merged_text[:50]}...'", "debug") else: # Single region, keep as-is merged_free_text.extend(group) # Combine all regions regions = other_regions + merged_free_text self._log(f"βœ… Final: {len(regions)} regions after reclassification and merging", "info") # Skip merging section and return directly return regions else: self._log("⚠️ No text found in RT-DETR regions, falling back to full-page OCR", "warning") # ROI-based concurrent OCR when bubble detection is enabled and batching is requested try: use_roi_locality = ocr_settings.get('bubble_detection_enabled', False) and ocr_settings.get('roi_locality_enabled', False) if 'ocr_batch_enabled' in ocr_settings: ocr_batch_enabled = bool(ocr_settings.get('ocr_batch_enabled')) else: ocr_batch_enabled = (os.getenv('BATCH_OCR', '0') == '1') or (os.getenv('BATCH_TRANSLATION', '0') == '1') or getattr(self, 'batch_mode', False) bs = int(ocr_settings.get('ocr_batch_size') or 0) if bs <= 0: bs = int(os.getenv('OCR_BATCH_SIZE', '0') or 0) if bs <= 0: bs = int(os.getenv('BATCH_SIZE', str(getattr(self, 'batch_size', 1))) or 1) ocr_batch_size = max(1, bs) except Exception: use_roi_locality = False ocr_batch_enabled = False ocr_batch_size = 1 if use_roi_locality and (ocr_batch_enabled or ocr_batch_size > 1): rois = self._prepare_ocr_rois_from_bubbles(image_path, ocr_settings, preprocessing, page_hash) if rois: # AZURE RATE LIMITING: Force low concurrency to prevent "Too Many Requests" # Azure has strict rate limits that vary by tier: # - Free tier: 20 requests/minute # - Standard tier: Higher but still limited try: azure_workers = int(ocr_settings.get('ocr_max_concurrency') or 0) if azure_workers <= 0: azure_workers = 1 # Force sequential by default else: azure_workers = min(2, max(1, azure_workers)) # Cap at 2 max except Exception: azure_workers = 1 # Safe default regions = self._azure_ocr_rois_concurrent(rois, ocr_settings, azure_workers, page_hash) self._log(f"βœ… Azure OCR concurrent over {len(rois)} ROIs β†’ {len(regions)} regions (workers={azure_workers})", "info") # Force garbage collection after concurrent OCR to reduce memory spikes try: import gc gc.collect() except Exception: pass return regions # Start local inpainter preload while Azure OCR runs (background; multiple if panel-parallel) try: if not getattr(self, 'skip_inpainting', False) and not getattr(self, 'use_cloud_inpainting', False): already_loaded, _lm = self._is_local_inpainter_loaded() if not already_loaded: import threading as _threading local_method = (self.manga_settings.get('inpainting', {}) or {}).get('local_method', 'anime') model_path = self.main_gui.config.get(f'manga_{local_method}_model_path', '') if hasattr(self, 'main_gui') else '' adv = self.main_gui.config.get('manga_settings', {}).get('advanced', {}) if hasattr(self, 'main_gui') else {} desired = 1 if adv.get('parallel_panel_translation', False): try: desired = max(1, int(adv.get('panel_max_workers', 2))) except Exception: desired = 2 allow = True if desired == 1 else bool(adv.get('preload_local_inpainting_for_panels', True)) if allow: self._inpaint_preload_event = _threading.Event() def _preload_inp_many(): try: self.preload_local_inpainters_concurrent(local_method, model_path, desired) finally: try: self._inpaint_preload_event.set() except Exception: pass _threading.Thread(target=_preload_inp_many, name="InpaintPreload@AzureOCR", daemon=True).start() except Exception: pass # Ensure Azure-supported format for the BYTES we are sending. # If compression is enabled and produced an Azure-supported format (JPEG/PNG/BMP/TIFF), # DO NOT force-convert to PNG. Only convert when the current bytes are in an unsupported format. file_ext = os.path.splitext(image_path)[1].lower() azure_supported_exts = ['.jpg', '.jpeg', '.png', '.bmp', '.pdf', '.tiff'] azure_supported_fmts = ['jpeg', 'jpg', 'png', 'bmp', 'tiff'] # Probe the actual byte format we will upload try: from PIL import Image as _PILImage img_probe = _PILImage.open(io.BytesIO(processed_image_data)) fmt = (img_probe.format or '').lower() except Exception: fmt = '' # If original is a PDF, allow as-is (Azure supports PDF streams) if file_ext == '.pdf': needs_convert = False else: # Decide based on the detected format of the processed bytes needs_convert = fmt not in azure_supported_fmts if needs_convert: # If compression settings are enabled and target format is Azure-supported, prefer that try: comp_cfg = (self.main_gui.config.get('manga_settings', {}) or {}).get('compression', {}) except Exception: comp_cfg = {} # Determine if conversion is actually needed based on compression and current format try: from PIL import Image as _PILImage img2 = _PILImage.open(io.BytesIO(processed_image_data)) fmt_lower = (img2.format or '').lower() except Exception: img2 = None fmt_lower = '' accepted = {'jpeg', 'jpg', 'png', 'bmp', 'tiff'} convert_needed = False target_fmt = None if comp_cfg.get('enabled', False): cf = str(comp_cfg.get('format', '')).lower() desired = None if cf in ('jpeg', 'jpg'): desired = 'JPEG' elif cf == 'png': desired = 'PNG' elif cf == 'bmp': desired = 'BMP' elif cf == 'tiff': desired = 'TIFF' # If WEBP or others, desired remains None and we fall back to PNG only if unsupported if desired is not None: # Skip conversion if already in the desired supported format already_matches = ((fmt_lower in ('jpeg', 'jpg') and desired == 'JPEG') or (fmt_lower == desired.lower())) if not already_matches: convert_needed = True target_fmt = desired else: # Compression format not supported by Azure (e.g., WEBP); convert only if unsupported if fmt_lower not in accepted: convert_needed = True target_fmt = 'PNG' else: # No compression preference; convert only if unsupported by Azure if fmt_lower not in accepted: convert_needed = True target_fmt = 'PNG' if convert_needed: self._log(f"⚠️ Converting image to {target_fmt} for Azure compatibility") try: if img2 is None: from PIL import Image as _PILImage img2 = _PILImage.open(io.BytesIO(processed_image_data)) buffer = io.BytesIO() if target_fmt == 'JPEG' and img2.mode != 'RGB': img2 = img2.convert('RGB') img2.save(buffer, format=target_fmt) processed_image_data = buffer.getvalue() except Exception: pass # Create stream from image data image_stream = io.BytesIO(processed_image_data) # Get Azure-specific settings reading_order = ocr_settings.get('azure_reading_order', 'natural') model_version = ocr_settings.get('azure_model_version', 'latest') max_wait = ocr_settings.get('azure_max_wait', 60) poll_interval = ocr_settings.get('azure_poll_interval', 0.5) # Map language hints to Azure language codes language_hints = ocr_settings.get('language_hints', ['ja', 'ko', 'zh']) # Build parameters dictionary read_params = { 'raw': True, 'readingOrder': reading_order } # Add model version if not using latest if model_version != 'latest': read_params['model-version'] = model_version # Use language parameter only if single language is selected if len(language_hints) == 1: azure_lang = language_hints[0] # Map to Azure language codes lang_mapping = { 'zh': 'zh-Hans', 'zh-TW': 'zh-Hant', 'zh-CN': 'zh-Hans', 'ja': 'ja', 'ko': 'ko', 'en': 'en' } azure_lang = lang_mapping.get(azure_lang, azure_lang) read_params['language'] = azure_lang self._log(f" Using Azure Read API with language: {azure_lang}, order: {reading_order}") else: self._log(f" Using Azure Read API (auto-detect for {len(language_hints)} languages, order: {reading_order})") # Start Read operation with error handling and rate limit retry # Use max_retries from config (default 7, configurable in Other Settings) max_retries = self.main_gui.config.get('max_retries', 7) retry_delay = 60 # Start with 60 seconds for rate limits read_response = None for retry_attempt in range(max_retries): try: # Ensure client is alive before starting if getattr(self, 'vision_client', None) is None: self._log("⚠️ Azure client missing before read; reinitializing...", "warning") self._ensure_azure_client() if getattr(self, 'vision_client', None) is None: raise RuntimeError("Azure Computer Vision client is not initialized. Check your key/endpoint and azure-cognitiveservices-vision-computervision installation.") # Reset stream position for retry image_stream.seek(0) read_response = self.vision_client.read_in_stream( image_stream, **read_params ) # Success! Break out of retry loop break except Exception as e: error_msg = str(e) # Handle rate limit errors with fixed 60s wait if 'Too Many Requests' in error_msg or '429' in error_msg: if retry_attempt < max_retries - 1: wait_time = retry_delay # Fixed 60s wait each time self._log(f"⚠️ Azure rate limit hit. Waiting {wait_time}s before retry {retry_attempt + 1}/{max_retries}...", "warning") time.sleep(wait_time) continue else: self._log(f"❌ Azure rate limit: Exhausted {max_retries} retries", "error") raise # Handle bad request errors elif 'Bad Request' in error_msg: self._log("⚠️ Azure Read API Bad Request - likely invalid image format or too small. Retrying without language parameter...", "warning") # Retry without language parameter image_stream.seek(0) read_params.pop('language', None) if getattr(self, 'vision_client', None) is None: self._ensure_azure_client() read_response = self.vision_client.read_in_stream( image_stream, **read_params ) break else: raise if read_response is None: raise RuntimeError("Failed to get response from Azure Read API after retries") # Get operation ID operation_location = read_response.headers.get("Operation-Location") if hasattr(read_response, 'headers') else None if not operation_location: raise RuntimeError("Azure Read API did not return Operation-Location header") operation_id = operation_location.split("/")[-1] # Poll for results with configurable timeout self._log(f" Waiting for Azure OCR to complete (max {max_wait}s)...") wait_time = 0 last_status = None result = None while wait_time < max_wait: try: if getattr(self, 'vision_client', None) is None: # Client got cleaned up mid-poll; reinitialize and continue self._log("⚠️ Azure client became None during polling; reinitializing...", "warning") self._ensure_azure_client() if getattr(self, 'vision_client', None) is None: raise AttributeError("Azure client lost and could not be reinitialized") result = self.vision_client.get_read_result(operation_id) except AttributeError as e: # Defensive: reinitialize once and retry this iteration self._log(f"⚠️ {e} β€” reinitializing Azure client and retrying once", "warning") self._ensure_azure_client() if getattr(self, 'vision_client', None) is None: raise result = self.vision_client.get_read_result(operation_id) # Log status changes if result.status != last_status: self._log(f" Status: {result.status}") last_status = result.status if result.status not in [OperationStatusCodes.running, OperationStatusCodes.not_started]: break time.sleep(poll_interval) self._log("πŸ’€ Azure OCR polling pausing briefly for stability", "debug") wait_time += poll_interval if not result: raise RuntimeError("Azure Read API polling did not return a result") if result.status == OperationStatusCodes.succeeded: # Track statistics total_lines = 0 handwritten_lines = 0 for page_num, page in enumerate(result.analyze_result.read_results): if len(result.analyze_result.read_results) > 1: self._log(f" Processing page {page_num + 1}/{len(result.analyze_result.read_results)}") for line in page.lines: # CLEAN ORIGINAL OCR TEXT FOR AZURE - Fix cube characters and encoding issues original_azure_text = line.text cleaned_line_text = self._fix_encoding_issues(line.text) cleaned_line_text = self._sanitize_unicode_characters(cleaned_line_text) # Log cleaning if changes were made if cleaned_line_text != original_azure_text: self._log(f"🧹 Cleaned Azure OCR text: '{original_azure_text[:30]}...' β†’ '{cleaned_line_text[:30]}...'", "debug") # TEXT FILTERING FOR AZURE # Skip if text is too short (after cleaning) if len(cleaned_line_text.strip()) < min_text_length: if not getattr(self, 'concise_logs', False): self._log(f" Skipping short text ({len(cleaned_line_text)} chars): {cleaned_line_text}") continue # Skip if primarily English and exclude_english is enabled (use cleaned text) if exclude_english and self._is_primarily_english(cleaned_line_text): if not getattr(self, 'concise_logs', False): self._log(f" Skipping English text: {cleaned_line_text[:50]}...") continue # Azure provides 8-point bounding box bbox = line.bounding_box vertices = [ (bbox[0], bbox[1]), (bbox[2], bbox[3]), (bbox[4], bbox[5]), (bbox[6], bbox[7]) ] # Calculate rectangular bounding box xs = [v[0] for v in vertices] ys = [v[1] for v in vertices] x_min, x_max = min(xs), max(xs) y_min, y_max = min(ys), max(ys) # Calculate confidence from word-level data confidence = 0.95 # Default high confidence if hasattr(line, 'words') and line.words: # Calculate average confidence from words confidences = [] for word in line.words: if hasattr(word, 'confidence'): confidences.append(word.confidence) if confidences: confidence = sum(confidences) / len(confidences) if not getattr(self, 'concise_logs', False): self._log(f" Line has {len(line.words)} words, avg confidence: {confidence:.3f}") # Check for handwriting style (if available) style = 'print' # Default style_confidence = None if hasattr(line, 'appearance') and line.appearance: if hasattr(line.appearance, 'style'): style_info = line.appearance.style if hasattr(style_info, 'name'): style = style_info.name if style == 'handwriting': handwritten_lines += 1 if hasattr(style_info, 'confidence'): style_confidence = style_info.confidence if not getattr(self, 'concise_logs', False): self._log(f" Style: {style} (confidence: {style_confidence:.2f})") # Apply confidence threshold filtering if confidence >= confidence_threshold: region = TextRegion( text=cleaned_line_text, # Use cleaned text instead of original vertices=vertices, bounding_box=(x_min, y_min, x_max - x_min, y_max - y_min), confidence=confidence, region_type='text_line' ) # Add extra attributes for Azure-specific info region.style = style region.style_confidence = style_confidence regions.append(region) total_lines += 1 # More detailed logging (use cleaned text) if not getattr(self, 'concise_logs', False): if style == 'handwriting': self._log(f" Found handwritten text ({confidence:.2f}): {cleaned_line_text[:50]}...") else: self._log(f" Found text region ({confidence:.2f}): {cleaned_line_text[:50]}...") else: if not getattr(self, 'concise_logs', False): self._log(f" Skipping low confidence text ({confidence:.2f}): {cleaned_line_text[:30]}...") # Log summary statistics if total_lines > 0 and not getattr(self, 'concise_logs', False): self._log(f" Total lines detected: {total_lines}") if handwritten_lines > 0: self._log(f" Handwritten lines: {handwritten_lines} ({handwritten_lines/total_lines*100:.1f}%)") elif result.status == OperationStatusCodes.failed: # More detailed error handling error_msg = "Azure OCR failed" if hasattr(result, 'message'): error_msg += f": {result.message}" if hasattr(result.analyze_result, 'errors') and result.analyze_result.errors: for error in result.analyze_result.errors: self._log(f" Error: {error}", "error") raise Exception(error_msg) else: # Timeout or other status raise Exception(f"Azure OCR ended with status: {result.status} after {wait_time}s") else: # === NEW OCR PROVIDERS === import cv2 import numpy as np from ocr_manager import OCRManager # Load image as numpy array if isinstance(processed_image_data, bytes): # Convert bytes to numpy array nparr = np.frombuffer(processed_image_data, np.uint8) image = cv2.imdecode(nparr, cv2.IMREAD_COLOR) else: # Load from file path image = cv2.imread(image_path) if image is None: # Try with PIL for Unicode paths from PIL import Image as PILImage pil_image = PILImage.open(image_path) image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR) # Ensure OCR manager is available if not hasattr(self, 'ocr_manager') or self.ocr_manager is None: try: # Prefer GUI-provided manager if available if hasattr(self, 'main_gui') and hasattr(self.main_gui, 'ocr_manager') and self.main_gui.ocr_manager is not None: self.ocr_manager = self.main_gui.ocr_manager else: from ocr_manager import OCRManager self.ocr_manager = OCRManager(log_callback=self.log_callback) self._log("Initialized internal OCRManager instance", "info") except Exception as _e: self.ocr_manager = None self._log(f"Failed to initialize OCRManager: {str(_e)}", "error") if self.ocr_manager is None: raise RuntimeError("OCRManager is not available; cannot proceed with OCR provider.") # Check provider status and load if needed provider_status = self.ocr_manager.check_provider_status(self.ocr_provider) if not provider_status['installed']: self._log(f"❌ {self.ocr_provider} is not installed", "error") self._log(f" Please install it from the GUI settings", "error") raise Exception(f"{self.ocr_provider} OCR provider is not installed") # Start local inpainter preload while provider is being readied/used (non-cloud path only; background) try: if not getattr(self, 'skip_inpainting', False) and not getattr(self, 'use_cloud_inpainting', False): already_loaded, _lm = self._is_local_inpainter_loaded() if not already_loaded: import threading as _threading local_method = (self.manga_settings.get('inpainting', {}) or {}).get('local_method', 'anime') model_path = self.main_gui.config.get(f'manga_{local_method}_model_path', '') if hasattr(self, 'main_gui') else '' adv = self.main_gui.config.get('manga_settings', {}).get('advanced', {}) if hasattr(self, 'main_gui') else {} desired = 1 if adv.get('parallel_panel_translation', False): try: desired = max(1, int(adv.get('panel_max_workers', 2))) except Exception: desired = 2 allow = True if desired == 1 else bool(adv.get('preload_local_inpainting_for_panels', True)) if allow: self._inpaint_preload_event = _threading.Event() def _preload_inp_many(): try: self.preload_local_inpainters_concurrent(local_method, model_path, desired) finally: try: self._inpaint_preload_event.set() except Exception: pass _threading.Thread(target=_preload_inp_many, name="InpaintPreload@OCRProvider", daemon=True).start() except Exception: pass if not provider_status['loaded']: # Check if Qwen2-VL - if it's supposedly not loaded but actually is, skip if self.ocr_provider == 'Qwen2-VL': provider = self.ocr_manager.get_provider('Qwen2-VL') if provider and hasattr(provider, 'model') and provider.model is not None: self._log("βœ… Qwen2-VL model actually already loaded, skipping reload") success = True else: # Only actually load if truly not loaded model_size = self.ocr_config.get('model_size', '2') if hasattr(self, 'ocr_config') else '2' self._log(f"Loading Qwen2-VL with model_size={model_size}") success = self.ocr_manager.load_provider(self.ocr_provider, model_size=model_size) if not success: raise Exception(f"Failed to load {self.ocr_provider} model") elif self.ocr_provider == 'custom-api': # Custom API needs to initialize UnifiedClient with credentials self._log("πŸ“‘ Loading custom-api provider...") # Try to get API key and model from GUI if available load_kwargs = {} if hasattr(self, 'main_gui'): # Get API key from GUI if hasattr(self.main_gui, 'api_key_entry'): api_key = self.main_gui.api_key_entry.get() if api_key: load_kwargs['api_key'] = api_key # Get model from GUI if hasattr(self.main_gui, 'model_var'): model = self.main_gui.model_var.get() if model: load_kwargs['model'] = model success = self.ocr_manager.load_provider(self.ocr_provider, **load_kwargs) if not success: raise Exception(f"Failed to initialize {self.ocr_provider}") else: # Other providers success = self.ocr_manager.load_provider(self.ocr_provider) if not success: raise Exception(f"Failed to load {self.ocr_provider} model") if not success: raise Exception(f"Failed to load {self.ocr_provider} model") # Initialize ocr_results here before any provider-specific code ocr_results = [] # Special handling for manga-ocr (needs region detection first) if self.ocr_provider == 'manga-ocr': # IMPORTANT: Initialize fresh results list ocr_results = [] # Check if we should use bubble detection for regions if ocr_settings.get('bubble_detection_enabled', False): self._log("πŸ“ Using bubble detection regions for manga-ocr...") # Run bubble detection to get regions if self.bubble_detector is None: from bubble_detector import BubbleDetector self.bubble_detector = BubbleDetector() # Get regions from bubble detector rtdetr_detections = self._load_bubble_detector(ocr_settings, image_path) if rtdetr_detections: # Process detections immediately and don't store all_regions = [] # ONLY ADD TEXT-CONTAINING REGIONS # Skip empty bubbles since they shouldn't have text if 'text_bubbles' in rtdetr_detections: all_regions.extend(rtdetr_detections.get('text_bubbles', [])) if 'text_free' in rtdetr_detections: all_regions.extend(rtdetr_detections.get('text_free', [])) # DO NOT ADD empty bubbles - they're duplicates of text_bubbles # if 'bubbles' in rtdetr_detections: # <-- REMOVE THIS # all_regions.extend(rtdetr_detections.get('bubbles', [])) self._log(f"πŸ“Š Processing {len(all_regions)} text-containing regions (skipping empty bubbles)") # Clear detection results after extracting regions rtdetr_detections = None # Check if parallel processing is enabled if self.parallel_processing and len(all_regions) > 1: self._log(f"πŸš€ Using PARALLEL OCR for {len(all_regions)} regions with manga-ocr") ocr_results = self._parallel_ocr_regions(image, all_regions, 'manga-ocr', confidence_threshold) else: # Process each region with manga-ocr for i, (x, y, w, h) in enumerate(all_regions): cropped = self._safe_crop_region(image, x, y, w, h) if cropped is None: continue result = self.ocr_manager.detect_text(cropped, 'manga-ocr', confidence=confidence_threshold) if result and len(result) > 0 and result[0].text.strip(): result[0].bbox = (x, y, w, h) result[0].vertices = [(x, y), (x+w, y), (x+w, y+h), (x, y+h)] # CRITICAL: Store RT-DETR bubble bounds for rendering # The bbox/vertices are the small OCR polygon, but bubble_bounds is the full RT-DETR bubble result[0].bubble_bounds = (x, y, w, h) ocr_results.append(result[0]) self._log(f"πŸ” Processing region {i+1}/{len(all_regions)} with manga-ocr...") self._log(f"βœ… Detected text: {result[0].text[:50]}...") # Clear regions list after processing all_regions = None else: # NO bubble detection - just process full image self._log("πŸ“ Processing full image with manga-ocr (no bubble detection)") ocr_results = self.ocr_manager.detect_text(image, self.ocr_provider, confidence=confidence_threshold) elif self.ocr_provider == 'Qwen2-VL': # Initialize results list ocr_results = [] # Configure Qwen2-VL for Korean text language_hints = ocr_settings.get('language_hints', ['ko']) self._log("🍩 Qwen2-VL OCR for Korean text recognition") # Check if we should use bubble detection for regions if ocr_settings.get('bubble_detection_enabled', False): self._log("πŸ“ Using bubble detection regions for Qwen2-VL...") # Run bubble detection to get regions (thread-local) _ = self._get_thread_bubble_detector() # Get regions from bubble detector rtdetr_detections = self._load_bubble_detector(ocr_settings, image_path) if rtdetr_detections: # Process only text-containing regions all_regions = [] if 'text_bubbles' in rtdetr_detections: all_regions.extend(rtdetr_detections.get('text_bubbles', [])) if 'text_free' in rtdetr_detections: all_regions.extend(rtdetr_detections.get('text_free', [])) self._log(f"πŸ“Š Processing {len(all_regions)} text regions with Qwen2-VL") # Check if parallel processing is enabled if self.parallel_processing and len(all_regions) > 1: self._log(f"πŸš€ Using PARALLEL OCR for {len(all_regions)} regions with Qwen2-VL") ocr_results = self._parallel_ocr_regions(image, all_regions, 'Qwen2-VL', confidence_threshold) else: # Process each region with Qwen2-VL for i, (x, y, w, h) in enumerate(all_regions): cropped = self._safe_crop_region(image, x, y, w, h) if cropped is None: continue result = self.ocr_manager.detect_text(cropped, 'Qwen2-VL', confidence=confidence_threshold) if result and len(result) > 0 and result[0].text.strip(): result[0].bbox = (x, y, w, h) result[0].vertices = [(x, y), (x+w, y), (x+w, y+h), (x, y+h)] ocr_results.append(result[0]) self._log(f"βœ… Region {i+1}: {result[0].text[:50]}...") else: # Process full image without bubble detection self._log("πŸ“ Processing full image with Qwen2-VL") ocr_results = self.ocr_manager.detect_text(image, self.ocr_provider) elif self.ocr_provider == 'custom-api': # Initialize results list ocr_results = [] # Configure Custom API for text extraction self._log("πŸ”Œ Using Custom API for OCR") # Check if we should use bubble detection for regions if ocr_settings.get('bubble_detection_enabled', False): self._log("πŸ“ Using bubble detection regions for Custom API...") # Run bubble detection to get regions (thread-local) _ = self._get_thread_bubble_detector() # Get regions from bubble detector rtdetr_detections = self._load_bubble_detector(ocr_settings, image_path) if rtdetr_detections: # Process only text-containing regions all_regions = [] if 'text_bubbles' in rtdetr_detections: all_regions.extend(rtdetr_detections.get('text_bubbles', [])) if 'text_free' in rtdetr_detections: all_regions.extend(rtdetr_detections.get('text_free', [])) self._log(f"πŸ“Š Processing {len(all_regions)} text regions with Custom API") # Clear detections after extracting regions rtdetr_detections = None # Decide parallelization for custom-api: # Use API batch mode OR local parallel toggle so that API calls can run in parallel if (getattr(self, 'batch_mode', False) or self.parallel_processing) and len(all_regions) > 1: self._log(f"πŸš€ Using PARALLEL OCR for {len(all_regions)} regions (custom-api; API batch mode honored)") ocr_results = self._parallel_ocr_regions(image, all_regions, 'custom-api', confidence_threshold) else: # Original sequential processing for i, (x, y, w, h) in enumerate(all_regions): cropped = self._safe_crop_region(image, x, y, w, h) if cropped is None: continue result = self.ocr_manager.detect_text( cropped, 'custom-api', confidence=confidence_threshold ) if result and len(result) > 0 and result[0].text.strip(): result[0].bbox = (x, y, w, h) result[0].vertices = [(x, y), (x+w, y), (x+w, y+h), (x, y+h)] ocr_results.append(result[0]) self._log(f"πŸ” Region {i+1}/{len(all_regions)}: {result[0].text[:50]}...") # Clear regions list after processing all_regions = None else: # Process full image without bubble detection self._log("πŸ“ Processing full image with Custom API") ocr_results = self.ocr_manager.detect_text( image, 'custom-api', confidence=confidence_threshold ) elif self.ocr_provider == 'easyocr': # Initialize results list ocr_results = [] # Configure EasyOCR languages language_hints = ocr_settings.get('language_hints', ['ja', 'en']) validated_languages = self._validate_easyocr_languages(language_hints) easyocr_provider = self.ocr_manager.get_provider('easyocr') if easyocr_provider: if easyocr_provider.languages != validated_languages: easyocr_provider.languages = validated_languages easyocr_provider.is_loaded = False self._log(f"πŸ”₯ Reloading EasyOCR with languages: {validated_languages}") self.ocr_manager.load_provider('easyocr') # Check if we should use bubble detection if ocr_settings.get('bubble_detection_enabled', False): self._log("πŸ“ Using bubble detection regions for EasyOCR...") # Run bubble detection to get regions (thread-local) _ = self._get_thread_bubble_detector() # Get regions from bubble detector rtdetr_detections = self._load_bubble_detector(ocr_settings, image_path) if rtdetr_detections: # Process only text-containing regions all_regions = [] if 'text_bubbles' in rtdetr_detections: all_regions.extend(rtdetr_detections.get('text_bubbles', [])) if 'text_free' in rtdetr_detections: all_regions.extend(rtdetr_detections.get('text_free', [])) self._log(f"πŸ“Š Processing {len(all_regions)} text regions with EasyOCR") # Check if parallel processing is enabled if self.parallel_processing and len(all_regions) > 1: self._log(f"πŸš€ Using PARALLEL OCR for {len(all_regions)} regions with EasyOCR") ocr_results = self._parallel_ocr_regions(image, all_regions, 'easyocr', confidence_threshold) else: # Process each region with EasyOCR for i, (x, y, w, h) in enumerate(all_regions): cropped = self._safe_crop_region(image, x, y, w, h) if cropped is None: continue result = self.ocr_manager.detect_text(cropped, 'easyocr', confidence=confidence_threshold) if result and len(result) > 0 and result[0].text.strip(): result[0].bbox = (x, y, w, h) result[0].vertices = [(x, y), (x+w, y), (x+w, y+h), (x, y+h)] ocr_results.append(result[0]) self._log(f"βœ… Region {i+1}: {result[0].text[:50]}...") else: # Process full image without bubble detection self._log("πŸ“ Processing full image with EasyOCR") ocr_results = self.ocr_manager.detect_text(image, self.ocr_provider) elif self.ocr_provider == 'paddleocr': # Initialize results list ocr_results = [] # Configure PaddleOCR language language_hints = ocr_settings.get('language_hints', ['ja']) lang_map = {'ja': 'japan', 'ko': 'korean', 'zh': 'ch', 'en': 'en'} paddle_lang = lang_map.get(language_hints[0] if language_hints else 'ja', 'japan') # Reload if language changed paddle_provider = self.ocr_manager.get_provider('paddleocr') if paddle_provider and paddle_provider.is_loaded: if hasattr(paddle_provider.model, 'lang') and paddle_provider.model.lang != paddle_lang: from paddleocr import PaddleOCR paddle_provider.model = PaddleOCR( use_angle_cls=True, lang=paddle_lang, use_gpu=True, show_log=False ) self._log(f"πŸ”₯ Reloaded PaddleOCR with language: {paddle_lang}") # Check if we should use bubble detection if ocr_settings.get('bubble_detection_enabled', False): self._log("πŸ“ Using bubble detection regions for PaddleOCR...") # Run bubble detection to get regions (thread-local) _ = self._get_thread_bubble_detector() # Get regions from bubble detector rtdetr_detections = self._load_bubble_detector(ocr_settings, image_path) if rtdetr_detections: # Process only text-containing regions all_regions = [] if 'text_bubbles' in rtdetr_detections: all_regions.extend(rtdetr_detections.get('text_bubbles', [])) if 'text_free' in rtdetr_detections: all_regions.extend(rtdetr_detections.get('text_free', [])) self._log(f"πŸ“Š Processing {len(all_regions)} text regions with PaddleOCR") # Check if parallel processing is enabled if self.parallel_processing and len(all_regions) > 1: self._log(f"πŸš€ Using PARALLEL OCR for {len(all_regions)} regions with PaddleOCR") ocr_results = self._parallel_ocr_regions(image, all_regions, 'paddleocr', confidence_threshold) else: # Process each region with PaddleOCR for i, (x, y, w, h) in enumerate(all_regions): cropped = self._safe_crop_region(image, x, y, w, h) if cropped is None: continue result = self.ocr_manager.detect_text(cropped, 'paddleocr', confidence=confidence_threshold) if result and len(result) > 0 and result[0].text.strip(): result[0].bbox = (x, y, w, h) result[0].vertices = [(x, y), (x+w, y), (x+w, y+h), (x, y+h)] ocr_results.append(result[0]) self._log(f"βœ… Region {i+1}: {result[0].text[:50]}...") else: # Process full image without bubble detection self._log("πŸ“ Processing full image with PaddleOCR") ocr_results = self.ocr_manager.detect_text(image, self.ocr_provider) elif self.ocr_provider == 'doctr': # Initialize results list ocr_results = [] self._log("πŸ“„ DocTR OCR for document text recognition") # Check if we should use bubble detection if ocr_settings.get('bubble_detection_enabled', False): self._log("πŸ“ Using bubble detection regions for DocTR...") # Run bubble detection to get regions (thread-local) _ = self._get_thread_bubble_detector() # Get regions from bubble detector rtdetr_detections = self._load_bubble_detector(ocr_settings, image_path) if rtdetr_detections: # Process only text-containing regions all_regions = [] if 'text_bubbles' in rtdetr_detections: all_regions.extend(rtdetr_detections.get('text_bubbles', [])) if 'text_free' in rtdetr_detections: all_regions.extend(rtdetr_detections.get('text_free', [])) self._log(f"πŸ“Š Processing {len(all_regions)} text regions with DocTR") # Check if parallel processing is enabled if self.parallel_processing and len(all_regions) > 1: self._log(f"πŸš€ Using PARALLEL OCR for {len(all_regions)} regions with DocTR") ocr_results = self._parallel_ocr_regions(image, all_regions, 'doctr', confidence_threshold) else: # Process each region with DocTR for i, (x, y, w, h) in enumerate(all_regions): cropped = self._safe_crop_region(image, x, y, w, h) if cropped is None: continue result = self.ocr_manager.detect_text(cropped, 'doctr', confidence=confidence_threshold) if result and len(result) > 0 and result[0].text.strip(): result[0].bbox = (x, y, w, h) result[0].vertices = [(x, y), (x+w, y), (x+w, y+h), (x, y+h)] ocr_results.append(result[0]) self._log(f"βœ… Region {i+1}: {result[0].text[:50]}...") else: # Process full image without bubble detection self._log("πŸ“ Processing full image with DocTR") ocr_results = self.ocr_manager.detect_text(image, self.ocr_provider) elif self.ocr_provider == 'rapidocr': # Initialize results list ocr_results = [] # Get RapidOCR settings use_recognition = self.main_gui.config.get('rapidocr_use_recognition', True) language = self.main_gui.config.get('rapidocr_language', 'auto') detection_mode = self.main_gui.config.get('rapidocr_detection_mode', 'document') self._log(f"⚑ RapidOCR - Recognition: {'Full' if use_recognition else 'Detection Only'}") # ALWAYS process full image with RapidOCR for best results self._log("πŸ“Š Processing full image with RapidOCR") ocr_results = self.ocr_manager.detect_text( image, 'rapidocr', confidence=confidence_threshold, use_recognition=use_recognition, language=language, detection_mode=detection_mode ) # RT-DETR detection only affects merging, not OCR if ocr_settings.get('bubble_detection_enabled', False): self._log("πŸ€– RT-DETR will be used for bubble-based merging") else: # Default processing for any other providers ocr_results = self.ocr_manager.detect_text(image, self.ocr_provider) # Convert OCR results to TextRegion format for result in ocr_results: # CLEAN ORIGINAL OCR TEXT - Fix cube characters and encoding issues original_ocr_text = result.text cleaned_result_text = self._fix_encoding_issues(result.text) cleaned_result_text = self._normalize_unicode_width(cleaned_result_text) cleaned_result_text = self._sanitize_unicode_characters(cleaned_result_text) # Log cleaning if changes were made if cleaned_result_text != original_ocr_text: self._log(f"🧹 Cleaned OCR manager text: '{original_ocr_text[:30]}...' β†’ '{cleaned_result_text[:30]}...'", "debug") # Apply filtering (use cleaned text) if len(cleaned_result_text.strip()) < min_text_length: if not getattr(self, 'concise_logs', False): self._log(f" Skipping short text ({len(cleaned_result_text)} chars): {cleaned_result_text}") continue if exclude_english and self._is_primarily_english(cleaned_result_text): if not getattr(self, 'concise_logs', False): self._log(f" Skipping English text: {cleaned_result_text[:50]}...") continue if result.confidence < confidence_threshold: if not getattr(self, 'concise_logs', False): self._log(f" Skipping low confidence ({result.confidence:.2f}): {cleaned_result_text[:30]}...") continue # Create TextRegion (use cleaned text) # CRITICAL: Preserve bubble_bounds if it was set during OCR (e.g., manga-ocr with RT-DETR) region_kwargs = { 'text': cleaned_result_text, # Use cleaned text instead of original 'vertices': result.vertices if result.vertices else [ (result.bbox[0], result.bbox[1]), (result.bbox[0] + result.bbox[2], result.bbox[1]), (result.bbox[0] + result.bbox[2], result.bbox[1] + result.bbox[3]), (result.bbox[0], result.bbox[1] + result.bbox[3]) ], 'bounding_box': result.bbox, 'confidence': result.confidence, 'region_type': 'text_block' } # Preserve bubble_bounds from OCR result if present if hasattr(result, 'bubble_bounds') and result.bubble_bounds is not None: region_kwargs['bubble_bounds'] = result.bubble_bounds self._log(f" πŸ” Preserved bubble_bounds from OCR: {result.bubble_bounds}", "debug") else: if hasattr(result, 'bubble_bounds'): self._log(f" ⚠️ OCR result has bubble_bounds but it's None!", "debug") else: self._log(f" ℹ️ OCR result has no bubble_bounds attribute", "debug") region = TextRegion(**region_kwargs) regions.append(region) if not getattr(self, 'concise_logs', False): self._log(f" Found text ({result.confidence:.2f}): {cleaned_result_text[:50]}...") # MERGING SECTION (applies to all providers) # Check if bubble detection is enabled if ocr_settings.get('bubble_detection_enabled', False): # For manga-ocr and similar providers, skip merging since regions already have bubble_bounds from OCR # Only Azure and Google need merging because they return line-level OCR results if self.ocr_provider in ['manga-ocr', 'Qwen2-VL', 'custom-api', 'easyocr', 'paddleocr', 'doctr']: self._log("🎯 Skipping bubble detection merge for manga-ocr (regions already aligned with RT-DETR)") # Regions already have bubble_bounds set from OCR phase - no need to merge else: # Azure and Google return line-level results that need to be merged into bubbles self._log("πŸ€– Using AI bubble detection for merging") regions = self._merge_with_bubble_detection(regions, image_path) else: # Traditional merging merge_threshold = ocr_settings.get('merge_nearby_threshold', 20) # Apply provider-specific adjustments if self.ocr_provider == 'azure': azure_multiplier = ocr_settings.get('azure_merge_multiplier', 2.0) merge_threshold = int(merge_threshold * azure_multiplier) self._log(f"πŸ“‹ Using Azure-adjusted merge threshold: {merge_threshold}px") # Pre-group Azure lines if the method exists if hasattr(self, '_pregroup_azure_lines'): regions = self._pregroup_azure_lines(regions, merge_threshold) elif self.ocr_provider in ['paddleocr', 'easyocr', 'doctr']: # These providers often return smaller text segments line_multiplier = ocr_settings.get('line_ocr_merge_multiplier', 1.5) merge_threshold = int(merge_threshold * line_multiplier) self._log(f"πŸ“‹ Using line-based OCR adjusted threshold: {merge_threshold}px") # Apply standard merging regions = self._merge_nearby_regions(regions, threshold=merge_threshold) self._log(f"βœ… Detected {len(regions)} text regions after merging") # NOTE: Debug images are saved in process_image() with correct output_dir # Removed duplicate save here to avoid creating unexpected 'translated_images' folders return regions except Exception as e: self._log(f"❌ Error detecting text: {str(e)}", "error") import traceback self._log(traceback.format_exc(), "error") raise def _validate_easyocr_languages(self, languages): """Validate EasyOCR language combinations""" # EasyOCR compatibility rules incompatible_sets = [ {'ja', 'ko'}, # Japanese + Korean {'ja', 'zh'}, # Japanese + Chinese {'ko', 'zh'} # Korean + Chinese ] lang_set = set(languages) for incompatible in incompatible_sets: if incompatible.issubset(lang_set): # Conflict detected - keep first language + English primary_lang = languages[0] if languages else 'en' result = [primary_lang, 'en'] if primary_lang != 'en' else ['en'] self._log(f"⚠️ EasyOCR: {' + '.join(incompatible)} not compatible", "warning") self._log(f"πŸ”§ Auto-adjusted from {languages} to {result}", "info") return result return languages def _parallel_ocr_regions(self, image: np.ndarray, regions: List, provider: str, confidence_threshold: float) -> List: """Process multiple regions in parallel using ThreadPoolExecutor""" from concurrent.futures import ThreadPoolExecutor, as_completed import threading ocr_results = [] results_lock = threading.Lock() def process_single_region(index: int, bbox: Tuple[int, int, int, int]): """Process a single region with OCR""" x, y, w, h = bbox try: # Use the safe crop method cropped = self._safe_crop_region(image, x, y, w, h) # Skip if crop failed if cropped is None: self._log(f"⚠️ Skipping region {index} - invalid crop", "warning") return # Run OCR on this region result = self.ocr_manager.detect_text( cropped, provider, confidence=confidence_threshold ) if result and len(result) > 0 and result[0].text.strip(): # Adjust coordinates to full image space result[0].bbox = (x, y, w, h) result[0].vertices = [(x, y), (x+w, y), (x+w, y+h), (x, y+h)] # CRITICAL: Store RT-DETR bubble bounds for rendering (for non-Azure/Google providers) result[0].bubble_bounds = (x, y, w, h) return (index, result[0]) return (index, None) except Exception as e: self._log(f"Error processing region {index}: {str(e)}", "error") return (index, None) # Process regions in parallel max_workers = self.manga_settings.get('advanced', {}).get('max_workers', 4) # For custom-api, treat OCR calls as API calls: use batch size when batch mode is enabled try: if provider == 'custom-api': # prefer MangaTranslator.batch_size (from env BATCH_SIZE) bs = int(getattr(self, 'batch_size', 0) or int(os.getenv('BATCH_SIZE', '0'))) if bs and bs > 0: max_workers = bs except Exception: pass # Never spawn more workers than regions max_workers = max(1, min(max_workers, len(regions))) with ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit all tasks future_to_index = {} for i, bbox in enumerate(regions): future = executor.submit(process_single_region, i, bbox) future_to_index[future] = i # Collect results results_dict = {} completed = 0 for future in as_completed(future_to_index): try: index, result = future.result(timeout=30) if result: results_dict[index] = result completed += 1 self._log(f"βœ… [{completed}/{len(regions)}] Processed region {index+1}") except Exception as e: self._log(f"Failed to process region: {str(e)}", "error") # Sort results by index to maintain order for i in range(len(regions)): if i in results_dict: ocr_results.append(results_dict[i]) self._log(f"πŸ“Š Parallel OCR complete: {len(ocr_results)}/{len(regions)} regions extracted") return ocr_results def _pregroup_azure_lines(self, lines: List[TextRegion], base_threshold: int) -> List[TextRegion]: """Pre-group Azure lines that are obviously part of the same text block This makes them more like Google's blocks before the main merge logic""" if len(lines) <= 1: return lines # Sort by vertical position first, then horizontal lines.sort(key=lambda r: (r.bounding_box[1], r.bounding_box[0])) pregrouped = [] i = 0 while i < len(lines): current_group = [lines[i]] current_bbox = list(lines[i].bounding_box) # Look ahead for lines that should obviously be grouped j = i + 1 while j < len(lines): x1, y1, w1, h1 = current_bbox x2, y2, w2, h2 = lines[j].bounding_box # Calculate gaps vertical_gap = y2 - (y1 + h1) if y2 > y1 + h1 else 0 # Check horizontal alignment center_x1 = x1 + w1 / 2 center_x2 = x2 + w2 / 2 horizontal_offset = abs(center_x1 - center_x2) avg_width = (w1 + w2) / 2 # Group if: # 1. Lines are vertically adjacent (small gap) # 2. Lines are well-aligned horizontally (likely same bubble) if (vertical_gap < h1 * 0.5 and # Less than half line height gap horizontal_offset < avg_width * 0.5): # Well centered # Add to group current_group.append(lines[j]) # Update bounding box to include new line min_x = min(x1, x2) min_y = min(y1, y2) max_x = max(x1 + w1, x2 + w2) max_y = max(y1 + h1, y2 + h2) current_bbox = [min_x, min_y, max_x - min_x, max_y - min_y] j += 1 else: break # Create merged region from group if len(current_group) > 1: merged_text = " ".join([line.text for line in current_group]) all_vertices = [] for line in current_group: all_vertices.extend(line.vertices) merged_region = TextRegion( text=merged_text, vertices=all_vertices, bounding_box=tuple(current_bbox), confidence=0.95, region_type='pregrouped_lines' ) pregrouped.append(merged_region) self._log(f" Pre-grouped {len(current_group)} Azure lines into block") else: # Single line, keep as is pregrouped.append(lines[i]) i = j if j > i + 1 else i + 1 self._log(f" Azure pre-grouping: {len(lines)} lines β†’ {len(pregrouped)} blocks") return pregrouped def _safe_crop_region(self, image, x, y, w, h): """Safely crop a region from image with validation""" img_h, img_w = image.shape[:2] # Validate and clamp coordinates x = max(0, min(x, img_w - 1)) y = max(0, min(y, img_h - 1)) x2 = min(x + w, img_w) y2 = min(y + h, img_h) # Ensure valid region if x2 <= x or y2 <= y: self._log(f"⚠️ Invalid crop region: ({x},{y},{w},{h}) for image {img_w}x{img_h}", "warning") return None # Minimum size check if (x2 - x) < 5 or (y2 - y) < 5: self._log(f"⚠️ Region too small: {x2-x}x{y2-y} pixels", "warning") return None cropped = image[y:y2, x:x2] if cropped.size == 0: self._log(f"⚠️ Empty crop result", "warning") return None return cropped def _prepare_ocr_rois_from_bubbles(self, image_path: str, ocr_settings: Dict, preprocessing: Dict, page_hash: str) -> List[Dict[str, Any]]: """Prepare ROI crops (bytes) from bubble detection to use with OCR locality. - Enhancements/resizing are gated by preprocessing['enabled']. - Compression/encoding is controlled by manga_settings['compression'] independently. Returns list of dicts: {id, bbox, bytes, type} """ try: # Run bubble detector and collect text-containing boxes detections = self._load_bubble_detector(ocr_settings, image_path) if not detections: return [] regions = [] for key in ('text_bubbles', 'text_free'): for i, (bx, by, bw, bh) in enumerate(detections.get(key, []) or []): regions.append({'type': 'text_bubble' if key == 'text_bubbles' else 'free_text', 'bbox': (int(bx), int(by), int(bw), int(bh)), 'id': f"{key}_{i}"}) if not regions: return [] # Open original image once pil = Image.open(image_path) if pil.mode != 'RGB': pil = pil.convert('RGB') pad_ratio = float(ocr_settings.get('roi_padding_ratio', 0.08)) # 8% padding default preproc_enabled = bool(preprocessing.get('enabled', False)) # Compression settings (separate from preprocessing) comp = {} try: comp = (self.main_gui.config.get('manga_settings', {}) or {}).get('compression', {}) except Exception: comp = {} comp_enabled = bool(comp.get('enabled', False)) comp_format = str(comp.get('format', 'jpeg')).lower() jpeg_q = int(comp.get('jpeg_quality', 85)) png_lvl = int(comp.get('png_compress_level', 6)) webp_q = int(comp.get('webp_quality', 85)) out = [] W, H = pil.size # Pre-filter tiny ROIs (skip before cropping) min_side_px = int(ocr_settings.get('roi_min_side_px', 12)) min_area_px = int(ocr_settings.get('roi_min_area_px', 100)) for rec in regions: x, y, w, h = rec['bbox'] if min(w, h) < max(1, min_side_px) or (w * h) < max(1, min_area_px): # Skip tiny ROI continue # Apply padding px = int(w * pad_ratio) py = int(h * pad_ratio) x1 = max(0, x - px) y1 = max(0, y - py) x2 = min(W, x + w + px) y2 = min(H, y + h + py) if x2 <= x1 or y2 <= y1: continue crop = pil.crop((x1, y1, x2, y2)) # Quality-affecting steps only when preprocessing enabled if preproc_enabled: try: # Enhance contrast/sharpness/brightness if configured c = float(preprocessing.get('contrast_threshold', 0.4)) s = float(preprocessing.get('sharpness_threshold', 0.3)) g = float(preprocessing.get('enhancement_strength', 1.5)) if c: crop = ImageEnhance.Contrast(crop).enhance(1 + c) if s: crop = ImageEnhance.Sharpness(crop).enhance(1 + s) if g and g != 1.0: crop = ImageEnhance.Brightness(crop).enhance(g) # Optional ROI resize limit (short side cap) roi_max_side = int(ocr_settings.get('roi_max_side', 0) or 0) if roi_max_side and (crop.width > roi_max_side or crop.height > roi_max_side): ratio = min(roi_max_side / crop.width, roi_max_side / crop.height) crop = crop.resize((max(1, int(crop.width * ratio)), max(1, int(crop.height * ratio))), Image.Resampling.LANCZOS) except Exception: pass # Encoding/Compression independent of preprocessing from io import BytesIO buf = BytesIO() try: if comp_enabled: if comp_format in ('jpeg', 'jpg'): if crop.mode != 'RGB': crop = crop.convert('RGB') crop.save(buf, format='JPEG', quality=max(1, min(95, jpeg_q)), optimize=True, progressive=True) elif comp_format == 'png': crop.save(buf, format='PNG', optimize=True, compress_level=max(0, min(9, png_lvl))) elif comp_format == 'webp': crop.save(buf, format='WEBP', quality=max(1, min(100, webp_q))) else: crop.save(buf, format='PNG', optimize=True) else: # Default lossless PNG crop.save(buf, format='PNG', optimize=True) img_bytes = buf.getvalue() except Exception: buf = BytesIO() crop.save(buf, format='PNG', optimize=True) img_bytes = buf.getvalue() out.append({ 'id': rec['id'], 'bbox': (x, y, w, h), # keep original bbox without padding for placement 'bytes': img_bytes, 'type': rec['type'], 'page_hash': page_hash }) return out except Exception as e: self._log(f"⚠️ ROI preparation failed: {e}", "warning") return [] def _google_ocr_rois_batched(self, rois: List[Dict[str, Any]], ocr_settings: Dict, batch_size: int, max_concurrency: int, page_hash: str) -> List[TextRegion]: """Batch OCR of ROI crops using Google Vision batchAnnotateImages. - Uses bounded concurrency for multiple batches in flight. - Consults and updates an in-memory ROI OCR cache. """ try: from google.cloud import vision as _vision except Exception: self._log("❌ Google Vision SDK not available for ROI batching", "error") return [] lang_hints = ocr_settings.get('language_hints', ['ja', 'ko', 'zh']) detection_mode = ocr_settings.get('text_detection_mode', 'document') feature_type = _vision.Feature.Type.DOCUMENT_TEXT_DETECTION if detection_mode == 'document' else _vision.Feature.Type.TEXT_DETECTION feature = _vision.Feature(type=feature_type) results: List[TextRegion] = [] min_text_length = int(ocr_settings.get('min_text_length', 2)) exclude_english = bool(ocr_settings.get('exclude_english_text', True)) # Check cache first and build work list of uncached ROIs work_rois = [] for roi in rois: x, y, w, h = roi['bbox'] # Include region type in cache key to prevent mismapping cache_key = ("google", page_hash, x, y, w, h, tuple(lang_hints), detection_mode, roi.get('type', 'unknown')) # THREAD-SAFE: Use lock for cache access in parallel panel translation with self._cache_lock: cached_text = self.ocr_roi_cache.get(cache_key) if cached_text: region = TextRegion( text=cached_text, vertices=[(x, y), (x+w, y), (x+w, y+h), (x, y+h)], bounding_box=(x, y, w, h), confidence=0.95, region_type='ocr_roi' ) try: region.bubble_type = 'free_text' if roi.get('type') == 'free_text' else 'text_bubble' region.should_inpaint = True except Exception: pass results.append(region) else: roi['cache_key'] = cache_key work_rois.append(roi) if not work_rois: return results # Create batches batch_size = max(1, batch_size) batches = [work_rois[i:i+batch_size] for i in range(0, len(work_rois), batch_size)] max_concurrency = max(1, int(max_concurrency or 1)) def do_batch(batch): # RATE LIMITING: Add small delay before batch submission import time import random time.sleep(0.1 + random.random() * 0.2) # 0.1-0.3s random delay requests = [] for roi in batch: img = _vision.Image(content=roi['bytes']) ctx = _vision.ImageContext(language_hints=list(lang_hints)) req = _vision.AnnotateImageRequest(image=img, features=[feature], image_context=ctx) requests.append(req) return self.vision_client.batch_annotate_images(requests=requests), batch # Execute with concurrency if max_concurrency == 1 or len(batches) == 1: iter_batches = [(self.vision_client.batch_annotate_images(requests=[ _vision.AnnotateImageRequest(image=_vision.Image(content=roi['bytes']), features=[feature], image_context=_vision.ImageContext(language_hints=list(lang_hints))) for roi in batch ]), batch) for batch in batches] else: from concurrent.futures import ThreadPoolExecutor, as_completed iter_batches = [] with ThreadPoolExecutor(max_workers=max_concurrency) as ex: futures = [ex.submit(do_batch, b) for b in batches] for fut in as_completed(futures): try: iter_batches.append(fut.result()) except Exception as e: self._log(f"⚠️ Google batch failed: {e}", "warning") continue # Consume responses and update cache for resp, batch in iter_batches: for roi, ann in zip(batch, resp.responses): if getattr(ann, 'error', None) and ann.error.message: self._log(f"⚠️ ROI OCR error: {ann.error.message}", "warning") continue text = '' try: if getattr(ann, 'full_text_annotation', None) and ann.full_text_annotation.text: text = ann.full_text_annotation.text elif ann.text_annotations: text = ann.text_annotations[0].description except Exception: text = '' text = (text or '').strip() text_clean = self._sanitize_unicode_characters(self._fix_encoding_issues(text)) if len(text_clean.strip()) < min_text_length: continue if exclude_english and self._is_primarily_english(text_clean): continue x, y, w, h = roi['bbox'] # Update cache # THREAD-SAFE: Use lock for cache write in parallel panel translation try: ck = roi.get('cache_key') or ("google", page_hash, x, y, w, h, tuple(lang_hints), detection_mode) with self._cache_lock: self.ocr_roi_cache[ck] = text_clean except Exception: pass region = TextRegion( text=text_clean, vertices=[(x, y), (x+w, y), (x+w, y+h), (x, y+h)], bounding_box=(x, y, w, h), confidence=0.95, region_type='ocr_roi' ) try: region.bubble_type = 'free_text' if roi.get('type') == 'free_text' else 'text_bubble' region.should_inpaint = True except Exception: pass results.append(region) return results def _azure_ocr_rois_concurrent(self, rois: List[Dict[str, Any]], ocr_settings: Dict, max_workers: int, page_hash: str) -> List[TextRegion]: """Concurrent ROI OCR for Azure Read API. Each ROI is sent as a separate call. Concurrency is bounded by max_workers. Consults/updates cache. """ from concurrent.futures import ThreadPoolExecutor, as_completed from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes import io results: List[TextRegion] = [] # Read settings reading_order = ocr_settings.get('azure_reading_order', 'natural') model_version = ocr_settings.get('azure_model_version', 'latest') language_hints = ocr_settings.get('language_hints', ['ja']) read_params = {'raw': True, 'readingOrder': reading_order} if model_version != 'latest': read_params['model-version'] = model_version if len(language_hints) == 1: lang_mapping = {'zh': 'zh-Hans', 'zh-TW': 'zh-Hant', 'zh-CN': 'zh-Hans', 'ja': 'ja', 'ko': 'ko', 'en': 'en'} read_params['language'] = lang_mapping.get(language_hints[0], language_hints[0]) min_text_length = int(ocr_settings.get('min_text_length', 2)) exclude_english = bool(ocr_settings.get('exclude_english_text', True)) # Check cache first and split into cached vs work rois cached_regions: List[TextRegion] = [] work_rois: List[Dict[str, Any]] = [] for roi in rois: x, y, w, h = roi['bbox'] # Include region type in cache key to prevent mismapping cache_key = ("azure", page_hash, x, y, w, h, reading_order, roi.get('type', 'unknown')) # THREAD-SAFE: Use lock for cache access in parallel panel translation with self._cache_lock: text_cached = self.ocr_roi_cache.get(cache_key) if text_cached: region = TextRegion( text=text_cached, vertices=[(x, y), (x+w, y), (x+w, y+h), (x, y+h)], bounding_box=(x, y, w, h), confidence=0.95, region_type='ocr_roi' ) try: region.bubble_type = 'free_text' if roi.get('type') == 'free_text' else 'text_bubble' region.should_inpaint = True except Exception: pass cached_regions.append(region) else: roi['cache_key'] = cache_key work_rois.append(roi) def ocr_one(roi): try: # RATE LIMITING: Add delay between Azure API calls to avoid "Too Many Requests" import time import random # Stagger requests with randomized delay time.sleep(0.1 + random.random() * 0.2) # 0.1-0.3s random delay # Ensure Azure-supported format for ROI bytes; honor compression preference when possible data = roi['bytes'] try: from PIL import Image as _PILImage im = _PILImage.open(io.BytesIO(data)) fmt = (im.format or '').lower() if fmt not in ['jpeg', 'jpg', 'png', 'bmp', 'tiff']: # Choose conversion target based on compression settings if available try: comp_cfg = (self.main_gui.config.get('manga_settings', {}) or {}).get('compression', {}) except Exception: comp_cfg = {} target_fmt = 'PNG' try: if comp_cfg.get('enabled', False): cf = str(comp_cfg.get('format', '')).lower() if cf in ('jpeg', 'jpg'): target_fmt = 'JPEG' elif cf == 'png': target_fmt = 'PNG' elif cf == 'bmp': target_fmt = 'BMP' elif cf == 'tiff': target_fmt = 'TIFF' except Exception: pass buf2 = io.BytesIO() if target_fmt == 'JPEG' and im.mode != 'RGB': im = im.convert('RGB') im.save(buf2, format=target_fmt) data = buf2.getvalue() except Exception: pass stream = io.BytesIO(data) read_response = self.vision_client.read_in_stream(stream, **read_params) op_loc = read_response.headers.get('Operation-Location') if hasattr(read_response, 'headers') else None if not op_loc: return None op_id = op_loc.split('/')[-1] # Poll import time waited = 0.0 poll_interval = float(ocr_settings.get('azure_poll_interval', 0.5)) max_wait = float(ocr_settings.get('azure_max_wait', 60)) while waited < max_wait: result = self.vision_client.get_read_result(op_id) if result.status not in [OperationStatusCodes.running, OperationStatusCodes.not_started]: break time.sleep(poll_interval) waited += poll_interval if result.status != OperationStatusCodes.succeeded: return None # Aggregate text lines texts = [] for page in result.analyze_result.read_results: for line in page.lines: t = self._sanitize_unicode_characters(self._fix_encoding_issues(line.text or '')) if t: texts.append(t) text_all = ' '.join(texts).strip() if len(text_all) < min_text_length: return None if exclude_english and self._is_primarily_english(text_all): return None x, y, w, h = roi['bbox'] # Update cache # THREAD-SAFE: Use lock for cache write in parallel panel translation try: ck = roi.get('cache_key') if ck: with self._cache_lock: self.ocr_roi_cache[ck] = text_all except Exception: pass region = TextRegion( text=text_all, vertices=[(x, y), (x+w, y), (x+w, y+h), (x, y+h)], bounding_box=(x, y, w, h), confidence=0.95, region_type='ocr_roi' ) try: region.bubble_type = 'free_text' if roi.get('type') == 'free_text' else 'text_bubble' region.should_inpaint = True except Exception: pass return region except Exception: return None # Combine cached and new results results.extend(cached_regions) if work_rois: max_workers = max(1, min(max_workers, len(work_rois))) with ThreadPoolExecutor(max_workers=max_workers) as ex: fut_map = {ex.submit(ocr_one, r): r for r in work_rois} for fut in as_completed(fut_map): reg = fut.result() if reg is not None: results.append(reg) return results def _detect_text_azure(self, image_data: bytes, ocr_settings: dict) -> List[TextRegion]: """Detect text using Azure Computer Vision""" import io from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes stream = io.BytesIO(image_data) # Use Read API for better manga text detection read_result = self.vision_client.read_in_stream( stream, raw=True, language='ja' # or from ocr_settings ) # Get operation ID from headers operation_location = read_result.headers["Operation-Location"] operation_id = operation_location.split("/")[-1] # Wait for completion import time while True: result = self.vision_client.get_read_result(operation_id) if result.status not in [OperationStatusCodes.running, OperationStatusCodes.not_started]: break time.sleep(0.1) # Brief pause for stability logger.debug("πŸ’€ Azure text detection pausing briefly for stability") regions = [] confidence_threshold = ocr_settings.get('confidence_threshold', 0.6) if result.status == OperationStatusCodes.succeeded: for page in result.analyze_result.read_results: for line in page.lines: # Azure returns bounding box as 8 coordinates bbox = line.bounding_box vertices = [ (bbox[0], bbox[1]), (bbox[2], bbox[3]), (bbox[4], bbox[5]), (bbox[6], bbox[7]) ] xs = [v[0] for v in vertices] ys = [v[1] for v in vertices] x_min, x_max = min(xs), max(xs) y_min, y_max = min(ys), max(ys) # Azure doesn't provide per-line confidence in Read API confidence = 0.95 # Default high confidence if confidence >= confidence_threshold: region = TextRegion( text=line.text, vertices=vertices, bounding_box=(x_min, y_min, x_max - x_min, y_max - y_min), confidence=confidence, region_type='text_line' ) regions.append(region) return regions def _load_image_with_compression_only(self, image_path: str, comp: Dict) -> bytes: """Load image and apply compression settings only (no enhancements/resizing).""" from io import BytesIO pil = Image.open(image_path) if pil.mode != 'RGB': pil = pil.convert('RGB') buf = BytesIO() try: fmt = str(comp.get('format', 'jpeg')).lower() if fmt in ('jpeg', 'jpg'): q = max(1, min(95, int(comp.get('jpeg_quality', 85)))) pil.save(buf, format='JPEG', quality=q, optimize=True, progressive=True) elif fmt == 'png': lvl = max(0, min(9, int(comp.get('png_compress_level', 6)))) pil.save(buf, format='PNG', optimize=True, compress_level=lvl) elif fmt == 'webp': wq = max(1, min(100, int(comp.get('webp_quality', 85)))) pil.save(buf, format='WEBP', quality=wq) else: pil.save(buf, format='PNG', optimize=True) except Exception: pil.save(buf, format='PNG', optimize=True) return buf.getvalue() def _preprocess_image(self, image_path: str, preprocessing_settings: Dict) -> bytes: """Preprocess image for better OCR results - Enhancements/resizing controlled by preprocessing_settings - Compression controlled by manga_settings['compression'] independently """ try: # Open image with PIL pil_image = Image.open(image_path) # Convert to RGB if necessary if pil_image.mode != 'RGB': pil_image = pil_image.convert('RGB') # Auto-detect quality issues if enabled if preprocessing_settings.get('auto_detect_quality', True): needs_enhancement = self._detect_quality_issues(pil_image, preprocessing_settings) if needs_enhancement: self._log(" Auto-detected quality issues - applying enhancements") else: needs_enhancement = True if needs_enhancement: # Apply contrast enhancement contrast_threshold = preprocessing_settings.get('contrast_threshold', 0.4) enhancer = ImageEnhance.Contrast(pil_image) pil_image = enhancer.enhance(1 + contrast_threshold) # Apply sharpness enhancement sharpness_threshold = preprocessing_settings.get('sharpness_threshold', 0.3) enhancer = ImageEnhance.Sharpness(pil_image) pil_image = enhancer.enhance(1 + sharpness_threshold) # Apply general enhancement strength enhancement_strength = preprocessing_settings.get('enhancement_strength', 1.5) if enhancement_strength != 1.0: # Brightness adjustment enhancer = ImageEnhance.Brightness(pil_image) pil_image = enhancer.enhance(enhancement_strength) # Resize if too large max_dimension = preprocessing_settings.get('max_image_dimension', 2000) if pil_image.width > max_dimension or pil_image.height > max_dimension: ratio = min(max_dimension / pil_image.width, max_dimension / pil_image.height) new_size = (int(pil_image.width * ratio), int(pil_image.height * ratio)) pil_image = pil_image.resize(new_size, Image.Resampling.LANCZOS) self._log(f" Resized image to {new_size[0]}x{new_size[1]}") # Convert back to bytes with compression settings from global config from io import BytesIO buffered = BytesIO() comp = {} try: comp = (self.main_gui.config.get('manga_settings', {}) or {}).get('compression', {}) except Exception: comp = {} try: if comp.get('enabled', False): fmt = str(comp.get('format', 'jpeg')).lower() if fmt in ('jpeg', 'jpg'): if pil_image.mode != 'RGB': pil_image = pil_image.convert('RGB') quality = max(1, min(95, int(comp.get('jpeg_quality', 85)))) pil_image.save(buffered, format='JPEG', quality=quality, optimize=True, progressive=True) self._log(f" Compressed image as JPEG (q={quality})") elif fmt == 'png': level = max(0, min(9, int(comp.get('png_compress_level', 6)))) pil_image.save(buffered, format='PNG', optimize=True, compress_level=level) self._log(f" Compressed image as PNG (level={level})") elif fmt == 'webp': q = max(1, min(100, int(comp.get('webp_quality', 85)))) pil_image.save(buffered, format='WEBP', quality=q) self._log(f" Compressed image as WEBP (q={q})") else: pil_image.save(buffered, format='PNG', optimize=True) self._log(" Unknown compression format; saved as optimized PNG") else: pil_image.save(buffered, format='PNG', optimize=True) except Exception as _e: self._log(f" ⚠️ Compression failed ({_e}); saved as optimized PNG", "warning") pil_image.save(buffered, format='PNG', optimize=True) return buffered.getvalue() except Exception as e: self._log(f"⚠️ Preprocessing failed: {str(e)}, using original image", "warning") with open(image_path, 'rb') as f: return f.read() def _detect_quality_issues(self, image: Image.Image, settings: Dict) -> bool: """Auto-detect if image needs quality enhancement""" # Convert to grayscale for analysis gray = image.convert('L') # Get histogram hist = gray.histogram() # Calculate contrast (simplified) pixels = sum(hist) mean = sum(i * hist[i] for i in range(256)) / pixels variance = sum(hist[i] * (i - mean) ** 2 for i in range(256)) / pixels std_dev = variance ** 0.5 # Low contrast if std deviation is low contrast_threshold = settings.get('contrast_threshold', 0.4) * 100 if std_dev < contrast_threshold: self._log(" Low contrast detected") return True # Check for blur using Laplacian variance import numpy as np gray_array = np.array(gray) laplacian = cv2.Laplacian(gray_array, cv2.CV_64F) variance = laplacian.var() sharpness_threshold = settings.get('sharpness_threshold', 0.3) * 100 if variance < sharpness_threshold: self._log(" Blur detected") return True return False def _save_debug_image(self, image_path: str, regions: List[TextRegion], debug_base_dir: str = None): """Save debug image with detected regions highlighted, respecting save_intermediate toggle. All files are written under /debug (or provided debug_base_dir).""" advanced_settings = self.manga_settings.get('advanced', {}) # Skip debug images in batch mode unless explicitly requested if self.batch_mode and not advanced_settings.get('force_debug_batch', False): return # Respect the 'Save intermediate images' toggle only if not advanced_settings.get('save_intermediate', False): return # Compute debug directory under translated_images if debug_base_dir is None: translated_dir = os.path.join(os.path.dirname(image_path), 'translated_images') debug_dir = os.path.join(translated_dir, 'debug') else: debug_dir = os.path.join(debug_base_dir, 'debug') os.makedirs(debug_dir, exist_ok=True) base_name = os.path.splitext(os.path.basename(image_path))[0] try: import cv2 import numpy as np from PIL import Image as PILImage # Handle Unicode paths try: img = cv2.imread(image_path) if img is None: # Fallback to PIL for Unicode paths pil_image = PILImage.open(image_path) img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR) except Exception as e: self._log(f" Failed to load image for debug: {str(e)}", "warning") return # Debug directory prepared earlier; compute base name # base_name already computed above # Draw rectangles around detected text regions overlay = img.copy() # Calculate statistics total_chars = sum(len(r.text) for r in regions) avg_confidence = np.mean([r.confidence for r in regions]) if regions else 0 for i, region in enumerate(regions): # Convert to int to avoid OpenCV type errors x, y, w, h = map(int, region.bounding_box) # Color based on confidence if region.confidence > 0.95: color = (0, 255, 0) # Green - high confidence elif region.confidence > 0.8: color = (0, 165, 255) # Orange - medium confidence else: color = (0, 0, 255) # Red - low confidence # Draw rectangle cv2.rectangle(overlay, (x, y), (x + w, y + h), color, 2) # Add region info info_text = f"#{i} ({region.confidence:.2f})" cv2.putText(overlay, info_text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1, cv2.LINE_AA) # Add character count char_count = len(region.text.strip()) cv2.putText(overlay, f"{char_count} chars", (x, y + h + 15), cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1, cv2.LINE_AA) # Add detected text preview if in verbose debug mode if self.manga_settings.get('advanced', {}).get('save_intermediate', False): text_preview = region.text[:20] + "..." if len(region.text) > 20 else region.text cv2.putText(overlay, text_preview, (x, y + h + 30), cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1, cv2.LINE_AA) # Add overall statistics to the image stats_bg = overlay.copy() cv2.rectangle(stats_bg, (10, 10), (300, 90), (0, 0, 0), -1) cv2.addWeighted(stats_bg, 0.7, overlay, 0.3, 0, overlay) stats_text = [ f"Regions: {len(regions)}", f"Total chars: {total_chars}", f"Avg confidence: {avg_confidence:.2f}" ] for i, text in enumerate(stats_text): cv2.putText(overlay, text, (20, 35 + i*20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA) # Save main debug image (always under translated_images/debug when enabled) debug_path = os.path.join(debug_dir, f"{base_name}_debug_regions.png") cv2.imwrite(debug_path, overlay) self._log(f" πŸ“Έ Saved debug image: {debug_path}") # Save text mask mask = self.create_text_mask(img, regions) mask_debug_path = debug_path.replace('_debug', '_mask') cv2.imwrite(mask_debug_path, mask) mask_percentage = ((mask > 0).sum() / mask.size) * 100 self._log(f" 🎭 Saved mask image: {mask_debug_path}", "info") self._log(f" πŸ“Š Mask coverage: {mask_percentage:.1f}% of image", "info") # If save_intermediate is enabled, save additional debug images if self.manga_settings.get('advanced', {}).get('save_intermediate', False): # Save confidence heatmap heatmap = self._create_confidence_heatmap(img, regions) heatmap_path = os.path.join(debug_dir, f"{base_name}_confidence_heatmap.png") cv2.imwrite(heatmap_path, heatmap) self._log(f" 🌑️ Saved confidence heatmap: {heatmap_path}") # Save polygon visualization with safe text areas if any(hasattr(r, 'vertices') and r.vertices for r in regions): polygon_img = img.copy() for region in regions: if hasattr(region, 'vertices') and region.vertices: # Draw polygon pts = np.array(region.vertices, np.int32) pts = pts.reshape((-1, 1, 2)) # Fill with transparency overlay_poly = polygon_img.copy() cv2.fillPoly(overlay_poly, [pts], (0, 255, 255)) cv2.addWeighted(overlay_poly, 0.2, polygon_img, 0.8, 0, polygon_img) # Draw outline cv2.polylines(polygon_img, [pts], True, (255, 0, 0), 2) # Draw safe text area try: safe_x, safe_y, safe_w, safe_h = self.get_safe_text_area(region) # Convert to int for OpenCV safe_x, safe_y, safe_w, safe_h = map(int, (safe_x, safe_y, safe_w, safe_h)) cv2.rectangle(polygon_img, (safe_x, safe_y), (safe_x + safe_w, safe_y + safe_h), (0, 255, 0), 1) except: pass # Skip if get_safe_text_area fails # Add legend to explain colors legend_bg = polygon_img.copy() legend_height = 140 legend_width = 370 cv2.rectangle(legend_bg, (10, 10), (10 + legend_width, 10 + legend_height), (0, 0, 0), -1) cv2.addWeighted(legend_bg, 0.8, polygon_img, 0.2, 0, polygon_img) # Add legend items # Note: OpenCV uses BGR format, so (255, 0, 0) = Blue, (0, 0, 255) = Red legend_items = [ ("Blue outline: OCR polygon (detected text)", (255, 0, 0)), ("Yellow fill: Mask area (will be inpainted)", (0, 255, 255)), ("Green rect: Safe text area (algorithm-based)", (0, 255, 0)), ("Magenta rect: Mask bounds (actual render area)", (255, 0, 255)) ] for i, (text, color) in enumerate(legend_items): y_pos = 30 + i * 30 # Draw color sample if i == 1: # Yellow fill cv2.rectangle(polygon_img, (20, y_pos - 8), (35, y_pos + 8), color, -1) else: cv2.rectangle(polygon_img, (20, y_pos - 8), (35, y_pos + 8), color, 2) # Draw text cv2.putText(polygon_img, text, (45, y_pos + 5), cv2.FONT_HERSHEY_SIMPLEX, 0.45, (255, 255, 255), 1, cv2.LINE_AA) polygon_path = os.path.join(debug_dir, f"{base_name}_polygons.png") cv2.imwrite(polygon_path, polygon_img) self._log(f" πŸ”· Saved polygon visualization: {polygon_path}") # Save individual region crops with more info regions_dir = os.path.join(debug_dir, 'regions') os.makedirs(regions_dir, exist_ok=True) for i, region in enumerate(regions[:10]): # Limit to first 10 regions # Convert to int to avoid OpenCV type errors x, y, w, h = map(int, region.bounding_box) # Add padding pad = 10 x1 = max(0, x - pad) y1 = max(0, y - pad) x2 = min(img.shape[1], x + w + pad) y2 = min(img.shape[0], y + h + pad) region_crop = img[y1:y2, x1:x2].copy() # Draw bounding box on crop cv2.rectangle(region_crop, (pad, pad), (pad + w, pad + h), (0, 255, 0), 2) # Add text info on the crop info = f"Conf: {region.confidence:.2f} | Chars: {len(region.text)}" cv2.putText(region_crop, info, (5, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1, cv2.LINE_AA) # Save with meaningful filename safe_text = region.text[:20].replace('/', '_').replace('\\', '_').strip() region_path = os.path.join(regions_dir, f"region_{i:03d}_{safe_text}.png") cv2.imwrite(region_path, region_crop) self._log(f" πŸ“ Saved individual region crops to: {regions_dir}") except Exception as e: self._log(f" ❌ Failed to save debug image: {str(e)}", "warning") if self.manga_settings.get('advanced', {}).get('debug_mode', False): # If debug mode is on, log the full traceback import traceback self._log(traceback.format_exc(), "warning") def _create_confidence_heatmap(self, img, regions): """Create a heatmap showing OCR confidence levels""" heatmap = np.zeros_like(img[:, :, 0], dtype=np.float32) for region in regions: # Convert to int for array indexing x, y, w, h = map(int, region.bounding_box) confidence = region.confidence heatmap[y:y+h, x:x+w] = confidence # Convert to color heatmap heatmap_normalized = (heatmap * 255).astype(np.uint8) heatmap_colored = cv2.applyColorMap(heatmap_normalized, cv2.COLORMAP_JET) # Blend with original image result = cv2.addWeighted(img, 0.7, heatmap_colored, 0.3, 0) return result def _get_translation_history_context(self) -> List[Dict[str, str]]: """Get translation history context from HistoryManager""" if not self.history_manager or not self.contextual_enabled: return [] try: # Load full history full_history = self.history_manager.load_history() if not full_history: return [] # Extract only the contextual messages up to the limit context = [] exchange_count = 0 # Process history in pairs (user + assistant messages) for i in range(0, len(full_history), 2): if i + 1 < len(full_history): user_msg = full_history[i] assistant_msg = full_history[i + 1] if user_msg.get("role") == "user" and assistant_msg.get("role") == "assistant": context.extend([user_msg, assistant_msg]) exchange_count += 1 # Only keep up to the history limit if exchange_count >= self.translation_history_limit: # Get only the most recent exchanges context = context[-(self.translation_history_limit * 2):] break return context except Exception as e: self._log(f"⚠️ Error loading history context: {str(e)}", "warning") return [] def translate_text(self, text: str, context: Optional[List[Dict]] = None, image_path: str = None, region: TextRegion = None) -> str: """Translate text using API with GUI system prompt and full image context""" try: # Build per-request log prefix for clearer parallel logs try: import threading thread_name = threading.current_thread().name except Exception: thread_name = "MainThread" bbox_info = "" try: if region and hasattr(region, 'bounding_box') and region.bounding_box: x, y, w, h = region.bounding_box bbox_info = f" [bbox={x},{y},{w}x{h}]" except Exception: pass prefix = f"[{thread_name}]{bbox_info}" self._log(f"\n{prefix} 🌐 Starting translation for text: '{text[:50]}...'") # CHECK 1: Before starting if self._check_stop(): self._log("⏹️ Translation stopped before full page context processing", "warning") return {} # Get system prompt from GUI profile profile_name = self.main_gui.profile_var.get() # Get the prompt from prompt_profiles dictionary system_prompt = '' if hasattr(self.main_gui, 'prompt_profiles') and profile_name in self.main_gui.prompt_profiles: system_prompt = self.main_gui.prompt_profiles[profile_name] self._log(f"πŸ“‹ Using profile: {profile_name}") else: self._log(f"⚠️ Profile '{profile_name}' not found in prompt_profiles", "warning") self._log(f"{prefix} πŸ“ System prompt: {system_prompt[:100]}..." if system_prompt else f"{prefix} πŸ“ No system prompt configured") if system_prompt: messages = [{"role": "system", "content": system_prompt}] else: messages = [] # Add contextual translations if enabled if self.contextual_enabled and self.history_manager: # Get history from HistoryManager history_context = self._get_translation_history_context() if history_context: context_count = len(history_context) // 2 # Each exchange is 2 messages self._log(f"πŸ”— Adding {context_count} previous exchanges from history (limit: {self.translation_history_limit})") messages.extend(history_context) else: self._log(f"πŸ”— Contextual enabled but no history available yet") else: self._log(f"{prefix} πŸ”— Contextual: {'Disabled' if not self.contextual_enabled else 'No HistoryManager'}") # Add full image context if available AND visual context is enabled if image_path and self.visual_context_enabled: try: import base64 from PIL import Image as PILImage self._log(f"{prefix} πŸ“· Adding full page visual context for translation") # Read and encode the full image with open(image_path, 'rb') as img_file: img_data = img_file.read() # Check image size img_size_mb = len(img_data) / (1024 * 1024) self._log(f"{prefix} πŸ“Š Image size: {img_size_mb:.2f} MB") # Optionally resize if too large (Gemini has limits) if img_size_mb > 10: # If larger than 10MB self._log(f"πŸ“‰ Resizing large image for API limits...") pil_image = PILImage.open(image_path) # Calculate new size (max 2048px on longest side) max_size = 2048 ratio = min(max_size / pil_image.width, max_size / pil_image.height) if ratio < 1: new_size = (int(pil_image.width * ratio), int(pil_image.height * ratio)) pil_image = pil_image.resize(new_size, PILImage.Resampling.LANCZOS) # Re-encode from io import BytesIO buffered = BytesIO() pil_image.save(buffered, format="PNG", optimize=True) img_data = buffered.getvalue() self._log(f"{prefix} βœ… Resized to {new_size[0]}x{new_size[1]}px ({len(img_data)/(1024*1024):.2f} MB)") # Encode to base64 img_base64 = base64.b64encode(img_data).decode('utf-8') # Build the message with image and text location info location_description = "" if region: x, y, w, h = region.bounding_box # Describe where on the page this text is located page_width = PILImage.open(image_path).width page_height = PILImage.open(image_path).height # Determine position h_pos = "left" if x < page_width/3 else "center" if x < 2*page_width/3 else "right" v_pos = "top" if y < page_height/3 else "middle" if y < 2*page_height/3 else "bottom" location_description = f"\n\nThe text to translate is located in the {v_pos}-{h_pos} area of the page, " location_description += f"at coordinates ({x}, {y}) with size {w}x{h} pixels." # Add image and text to translate messages.append({ "role": "user", "content": [ { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{img_base64}" } }, { "type": "text", "text": f"Looking at this full manga page, translate the following text: '{text}'{location_description}" } ] }) self._log(f"{prefix} βœ… Added full page image as visual context") except Exception as e: self._log(f"⚠️ Failed to add image context: {str(e)}", "warning") self._log(f" Error type: {type(e).__name__}", "warning") import traceback self._log(traceback.format_exc(), "warning") # Fall back to text-only translation messages.append({"role": "user", "content": text}) elif image_path and not self.visual_context_enabled: # Visual context disabled - text-only mode self._log(f"{prefix} πŸ“ Text-only mode (visual context disabled)") messages.append({"role": "user", "content": text}) else: # No image path provided - text-only translation messages.append({"role": "user", "content": text}) # Check input token limit text_tokens = 0 image_tokens = 0 for msg in messages: if isinstance(msg.get("content"), str): # Simple text message text_tokens += len(msg["content"]) // 4 elif isinstance(msg.get("content"), list): # Message with mixed content (text + image) for content_part in msg["content"]: if content_part.get("type") == "text": text_tokens += len(content_part.get("text", "")) // 4 elif content_part.get("type") == "image_url": # Only count image tokens if visual context is enabled if self.visual_context_enabled: image_tokens += 258 estimated_tokens = text_tokens + image_tokens # Check token limit only if it's enabled if self.input_token_limit is None: self._log(f"{prefix} πŸ“Š Token estimate - Text: {text_tokens}, Images: {image_tokens} (Total: {estimated_tokens} / unlimited)") else: self._log(f"{prefix} πŸ“Š Token estimate - Text: {text_tokens}, Images: {image_tokens} (Total: {estimated_tokens} / {self.input_token_limit})") if estimated_tokens > self.input_token_limit: self._log(f"⚠️ Token limit exceeded, trimming context", "warning") # Keep system prompt, image, and current text only if image_path: messages = [messages[0], messages[-1]] else: messages = [messages[0], {"role": "user", "content": text}] # Recalculate tokens after trimming text_tokens = len(messages[0]["content"]) // 4 if isinstance(messages[-1].get("content"), str): text_tokens += len(messages[-1]["content"]) // 4 else: text_tokens += len(messages[-1]["content"][0]["text"]) // 4 estimated_tokens = text_tokens + image_tokens self._log(f"πŸ“Š Trimmed token estimate: {estimated_tokens}") start_time = time.time() api_time = 0 # Initialize to avoid NameError try: response = send_with_interrupt( messages=messages, client=self.client, temperature=self.temperature, max_tokens=self.max_tokens, stop_check_fn=self._check_stop ) api_time = time.time() - start_time self._log(f"{prefix} βœ… API responded in {api_time:.2f} seconds") # Normalize response to plain text (handle tuples and bytes) if hasattr(response, 'content'): response_text = response.content else: response_text = response # Handle tuple response like (text, 'stop') from some clients if isinstance(response_text, tuple): response_text = response_text[0] # Decode bytes/bytearray if isinstance(response_text, (bytes, bytearray)): try: response_text = response_text.decode('utf-8', errors='replace') except Exception: response_text = str(response_text) # Ensure string if not isinstance(response_text, str): response_text = str(response_text) response_text = response_text.strip() # If it's a stringified tuple like "('text', 'stop')", extract the first element if response_text.startswith("('") or response_text.startswith('("'): import ast, re try: parsed_tuple = ast.literal_eval(response_text) if isinstance(parsed_tuple, tuple) and parsed_tuple: response_text = str(parsed_tuple[0]) self._log("πŸ“¦ Extracted response from tuple literal", "debug") except Exception: match = re.match(r"^\('(.+?)',\s*'.*'\)$", response_text, re.DOTALL) if match: tmp = match.group(1) tmp = tmp.replace('\\n', '\n').replace("\\'", "'").replace('\\\"', '"').replace('\\\\', '\\') response_text = tmp self._log("πŸ“¦ Extracted response using regex from tuple literal", "debug") self._log(f"{prefix} πŸ“₯ Received response ({len(response_text)} chars)") except Exception as api_error: api_time = time.time() - start_time error_str = str(api_error).lower() error_type = type(api_error).__name__ # Check for specific error types if "429" in error_str or "rate limit" in error_str: self._log(f"⚠️ RATE LIMIT ERROR (429) after {api_time:.2f}s", "error") self._log(f" The API rate limit has been exceeded", "error") self._log(f" Please wait before retrying or reduce request frequency", "error") self._log(f" Error details: {str(api_error)}", "error") raise Exception(f"Rate limit exceeded (429): {str(api_error)}") elif "401" in error_str or "unauthorized" in error_str: self._log(f"❌ AUTHENTICATION ERROR (401) after {api_time:.2f}s", "error") self._log(f" Invalid API key or authentication failed", "error") self._log(f" Please check your API key in settings", "error") self._log(f" Error details: {str(api_error)}", "error") raise Exception(f"Authentication failed (401): {str(api_error)}") elif "403" in error_str or "forbidden" in error_str: self._log(f"❌ FORBIDDEN ERROR (403) after {api_time:.2f}s", "error") self._log(f" Access denied - check API permissions", "error") self._log(f" Error details: {str(api_error)}", "error") raise Exception(f"Access forbidden (403): {str(api_error)}") elif "400" in error_str or "bad request" in error_str: self._log(f"❌ BAD REQUEST ERROR (400) after {api_time:.2f}s", "error") self._log(f" Invalid request format or parameters", "error") self._log(f" Error details: {str(api_error)}", "error") raise Exception(f"Bad request (400): {str(api_error)}") elif "timeout" in error_str: self._log(f"⏱️ TIMEOUT ERROR after {api_time:.2f}s", "error") self._log(f" API request timed out", "error") self._log(f" Consider increasing timeout or retry", "error") self._log(f" Error details: {str(api_error)}", "error") raise Exception(f"Request timeout: {str(api_error)}") else: # Generic API error self._log(f"❌ API ERROR ({error_type}) after {api_time:.2f}s", "error") self._log(f" Error details: {str(api_error)}", "error") self._log(f" Full traceback:", "error") self._log(traceback.format_exc(), "error") raise # Initialize translated with extracted response text to avoid UnboundLocalError if response_text is None: translated = "" elif isinstance(response_text, str): translated = response_text elif isinstance(response_text, (bytes, bytearray)): try: translated = response_text.decode('utf-8', errors='replace') except Exception: translated = str(response_text) else: translated = str(response_text) # ADD THIS DEBUG CODE: self._log(f"πŸ” RAW API RESPONSE DEBUG:", "debug") self._log(f" Type: {type(translated)}", "debug") #self._log(f" Raw content length: {len(translated)}", "debug") #self._log(f" First 200 chars: {translated[:200]}", "debug") #self._log(f" Last 200 chars: {translated[-200:]}", "debug") # Check if both Japanese and English are present has_japanese = any('\u3040' <= c <= '\u9fff' or '\uac00' <= c <= '\ud7af' for c in translated) has_english = any('a' <= c.lower() <= 'z' for c in translated) if has_japanese and has_english: self._log(f" ⚠️ WARNING: Response contains BOTH Japanese AND English!", "warning") self._log(f" This might be causing the duplicate text issue", "warning") # Check if response looks like JSON (contains both { and } and : characters) if '{' in translated and '}' in translated and ':' in translated: try: # It might be JSON, try to fix and parse it fixed_json = self._fix_json_response(translated) import json parsed = json.loads(fixed_json) # If it's a dict with a single translation, extract it if isinstance(parsed, dict) and len(parsed) == 1: translated = list(parsed.values())[0] translated = self._clean_translation_text(translated) self._log("πŸ“¦ Extracted translation from JSON response", "debug") except: # Not JSON or failed to parse, use as-is pass self._log(f"{prefix} πŸ” Raw response type: {type(translated)}") self._log(f"{prefix} πŸ” Raw response content: '{translated[:5000]}...'") # Check if the response looks like a Python literal (tuple/string representation) if translated.startswith("('") or translated.startswith('("') or translated.startswith("('''"): self._log(f"⚠️ Detected Python literal in response, attempting to extract actual text", "warning") original = translated try: # Try to evaluate it as a Python literal import ast evaluated = ast.literal_eval(translated) self._log(f"πŸ“¦ Evaluated type: {type(evaluated)}") if isinstance(evaluated, tuple): # Take the first element of the tuple translated = str(evaluated[0]) self._log(f"πŸ“¦ Extracted from tuple: '{translated[:50]}...'") elif isinstance(evaluated, str): translated = evaluated self._log(f"πŸ“¦ Extracted string: '{translated[:50]}...'") else: self._log(f"⚠️ Unexpected type after eval: {type(evaluated)}", "warning") except Exception as e: self._log(f"⚠️ Failed to parse Python literal: {e}", "warning") self._log(f"⚠️ Original content: {original[:200]}", "warning") # Try multiple levels of unescaping temp = translated for i in range(5): # Try up to 5 levels of unescaping if temp.startswith("('") or temp.startswith('("'): # Try regex as fallback import re match = re.search(r"^\(['\"](.+)['\"]\)$", temp, re.DOTALL) if match: temp = match.group(1) self._log(f"πŸ“¦ Regex extracted (level {i+1}): '{temp[:50]}...'") else: break else: break translated = temp # Additional check for escaped content #if '\\\\' in translated or '\\n' in translated or "\\'" in translated or '\\"' in translated: # self._log(f"⚠️ Detected escaped content, unescaping...", "warning") # try: # before = translated # # # Handle quotes and apostrophes # translated = translated.replace("\\'", "'") # translated = translated.replace('\\"', '"') # translated = translated.replace("\\`", "`") # DON'T UNESCAPE NEWLINES BEFORE JSON PARSING! # translated = translated.replace('\\n', '\n') # COMMENT THIS OUT # translated = translated.replace('\\\\', '\\') # translated = translated.replace('\\/', '/') # translated = translated.replace('\\t', '\t') # COMMENT THIS OUT TOO # translated = translated.replace('\\r', '\r') # AND THIS # self._log(f"πŸ“¦ Unescaped safely: '{before[:50]}...' -> '{translated[:50]}...'") # except Exception as e: # self._log(f"⚠️ Failed to unescape: {e}", "warning") # Clean up unwanted trailing apostrophes/quotes import re response_text = translated response_text = re.sub(r"['''\"`]$", "", response_text.strip()) # Remove trailing response_text = re.sub(r"^['''\"`]", "", response_text.strip()) # Remove leading response_text = re.sub(r"\s+['''\"`]\s+", " ", response_text) # Remove isolated translated = response_text translated = self._clean_translation_text(translated) # Apply glossary if available if hasattr(self.main_gui, 'manual_glossary') and self.main_gui.manual_glossary: glossary_count = len(self.main_gui.manual_glossary) self._log(f"πŸ“š Applying glossary with {glossary_count} entries") replacements = 0 for entry in self.main_gui.manual_glossary: if 'source' in entry and 'target' in entry: if entry['source'] in translated: translated = translated.replace(entry['source'], entry['target']) replacements += 1 if replacements > 0: self._log(f" ✏️ Made {replacements} glossary replacements") translated = self._clean_translation_text(translated) # Store in history if HistoryManager is available if self.history_manager and self.contextual_enabled: try: # Append to history with proper limit handling self.history_manager.append_to_history( user_content=text, assistant_content=translated, hist_limit=self.translation_history_limit, reset_on_limit=not self.rolling_history_enabled, rolling_window=self.rolling_history_enabled ) # Check if we're about to hit the limit if self.history_manager.will_reset_on_next_append( self.translation_history_limit, self.rolling_history_enabled ): mode = "roll over" if self.rolling_history_enabled else "reset" self._log(f"πŸ“š History will {mode} on next translation (at limit: {self.translation_history_limit})") except Exception as e: self._log(f"⚠️ Failed to save to history: {str(e)}", "warning") # Also store in legacy context for compatibility self.translation_context.append({ "original": text, "translated": translated }) return translated except Exception as e: self._log(f"❌ Translation error: {str(e)}", "error") self._log(f" Error type: {type(e).__name__}", "error") import traceback self._log(f" Traceback: {traceback.format_exc()}", "error") return text def translate_full_page_context(self, regions: List[TextRegion], image_path: str, _in_fallback=False) -> Dict[str, str]: """Translate all text regions with full page context in a single request Args: regions: List of text regions to translate image_path: Path to the manga page image _in_fallback: Internal flag to prevent infinite recursion during fallback attempts """ try: import time import traceback import json # Initialize response_text at the start response_text = "" self._log(f"\nπŸ“„ Full page context translation of {len(regions)} text regions") # Get system prompt from GUI profile profile_name = self.main_gui.profile_var.get() # Ensure visual_context_enabled exists (temporary fix) if not hasattr(self, 'visual_context_enabled'): self.visual_context_enabled = self.main_gui.config.get('manga_visual_context_enabled', True) # Try to get the prompt from prompt_profiles dictionary (for all profiles including custom ones) system_prompt = '' if hasattr(self.main_gui, 'prompt_profiles') and profile_name in self.main_gui.prompt_profiles: system_prompt = self.main_gui.prompt_profiles[profile_name] self._log(f"πŸ“‹ Using profile: {profile_name}") else: # Fallback to check if it's stored as a direct attribute (legacy support) system_prompt = getattr(self.main_gui, profile_name.replace(' ', '_'), '') if system_prompt: self._log(f"πŸ“‹ Using profile (legacy): {profile_name}") else: self._log(f"⚠️ Profile '{profile_name}' not found, using empty prompt", "warning") # Combine with full page context instructions if system_prompt: system_prompt = f"{system_prompt}\n\n{self.full_page_context_prompt}" else: system_prompt = self.full_page_context_prompt messages = [{"role": "system", "content": system_prompt}] # CHECK 2: Before adding context if self._check_stop(): self._log("⏹️ Translation stopped during context preparation", "warning") return {} # Add contextual translations if enabled if self.contextual_enabled and self.history_manager: history_context = self._get_translation_history_context() if history_context: context_count = len(history_context) // 2 self._log(f"πŸ”— Adding {context_count} previous exchanges from history") messages.extend(history_context) # Prepare text segments with indices all_texts = {} text_list = [] for i, region in enumerate(regions): # Use index-based key to handle duplicate texts key = f"[{i}] {region.text}" all_texts[key] = region.text text_list.append(f"{key}") # CHECK 3: Before image processing if self._check_stop(): self._log("⏹️ Translation stopped before image processing", "warning") return {} # Create the full context message text context_text = "\n".join(text_list) # Log text content info total_chars = sum(len(region.text) for region in regions) self._log(f"πŸ“ Text content: {len(regions)} regions, {total_chars} total characters") # Process image if visual context is enabled if self.visual_context_enabled: try: import base64 from PIL import Image as PILImage self._log(f"πŸ“· Adding full page visual context for translation") # Read and encode the image with open(image_path, 'rb') as img_file: img_data = img_file.read() # Check image size img_size_mb = len(img_data) / (1024 * 1024) self._log(f"πŸ“Š Image size: {img_size_mb:.2f} MB") # Get image dimensions pil_image = PILImage.open(image_path) self._log(f" Image dimensions: {pil_image.width}x{pil_image.height}") # CHECK 4: Before resizing (which can take time) if self._check_stop(): self._log("⏹️ Translation stopped during image preparation", "warning") return {} # Resize if needed if img_size_mb > 10: self._log(f"πŸ“‰ Resizing large image for API limits...") max_size = 2048 ratio = min(max_size / pil_image.width, max_size / pil_image.height) if ratio < 1: new_size = (int(pil_image.width * ratio), int(pil_image.height * ratio)) pil_image = pil_image.resize(new_size, PILImage.Resampling.LANCZOS) from io import BytesIO buffered = BytesIO() pil_image.save(buffered, format="PNG", optimize=True) img_data = buffered.getvalue() self._log(f"βœ… Resized to {new_size[0]}x{new_size[1]}px ({len(img_data)/(1024*1024):.2f} MB)") # Convert to base64 img_b64 = base64.b64encode(img_data).decode('utf-8') # Create message with both text and image messages.append({ "role": "user", "content": [ {"type": "text", "text": context_text}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}} ] }) self._log(f"βœ… Added full page image as visual context") except Exception as e: self._log(f"⚠️ Failed to add image context: {str(e)}", "warning") self._log(f" Error type: {type(e).__name__}", "warning") import traceback self._log(traceback.format_exc(), "warning") self._log(f" Falling back to text-only translation", "warning") # Fall back to text-only translation messages.append({"role": "user", "content": context_text}) else: # Visual context disabled - send text only self._log(f"πŸ“ Text-only mode (visual context disabled for non-vision models)") messages.append({"role": "user", "content": context_text}) # CHECK 5: Before API call if self._check_stop(): self._log("⏹️ Translation stopped before API call", "warning") return {} # Store original model for fallback original_model = self.client.model if hasattr(self.client, 'model') else None # Check input token limit text_tokens = 0 image_tokens = 0 for msg in messages: if isinstance(msg.get("content"), str): # Simple text message text_tokens += len(msg["content"]) // 4 elif isinstance(msg.get("content"), list): # Message with mixed content (text + image) for content_part in msg["content"]: if content_part.get("type") == "text": text_tokens += len(content_part.get("text", "")) // 4 elif content_part.get("type") == "image_url": # Only count image tokens if visual context is enabled if self.visual_context_enabled: image_tokens += 258 estimated_tokens = text_tokens + image_tokens # Check token limit only if it's enabled if self.input_token_limit is None: self._log(f"πŸ“Š Token estimate - Text: {text_tokens}, Images: {image_tokens} (Total: {estimated_tokens} / unlimited)") else: self._log(f"πŸ“Š Token estimate - Text: {text_tokens}, Images: {image_tokens} (Total: {estimated_tokens} / {self.input_token_limit})") if estimated_tokens > self.input_token_limit: self._log(f"⚠️ Token limit exceeded, trimming context", "warning") # Keep system prompt and current message only messages = [messages[0], messages[-1]] # Recalculate tokens text_tokens = len(messages[0]["content"]) // 4 if isinstance(messages[-1]["content"], str): text_tokens += len(messages[-1]["content"]) // 4 else: for content_part in messages[-1]["content"]: if content_part.get("type") == "text": text_tokens += len(content_part.get("text", "")) // 4 estimated_tokens = text_tokens + image_tokens self._log(f"πŸ“Š Trimmed token estimate: {estimated_tokens}") # Make API call using the client's send method (matching translate_text) self._log(f"🌐 Sending full page context to API...") self._log(f" API Model: {self.client.model if hasattr(self.client, 'model') else 'unknown'}") self._log(f" Temperature: {self.temperature}") self._log(f" Max Output Tokens: {self.max_tokens}") start_time = time.time() api_time = 0 # Initialize to avoid NameError try: response = send_with_interrupt( messages=messages, client=self.client, temperature=self.temperature, max_tokens=self.max_tokens, stop_check_fn=self._check_stop ) api_time = time.time() - start_time # Extract content from response if hasattr(response, 'content'): response_text = response.content # Check if it's a tuple representation if isinstance(response_text, tuple): response_text = response_text[0] # Get first element of tuple response_text = response_text.strip() elif hasattr(response, 'text'): # Gemini responses have .text attribute response_text = response.text.strip() elif hasattr(response, 'candidates') and response.candidates: # Handle Gemini GenerateContentResponse structure try: response_text = response.candidates[0].content.parts[0].text.strip() except (IndexError, AttributeError): response_text = str(response).strip() else: # If response is a string or other format response_text = str(response).strip() # Check if it's a stringified tuple if response_text.startswith("('") or response_text.startswith('("'): # It's a tuple converted to string, extract the JSON part import ast try: parsed_tuple = ast.literal_eval(response_text) if isinstance(parsed_tuple, tuple): response_text = parsed_tuple[0] # Get first element self._log("πŸ“¦ Extracted response from tuple format", "debug") except: # If literal_eval fails, try regex import re match = re.match(r"^\('(.+)', '.*'\)$", response_text, re.DOTALL) if match: response_text = match.group(1) # Unescape the string response_text = response_text.replace('\\n', '\n') response_text = response_text.replace("\\'", "'") response_text = response_text.replace('\\"', '"') response_text = response_text.replace('\\\\', '\\') self._log("πŸ“¦ Extracted response using regex from tuple string", "debug") # CHECK 6: Immediately after API response if self._check_stop(): self._log(f"⏹️ Translation stopped after API call ({api_time:.2f}s)", "warning") return {} self._log(f"βœ… API responded in {api_time:.2f} seconds") self._log(f"πŸ“₯ Received response ({len(response_text)} chars)") except Exception as api_error: api_time = time.time() - start_time # CHECK 7: After API error if self._check_stop(): self._log(f"⏹️ Translation stopped during API error handling", "warning") return {} error_str = str(api_error).lower() error_type = type(api_error).__name__ # Check for specific error types if "429" in error_str or "rate limit" in error_str: self._log(f"⚠️ RATE LIMIT ERROR (429) after {api_time:.2f}s", "error") self._log(f" The API rate limit has been exceeded", "error") self._log(f" Please wait before retrying or reduce request frequency", "error") self._log(f" Error details: {str(api_error)}", "error") raise Exception(f"Rate limit exceeded (429): {str(api_error)}") elif "401" in error_str or "unauthorized" in error_str: self._log(f"❌ AUTHENTICATION ERROR (401) after {api_time:.2f}s", "error") self._log(f" Invalid API key or authentication failed", "error") self._log(f" Please check your API key in settings", "error") self._log(f" Error details: {str(api_error)}", "error") raise Exception(f"Authentication failed (401): {str(api_error)}") elif "403" in error_str or "forbidden" in error_str: self._log(f"❌ FORBIDDEN ERROR (403) after {api_time:.2f}s", "error") self._log(f" Access denied - check API permissions", "error") self._log(f" Error details: {str(api_error)}", "error") raise Exception(f"Access forbidden (403): {str(api_error)}") elif "400" in error_str or "bad request" in error_str: self._log(f"❌ BAD REQUEST ERROR (400) after {api_time:.2f}s", "error") self._log(f" Invalid request format or parameters", "error") self._log(f" Error details: {str(api_error)}", "error") raise Exception(f"Bad request (400): {str(api_error)}") elif "timeout" in error_str: self._log(f"⏱️ TIMEOUT ERROR after {api_time:.2f}s", "error") self._log(f" API request timed out", "error") self._log(f" Consider increasing timeout or retry", "error") self._log(f" Error details: {str(api_error)}", "error") raise Exception(f"Request timeout: {str(api_error)}") else: # Generic API error self._log(f"❌ API ERROR ({error_type}) after {api_time:.2f}s", "error") self._log(f" Error details: {str(api_error)}", "error") self._log(f" Full traceback:", "error") self._log(traceback.format_exc(), "error") raise # CHECK 8: Before parsing response if self._check_stop(): self._log("⏹️ Translation stopped before parsing response", "warning") return {} # Check if we got a response if not response_text: self._log("❌ Empty response from API", "error") return {} self._log(f"πŸ” Raw response type: {type(response_text)}") self._log(f"πŸ” Raw response preview: '{response_text[:2000]}...'") # Clean up response_text (handle Python literals, escapes, etc.) if response_text.startswith("('") or response_text.startswith('("') or response_text.startswith("('''"): self._log(f"⚠️ Detected Python literal in response, attempting to extract actual text", "warning") try: import ast evaluated = ast.literal_eval(response_text) if isinstance(evaluated, tuple): response_text = str(evaluated[0]) elif isinstance(evaluated, str): response_text = evaluated except Exception as e: self._log(f"⚠️ Failed to parse Python literal: {e}", "warning") # Handle escaped content #if '\\\\' in response_text or '\\n' in response_text or "\\'" in response_text or '\\"' in response_text: # self._log(f"⚠️ Detected escaped content, unescaping...", "warning") # response_text = response_text.replace("\\'", "'") # response_text = response_text.replace('\\"', '"') # response_text = response_text.replace('\\n', '\n') # response_text = response_text.replace('\\\\', '\\') # response_text = response_text.replace('\\/', '/') # response_text = response_text.replace('\\t', '\t') # response_text = response_text.replace('\\r', '\r') # Clean up quotes import re response_text = re.sub(r"['''\"`]$", "", response_text.strip()) response_text = re.sub(r"^['''\"`]", "", response_text.strip()) response_text = re.sub(r"\s+['''\"`]\s+", " ", response_text) # Try to parse as JSON translations = {} try: # Strip markdown blocks more aggressively import re import json # CRITICAL: Strip markdown code blocks FIRST, before attempting JSON extraction cleaned = response_text # Remove markdown code blocks (handles ```json, ``json, ```, ``, etc.) if '```' in cleaned or '``' in cleaned: patterns = [ r'```json\s*\n?(.*?)```', r'``json\s*\n?(.*?)``', r'```\s*\n?(.*?)```', r'``\s*\n?(.*?)``' ] for pattern in patterns: match = re.search(pattern, cleaned, re.DOTALL) if match: cleaned = match.group(1).strip() self._log(f"πŸ”§ Stripped markdown wrapper using pattern: {pattern[:20]}...") break # Method 1: Try to parse the cleaned text directly try: translations = json.loads(cleaned) self._log(f"βœ… Successfully parsed {len(translations)} translations (direct parse)") except json.JSONDecodeError: # Method 2: Extract JSON object if direct parse failed json_match = re.search(r'\{.*\}', cleaned, re.DOTALL) if json_match: json_text = json_match.group(0) try: translations = json.loads(json_text) self._log(f"βœ… Successfully parsed {len(translations)} translations (regex extraction)") except json.JSONDecodeError: # Try to fix the extracted JSON json_text = self._fix_json_response(json_text) translations = json.loads(json_text) self._log(f"βœ… Successfully parsed {len(translations)} translations (after fix)") else: # No JSON object found raise json.JSONDecodeError("No JSON object found", cleaned, 0) # Handle different response formats if isinstance(translations, list): # Array of translations only - map by position temp = {} for i, region in enumerate(regions): if i < len(translations): temp[region.text] = translations[i] translations = temp self._log(f"πŸ“Š Total translations: {len(translations)}") except Exception as e: self._log(f"❌ Failed to parse JSON: {str(e)}", "error") self._log(f"Response preview: {response_text[:500]}...", "warning") # CRITICAL: Check if this is a refusal message BEFORE regex fallback # OpenAI and other APIs refuse certain content with text responses instead of JSON # ONLY check if response looks like plain text refusal (not malformed JSON with translations) import re response_lower = response_text.lower() # Quick check: if response starts with refusal keywords, it's definitely a refusal refusal_starts = ['sorry', 'i cannot', "i can't", 'i apologize', 'i am unable', "i'm unable"] if any(response_lower.strip().startswith(start) for start in refusal_starts): # Very likely a refusal - raise immediately from unified_api_client import UnifiedClientError raise UnifiedClientError( f"Content refused by API", error_type="prohibited_content", details={"refusal_message": response_text[:500]} ) # Skip refusal check if response contains valid-looking JSON structure with translations # (indicates malformed JSON that should go to regex fallback, not a refusal) has_json_structure = ( (response_text.strip().startswith('{') and ':' in response_text and '"' in response_text) or (response_text.strip().startswith('[') and ':' in response_text and '"' in response_text) ) # Also check if response contains short translations (not refusal paragraphs) # Refusals are typically long paragraphs, translations are short avg_value_length = 0 if has_json_structure: # Quick estimate: count chars between quotes import re values = re.findall(r'"([^"]{1,200})"\s*[,}]', response_text) if values: avg_value_length = sum(len(v) for v in values) / len(values) # If looks like JSON with short values, skip refusal check (go to regex fallback) if has_json_structure and avg_value_length > 0 and avg_value_length < 150: self._log(f"πŸ” Detected malformed JSON with translations (avg len: {avg_value_length:.0f}), trying regex fallback", "debug") # Skip refusal detection, go straight to regex fallback pass else: # Check for refusal patterns # Refusal patterns - both simple strings and regex patterns # Must be strict to avoid false positives on valid translations refusal_patterns = [ "i cannot assist", "i can't assist", "i cannot help", "i can't help", r"sorry.{0,10}i can't (assist|help|translate)", # OpenAI specific "i'm unable to translate", "i am unable to translate", "i apologize, but i cannot", "i'm sorry, but i cannot", "i don't have the ability to", "this request cannot be", "unable to process this", "cannot complete this", r"against.{0,20}(content )?policy", # "against policy" or "against content policy" "violates.*policy", r"(can't|cannot).{0,30}(sexual|explicit|inappropriate)", # "can't translate sexual" "appears to sexualize", "who appear to be", "prohibited content", "content blocked", ] # Check both simple string matching and regex patterns is_refusal = False for pattern in refusal_patterns: if '.*' in pattern or r'.{' in pattern: # It's a regex pattern if re.search(pattern, response_lower): is_refusal = True break else: # Simple string match if pattern in response_lower: is_refusal = True break if is_refusal: # Raise UnifiedClientError with prohibited_content type # Fallback mechanism will handle this automatically from unified_api_client import UnifiedClientError raise UnifiedClientError( f"Content refused by API", error_type="prohibited_content", details={"refusal_message": response_text[:500]} ) # Fallback: try regex extraction (handles both quoted and unquoted keys) try: import re translations = {} # Try 1: Standard quoted keys and values pattern1 = r'"([^"]+)"\s*:\s*"([^"]*(?:\\.[^"]*)*)"' matches = re.findall(pattern1, response_text) if matches: for key, value in matches: value = value.replace('\\n', '\n').replace('\\"', '"').replace('\\\\', '\\') translations[key] = value self._log(f"βœ… Recovered {len(translations)} translations using regex (quoted keys)") else: # Try 2: Unquoted keys (for invalid JSON like: key: "value") pattern2 = r'([^\s:{}]+)\s*:\s*([^\n}]+)' matches = re.findall(pattern2, response_text) for key, value in matches: # Clean up key and value key = key.strip() value = value.strip().rstrip(',') # Remove quotes from value if present if value.startswith('"') and value.endswith('"'): value = value[1:-1] elif value.startswith("'") and value.endswith("'"): value = value[1:-1] translations[key] = value if translations: self._log(f"βœ… Recovered {len(translations)} translations using regex (unquoted keys)") if not translations: self._log("❌ All parsing attempts failed", "error") return {} except Exception as e: self._log(f"❌ Failed to recover JSON: {e}", "error") return {} # Map translations back to regions result = {} all_originals = [] all_translations = [] # Extract translation values in order translation_values = list(translations.values()) if translations else [] # DEBUG: Log what we extracted self._log(f"πŸ“Š Extracted {len(translation_values)} translation values", "debug") for i, val in enumerate(translation_values[:1000]): # First 1000 for debugging # Safely handle None values val_str = str(val) if val is not None else "" self._log(f" Translation {i}: '{val_str[:1000]}...'", "debug") # Clean all translation values to remove quotes # CRITICAL: Also clean the keys in the dictionary to maintain correct mapping cleaned_translations = {} for key, value in translations.items(): cleaned_key = key cleaned_value = self._clean_translation_text(value) # Only add if the cleaned value is not empty (avoid misalignment) if cleaned_value: cleaned_translations[cleaned_key] = cleaned_value else: self._log(f"πŸ” Skipping empty translation after cleaning: '{key}' β†’ ''", "debug") # Replace original dict with cleaned version translations = cleaned_translations translation_values = list(translations.values()) if translations else [] self._log(f"πŸ” DEBUG: translation_values after cleaning:", "debug") for i, val in enumerate(translation_values): self._log(f" [{i}]: {repr(val)}", "debug") # CRITICAL: Check if translation values are actually refusal messages # API sometimes returns valid JSON where each "translation" is a refusal if translation_values: # Check first few translations for refusal patterns import re refusal_patterns = [ "i cannot", "i can't", r"sorry.{0,5}i can't help", r"sorry.{0,5}i can't", "sexually explicit", "content policy", "prohibited content", "appears to be", "who appear to be", ] # Sample first 3 translations (or all if fewer) sample_size = min(3, len(translation_values)) refusal_count = 0 for sample_val in translation_values[:sample_size]: if sample_val: val_lower = sample_val.lower() for pattern in refusal_patterns: if '.*' in pattern or r'.{' in pattern: if re.search(pattern, val_lower): refusal_count += 1 break else: if pattern in val_lower: refusal_count += 1 break # If most translations are refusals, treat as refusal if refusal_count >= sample_size * 0.5: # 50% threshold # Raise UnifiedClientError with prohibited_content type # Fallback mechanism will handle this automatically from unified_api_client import UnifiedClientError raise UnifiedClientError( f"Content refused by API", error_type="prohibited_content", details={"refusal_message": translation_values[0][:500]} ) # Key-based mapping (prioritize indexed format as requested in prompt) self._log(f"πŸ“‹ Mapping {len(translations)} translations to {len(regions)} regions") for i, region in enumerate(regions): if i % 10 == 0 and self._check_stop(): self._log(f"⏹️ Translation stopped during mapping (processed {i}/{len(regions)} regions)", "warning") return result # Get translation using multiple strategies (indexed format is most reliable) translated = "" # Strategy 1: Indexed key format "[N] original_text" (NEW STANDARD - most reliable) key = f"[{i}] {region.text}" if key in translations: translated = translations[key] self._log(f" βœ… Matched indexed key: '{key[:40]}...'", "debug") # Strategy 2: Direct key match without index (backward compatibility) elif region.text in translations: translated = translations[region.text] self._log(f" βœ… Matched direct key: '{region.text[:40]}...'", "debug") # Strategy 3: Position-based fallback (least reliable, only if counts match exactly) elif i < len(translation_values) and len(translation_values) == len(regions): translated = translation_values[i] self._log(f" ⚠️ Using position-based fallback for region {i}", "debug") # Only mark as missing if we genuinely have no translation # NOTE: Keep translation even if it matches original (e.g., numbers, names, SFX) if not translated: self._log(f" ⚠️ No translation for region {i}, leaving empty", "warning") translated = "" # Apply glossary if we have a translation if translated and hasattr(self.main_gui, 'manual_glossary') and self.main_gui.manual_glossary: for entry in self.main_gui.manual_glossary: if 'source' in entry and 'target' in entry: if entry['source'] in translated: translated = translated.replace(entry['source'], entry['target']) result[region.text] = translated region.translated_text = translated if translated: all_originals.append(f"[{i+1}] {region.text}") all_translations.append(f"[{i+1}] {translated}") self._log(f" βœ… Translated: '{region.text[:30]}...' β†’ '{translated[:30]}...'", "debug") # Save history if enabled if self.history_manager and self.contextual_enabled and all_originals: try: combined_original = "\n".join(all_originals) combined_translation = "\n".join(all_translations) self.history_manager.append_to_history( user_content=combined_original, assistant_content=combined_translation, hist_limit=self.translation_history_limit, reset_on_limit=not self.rolling_history_enabled, rolling_window=self.rolling_history_enabled ) self._log(f"πŸ“š Saved {len(all_originals)} translations as 1 combined history entry", "success") except Exception as e: self._log(f"⚠️ Failed to save page to history: {str(e)}", "warning") return result except Exception as e: if self._check_stop(): self._log("⏹️ Translation stopped due to user request", "warning") return {} # Check if this is a prohibited_content error from unified_api_client import UnifiedClientError if isinstance(e, UnifiedClientError) and getattr(e, "error_type", None) == "prohibited_content": # Check if USE_FALLBACK_KEYS is enabled and we're not already in a fallback attempt use_fallback = os.getenv('USE_FALLBACK_KEYS', '0') == '1' if use_fallback and not _in_fallback: self._log(f"β›” Content refused by primary model, trying fallback keys...", "warning") # Store original credentials to restore after fallback attempts original_api_key = self.client.api_key original_model = self.client.model # Try to get fallback keys from environment try: fallback_keys_json = os.getenv('FALLBACK_KEYS', '[]') fallback_keys = json.loads(fallback_keys_json) if fallback_keys_json != '[]' else [] if fallback_keys: for idx, fallback in enumerate(fallback_keys, 1): if self._check_stop(): self._log("⏹️ Translation stopped during fallback", "warning") return {} fallback_model = fallback.get('model') fallback_key = fallback.get('api_key') if not fallback_model or not fallback_key: continue self._log(f"πŸ”„ Trying fallback {idx}/{len(fallback_keys)}: {fallback_model}", "info") try: # Temporarily switch to fallback model old_key = self.client.api_key old_model = self.client.model self.client.api_key = fallback_key self.client.model = fallback_model # Re-setup client with new credentials if hasattr(self.client, '_setup_client'): self.client._setup_client() # Retry the translation with fallback model (mark as in_fallback to prevent recursion) return self.translate_full_page_context(regions, image_path, _in_fallback=True) except UnifiedClientError as fallback_err: if getattr(fallback_err, "error_type", None) == "prohibited_content": self._log(f" β›” Fallback {idx} also refused", "warning") # Restore original credentials and try next fallback self.client.api_key = old_key self.client.model = old_model if hasattr(self.client, '_setup_client'): self.client._setup_client() continue else: # Other error, restore and raise self.client.api_key = old_key self.client.model = old_model if hasattr(self.client, '_setup_client'): self.client._setup_client() raise except Exception as fallback_err: self._log(f" ❌ Fallback {idx} error: {str(fallback_err)[:100]}", "error") # Restore original credentials and try next fallback self.client.api_key = old_key self.client.model = old_model if hasattr(self.client, '_setup_client'): self.client._setup_client() continue self._log(f"❌ All fallback keys refused content", "error") else: self._log(f"⚠️ No fallback keys configured", "warning") except Exception as fallback_error: self._log(f"❌ Error processing fallback keys: {str(fallback_error)}", "error") finally: # Always restore original credentials after fallback attempts try: self.client.api_key = original_api_key self.client.model = original_model if hasattr(self.client, '_setup_client'): self.client._setup_client() except Exception: pass # Ignore errors during credential restoration # If we get here, all fallbacks failed or weren't configured self._log(f"❌ Content refused by API", "error") return {} self._log(f"❌ Full page context translation error: {str(e)}", "error") self._log(traceback.format_exc(), "error") return {} def _fix_json_response(self, response_text: str) -> str: import re import json # Debug: Show what we received self._log(f"DEBUG: Original length: {len(response_text)}", "debug") self._log(f"DEBUG: First 50 chars: [{response_text[:50]}]", "debug") cleaned = response_text if "```json" in cleaned: match = re.search(r'```json\s*(.*?)```', cleaned, re.DOTALL) if match: cleaned = match.group(1).strip() self._log(f"DEBUG: Extracted {len(cleaned)} chars from markdown", "debug") else: self._log("DEBUG: Regex didn't match!", "warning") # Try to parse try: result = json.loads(cleaned) self._log(f"βœ… Parsed JSON with {len(result)} entries", "info") return cleaned except json.JSONDecodeError as e: self._log(f"⚠️ JSON invalid: {str(e)}", "warning") self._log(f"DEBUG: Cleaned text starts with: [{cleaned[:20]}]", "debug") return cleaned def _clean_translation_text(self, text: str) -> str: """Remove unnecessary quotation marks, dots, and invalid characters from translated text""" if not text: return text # Log what we're cleaning original = text # First, fix encoding issues text = self._fix_encoding_issues(text) # Normalize width/compatibility (e.g., fullwidth β†’ ASCII, circled numbers β†’ digits) text = self._normalize_unicode_width(text) # Remove Unicode replacement characters and invalid symbols text = self._sanitize_unicode_characters(text) # Remove leading and trailing whitespace text = text.strip() # Remove ALL types of quotes and dots from start/end # Keep removing until no more quotes/dots at edges while len(text) > 0: old_len = len(text) # Remove from start text = text.lstrip('"\'`''""γ€Œγ€γ€Žγ€γ€γ€‘γ€Šγ€‹γ€ˆγ€‰.Β·β€’Β°') # Remove from end (but preserve ... and !!) if not text.endswith('...') and not text.endswith('!!'): text = text.rstrip('"\'`''""γ€Œγ€γ€Žγ€γ€γ€‘γ€Šγ€‹γ€ˆγ€‰.Β·β€’Β°') # If nothing changed, we're done if len(text) == old_len: break # Final strip text = text.strip() # Log if we made changes if text != original: self._log(f"🧹 Cleaned text: '{original}' β†’ '{text}'", "debug") return text def _sanitize_unicode_characters(self, text: str) -> str: """Remove invalid Unicode characters, replacement characters, and box symbols. Also more aggressively exclude square-like glyphs that leak as 'cubes' in some fonts. """ if not text: return text import re original = text # Remove Unicode replacement character (οΏ½) and similar invalid symbols text = text.replace('\ufffd', '') # Unicode replacement character # Geometric squares and variants (broad sweep) geo_squares = [ 'β–‘','β– ','β–’','β–£','β–€','β–₯','β–¦','β–§','β–¨','β–©','β—»','⬛','⬜', '\u25a1','\u25a0','\u2b1c','\u2b1b' ] for s in geo_squares: text = text.replace(s, '') # Extra cube-like CJK glyphs commonly misrendered in non-CJK fonts # (unconditionally removed per user request) cube_likes = [ '口', # U+53E3 'ε›—', # U+56D7 'ζ—₯', # U+65E5 (often boxy) 'ζ›°', # U+66F0 'η”°', # U+7530 'ε›ž', # U+56DE 'γƒ­', # U+30ED (Katakana RO) 'οΎ›', # U+FF9B (Halfwidth RO) 'ㅁ', # U+3141 (Hangul MIEUM) 'δΈ¨', # U+4E28 (CJK radical two) tall bar ] for s in cube_likes: text = text.replace(s, '') # Remove entire ranges that commonly render as boxes/blocks # Box Drawing, Block Elements, Geometric Shapes (full range), plus a common white/black large square range already handled text = re.sub(r'[\u2500-\u257F\u2580-\u259F\u25A0-\u25FF]', '', text) # Optional debug: log culprits found in original text (before removal) try: culprits = re.findall(r'[\u2500-\u257F\u2580-\u259F\u25A0-\u25FF\u2B1B\u2B1C\u53E3\u56D7\u65E5\u66F0\u7530\u56DE\u30ED\uFF9B\u3141\u4E28]', original) if culprits: as_codes = [f'U+{ord(c):04X}' for c in culprits] self._log(f"🧊 Removed box-like glyphs: {', '.join(as_codes)}", "debug") except Exception: pass # If line is mostly ASCII, strip any remaining single CJK ideographs that stand alone try: ascii_count = sum(1 for ch in text if ord(ch) < 128) ratio = ascii_count / max(1, len(text)) if ratio >= 0.8: text = re.sub(r'(?:(?<=\s)|^)[\u3000-\u303F\u3040-\u30FF\u3400-\u9FFF\uFF00-\uFFEF](?=(?:\s)|$)', '', text) except Exception: pass # Remove invisible and zero-width characters text = re.sub(r'[\u200b-\u200f\u2028-\u202f\u205f-\u206f\ufeff]', '', text) # Remove remaining control characters (except common ones like newline, tab) text = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]', '', text) # Remove any remaining characters that can't be properly encoded try: text = text.encode('utf-8', errors='ignore').decode('utf-8') except UnicodeError: pass if text != original: try: self._log(f"πŸ”§ Sanitized Unicode: '{original}' β†’ '{text}'", "debug") except Exception: pass return text def _normalize_unicode_width(self, text: str) -> str: """Normalize Unicode to NFKC to 'unsquare' fullwidth/stylized forms while preserving CJK text""" if not text: return text try: import unicodedata original = text # NFKC folds compatibility characters (fullwidth forms, circled digits, etc.) to standard forms text = unicodedata.normalize('NFKC', text) if text != original: try: self._log(f"πŸ”€ Normalized width/compat: '{original[:30]}...' β†’ '{text[:30]}...'", "debug") except Exception: pass return text except Exception: return text def _fix_encoding_issues(self, text: str) -> str: """Fix common encoding issues in text, especially for Korean""" if not text: return text # Check for mojibake indicators (UTF-8 misinterpreted as Latin-1) mojibake_indicators = ['Γ«', 'Γ¬', 'ΓͺΒ°', 'Γ£', 'Γƒ', 'Γ’', 'Γ€', 'Γ°', 'Γ­', 'ë­', 'ì´'] if any(indicator in text for indicator in mojibake_indicators): self._log("πŸ”§ Detected mojibake encoding issue, attempting fixes...", "debug") # Try multiple encoding fixes encodings_to_try = [ ('latin-1', 'utf-8'), ('windows-1252', 'utf-8'), ('iso-8859-1', 'utf-8'), ('cp1252', 'utf-8') ] for from_enc, to_enc in encodings_to_try: try: fixed = text.encode(from_enc, errors='ignore').decode(to_enc, errors='ignore') # Check if the fix actually improved things # Should have Korean characters (Hangul range) or be cleaner if any('\uAC00' <= c <= '\uD7AF' for c in fixed) or fixed.count('οΏ½') < text.count('οΏ½'): self._log(f"βœ… Fixed encoding using {from_enc} -> {to_enc}", "debug") return fixed except: continue # Clean up any remaining control characters and replacement characters import re text = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]', '', text) # Additional cleanup for common encoding artifacts # Remove sequences that commonly appear from encoding errors text = re.sub(r'\ufffd+', '', text) # Remove multiple replacement characters text = re.sub(r'[\u25a0-\u25ff]+', '', text) # Remove geometric shapes (common fallbacks) # Clean up double spaces and normalize whitespace text = re.sub(r'\s+', ' ', text).strip() return text def create_text_mask(self, image: np.ndarray, regions: List[TextRegion]) -> np.ndarray: """Create mask with comprehensive per-text-type dilation settings""" mask = np.zeros(image.shape[:2], dtype=np.uint8) regions_masked = 0 regions_skipped = 0 self._log(f"🎭 Creating text mask for {len(regions)} regions", "info") # Get manga settings manga_settings = self.main_gui.config.get('manga_settings', {}) # Get dilation settings base_dilation_size = manga_settings.get('mask_dilation', 15) # If Auto Iterations is enabled, auto-set dilation by OCR provider and RT-DETR guide status auto_iterations = manga_settings.get('auto_iterations', True) if auto_iterations: try: ocr_settings = manga_settings.get('ocr', {}) use_rtdetr_guide = ocr_settings.get('use_rtdetr_for_ocr_regions', True) bubble_detection_enabled = ocr_settings.get('bubble_detection_enabled', False) # If RT-DETR guide is enabled for Google/Azure, force dilation to 0 if (getattr(self, 'ocr_provider', '').lower() in ('azure', 'google') and bubble_detection_enabled and use_rtdetr_guide): base_dilation_size = 0 self._log(f"πŸ“ Auto dilation (RT-DETR guided): 0px (using iterations only)", "info") elif getattr(self, 'ocr_provider', '').lower() in ('azure', 'google'): # CRITICAL: Without RT-DETR, Azure/Google OCR is very conservative # Use base dilation to expand masks to actual bubble size base_dilation_size = 15 # Base expansion for Azure/Google without RT-DETR self._log(f"πŸ“ Auto dilation by provider ({self.ocr_provider}, no RT-DETR): {base_dilation_size}px", "info") else: base_dilation_size = 0 self._log(f"πŸ“ Auto dilation by provider ({self.ocr_provider}): {base_dilation_size}px", "info") except Exception: pass # Auto iterations: decide by image color vs B&W auto_iterations = manga_settings.get('auto_iterations', True) if auto_iterations: try: # Heuristic: consider image B&W if RGB channels are near-equal if len(image.shape) < 3 or image.shape[2] == 1: is_bw = True else: # Compute mean absolute differences between channels ch0 = image[:, :, 0].astype(np.int16) ch1 = image[:, :, 1].astype(np.int16) ch2 = image[:, :, 2].astype(np.int16) diff01 = np.mean(np.abs(ch0 - ch1)) diff12 = np.mean(np.abs(ch1 - ch2)) diff02 = np.mean(np.abs(ch0 - ch2)) # If channels are essentially the same, treat as B&W is_bw = max(diff01, diff12, diff02) < 2.0 if is_bw: text_bubble_iterations = 2 empty_bubble_iterations = 2 free_text_iterations = 0 self._log("πŸ“ Auto iterations (B&W): text=2, empty=2, free=0", "info") else: text_bubble_iterations = 4 empty_bubble_iterations = 4 free_text_iterations = 4 self._log("πŸ“ Auto iterations (Color): all=3", "info") except Exception: # Fallback to configured behavior on any error auto_iterations = False if not auto_iterations: # Check if using uniform iterations for all text types use_all_iterations = manga_settings.get('use_all_iterations', False) if use_all_iterations: # Use the same iteration count for all text types all_iterations = manga_settings.get('all_iterations', 2) text_bubble_iterations = all_iterations empty_bubble_iterations = all_iterations free_text_iterations = all_iterations self._log(f"πŸ“ Using uniform iterations: {all_iterations} for all text types", "info") else: # Use individual iteration settings text_bubble_iterations = manga_settings.get('text_bubble_dilation_iterations', manga_settings.get('bubble_dilation_iterations', 2)) empty_bubble_iterations = manga_settings.get('empty_bubble_dilation_iterations', 3) free_text_iterations = manga_settings.get('free_text_dilation_iterations', 0) self._log(f"πŸ“ Using individual iterations - Text bubbles: {text_bubble_iterations}, " f"Empty bubbles: {empty_bubble_iterations}, Free text: {free_text_iterations}", "info") # Create separate masks for different text types text_bubble_mask = np.zeros(image.shape[:2], dtype=np.uint8) empty_bubble_mask = np.zeros(image.shape[:2], dtype=np.uint8) free_text_mask = np.zeros(image.shape[:2], dtype=np.uint8) text_bubble_count = 0 empty_bubble_count = 0 free_text_count = 0 for i, region in enumerate(regions): # CHECK: Should this region be inpainted? if not getattr(region, 'should_inpaint', True): # Skip this region - it shouldn't be inpainted regions_skipped += 1 self._log(f" Region {i+1}: SKIPPED (filtered by settings)", "debug") continue regions_masked += 1 # Determine text type text_type = 'free_text' # default # Check if region has bubble_type attribute (from bubble detection) if hasattr(region, 'bubble_type'): # RT-DETR classifications if region.bubble_type == 'empty_bubble': text_type = 'empty_bubble' elif region.bubble_type == 'text_bubble': text_type = 'text_bubble' else: # 'free_text' or others text_type = 'free_text' else: # Fallback: use simple heuristics if no bubble detection x, y, w, h = region.bounding_box x, y, w, h = int(x), int(y), int(w), int(h) aspect_ratio = w / h if h > 0 else 1 # Check if region has text has_text = hasattr(region, 'text') and region.text and len(region.text.strip()) > 0 # Heuristic: bubbles tend to be more square-ish or tall # Free text tends to be wide and short if aspect_ratio < 2.5 and w > 50 and h > 50: if has_text: text_type = 'text_bubble' else: # Could be empty bubble if it's round/oval shaped text_type = 'empty_bubble' else: text_type = 'free_text' # Select appropriate mask and increment counter if text_type == 'text_bubble': target_mask = text_bubble_mask text_bubble_count += 1 mask_type = "TEXT BUBBLE" elif text_type == 'empty_bubble': target_mask = empty_bubble_mask empty_bubble_count += 1 mask_type = "EMPTY BUBBLE" else: target_mask = free_text_mask free_text_count += 1 mask_type = "FREE TEXT" # Check if this is a merged region with original regions if hasattr(region, 'original_regions') and region.original_regions: # Use original regions for precise masking self._log(f" Region {i+1} ({mask_type}): Using {len(region.original_regions)} original regions", "debug") for orig_region in region.original_regions: if hasattr(orig_region, 'vertices') and orig_region.vertices: pts = np.array(orig_region.vertices, np.int32) pts = pts.reshape((-1, 1, 2)) cv2.fillPoly(target_mask, [pts], 255) else: x, y, w, h = orig_region.bounding_box x, y, w, h = int(x), int(y), int(w), int(h) cv2.rectangle(target_mask, (x, y), (x + w, y + h), 255, -1) else: # Normal region if hasattr(region, 'vertices') and region.vertices and len(region.vertices) <= 8: pts = np.array(region.vertices, np.int32) pts = pts.reshape((-1, 1, 2)) cv2.fillPoly(target_mask, [pts], 255) self._log(f" Region {i+1} ({mask_type}): Using polygon", "debug") else: x, y, w, h = region.bounding_box x, y, w, h = int(x), int(y), int(w), int(h) cv2.rectangle(target_mask, (x, y), (x + w, y + h), 255, -1) self._log(f" Region {i+1} ({mask_type}): Using bounding box", "debug") self._log(f"πŸ“Š Mask breakdown: {text_bubble_count} text bubbles, {empty_bubble_count} empty bubbles, " f"{free_text_count} free text regions, {regions_skipped} skipped", "info") # Apply different dilation settings to each mask type if base_dilation_size > 0: kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (base_dilation_size, base_dilation_size)) # Apply dilation to text bubble mask if text_bubble_count > 0 and text_bubble_iterations > 0: self._log(f"πŸ“ Applying text bubble dilation: {base_dilation_size}px, {text_bubble_iterations} iterations", "info") text_bubble_mask = cv2.dilate(text_bubble_mask, kernel, iterations=text_bubble_iterations) # Apply dilation to empty bubble mask if empty_bubble_count > 0 and empty_bubble_iterations > 0: self._log(f"πŸ“ Applying empty bubble dilation: {base_dilation_size}px, {empty_bubble_iterations} iterations", "info") empty_bubble_mask = cv2.dilate(empty_bubble_mask, kernel, iterations=empty_bubble_iterations) # Apply dilation to free text mask if free_text_count > 0 and free_text_iterations > 0: self._log(f"πŸ“ Applying free text dilation: {base_dilation_size}px, {free_text_iterations} iterations", "info") free_text_mask = cv2.dilate(free_text_mask, kernel, iterations=free_text_iterations) elif free_text_count > 0 and free_text_iterations == 0: self._log(f"πŸ“ No dilation for free text (iterations=0, perfect for B&W panels)", "info") # Combine all masks mask = cv2.bitwise_or(text_bubble_mask, empty_bubble_mask) mask = cv2.bitwise_or(mask, free_text_mask) coverage_percent = (np.sum(mask > 0) / mask.size) * 100 self._log(f"πŸ“Š Final mask coverage: {coverage_percent:.1f}% of image", "info") return mask def _get_or_init_shared_local_inpainter(self, local_method: str, model_path: str, force_reload: bool = False): """Return a shared LocalInpainter for (local_method, model_path) with minimal locking. If another thread is loading the same model, wait on its event instead of competing. Set force_reload=True only when the method or model_path actually changed. If spare instances are available in the pool, check one out for use. The instance will stay assigned to this translator until cleanup. """ from local_inpainter import LocalInpainter key = (local_method, model_path or '') # FIRST: Try to check out a spare instance if available (for true parallelism) # Don't pop it - instead mark it as 'in use' so it stays in memory with MangaTranslator._inpaint_pool_lock: rec = MangaTranslator._inpaint_pool.get(key) if rec and rec.get('spares'): spares = rec.get('spares') or [] # Initialize checked_out list if it doesn't exist if 'checked_out' not in rec: rec['checked_out'] = [] checked_out = rec['checked_out'] # Look for an available spare (not checked out) for spare in spares: if spare not in checked_out and spare and getattr(spare, 'model_loaded', False): # Mark as checked out checked_out.append(spare) self._log(f"🧰 Checked out spare inpainter ({len(checked_out)}/{len(spares)} in use)", "debug") # Store reference for later return self._checked_out_inpainter = spare self._inpainter_pool_key = key return spare # FALLBACK: Use the shared instance rec = MangaTranslator._inpaint_pool.get(key) if rec and rec.get('loaded') and rec.get('inpainter'): # Already loaded - do NOT force reload! return rec['inpainter'] # Create or wait for loader with MangaTranslator._inpaint_pool_lock: rec = MangaTranslator._inpaint_pool.get(key) if rec and rec.get('loaded') and rec.get('inpainter'): # Already loaded - do NOT force reload! return rec['inpainter'] if not rec: # Register loading record rec = {'inpainter': None, 'loaded': False, 'event': threading.Event()} MangaTranslator._inpaint_pool[key] = rec is_loader = True else: is_loader = False event = rec['event'] # Loader performs heavy work without holding the lock if is_loader: try: inp = LocalInpainter() # Apply tiling settings once to the shared instance tiling_settings = self.manga_settings.get('tiling', {}) inp.tiling_enabled = tiling_settings.get('enabled', False) inp.tile_size = tiling_settings.get('tile_size', 512) inp.tile_overlap = tiling_settings.get('tile_overlap', 64) # Ensure model path if not model_path or not os.path.exists(model_path): try: model_path = inp.download_jit_model(local_method) except Exception as e: self._log(f"⚠️ JIT download failed: {e}", "warning") model_path = None # Load model - NEVER force reload for first-time shared pool loading loaded_ok = False if model_path and os.path.exists(model_path): try: self._log(f"πŸ“¦ Loading inpainter model...", "debug") self._log(f" Method: {local_method}", "debug") self._log(f" Path: {model_path}", "debug") # Only force reload if explicitly requested AND this is not the first load # For shared pool, we should never force reload on initial load loaded_ok = inp.load_model_with_retry(local_method, model_path, force_reload=force_reload) if not loaded_ok: # Retry with force_reload if initial load failed self._log(f"πŸ”„ Initial load failed, retrying with force_reload=True", "warning") loaded_ok = inp.load_model_with_retry(local_method, model_path, force_reload=True) if not loaded_ok: self._log(f"❌ Both load attempts failed", "error") # Check file validity try: size_mb = os.path.getsize(model_path) / (1024 * 1024) self._log(f" File size: {size_mb:.2f} MB", "info") if size_mb < 1: self._log(f" ⚠️ File may be corrupted (too small)", "warning") except Exception: self._log(f" ⚠️ Could not read model file", "warning") except Exception as e: self._log(f"⚠️ Inpainter load exception: {e}", "warning") import traceback self._log(traceback.format_exc(), "debug") loaded_ok = False elif not model_path: self._log(f"⚠️ No model path configured for {local_method}", "warning") elif not os.path.exists(model_path): self._log(f"⚠️ Model file does not exist: {model_path}", "warning") # Publish result with MangaTranslator._inpaint_pool_lock: rec = MangaTranslator._inpaint_pool.get(key) or rec rec['inpainter'] = inp rec['loaded'] = bool(loaded_ok) rec['event'].set() return inp except Exception as e: with MangaTranslator._inpaint_pool_lock: rec = MangaTranslator._inpaint_pool.get(key) or rec rec['inpainter'] = None rec['loaded'] = False rec['event'].set() self._log(f"⚠️ Shared inpainter setup failed: {e}", "warning") return None else: # Wait for loader to finish (without holding the lock) success = event.wait(timeout=120) if not success: self._log(f"⏱️ Timeout waiting for inpainter to load (120s)", "warning") return None # Check if load was successful rec2 = MangaTranslator._inpaint_pool.get(key) if not rec2: self._log(f"⚠️ Inpainter pool record disappeared after load", "warning") return None inp = rec2.get('inpainter') loaded = rec2.get('loaded', False) if inp and loaded: # Successfully loaded by another thread return inp elif inp and not loaded: # Inpainter created but model failed to load # Try to load it ourselves self._log(f"⚠️ Inpainter exists but model not loaded, attempting to load", "debug") if model_path and os.path.exists(model_path): try: loaded_ok = inp.load_model_with_retry(local_method, model_path, force_reload=True) if loaded_ok: # Update the pool record with MangaTranslator._inpaint_pool_lock: rec2['loaded'] = True self._log(f"βœ… Successfully loaded model on retry in waiting thread", "info") return inp except Exception as e: self._log(f"❌ Failed to load in waiting thread: {e}", "warning") return inp # Return anyway, inpaint will no-op else: self._log(f"⚠️ Loader thread failed to create inpainter", "warning") return None @classmethod def _count_preloaded_inpainters(cls) -> int: try: with cls._inpaint_pool_lock: total = 0 for rec in cls._inpaint_pool.values(): try: total += len(rec.get('spares') or []) except Exception: pass return total except Exception: return 0 def preload_local_inpainters(self, local_method: str, model_path: str, count: int) -> int: """Preload N local inpainting instances sequentially into the shared pool for parallel panel translation. Returns the number of instances successfully preloaded. """ # Respect singleton mode: do not create extra instances/spares if getattr(self, 'use_singleton_models', False): try: self._log("🧰 Skipping local inpainting preload (singleton mode)", "debug") except Exception: pass return 0 try: from local_inpainter import LocalInpainter except Exception: self._log("❌ Local inpainter module not available for preloading", "error") return 0 key = (local_method, model_path or '') created = 0 # FIRST: Ensure the shared instance is initialized and ready # This prevents race conditions when spare instances run out with MangaTranslator._inpaint_pool_lock: rec = MangaTranslator._inpaint_pool.get(key) if not rec or not rec.get('loaded') or not rec.get('inpainter'): # Need to create the shared instance if not rec: rec = {'inpainter': None, 'loaded': False, 'event': threading.Event(), 'spares': []} MangaTranslator._inpaint_pool[key] = rec need_init_shared = True else: need_init_shared = not (rec.get('loaded') and rec.get('inpainter')) else: need_init_shared = False if need_init_shared: self._log(f"πŸ“¦ Initializing shared inpainter instance first...", "info") try: shared_inp = self._get_or_init_shared_local_inpainter(local_method, model_path, force_reload=False) if shared_inp and getattr(shared_inp, 'model_loaded', False): self._log(f"βœ… Shared instance initialized and model loaded", "info") # Verify the pool record is updated with MangaTranslator._inpaint_pool_lock: rec_check = MangaTranslator._inpaint_pool.get(key) if rec_check: self._log(f" Pool record: loaded={rec_check.get('loaded')}, has_inpainter={rec_check.get('inpainter') is not None}", "debug") else: self._log(f"⚠️ Shared instance initialization returned but model not loaded", "warning") if shared_inp: self._log(f" Instance exists but model_loaded={getattr(shared_inp, 'model_loaded', 'ATTR_MISSING')}", "debug") except Exception as e: self._log(f"⚠️ Shared instance initialization failed: {e}", "warning") import traceback self._log(traceback.format_exc(), "debug") # Ensure pool record and spares list exist with MangaTranslator._inpaint_pool_lock: rec = MangaTranslator._inpaint_pool.get(key) if not rec: rec = {'inpainter': None, 'loaded': False, 'event': threading.Event(), 'spares': []} MangaTranslator._inpaint_pool[key] = rec if 'spares' not in rec or rec['spares'] is None: rec['spares'] = [] spares = rec.get('spares') # Prepare tiling settings tiling_settings = self.manga_settings.get('tiling', {}) if hasattr(self, 'manga_settings') else {} desired = max(0, int(count) - len(spares)) if desired <= 0: return 0 ctx = " for parallel panels" if int(count) > 1 else "" self._log(f"🧰 Preloading {desired} local inpainting instance(s){ctx}", "info") for i in range(desired): try: inp = LocalInpainter() inp.tiling_enabled = tiling_settings.get('enabled', False) inp.tile_size = tiling_settings.get('tile_size', 512) inp.tile_overlap = tiling_settings.get('tile_overlap', 64) # Resolve model path if needed resolved = model_path if not resolved or not os.path.exists(resolved): try: resolved = inp.download_jit_model(local_method) except Exception as e: self._log(f"⚠️ Preload JIT download failed: {e}", "warning") resolved = None if resolved and os.path.exists(resolved): ok = inp.load_model_with_retry(local_method, resolved, force_reload=False) if ok and getattr(inp, 'model_loaded', False): with MangaTranslator._inpaint_pool_lock: rec = MangaTranslator._inpaint_pool.get(key) or {'spares': []} if 'spares' not in rec or rec['spares'] is None: rec['spares'] = [] rec['spares'].append(inp) MangaTranslator._inpaint_pool[key] = rec created += 1 elif ok and not getattr(inp, 'model_loaded', False): self._log(f"⚠️ Preload: load_model_with_retry returned True but model_loaded is False", "warning") elif not ok: self._log(f"⚠️ Preload: load_model_with_retry returned False", "warning") else: self._log("⚠️ Preload skipped: no model path available", "warning") except Exception as e: self._log(f"⚠️ Preload error: {e}", "warning") self._log(f"βœ… Preloaded {created} local inpainting instance(s)", "info") return created def preload_local_inpainters_concurrent(self, local_method: str, model_path: str, count: int, max_parallel: int = None) -> int: """Preload N local inpainting instances concurrently into the shared pool. Honors advanced toggles for panel/region parallelism to pick a reasonable parallelism. Returns number of instances successfully preloaded. """ # Respect singleton mode: do not create extra instances/spares if getattr(self, 'use_singleton_models', False): try: self._log("🧰 Skipping concurrent local inpainting preload (singleton mode)", "debug") except Exception: pass return 0 try: from local_inpainter import LocalInpainter except Exception: self._log("❌ Local inpainter module not available for preloading", "error") return 0 key = (local_method, model_path or '') # Determine desired number based on existing spares with MangaTranslator._inpaint_pool_lock: rec = MangaTranslator._inpaint_pool.get(key) if not rec: rec = {'inpainter': None, 'loaded': False, 'event': threading.Event(), 'spares': []} MangaTranslator._inpaint_pool[key] = rec spares = (rec.get('spares') or []) desired = max(0, int(count) - len(spares)) if desired <= 0: return 0 # Determine max_parallel from advanced settings if not provided if max_parallel is None: adv = {} try: adv = self.main_gui.config.get('manga_settings', {}).get('advanced', {}) if hasattr(self, 'main_gui') else {} except Exception: adv = {} if adv.get('parallel_panel_translation', False): try: max_parallel = max(1, int(adv.get('panel_max_workers', 2))) except Exception: max_parallel = 2 elif adv.get('parallel_processing', False): try: max_parallel = max(1, int(adv.get('max_workers', 4))) except Exception: max_parallel = 2 else: max_parallel = 1 max_parallel = max(1, min(int(max_parallel), int(desired))) ctx = " for parallel panels" if int(count) > 1 else "" self._log(f"🧰 Preloading {desired} local inpainting instance(s){ctx} (parallel={max_parallel})", "info") # Resolve model path once resolved_path = model_path if not resolved_path or not os.path.exists(resolved_path): try: probe_inp = LocalInpainter() resolved_path = probe_inp.download_jit_model(local_method) except Exception as e: self._log(f"⚠️ JIT download failed for concurrent preload: {e}", "warning") resolved_path = None tiling_settings = self.manga_settings.get('tiling', {}) if hasattr(self, 'manga_settings') else {} from concurrent.futures import ThreadPoolExecutor, as_completed created = 0 def _one(): try: inp = LocalInpainter() inp.tiling_enabled = tiling_settings.get('enabled', False) inp.tile_size = tiling_settings.get('tile_size', 512) inp.tile_overlap = tiling_settings.get('tile_overlap', 64) if resolved_path and os.path.exists(resolved_path): ok = inp.load_model_with_retry(local_method, resolved_path, force_reload=False) if ok and getattr(inp, 'model_loaded', False): with MangaTranslator._inpaint_pool_lock: rec2 = MangaTranslator._inpaint_pool.get(key) or {'spares': []} if 'spares' not in rec2 or rec2['spares'] is None: rec2['spares'] = [] rec2['spares'].append(inp) MangaTranslator._inpaint_pool[key] = rec2 return True except Exception as e: self._log(f"⚠️ Concurrent preload error: {e}", "warning") return False with ThreadPoolExecutor(max_workers=max_parallel) as ex: futs = [ex.submit(_one) for _ in range(desired)] for f in as_completed(futs): try: if f.result(): created += 1 except Exception: pass self._log(f"βœ… Preloaded {created} local inpainting instance(s)", "info") return created return created @classmethod def _count_preloaded_detectors(cls) -> int: try: with cls._detector_pool_lock: return sum(len((rec or {}).get('spares') or []) for rec in cls._detector_pool.values()) except Exception: return 0 @classmethod def get_preload_counters(cls) -> Dict[str, int]: """Return current counters for preloaded instances (for diagnostics/logging).""" try: with cls._inpaint_pool_lock: inpaint_spares = sum(len((rec or {}).get('spares') or []) for rec in cls._inpaint_pool.values()) inpaint_keys = len(cls._inpaint_pool) with cls._detector_pool_lock: detector_spares = sum(len((rec or {}).get('spares') or []) for rec in cls._detector_pool.values()) detector_keys = len(cls._detector_pool) return { 'inpaint_spares': inpaint_spares, 'inpaint_keys': inpaint_keys, 'detector_spares': detector_spares, 'detector_keys': detector_keys, } except Exception: return {'inpaint_spares': 0, 'inpaint_keys': 0, 'detector_spares': 0, 'detector_keys': 0} def preload_bubble_detectors(self, ocr_settings: Dict[str, Any], count: int) -> int: """Preload N bubble detector instances (non-singleton) for panel parallelism. Only applies when not using singleton models. """ try: from bubble_detector import BubbleDetector except Exception: self._log("❌ BubbleDetector module not available for preloading", "error") return 0 # Skip if singleton mode if getattr(self, 'use_singleton_models', False): return 0 det_type = (ocr_settings or {}).get('detector_type', 'rtdetr_onnx') model_id = (ocr_settings or {}).get('rtdetr_model_url') or (ocr_settings or {}).get('bubble_model_path') or '' key = (det_type, model_id) created = 0 with MangaTranslator._detector_pool_lock: rec = MangaTranslator._detector_pool.get(key) if not rec: rec = {'spares': []} MangaTranslator._detector_pool[key] = rec spares = rec.get('spares') if spares is None: spares = [] rec['spares'] = spares desired = max(0, int(count) - len(spares)) if desired <= 0: return 0 self._log(f"🧰 Preloading {desired} bubble detector instance(s) [{det_type}]", "info") for i in range(desired): try: bd = BubbleDetector() ok = False if det_type == 'rtdetr_onnx': ok = bool(bd.load_rtdetr_onnx_model(model_id=model_id)) elif det_type == 'rtdetr': ok = bool(bd.load_rtdetr_model(model_id=model_id)) elif det_type == 'yolo': if model_id: ok = bool(bd.load_model(model_id)) else: # auto: prefer RT-DETR ok = bool(bd.load_rtdetr_model(model_id=model_id)) if ok: with MangaTranslator._detector_pool_lock: rec = MangaTranslator._detector_pool.get(key) or {'spares': []} if 'spares' not in rec or rec['spares'] is None: rec['spares'] = [] rec['spares'].append(bd) MangaTranslator._detector_pool[key] = rec created += 1 except Exception as e: self._log(f"⚠️ Bubble detector preload error: {e}", "warning") self._log(f"βœ… Preloaded {created} bubble detector instance(s)", "info") return created def _initialize_local_inpainter(self): """Initialize local inpainting if configured""" try: from local_inpainter import LocalInpainter, HybridInpainter, AnimeMangaInpaintModel # LOAD THE SETTINGS FROM CONFIG FIRST # The dialog saves it as 'manga_local_inpaint_model' at root level saved_local_method = self.main_gui.config.get('manga_local_inpaint_model', 'anime') saved_inpaint_method = self.main_gui.config.get('manga_inpaint_method', 'cloud') # MIGRATION: Ensure manga_ prefixed model path keys exist for ONNX methods # This fixes compatibility where model paths were saved without manga_ prefix for method_variant in ['anime', 'anime_onnx', 'lama', 'lama_onnx', 'aot', 'aot_onnx']: non_prefixed_key = f'{method_variant}_model_path' prefixed_key = f'manga_{method_variant}_model_path' # If we have the non-prefixed but not the prefixed, migrate it if non_prefixed_key in self.main_gui.config and prefixed_key not in self.main_gui.config: self.main_gui.config[prefixed_key] = self.main_gui.config[non_prefixed_key] self._log(f"πŸ”„ Migrated model path config: {non_prefixed_key} β†’ {prefixed_key}", "debug") # Update manga_settings with the saved values # ALWAYS use the top-level saved config to ensure correct model is loaded if 'inpainting' not in self.manga_settings: self.manga_settings['inpainting'] = {} # Always override with saved values from top-level config # This ensures the user's model selection in the settings dialog is respected self.manga_settings['inpainting']['method'] = saved_inpaint_method self.manga_settings['inpainting']['local_method'] = saved_local_method # Now get the values (they'll be correct now) inpaint_method = self.manga_settings.get('inpainting', {}).get('method', 'cloud') if inpaint_method == 'local': # This will now get the correct saved value local_method = self.manga_settings.get('inpainting', {}).get('local_method', 'anime') # Model path is saved with manga_ prefix - try both key formats for compatibility model_path = self.main_gui.config.get(f'manga_{local_method}_model_path', '') if not model_path: # Fallback to non-prefixed key (older format) model_path = self.main_gui.config.get(f'{local_method}_model_path', '') self._log(f"Using local method: {local_method} (loaded from config)", "info") # Check if we already have a loaded instance in the shared pool # This avoids unnecessary tracking and reloading inp_shared = self._get_or_init_shared_local_inpainter(local_method, model_path, force_reload=False) # Only track changes AFTER getting the shared instance # This prevents spurious reloads on first initialization if not hasattr(self, '_last_local_method'): self._last_local_method = local_method self._last_local_model_path = model_path else: # Check if settings actually changed and we need to force reload need_reload = False if self._last_local_method != local_method: self._log(f"πŸ”„ Local method changed from {self._last_local_method} to {local_method}", "info") need_reload = True # If method changed, we need a different model - get it with force_reload inp_shared = self._get_or_init_shared_local_inpainter(local_method, model_path, force_reload=True) elif self._last_local_model_path != model_path: self._log(f"πŸ”„ Model path changed", "info") if self._last_local_model_path: self._log(f" Old: {os.path.basename(self._last_local_model_path)}", "debug") if model_path: self._log(f" New: {os.path.basename(model_path)}", "debug") need_reload = True # If path changed, reload the model inp_shared = self._get_or_init_shared_local_inpainter(local_method, model_path, force_reload=True) # Update tracking only if changes were made if need_reload: self._last_local_method = local_method self._last_local_model_path = model_path if inp_shared is not None: self.local_inpainter = inp_shared if getattr(self.local_inpainter, 'model_loaded', False): self._log(f"βœ… Using shared {local_method.upper()} inpainting model", "info") return True else: self._log(f"⚠️ Shared inpainter created but model not loaded", "warning") self._log(f"πŸ”„ Attempting to retry model loading...", "info") # Retry loading the model if model_path and os.path.exists(model_path): self._log(f"πŸ“¦ Model path: {model_path}", "info") self._log(f"πŸ“‹ Method: {local_method}", "info") try: loaded_ok = inp_shared.load_model_with_retry(local_method, model_path, force_reload=True) if loaded_ok and getattr(inp_shared, 'model_loaded', False): self._log(f"βœ… Model loaded successfully on retry", "info") return True else: self._log(f"❌ Model still not loaded after retry", "error") # Check if model file exists and is valid try: size_mb = os.path.getsize(model_path) / (1024 * 1024) self._log(f"πŸ“Š Model file size: {size_mb:.2f} MB", "info") if size_mb < 1: self._log(f"⚠️ Model file seems too small (< 1 MB) - may be corrupted", "warning") except Exception: pass except Exception as e: self._log(f"❌ Retry load failed: {e}", "error") import traceback self._log(traceback.format_exc(), "debug") elif not model_path: self._log(f"❌ No model path provided", "error") elif not os.path.exists(model_path): self._log(f"❌ Model path does not exist: {model_path}", "error") self._log(f"πŸ“₯ Tip: Try downloading the model from the Manga Settings dialog", "info") # If retry failed, fall through to fallback logic below # Fall back to instance-level init only if shared init completely failed self._log("⚠️ Shared inpainter init failed, falling back to instance creation", "warning") try: from local_inpainter import LocalInpainter # Create local inpainter instance self.local_inpainter = LocalInpainter() tiling_settings = self.manga_settings.get('tiling', {}) self.local_inpainter.tiling_enabled = tiling_settings.get('enabled', False) self.local_inpainter.tile_size = tiling_settings.get('tile_size', 512) self.local_inpainter.tile_overlap = tiling_settings.get('tile_overlap', 64) self._log(f"βœ… Set tiling: enabled={self.local_inpainter.tiling_enabled}, size={self.local_inpainter.tile_size}, overlap={self.local_inpainter.tile_overlap}", "info") # If no model path or doesn't exist, try to find or download one if not model_path or not os.path.exists(model_path): self._log(f"⚠️ Model path not found: {model_path}", "warning") self._log("πŸ“₯ Attempting to download JIT model...", "info") try: downloaded_path = self.local_inpainter.download_jit_model(local_method) except Exception as e: self._log(f"⚠️ JIT download failed: {e}", "warning") downloaded_path = None if downloaded_path: model_path = downloaded_path self._log(f"βœ… Downloaded JIT model to: {model_path}") else: self._log("⚠️ JIT model download did not return a path", "warning") # Load model with retry to avoid transient file/JSON issues under parallel init loaded_ok = False if model_path and os.path.exists(model_path): for attempt in range(2): try: self._log(f"πŸ“₯ Loading {local_method} model... (attempt {attempt+1})", "info") if self.local_inpainter.load_model(local_method, model_path, force_reload=need_reload): loaded_ok = True break except Exception as e: self._log(f"⚠️ Load attempt {attempt+1} failed: {e}", "warning") time.sleep(0.5) if loaded_ok: self._log(f"βœ… Local inpainter loaded with {local_method.upper()} (fallback instance)") else: self._log(f"⚠️ Failed to load model, but inpainter is ready", "warning") else: self._log(f"⚠️ No model available, but inpainter is initialized", "warning") return True except Exception as e: self._log(f"❌ Local inpainter module not available: {e}", "error") return False elif inpaint_method == 'hybrid': # Track hybrid settings changes if not hasattr(self, '_last_hybrid_config'): self._last_hybrid_config = None # Set tiling from tiling section tiling_settings = self.manga_settings.get('tiling', {}) self.local_inpainter.tiling_enabled = tiling_settings.get('enabled', False) self.local_inpainter.tile_size = tiling_settings.get('tile_size', 512) self.local_inpainter.tile_overlap = tiling_settings.get('tile_overlap', 64) self._log(f"βœ… Set tiling: enabled={self.local_inpainter.tiling_enabled}, size={self.local_inpainter.tile_size}, overlap={self.local_inpainter.tile_overlap}", "info") current_hybrid_config = self.manga_settings.get('inpainting', {}).get('hybrid_methods', []) # Check if hybrid config changed need_reload = self._last_hybrid_config != current_hybrid_config if need_reload: self._log("πŸ”„ Hybrid configuration changed, reloading...", "info") self.hybrid_inpainter = None # Clear old instance self._last_hybrid_config = current_hybrid_config.copy() if current_hybrid_config else [] if self.hybrid_inpainter is None: self.hybrid_inpainter = HybridInpainter() # REMOVED: No longer override tiling settings for HybridInpainter # Load multiple methods methods = self.manga_settings.get('inpainting', {}).get('hybrid_methods', []) loaded = 0 for method_config in methods: method = method_config.get('method') model_path = method_config.get('model_path') if method and model_path: if self.hybrid_inpainter.add_method(method, method, model_path): loaded += 1 self._log(f"βœ… Added {method.upper()} to hybrid inpainter") if loaded > 0: self._log(f"βœ… Hybrid inpainter ready with {loaded} methods") else: self._log("⚠️ Hybrid inpainter initialized but no methods loaded", "warning") return True return False except ImportError: self._log("❌ Local inpainter module not available", "error") return False except Exception as e: self._log(f"❌ Error initializing inpainter: {e}", "error") return False def inpaint_regions(self, image: np.ndarray, mask: np.ndarray) -> np.ndarray: """Inpaint using configured method (cloud, local, or hybrid)""" # Primary source of truth is the runtime flags set by the UI. if getattr(self, 'skip_inpainting', False): self._log(" ⏭️ Skipping inpainting (preserving original art)", "info") return image.copy() # Cloud mode explicitly selected in UI if getattr(self, 'use_cloud_inpainting', False): return self._cloud_inpaint(image, mask) # Hybrid mode if UI requested it (fallback to settings key if present) mode = getattr(self, 'inpaint_mode', None) or self.manga_settings.get('inpainting', {}).get('method') if mode == 'hybrid' and hasattr(self, 'hybrid_inpainter'): self._log(" πŸ”„ Using hybrid ensemble inpainting", "info") return self.hybrid_inpainter.inpaint_ensemble(image, mask) # If a background preload is running, wait until it's finished before inpainting try: if hasattr(self, '_inpaint_preload_event') and self._inpaint_preload_event and not self._inpaint_preload_event.is_set(): self._log(" ⏳ Waiting for local inpainting models to finish preloading...", "info") # Wait with a generous timeout, but proceed afterward regardless self._inpaint_preload_event.wait(timeout=300) except Exception: pass # Default to local inpainting local_method = self.manga_settings.get('inpainting', {}).get('local_method', 'anime') model_path = self.main_gui.config.get(f'manga_{local_method}_model_path', '') # Use a thread-local inpainter instance inp = self._get_thread_local_inpainter(local_method, model_path) if inp and getattr(inp, 'model_loaded', False): self._log(" 🧽 Using local inpainting", "info") return inp.inpaint(image, mask) else: # Conservative fallback: try shared instance only; do not attempt risky reloads that can corrupt output try: shared_inp = self._get_or_init_shared_local_inpainter(local_method, model_path) if shared_inp and getattr(shared_inp, 'model_loaded', False): self._log(" βœ… Using shared inpainting instance", "info") return shared_inp.inpaint(image, mask) except Exception: pass # RETRY LOGIC: Attempt to reload model with multiple strategies self._log(" ⚠️ Local inpainting model not loaded; attempting retry...", "warning") retry_attempts = [ {'force_reload': True, 'desc': 'force reload'}, {'force_reload': True, 'desc': 'force reload with delay', 'delay': 1.0}, {'force_reload': False, 'desc': 'standard reload'}, ] for attempt_num, retry_config in enumerate(retry_attempts, 1): try: self._log(f" πŸ”„ Retry attempt {attempt_num}/{len(retry_attempts)}: {retry_config['desc']}", "info") # Apply delay if specified if retry_config.get('delay'): import time time.sleep(retry_config['delay']) # Try to get or create a fresh inpainter instance retry_inp = self._get_or_init_shared_local_inpainter( local_method, model_path, force_reload=retry_config['force_reload'] ) if retry_inp: # Check if model is loaded if getattr(retry_inp, 'model_loaded', False): self._log(f" βœ… Model loaded successfully on retry attempt {attempt_num}", "info") return retry_inp.inpaint(image, mask) else: # Model exists but not loaded - try loading it directly self._log(f" πŸ”§ Model not loaded, attempting direct load...", "info") if model_path and os.path.exists(model_path): try: loaded_ok = retry_inp.load_model_with_retry( local_method, model_path, force_reload=True ) if loaded_ok and getattr(retry_inp, 'model_loaded', False): self._log(f" βœ… Direct load successful on attempt {attempt_num}", "info") return retry_inp.inpaint(image, mask) else: self._log(f" ⚠️ Direct load returned {loaded_ok}, model_loaded={getattr(retry_inp, 'model_loaded', False)}", "warning") except Exception as load_err: self._log(f" ⚠️ Direct load failed: {load_err}", "warning") else: if not model_path: self._log(f" ⚠️ No model path configured", "warning") elif not os.path.exists(model_path): self._log(f" ⚠️ Model file does not exist: {model_path}", "warning") else: self._log(f" ⚠️ Failed to get inpainter instance on attempt {attempt_num}", "warning") except Exception as retry_err: self._log(f" ⚠️ Retry attempt {attempt_num} failed: {retry_err}", "warning") import traceback self._log(traceback.format_exc(), "debug") # All retries exhausted - provide detailed diagnostic information self._log(" ❌ All retry attempts exhausted. Diagnostics:", "error") self._log(f" Method: {local_method}", "error") if model_path: self._log(f" Model path: {model_path}", "error") if os.path.exists(model_path): try: size_mb = os.path.getsize(model_path) / (1024 * 1024) self._log(f" File size: {size_mb:.2f} MB", "error") if size_mb < 1: self._log(f" ⚠️ File may be corrupted (too small)", "error") except Exception: self._log(f" ⚠️ Cannot read model file", "error") else: self._log(f" ⚠️ Model file does not exist", "error") else: self._log(f" ⚠️ No model path configured", "error") self._log(" πŸ’‘ Suggestion: Check Manga Settings and download the model if needed", "error") self._log(" ⚠️ Returning original image without inpainting", "warning") return image.copy() def _cloud_inpaint(self, image: np.ndarray, mask: np.ndarray) -> np.ndarray: """Use Replicate API for inpainting""" try: import requests import base64 from io import BytesIO from PIL import Image as PILImage import cv2 self._log(" ☁️ Cloud inpainting via Replicate API", "info") # Convert to PIL image_pil = PILImage.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) mask_pil = PILImage.fromarray(mask).convert('L') # Convert to base64 img_buffer = BytesIO() image_pil.save(img_buffer, format='PNG') img_base64 = base64.b64encode(img_buffer.getvalue()).decode() mask_buffer = BytesIO() mask_pil.save(mask_buffer, format='PNG') mask_base64 = base64.b64encode(mask_buffer.getvalue()).decode() # Get cloud settings cloud_settings = self.main_gui.config.get('manga_settings', {}) model_type = cloud_settings.get('cloud_inpaint_model', 'ideogram-v2') timeout = cloud_settings.get('cloud_timeout', 60) # Determine model identifier based on model type if model_type == 'ideogram-v2': model = 'ideogram-ai/ideogram-v2' self._log(f" Using Ideogram V2 inpainting model", "info") elif model_type == 'sd-inpainting': model = 'stability-ai/stable-diffusion-inpainting' self._log(f" Using Stable Diffusion inpainting model", "info") elif model_type == 'flux-inpainting': model = 'zsxkib/flux-dev-inpainting' self._log(f" Using FLUX inpainting model", "info") elif model_type == 'custom': model = cloud_settings.get('cloud_custom_version', '') if not model: raise Exception("No custom model identifier specified") self._log(f" Using custom model: {model}", "info") else: # Default to Ideogram V2 model = 'ideogram-ai/ideogram-v2' self._log(f" Using default Ideogram V2 model", "info") # Build input data based on model type input_data = { 'image': f'data:image/png;base64,{img_base64}', 'mask': f'data:image/png;base64,{mask_base64}' } # Add prompt settings for models that support them if model_type in ['ideogram-v2', 'sd-inpainting', 'flux-inpainting', 'custom']: prompt = cloud_settings.get('cloud_inpaint_prompt', 'clean background, smooth surface') input_data['prompt'] = prompt self._log(f" Prompt: {prompt}", "info") # SD-specific parameters if model_type == 'sd-inpainting': negative_prompt = cloud_settings.get('cloud_negative_prompt', 'text, writing, letters') input_data['negative_prompt'] = negative_prompt input_data['num_inference_steps'] = cloud_settings.get('cloud_inference_steps', 20) self._log(f" Negative prompt: {negative_prompt}", "info") # Get the latest version of the model headers = { 'Authorization': f'Token {self.replicate_api_key}', 'Content-Type': 'application/json' } # First, get the latest version of the model model_response = requests.get( f'https://api.replicate.com/v1/models/{model}', headers=headers ) if model_response.status_code != 200: # If model lookup fails, try direct prediction with model identifier self._log(f" Model lookup returned {model_response.status_code}, trying direct prediction", "warning") version = None else: model_info = model_response.json() version = model_info.get('latest_version', {}).get('id') if not version: raise Exception(f"Could not get version for model {model}") # Create prediction prediction_data = { 'input': input_data } if version: prediction_data['version'] = version else: # For custom models, try extracting version from model string if ':' in model: # Format: owner/model:version model_name, version_id = model.split(':', 1) prediction_data['version'] = version_id else: raise Exception(f"Could not determine version for model {model}. Try using format: owner/model:version") response = requests.post( 'https://api.replicate.com/v1/predictions', headers=headers, json=prediction_data ) if response.status_code != 201: raise Exception(f"API error: {response.text}") # Get prediction URL prediction = response.json() prediction_url = prediction.get('urls', {}).get('get') or prediction.get('id') if not prediction_url: raise Exception("No prediction URL returned") # If we only got an ID, construct the URL if not prediction_url.startswith('http'): prediction_url = f'https://api.replicate.com/v1/predictions/{prediction_url}' # Poll for result with configured timeout import time for i in range(timeout): response = requests.get(prediction_url, headers=headers) result = response.json() # Log progress every 5 seconds if i % 5 == 0 and i > 0: self._log(f" ⏳ Still processing... ({i}s elapsed)", "info") if result['status'] == 'succeeded': # Download result image (handle both single URL and list) output = result.get('output') if not output: raise Exception("No output returned from model") if isinstance(output, list): output_url = output[0] if output else None else: output_url = output if not output_url: raise Exception("No output URL in result") img_response = requests.get(output_url) # Convert back to numpy result_pil = PILImage.open(BytesIO(img_response.content)) result_bgr = cv2.cvtColor(np.array(result_pil), cv2.COLOR_RGB2BGR) self._log(" βœ… Cloud inpainting completed", "success") return result_bgr elif result['status'] == 'failed': error_msg = result.get('error', 'Unknown error') # Check for common errors if 'version' in error_msg.lower(): error_msg += f" (Try using the model identifier '{model}' in the custom field)" raise Exception(f"Inpainting failed: {error_msg}") time.sleep(1) raise Exception(f"Timeout waiting for inpainting (>{timeout}s)") except Exception as e: self._log(f" ❌ Cloud inpainting failed: {str(e)}", "error") return image.copy() def _regions_overlap(self, region1: TextRegion, region2: TextRegion) -> bool: """Check if two regions overlap""" x1, y1, w1, h1 = region1.bounding_box x2, y2, w2, h2 = region2.bounding_box # Check if rectangles overlap if (x1 + w1 < x2 or x2 + w2 < x1 or y1 + h1 < y2 or y2 + h2 < y1): return False return True def _calculate_overlap_area(self, region1: TextRegion, region2: TextRegion) -> float: """Calculate the area of overlap between two regions""" x1, y1, w1, h1 = region1.bounding_box x2, y2, w2, h2 = region2.bounding_box # Calculate intersection x_left = max(x1, x2) y_top = max(y1, y2) x_right = min(x1 + w1, x2 + w2) y_bottom = min(y1 + h1, y2 + h2) if x_right < x_left or y_bottom < y_top: return 0.0 return (x_right - x_left) * (y_bottom - y_top) def _adjust_overlapping_regions(self, regions: List[TextRegion], image_width: int, image_height: int) -> List[TextRegion]: """Adjust positions of overlapping regions to prevent overlap while preserving text mapping""" if len(regions) <= 1: return regions # Create a copy of regions with preserved indices adjusted_regions = [] for idx, region in enumerate(regions): # Create a new TextRegion with copied values adjusted_region = TextRegion( text=region.text, vertices=list(region.vertices), bounding_box=list(region.bounding_box), confidence=region.confidence, region_type=region.region_type ) if hasattr(region, 'translated_text'): adjusted_region.translated_text = region.translated_text # IMPORTANT: Preserve original index to maintain text mapping adjusted_region.original_index = idx adjusted_region.original_bbox = tuple(region.bounding_box) # Store original position adjusted_regions.append(adjusted_region) # DON'T SORT - This breaks the text-to-region mapping! # Process in original order to maintain associations # Track which regions have been moved to avoid cascade effects moved_regions = set() # Adjust overlapping regions for i in range(len(adjusted_regions)): if i in moved_regions: continue # Skip if already moved for j in range(i + 1, len(adjusted_regions)): if j in moved_regions: continue # Skip if already moved region1 = adjusted_regions[i] region2 = adjusted_regions[j] if self._regions_overlap(region1, region2): x1, y1, w1, h1 = region1.bounding_box x2, y2, w2, h2 = region2.bounding_box # Calculate centers using ORIGINAL positions for better logic orig_x1, orig_y1, _, _ = region1.original_bbox orig_x2, orig_y2, _, _ = region2.original_bbox # Determine which region to move based on original positions # Move the one that's naturally "later" in reading order if orig_y2 > orig_y1 + h1/2: # region2 is below # Move region2 down slightly min_gap = 10 new_y2 = y1 + h1 + min_gap if new_y2 + h2 <= image_height: region2.bounding_box = (x2, new_y2, w2, h2) moved_regions.add(j) self._log(f" πŸ“ Adjusted region {j} down (preserving order)", "debug") elif orig_y1 > orig_y2 + h2/2: # region1 is below # Move region1 down slightly min_gap = 10 new_y1 = y2 + h2 + min_gap if new_y1 + h1 <= image_height: region1.bounding_box = (x1, new_y1, w1, h1) moved_regions.add(i) self._log(f" πŸ“ Adjusted region {i} down (preserving order)", "debug") elif orig_x2 > orig_x1 + w1/2: # region2 is to the right # Move region2 right slightly min_gap = 10 new_x2 = x1 + w1 + min_gap if new_x2 + w2 <= image_width: region2.bounding_box = (new_x2, y2, w2, h2) moved_regions.add(j) self._log(f" πŸ“ Adjusted region {j} right (preserving order)", "debug") else: # Minimal adjustment - just separate them slightly # without changing their relative order min_gap = 5 if y2 >= y1: # region2 is lower or same level new_y2 = y2 + min_gap if new_y2 + h2 <= image_height: region2.bounding_box = (x2, new_y2, w2, h2) moved_regions.add(j) else: # region1 is lower new_y1 = y1 + min_gap if new_y1 + h1 <= image_height: region1.bounding_box = (x1, new_y1, w1, h1) moved_regions.add(i) # IMPORTANT: Return in ORIGINAL order to preserve text mapping # Sort by original_index to restore the original order adjusted_regions.sort(key=lambda r: r.original_index) return adjusted_regions # Emote-only mixed font fallback (Meiryo) β€” primary font remains unchanged def _get_emote_fallback_font(self, font_size: int): """Return a Meiryo Bold fallback font if available (preferred), else Meiryo. Does not change the primary font; used only for emote glyphs. """ try: from PIL import ImageFont as _ImageFont import os as _os # Prefer Meiryo Bold TTC first; try common face indices, then regular Meiryo candidates = [ ("C:/Windows/Fonts/meiryob.ttc", [0,1,2,3]), # Meiryo Bold (and variants) TTC ("C:/Windows/Fonts/meiryo.ttc", [1,0,2,3]), # Try bold-ish index first if present ] for path, idxs in candidates: if _os.path.exists(path): for idx in idxs: try: return _ImageFont.truetype(path, font_size, index=idx) except Exception: continue return None except Exception: return None def _is_emote_char(self, ch: str) -> bool: # Strict whitelist of emote-like symbols to render with Meiryo EMOTES = set([ '\u2661', # β™‘ '\u2665', # β™₯ '\u2764', # ❀ '\u2605', # β˜… '\u2606', # β˜† '\u266A', # β™ͺ '\u266B', # β™« '\u203B', # β€» ]) return ch in EMOTES def _line_width_emote_mixed(self, draw, text: str, primary_font, emote_font) -> int: if not emote_font: bbox = draw.textbbox((0, 0), text, font=primary_font) return (bbox[2] - bbox[0]) w = 0 i = 0 while i < len(text): ch = text[i] # Treat VS16/VS15 as zero-width modifiers if ch in ('\ufe0f', '\ufe0e'): i += 1 continue f = emote_font if self._is_emote_char(ch) else primary_font try: bbox = draw.textbbox((0, 0), ch, font=f) w += (bbox[2] - bbox[0]) except Exception: w += max(1, int(getattr(primary_font, 'size', 12) * 0.6)) i += 1 return w def _draw_text_line_emote_mixed(self, draw, line: str, x: int, y: int, primary_font, emote_font, fill_rgba, outline_rgba, outline_width: int, shadow_enabled: bool, shadow_color_rgba, shadow_off): cur_x = x i = 0 while i < len(line): ch = line[i] if ch in ('\ufe0f', '\ufe0e'): i += 1 continue f = emote_font if (emote_font and self._is_emote_char(ch)) else primary_font # measure try: bbox = draw.textbbox((0, 0), ch, font=f) cw = bbox[2] - bbox[0] except Exception: cw = max(1, int(getattr(primary_font, 'size', 12) * 0.6)) # shadow if shadow_enabled: sx, sy = shadow_off draw.text((cur_x + sx, y + sy), ch, font=f, fill=shadow_color_rgba) # outline if outline_width > 0: for dx in range(-outline_width, outline_width + 1): for dy in range(-outline_width, outline_width + 1): if dx == 0 and dy == 0: continue draw.text((cur_x + dx, y + dy), ch, font=f, fill=outline_rgba) # main draw.text((cur_x, y), ch, font=f, fill=fill_rgba) cur_x += cw i += 1 def render_translated_text(self, image: np.ndarray, regions: List[TextRegion]) -> np.ndarray: """Enhanced text rendering with customizable backgrounds and styles""" self._log(f"\n🎨 Starting ENHANCED text rendering with custom settings:", "info") self._log(f" βœ… Using ENHANCED renderer (not the simple version)", "info") self._log(f" Background: {self.text_bg_style} @ {int(self.text_bg_opacity/255*100)}% opacity", "info") self._log(f" Text color: RGB{self.text_color}", "info") self._log(f" Shadow: {'Enabled' if self.shadow_enabled else 'Disabled'}", "info") self._log(f" Font: {os.path.basename(self.selected_font_style) if self.selected_font_style else 'Default'}", "info") if self.force_caps_lock: self._log(f" Force Caps Lock: ENABLED", "info") # Convert to PIL for text rendering import cv2 pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) # Get image dimensions for boundary checking image_height, image_width = image.shape[:2] # Create text mask to get accurate render boundaries # This represents what will actually be inpainted try: text_mask = self.create_text_mask(image, regions) use_mask_for_rendering = True self._log(f" 🎭 Created text mask for accurate render boundaries", "info") except Exception as e: text_mask = None use_mask_for_rendering = False if not getattr(self, 'concise_logs', False): self._log(f" ⚠️ Failed to create mask, using polygon bounds: {e}", "warning") # Only adjust overlapping regions if constraining to bubbles if self.constrain_to_bubble: adjusted_regions = self._adjust_overlapping_regions(regions, image_width, image_height) else: # Skip adjustment when not constraining (allows overflow) adjusted_regions = regions self._log(" πŸ“ Using original regions (overflow allowed)", "info") # Check if any regions still overlap after adjustment (shouldn't happen, but let's verify) has_overlaps = False for i, region1 in enumerate(adjusted_regions): for region2 in adjusted_regions[i+1:]: if self._regions_overlap(region1, region2): has_overlaps = True self._log(" ⚠️ Regions still overlap after adjustment", "warning") break if has_overlaps: break # Handle transparency settings based on overlaps if has_overlaps and self.text_bg_opacity < 255 and self.text_bg_opacity > 0: self._log(" ⚠️ Overlapping regions detected with partial transparency", "warning") self._log(" ℹ️ Rendering with requested transparency level", "info") region_count = 0 # Decide rendering path based on transparency needs # For full transparency (opacity = 0) or no overlaps, use RGBA rendering # For overlaps with partial transparency, we still use RGBA to honor user settings use_rgba_rendering = True # Always use RGBA for consistent transparency support if use_rgba_rendering: # Transparency-enabled rendering path pil_image = pil_image.convert('RGBA') # Decide parallel rendering from advanced settings try: adv = getattr(self, 'manga_settings', {}).get('advanced', {}) if hasattr(self, 'manga_settings') else {} except Exception: adv = {} render_parallel = bool(adv.get('render_parallel', True)) max_workers = None try: max_workers = int(adv.get('max_workers', 4)) except Exception: max_workers = 4 def _render_one(region, idx): # Build a separate overlay for this region from PIL import Image as _PIL overlay = _PIL.new('RGBA', pil_image.size, (0,0,0,0)) draw = ImageDraw.Draw(overlay) # Work on local copy of text for caps lock tr_text = region.translated_text or '' if self.force_caps_lock: tr_text = tr_text.upper() # Get original bounding box x, y, w, h = region.bounding_box # CRITICAL: Always prefer mask bounds when available (most accurate) # Mask bounds are especially important for Azure/Google without RT-DETR, # where OCR polygons are unreliable. if use_mask_for_rendering and text_mask is not None: # Use mask bounds directly - most accurate method safe_x, safe_y, safe_w, safe_h = self.get_safe_text_area( region, use_mask_bounds=True, full_mask=text_mask ) render_x, render_y, render_w, render_h = safe_x, safe_y, safe_w, safe_h elif hasattr(region, 'vertices') and region.vertices: # Fallback: use polygon-based safe area (for RT-DETR regions) safe_x, safe_y, safe_w, safe_h = self.get_safe_text_area(region, use_mask_bounds=False) render_x, render_y, render_w, render_h = safe_x, safe_y, safe_w, safe_h else: # Last resort: use simple bounding box render_x, render_y, render_w, render_h = x, y, w, h # Fit text - use render dimensions for proper sizing if self.custom_font_size: font_size = self.custom_font_size lines = self._wrap_text(tr_text, self._get_font(font_size), render_w, draw) elif self.font_size_mode == 'multiplier': # Pass use_as_is=True since render dimensions are already safe area font_size, lines = self._fit_text_to_region(tr_text, render_w, render_h, draw, region, use_as_is=True) else: # Pass use_as_is=True since render dimensions are already safe area font_size, lines = self._fit_text_to_region(tr_text, render_w, render_h, draw, region, use_as_is=True) # Fonts font = self._get_font(font_size) emote_font = self._get_emote_fallback_font(font_size) # Layout - use render dimensions (safe area if available) # CRITICAL: Use actual text bbox height for accurate positioning line_height = font_size * 1.2 # Calculate actual total height using text bbox for first line as reference if lines: sample_bbox = draw.textbbox((0, 0), lines[0] if lines[0] else "Ay", font=font) actual_line_height = sample_bbox[3] - sample_bbox[1] # Use the larger of: computed line_height or actual_line_height line_height = max(line_height, actual_line_height * 1.1) total_height = len(lines) * line_height # Ensure text doesn't overflow vertically - constrain start_y ideal_start_y = render_y + (render_h - total_height) // 2 # Make sure text starts within render area and doesn't extend past bottom max_start_y = render_y + render_h - total_height start_y = max(render_y, min(ideal_start_y, max_start_y)) # Debug logging for vertical constraint if not getattr(self, 'concise_logs', False): end_y = start_y + total_height render_end_y = render_y + render_h overflow = max(0, end_y - render_end_y) if overflow > 0: self._log(f" ⚠️ Text would overflow by {overflow}px, constrained to render area", "debug") self._log(f" πŸ“ Render area: y={render_y}-{render_end_y} (h={render_h}), Text: y={start_y}-{end_y} (h={total_height:.0f})", "debug") # BG - use render dimensions draw_bg = self.text_bg_opacity > 0 try: if draw_bg and getattr(self, 'free_text_only_bg_opacity', False): draw_bg = self._is_free_text_region(region) except Exception: pass if draw_bg: self._draw_text_background(draw, render_x, render_y, render_w, render_h, lines, font, font_size, start_y, emote_font) # Text - use render dimensions for centering for i, line in enumerate(lines): if emote_font is not None: text_width = self._line_width_emote_mixed(draw, line, font, emote_font) else: tb = draw.textbbox((0,0), line, font=font) text_width = tb[2]-tb[0] tx = render_x + (render_w - text_width)//2 ty = start_y + i*line_height ow = max(1, font_size // self.outline_width_factor) if emote_font is not None: self._draw_text_line_emote_mixed(draw, line, tx, ty, font, emote_font, self.text_color + (255,), self.outline_color + (255,), ow, self.shadow_enabled, self.shadow_color + (255,) if isinstance(self.shadow_color, tuple) and len(self.shadow_color)==3 else (0,0,0,255), (self.shadow_offset_x, self.shadow_offset_y)) else: if self.shadow_enabled: self._draw_text_shadow(draw, tx, ty, line, font) for dx in range(-ow, ow+1): for dy in range(-ow, ow+1): if dx!=0 or dy!=0: draw.text((tx+dx, ty+dy), line, font=font, fill=self.outline_color + (255,)) draw.text((tx, ty), line, font=font, fill=self.text_color + (255,)) return overlay overlays = [] if render_parallel and len(adjusted_regions) > 1: from concurrent.futures import ThreadPoolExecutor, as_completed workers = max(1, min(max_workers, len(adjusted_regions))) with ThreadPoolExecutor(max_workers=workers) as ex: fut_to_idx = {ex.submit(_render_one, r, i): i for i, r in enumerate(adjusted_regions) if r.translated_text} # Collect in order temp = {} for fut in as_completed(fut_to_idx): i = fut_to_idx[fut] try: temp[i] = fut.result() except Exception: temp[i] = None overlays = [temp.get(i) for i in range(len(adjusted_regions))] else: for i, r in enumerate(adjusted_regions): if not r.translated_text: overlays.append(None) continue overlays.append(_render_one(r, i)) # Composite overlays sequentially for ov in overlays: if ov is not None: pil_image = Image.alpha_composite(pil_image, ov) region_count += 1 # Convert back to RGB pil_image = pil_image.convert('RGB') else: # This path is now deprecated but kept for backwards compatibility # Direct rendering without transparency layers draw = ImageDraw.Draw(pil_image) for region in adjusted_regions: if not region.translated_text: continue self._log(f"DEBUG: Rendering - Original: '{region.text[:30]}...' -> Translated: '{region.translated_text[:30]}...'", "debug") # APPLY CAPS LOCK TRANSFORMATION HERE if self.force_caps_lock: region.translated_text = region.translated_text.upper() region_count += 1 self._log(f" Rendering region {region_count}: {region.translated_text[:30]}...", "info") # Get original bounding box x, y, w, h = region.bounding_box # CRITICAL: Always prefer mask bounds when available (most accurate) # Mask bounds are especially important for Azure/Google without RT-DETR, # where OCR polygons are unreliable. if use_mask_for_rendering and text_mask is not None: # Use mask bounds directly - most accurate method safe_x, safe_y, safe_w, safe_h = self.get_safe_text_area( region, use_mask_bounds=True, full_mask=text_mask ) render_x, render_y, render_w, render_h = safe_x, safe_y, safe_w, safe_h elif hasattr(region, 'vertices') and region.vertices: # Fallback: use polygon-based safe area (for RT-DETR regions) safe_x, safe_y, safe_w, safe_h = self.get_safe_text_area(region, use_mask_bounds=False) render_x, render_y, render_w, render_h = safe_x, safe_y, safe_w, safe_h else: # Last resort: use simple bounding box render_x, render_y, render_w, render_h = x, y, w, h # Find optimal font size - use render dimensions for proper sizing if self.custom_font_size: font_size = self.custom_font_size lines = self._wrap_text(region.translated_text, self._get_font(font_size), render_w, draw) else: # Pass use_as_is=True since render dimensions are already safe area font_size, lines = self._fit_text_to_region( region.translated_text, render_w, render_h, draw, region, use_as_is=True ) # Load font font = self._get_font(font_size) # Calculate text layout - use render dimensions # CRITICAL: Use actual text bbox height for accurate positioning line_height = font_size * 1.2 # Calculate actual total height using text bbox for first line as reference if lines: sample_bbox = draw.textbbox((0, 0), lines[0] if lines[0] else "Ay", font=font) actual_line_height = sample_bbox[3] - sample_bbox[1] # Use the larger of: computed line_height or actual_line_height line_height = max(line_height, actual_line_height * 1.1) total_height = len(lines) * line_height # Ensure text doesn't overflow vertically - constrain start_y ideal_start_y = render_y + (render_h - total_height) // 2 # Make sure text starts within render area and doesn't extend past bottom max_start_y = render_y + render_h - total_height start_y = max(render_y, min(ideal_start_y, max_start_y)) # Draw opaque background (optionally only for free text) - use render dimensions draw_bg = self.text_bg_opacity > 0 try: if draw_bg and getattr(self, 'free_text_only_bg_opacity', False): draw_bg = self._is_free_text_region(region) except Exception: pass if draw_bg: self._draw_text_background(draw, render_x, render_y, render_w, render_h, lines, font, font_size, start_y) # Draw text - use render dimensions for i, line in enumerate(lines): # Mixed fallback not supported in legacy path; keep primary measurement text_bbox = draw.textbbox((0, 0), line, font=font) text_width = text_bbox[2] - text_bbox[0] text_x = render_x + (render_w - text_width) // 2 text_y = start_y + i * line_height if self.shadow_enabled: self._draw_text_shadow(draw, text_x, text_y, line, font) outline_width = max(1, font_size // self.outline_width_factor) # Draw outline for dx in range(-outline_width, outline_width + 1): for dy in range(-outline_width, outline_width + 1): if dx != 0 or dy != 0: draw.text((text_x + dx, text_y + dy), line, font=font, fill=self.outline_color) # Draw main text draw.text((text_x, text_y), line, font=font, fill=self.text_color) # Convert back to numpy array result = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR) self._log(f"βœ… ENHANCED text rendering complete - rendered {region_count} regions", "info") return result def _is_free_text_region(self, region) -> bool: """Heuristic: determine if the region is free text (not a bubble). Uses bubble_type when available; otherwise falls back to aspect ratio heuristics. """ try: if hasattr(region, 'bubble_type') and region.bubble_type: return region.bubble_type == 'free_text' # Fallback heuristic x, y, w, h = region.bounding_box w, h = int(w), int(h) if h <= 0: return True aspect = w / max(1, h) # Wider, shorter regions are often free text return aspect >= 2.5 or h < 50 except Exception: return False def _draw_text_background(self, draw: ImageDraw, x: int, y: int, w: int, h: int, lines: List[str], font: ImageFont, font_size: int, start_y: int, emote_font: ImageFont = None): """Draw background behind text with selected style. If emote_font is provided, measure lines with emote-only mixing. """ # Early return if opacity is 0 (fully transparent) if self.text_bg_opacity == 0: return # Calculate actual text bounds line_height = font_size * 1.2 max_width = 0 for line in lines: if emote_font is not None: line_width = self._line_width_emote_mixed(draw, line, font, emote_font) else: bbox = draw.textbbox((0, 0), line, font=font) line_width = bbox[2] - bbox[0] max_width = max(max_width, line_width) # Apply size reduction padding = int(font_size * 0.3) bg_width = int((max_width + padding * 2) * self.text_bg_reduction) bg_height = int((len(lines) * line_height + padding * 2) * self.text_bg_reduction) # Center background bg_x = x + (w - bg_width) // 2 bg_y = int(start_y - padding) # Create semi-transparent color bg_color = (255, 255, 255, self.text_bg_opacity) if self.text_bg_style == 'box': # Rounded rectangle radius = min(20, bg_width // 10, bg_height // 10) self._draw_rounded_rectangle(draw, bg_x, bg_y, bg_x + bg_width, bg_y + bg_height, radius, bg_color) elif self.text_bg_style == 'circle': # Ellipse that encompasses the text center_x = bg_x + bg_width // 2 center_y = bg_y + bg_height // 2 # Make it slightly wider to look more natural ellipse_width = int(bg_width * 1.2) ellipse_height = bg_height draw.ellipse([center_x - ellipse_width // 2, center_y - ellipse_height // 2, center_x + ellipse_width // 2, center_y + ellipse_height // 2], fill=bg_color) elif self.text_bg_style == 'wrap': # Individual background for each line for i, line in enumerate(lines): bbox = draw.textbbox((0, 0), line, font=font) line_width = bbox[2] - bbox[0] line_bg_width = int((line_width + padding) * self.text_bg_reduction) line_bg_x = x + (w - line_bg_width) // 2 line_bg_y = int(start_y + i * line_height - padding // 2) line_bg_height = int(line_height + padding // 2) # Draw rounded rectangle for each line radius = min(10, line_bg_width // 10, line_bg_height // 10) self._draw_rounded_rectangle(draw, line_bg_x, line_bg_y, line_bg_x + line_bg_width, line_bg_y + line_bg_height, radius, bg_color) def _draw_text_shadow(self, draw: ImageDraw, x: int, y: int, text: str, font: ImageFont): """Draw text shadow with optional blur effect""" if self.shadow_blur == 0: # Simple sharp shadow shadow_x = x + self.shadow_offset_x shadow_y = y + self.shadow_offset_y draw.text((shadow_x, shadow_y), text, font=font, fill=self.shadow_color) else: # Blurred shadow (simulated with multiple layers) blur_range = self.shadow_blur opacity_step = 80 // (blur_range + 1) # Distribute opacity across blur layers for blur_offset in range(blur_range, 0, -1): layer_opacity = opacity_step * (blur_range - blur_offset + 1) shadow_color_with_opacity = self.shadow_color + (layer_opacity,) # Draw shadow at multiple positions for blur effect for dx in range(-blur_offset, blur_offset + 1): for dy in range(-blur_offset, blur_offset + 1): if dx*dx + dy*dy <= blur_offset*blur_offset: # Circular blur shadow_x = x + self.shadow_offset_x + dx shadow_y = y + self.shadow_offset_y + dy draw.text((shadow_x, shadow_y), text, font=font, fill=shadow_color_with_opacity) def _draw_rounded_rectangle(self, draw: ImageDraw, x1: int, y1: int, x2: int, y2: int, radius: int, fill): """Draw a rounded rectangle""" # Draw the main rectangle draw.rectangle([x1 + radius, y1, x2 - radius, y2], fill=fill) draw.rectangle([x1, y1 + radius, x2, y2 - radius], fill=fill) # Draw the corners draw.pieslice([x1, y1, x1 + 2 * radius, y1 + 2 * radius], 180, 270, fill=fill) draw.pieslice([x2 - 2 * radius, y1, x2, y1 + 2 * radius], 270, 360, fill=fill) draw.pieslice([x1, y2 - 2 * radius, x1 + 2 * radius, y2], 90, 180, fill=fill) draw.pieslice([x2 - 2 * radius, y2 - 2 * radius, x2, y2], 0, 90, fill=fill) def _get_font(self, font_size: int) -> ImageFont: """Get font with specified size, using selected style if available""" font_path = self.selected_font_style or self.font_path if font_path: try: return ImageFont.truetype(font_path, font_size) except: pass return ImageFont.load_default() def _pil_word_wrap(self, text: str, font_path: str, roi_width: int, roi_height: int, init_font_size: int, min_font_size: int, draw: ImageDraw) -> Tuple[str, int]: """Comic-translate's pil_word_wrap algorithm - top-down font sizing with column wrapping. Break long text to multiple lines, and reduce point size until all text fits within bounds. This is a direct port from comic-translate for better text fitting. """ from hyphen_textwrap import wrap as hyphen_wrap mutable_message = text font_size = init_font_size def eval_metrics(txt, font): """Calculate width/height of multiline text. CRITICAL: Must match the rendering logic exactly to prevent overflow. Rendering uses font_size * 1.2 as line_height, so we must do the same here. """ lines = txt.split('\n') if not lines: return (0, 0) max_width = 0 for line in lines: bbox = draw.textbbox((0, 0), line if line else "A", font=font) line_width = bbox[2] - bbox[0] max_width = max(max_width, line_width) # Calculate height using same logic as rendering: # line_height = max(font_size * 1.2, actual_bbox_height * 1.1) sample_bbox = draw.textbbox((0, 0), lines[0] if lines[0] else "Ay", font=font) actual_line_height = sample_bbox[3] - sample_bbox[1] line_height = max(font_size * 1.2, actual_line_height * 1.1) total_height = len(lines) * line_height return (max_width, total_height) # Get initial font try: if font_path: font = ImageFont.truetype(font_path, font_size) else: font = ImageFont.load_default() except Exception: font = ImageFont.load_default() # Top-down algorithm: start with large font, shrink until it fits while font_size > min_font_size: try: if font_path: font = ImageFont.truetype(font_path, font_size) else: font = ImageFont.load_default() except Exception: font = ImageFont.load_default() width, height = eval_metrics(mutable_message, font) if height > roi_height: # Text is too tall, reduce font size font_size -= 0.75 mutable_message = text # Restore original text elif width > roi_width: # Text is too wide, try wrapping with column optimization columns = len(mutable_message) # Search for optimal column width while columns > 0: columns -= 1 if columns == 0: break # Use hyphen_wrap for smart wrapping try: wrapped = '\n'.join(hyphen_wrap( text, columns, break_on_hyphens=False, break_long_words=False, hyphenate_broken_words=True )) wrapped_width, _ = eval_metrics(wrapped, font) if wrapped_width <= roi_width: mutable_message = wrapped break except Exception: # Fallback to simple wrapping if hyphen_wrap fails break if columns < 1: # Couldn't find good column width, reduce font size font_size -= 0.75 mutable_message = text # Restore original text else: # Text fits! break # If we hit minimum font size, do brute-force optimization if font_size <= min_font_size: font_size = min_font_size mutable_message = text try: if font_path: font = ImageFont.truetype(font_path, font_size) else: font = ImageFont.load_default() except Exception: font = ImageFont.load_default() # Brute force: minimize cost function (width - roi_width)^2 + (height - roi_height)^2 min_cost = 1e9 min_text = text for columns in range(1, min(len(text) + 1, 100)): # Limit iterations for performance try: wrapped_text = '\n'.join(hyphen_wrap( text, columns, break_on_hyphens=False, break_long_words=False, hyphenate_broken_words=True )) wrapped_width, wrapped_height = eval_metrics(wrapped_text, font) cost = (wrapped_width - roi_width)**2 + (wrapped_height - roi_height)**2 if cost < min_cost: min_cost = cost min_text = wrapped_text except Exception: continue mutable_message = min_text return mutable_message, int(font_size) def get_mask_bounds(self, region: TextRegion, full_mask: np.ndarray) -> Tuple[int, int, int, int]: """Extract the actual mask boundaries for a region. For non-Azure/Google OCR providers (manga-ocr, etc.), use RT-DETR bubble_bounds directly. For Azure/Google, extract from the mask overlap to handle full-page OCR. """ # PRIORITY 1: For manga-ocr and other RT-DETR-guided OCR providers, use bubble_bounds directly # These providers already OCR within RT-DETR bubbles, so bubble_bounds IS the correct render area is_azure_google = getattr(self, 'ocr_provider', '').lower() in ('azure', 'google') if not is_azure_google and hasattr(region, 'bubble_bounds') and region.bubble_bounds: # Use the RT-DETR bubble bounds directly - this is the full bubble area bx, by, bw, bh = region.bubble_bounds if not getattr(self, 'concise_logs', False): self._log(f" βœ… Using RT-DETR bubble_bounds for mask: {int(bw)}Γ—{int(bh)} at ({int(bx)}, {int(by)})", "debug") return int(bx), int(by), int(bw), int(bh) elif not is_azure_google: # Debug: Why are we not using bubble_bounds? if not getattr(self, 'concise_logs', False): has_attr = hasattr(region, 'bubble_bounds') is_none = getattr(region, 'bubble_bounds', None) is None if has_attr else True #self._log(f" ⚠️ manga-ocr but NO bubble_bounds (has_attr={has_attr}, is_none={is_none})", "warning") # PRIORITY 2: For Azure/Google or when bubble_bounds not available, extract from mask if full_mask is not None: try: import cv2 import numpy as np # Create a blank mask for this region region_mask = np.zeros(full_mask.shape, dtype=np.uint8) # Fill the region's area in the mask if hasattr(region, 'vertices') and region.vertices: vertices_np = np.array(region.vertices, dtype=np.int32) cv2.fillPoly(region_mask, [vertices_np], 255) else: x, y, w, h = region.bounding_box cv2.rectangle(region_mask, (int(x), int(y)), (int(x+w), int(y+h)), 255, -1) # Find where this region overlaps with the full mask overlap = cv2.bitwise_and(region_mask, full_mask) # Get bounding box of the overlap contours, _ = cv2.findContours(overlap, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) if contours: # Get the largest contour (should be the main text region) largest_contour = max(contours, key=cv2.contourArea) x, y, w, h = cv2.boundingRect(largest_contour) if w > 0 and h > 0: return x, y, w, h except Exception as e: if not getattr(self, 'concise_logs', False): self._log(f" ⚠️ Failed to extract mask bounds: {e}, falling back", "debug") # Fallback to original bounding box x, y, w, h = region.bounding_box return int(x), int(y), int(w), int(h) def get_safe_text_area(self, region: TextRegion, use_mask_bounds: bool = False, full_mask: np.ndarray = None) -> Tuple[int, int, int, int]: """Get safe text area with algorithm-aware shrink strategy. Respects font_algorithm and auto_fit_style settings: - conservative: Comic-translate's 15% shrink (85% usable) - smart: Adaptive 10-20% shrink based on bubble shape - aggressive: Minimal 5% shrink (95% usable) Also applies OCR-specific adjustments for Azure/Google without RT-DETR guidance. Args: region: The text region to calculate safe area for use_mask_bounds: If True, use actual mask boundaries instead of shrinking from polygon full_mask: The complete mask image (required if use_mask_bounds=True) """ # Get font sizing settings from config try: manga_settings = self.main_gui.config.get('manga_settings', {}) font_sizing = manga_settings.get('font_sizing', {}) rendering = manga_settings.get('rendering', {}) font_algorithm = font_sizing.get('algorithm', 'smart') auto_fit_style = rendering.get('auto_fit_style', 'balanced') # Check if using Azure/Google without RT-DETR guidance ocr_settings = manga_settings.get('ocr', {}) use_rtdetr_guide = ocr_settings.get('use_rtdetr_for_ocr_regions', True) is_azure_google = getattr(self, 'ocr_provider', '').lower() in ('azure', 'google') needs_aggressive = is_azure_google and not use_rtdetr_guide except Exception: font_algorithm = 'smart' auto_fit_style = 'balanced' needs_aggressive = False # Base margin factor by algorithm if font_algorithm == 'conservative': # Comic-translate default: 15% shrink = 85% usable base_margin = 0.85 elif font_algorithm == 'aggressive': # Aggressive: 5% shrink = 95% usable base_margin = 0.95 else: # 'smart' # Smart: adaptive based on auto_fit_style if auto_fit_style == 'compact': base_margin = 0.82 # 18% shrink - tight fit elif auto_fit_style == 'readable': base_margin = 0.92 # 8% shrink - loose fit else: # 'balanced' base_margin = 0.87 # 13% shrink - balanced # SPECIAL CASE: Azure/Google without RT-DETR guidance # Their OCR is too conservative, so we need more aggressive sizing if needs_aggressive: # Boost margin by 5-8% to compensate for conservative OCR bounds base_margin = min(0.98, base_margin + 0.08) self._log(f" 🎯 Azure/Google non-RT-DETR mode: Using aggressive {int(base_margin*100)}% margin", "debug") # OPTION 1: Use mask boundaries directly (most accurate) if use_mask_bounds and full_mask is not None: mask_x, mask_y, mask_w, mask_h = self.get_mask_bounds(region, full_mask) # Use the FULL mask bounds directly - the mask already represents the accurate # inpainted area from the inpainting process. The inpainting itself already includes # padding/margins, so we don't need to shrink further. Using 100% maximizes text # utilization and prevents the "text too small" issue. # CRITICAL: Use 100% of mask area for maximum text utilization # The inpainting mask already has built-in margins from the mask generation process safe_x, safe_y, safe_w, safe_h = mask_x, mask_y, mask_w, mask_h if not getattr(self, 'concise_logs', False): self._log(f" πŸ“ Using FULL mask bounds: {mask_w}Γ—{mask_h} (100% utilization)", "debug") self._log(f" Mask position: ({mask_x}, {mask_y})", "debug") if hasattr(region, 'bounding_box'): orig_x, orig_y, orig_w, orig_h = region.bounding_box self._log(f" Original bbox: {orig_w}Γ—{orig_h} at ({orig_x}, {orig_y})", "debug") return safe_x, safe_y, safe_w, safe_h # OPTION 2: Handle regions without vertices (simple bounding box) if not hasattr(region, 'vertices') or not region.vertices: x, y, w, h = region.bounding_box safe_width = int(w * base_margin) safe_height = int(h * base_margin) safe_x = x + (w - safe_width) // 2 safe_y = y + (h - safe_height) // 2 return safe_x, safe_y, safe_width, safe_height # Calculate convexity for shape-aware adjustment (only for 'smart' algorithm) margin_factor = base_margin if font_algorithm == 'smart': try: # Convert vertices to numpy array with correct dtype vertices = np.array(region.vertices, dtype=np.int32) hull = cv2.convexHull(vertices) hull_area = cv2.contourArea(hull) poly_area = cv2.contourArea(vertices) if poly_area > 0: convexity = hull_area / poly_area else: convexity = 1.0 # Adjust margin based on bubble shape if convexity < 0.85: # Speech bubble with tail # More aggressive shrink for tailed bubbles (avoid the tail) margin_factor = base_margin - 0.10 if not getattr(self, 'concise_logs', False): self._log(f" Speech bubble with tail: {int(margin_factor*100)}% usable area", "debug") elif convexity > 0.98: # Rectangular/square # Less shrink for rectangular regions margin_factor = base_margin + 0.05 if not getattr(self, 'concise_logs', False): self._log(f" Rectangular region: {int(margin_factor*100)}% usable area", "debug") else: # Regular oval bubble # Use base margin margin_factor = base_margin if not getattr(self, 'concise_logs', False): self._log(f" Regular bubble: {int(margin_factor*100)}% usable area", "debug") # Clamp margin factor margin_factor = max(0.70, min(0.98, margin_factor)) except Exception: margin_factor = base_margin # Convert vertices to numpy array for boundingRect vertices_np = np.array(region.vertices, dtype=np.int32) x, y, w, h = cv2.boundingRect(vertices_np) safe_width = int(w * margin_factor) safe_height = int(h * margin_factor) safe_x = x + (w - safe_width) // 2 safe_y = y + (h - safe_height) // 2 return safe_x, safe_y, safe_width, safe_height def _fit_text_to_region(self, text: str, max_width: int, max_height: int, draw: ImageDraw, region: TextRegion = None, use_as_is: bool = False) -> Tuple[int, List[str]]: """Find optimal font size using comic-translate's pil_word_wrap algorithm with algorithm-aware adjustments Args: text: Text to fit max_width: Maximum width available max_height: Maximum height available draw: PIL ImageDraw object region: Optional TextRegion for safe area calculation use_as_is: If True, use max_width/max_height directly without further shrinking """ # Get font sizing settings try: manga_settings = self.main_gui.config.get('manga_settings', {}) font_sizing = manga_settings.get('font_sizing', {}) font_algorithm = font_sizing.get('algorithm', 'smart') prefer_larger = font_sizing.get('prefer_larger', True) except Exception: font_algorithm = 'smart' prefer_larger = True # Get usable area if use_as_is: # Dimensions are already safe area - use them directly (no double shrinking) usable_width = max_width usable_height = max_height elif region and hasattr(region, 'vertices') and region.vertices: # Calculate safe area from region safe_x, safe_y, safe_width, safe_height = self.get_safe_text_area(region) usable_width = safe_width usable_height = safe_height else: # Fallback: use algorithm-aware margin if font_algorithm == 'conservative': margin = 0.85 # Comic-translate default elif font_algorithm == 'aggressive': margin = 0.95 else: # smart margin = 0.87 usable_width = int(max_width * margin) usable_height = int(max_height * margin) # Font size limits (GUI settings with algorithm adjustments) min_font_size = max(10, self.min_readable_size) # Adjust initial font size based on algorithm and prefer_larger base_init = min(40, self.max_font_size_limit) if font_algorithm == 'aggressive' and prefer_larger: # Start higher for aggressive mode init_font_size = min(int(base_init * 1.2), self.max_font_size_limit) elif font_algorithm == 'conservative': # Start lower for conservative mode init_font_size = int(base_init * 0.9) else: init_font_size = base_init # Use comic-translate's pil_word_wrap algorithm wrapped_text, final_font_size = self._pil_word_wrap( text=text, font_path=self.selected_font_style or self.font_path, roi_width=usable_width, roi_height=usable_height, init_font_size=init_font_size, min_font_size=min_font_size, draw=draw ) # Convert wrapped text to lines lines = wrapped_text.split('\n') if wrapped_text else [text] # Log font algorithm used (debug) if not getattr(self, 'concise_logs', False): self._log(f" Font algorithm: {font_algorithm}, init_size: {init_font_size}, final_size: {final_font_size}", "debug") # Apply multiplier if in multiplier mode if self.font_size_mode == 'multiplier': target_size = int(final_font_size * self.font_size_multiplier) # Check if multiplied size still fits (if constrained) if self.constrain_to_bubble: # Re-wrap at target size to check fit test_wrapped, _ = self._pil_word_wrap( text=text, font_path=self.selected_font_style or self.font_path, roi_width=usable_width, roi_height=usable_height, init_font_size=target_size, min_font_size=target_size, # Force this size draw=draw ) test_lines = test_wrapped.split('\n') if test_wrapped else [text] test_height = len(test_lines) * target_size * 1.2 if test_height <= usable_height: final_font_size = target_size lines = test_lines else: self._log(f" Multiplier {self.font_size_multiplier}x would exceed bubble", "debug") else: # Not constrained, use multiplied size final_font_size = target_size lines = wrapped_text.split('\n') if wrapped_text else [text] self._log(f" Font sizing: text_len={len(text)}, size={final_font_size}, lines={len(lines)}", "debug") return final_font_size, lines def _fit_text_simple_topdown(self, text: str, usable_width: int, usable_height: int, draw: ImageDraw, min_size: int, max_size: int) -> Tuple[int, List[str]]: """Simple top-down approach - start large and shrink only if needed""" # Start from a reasonable large size start_size = int(max_size * 0.8) for font_size in range(start_size, min_size - 1, -2): # Step by 2 for speed font = self._get_font(font_size) lines = self._wrap_text(text, font, usable_width, draw) line_height = font_size * 1.2 # Tighter for overlaps total_height = len(lines) * line_height if total_height <= usable_height: return font_size, lines # If nothing fits, use minimum font = self._get_font(min_size) lines = self._wrap_text(text, font, usable_width, draw) return min_size, lines def _check_potential_overlap(self, region: TextRegion) -> bool: """Check if this region might overlap with others based on position""" if not region or not hasattr(region, 'bounding_box'): return False x, y, w, h = region.bounding_box # Simple heuristic: small regions or regions at edges might overlap # You can make this smarter based on your needs if w < 100 or h < 50: # Small bubbles often overlap return True # Add more overlap detection logic here if needed # For now, default to no overlap for larger bubbles return False def _wrap_text(self, text: str, font: ImageFont, max_width: int, draw: ImageDraw) -> List[str]: """Wrap text to fit within max_width with optional strict wrapping""" # Handle empty text if not text.strip(): return [] # Only enforce width check if constrain_to_bubble is enabled if self.constrain_to_bubble and max_width <= 0: self._log(f" ⚠️ Invalid max_width: {max_width}, using fallback", "warning") return [text[:20] + "..."] if len(text) > 20 else [text] words = text.split() lines = [] current_line = [] for word in words: # Check if word alone is too long word_bbox = draw.textbbox((0, 0), word, font=font) word_width = word_bbox[2] - word_bbox[0] if word_width > max_width and len(word) > 1: # Word is too long for the bubble if current_line: # Save current line first lines.append(' '.join(current_line)) current_line = [] if self.strict_text_wrapping: # STRICT MODE: Force break the word to fit within bubble # This is the original behavior that ensures text stays within bounds broken_parts = self._force_break_word(word, font, max_width, draw) lines.extend(broken_parts) else: # RELAXED MODE: Keep word whole (may exceed bubble) lines.append(word) # self._log(f" ⚠️ Word '{word}' exceeds bubble width, keeping whole", "warning") else: # Normal word processing if current_line: test_line = ' '.join(current_line + [word]) else: test_line = word text_bbox = draw.textbbox((0, 0), test_line, font=font) text_width = text_bbox[2] - text_bbox[0] if text_width <= max_width: current_line.append(word) else: if current_line: lines.append(' '.join(current_line)) current_line = [word] else: # Single word that fits lines.append(word) if current_line: lines.append(' '.join(current_line)) return lines # Keep the existing _force_break_word method as is (the complete version from earlier): def _force_break_word(self, word: str, font: ImageFont, max_width: int, draw: ImageDraw) -> List[str]: """Force break a word that's too long to fit""" lines = [] # Binary search to find how many characters fit low = 1 high = len(word) chars_that_fit = 1 while low <= high: mid = (low + high) // 2 test_text = word[:mid] bbox = draw.textbbox((0, 0), test_text, font=font) width = bbox[2] - bbox[0] if width <= max_width: chars_that_fit = mid low = mid + 1 else: high = mid - 1 # Break the word into pieces remaining = word while remaining: if len(remaining) <= chars_that_fit: # Last piece lines.append(remaining) break else: # Find the best break point break_at = chars_that_fit # Try to break at a more natural point if possible # Look for vowel-consonant boundaries for better hyphenation for i in range(min(chars_that_fit, len(remaining) - 1), max(1, chars_that_fit - 5), -1): if i < len(remaining) - 1: current_char = remaining[i].lower() next_char = remaining[i + 1].lower() # Good hyphenation points: # - Between consonant and vowel # - After prefix (un-, re-, pre-, etc.) # - Before suffix (-ing, -ed, -er, etc.) if (current_char in 'bcdfghjklmnpqrstvwxyz' and next_char in 'aeiou') or \ (current_char in 'aeiou' and next_char in 'bcdfghjklmnpqrstvwxyz'): break_at = i + 1 break # Add hyphen if we're breaking in the middle of a word if break_at < len(remaining): # Check if adding hyphen still fits test_with_hyphen = remaining[:break_at] + '-' bbox = draw.textbbox((0, 0), test_with_hyphen, font=font) width = bbox[2] - bbox[0] if width <= max_width: lines.append(remaining[:break_at] + '-') else: # Hyphen doesn't fit, break without it lines.append(remaining[:break_at]) else: lines.append(remaining[:break_at]) remaining = remaining[break_at:] return lines def _estimate_font_size_for_region(self, region: TextRegion) -> int: """Estimate the likely font size for a text region based on its dimensions and text content""" x, y, w, h = region.bounding_box text_length = len(region.text.strip()) if text_length == 0: return self.max_font_size // 2 # Default middle size # Calculate area per character area = w * h area_per_char = area / text_length # Estimate font size based on area per character # These ratios are approximate and based on typical manga text if area_per_char > 800: estimated_size = int(self.max_font_size * 0.8) elif area_per_char > 400: estimated_size = int(self.max_font_size * 0.6) elif area_per_char > 200: estimated_size = int(self.max_font_size * 0.4) elif area_per_char > 100: estimated_size = int(self.max_font_size * 0.3) else: estimated_size = int(self.max_font_size * 0.2) # Clamp to reasonable bounds return max(self.min_font_size, min(estimated_size, self.max_font_size)) def _split_bubble_if_needed(self, bubble_regions: List[TextRegion]) -> List[List[TextRegion]]: """Split a detected bubble if it actually contains multiple separate speech bubbles This happens when RT-DETR detects one large bounding box over vertically or horizontally stacked speech bubbles. We detect this by checking if text regions within the bubble have LARGE gaps between them. For manga-ocr and other non-Google/Azure OCR providers, RT-DETR detection is trusted completely and splitting is disabled. Returns: List of region groups - each group represents a separate bubble """ # For manga-ocr and other providers that use RT-DETR regions directly, trust RT-DETR # Splitting is only needed for Google/Azure which do full-page OCR if hasattr(self, 'ocr_provider') and self.ocr_provider not in ('google', 'azure'): return [bubble_regions] # Trust RT-DETR completely for these providers if len(bubble_regions) <= 1: return [bubble_regions] # Single region, no splitting needed # Sort regions by position (top-to-bottom, left-to-right) sorted_regions = sorted(bubble_regions, key=lambda r: (r.bounding_box[1], r.bounding_box[0])) # Group regions that should be together groups = [[sorted_regions[0]]] for i in range(1, len(sorted_regions)): current_region = sorted_regions[i] cx, cy, cw, ch = current_region.bounding_box placed = False # Try to place in an existing group for group in groups: # Check if current region should be in this group # We look at the closest region in the group min_gap = float('inf') min_vertical_gap = float('inf') min_horizontal_gap = float('inf') closest_region = None for group_region in group: gx, gy, gw, gh = group_region.bounding_box # Calculate gap between regions horizontal_gap = 0 if gx + gw < cx: horizontal_gap = cx - (gx + gw) elif cx + cw < gx: horizontal_gap = gx - (cx + cw) vertical_gap = 0 if gy + gh < cy: vertical_gap = cy - (gy + gh) elif cy + ch < gy: vertical_gap = gy - (cy + ch) # Use Euclidean distance as overall gap measure gap = (horizontal_gap ** 2 + vertical_gap ** 2) ** 0.5 if gap < min_gap: min_gap = gap closest_region = group_region # Store individual gaps for aggressive vertical splitting min_vertical_gap = vertical_gap min_horizontal_gap = horizontal_gap # AGGRESSIVE SPLIT for MANGA: Check for large vertical gaps first # Manga often has vertically stacked speech bubbles that RT-DETR detects as one if closest_region and min_vertical_gap > 50: # Large vertical gap (>50px) - likely separate bubbles stacked vertically # Check if there's NO vertical overlap (completely separate) gx, gy, gw, gh = closest_region.bounding_box vertical_overlap = min(gy + gh, cy + ch) - max(gy, cy) if vertical_overlap <= 0: # No vertical overlap at all - definitely separate bubbles # Create new group (don't merge) pass # Will create new group below else: # Some overlap despite gap - check other criteria horizontal_overlap = min(gx + gw, cx + cw) - max(gx, cx) min_width = min(gw, cw) min_height = min(gh, ch) # Only merge if there's very strong overlap (>75%) if (horizontal_overlap > min_width * 0.75 or vertical_overlap > min_height * 0.75): group.append(current_region) placed = True break # BALANCED SPLIT CRITERIA: # Split if gap is > 21px unless there's strong overlap (>62%) elif closest_region and min_gap < 21: # Within 21px - likely same bubble group.append(current_region) placed = True break elif closest_region: # Check if they have significant overlap despite the gap gx, gy, gw, gh = closest_region.bounding_box horizontal_overlap = min(gx + gw, cx + cw) - max(gx, cx) vertical_overlap = min(gy + gh, cy + ch) - max(gy, cy) min_width = min(gw, cw) min_height = min(gh, ch) # If they have strong overlap (>62%) in either direction, keep together if (horizontal_overlap > min_width * 0.62 or vertical_overlap > min_height * 0.62): group.append(current_region) placed = True break # If not placed in any existing group, create a new group if not placed: groups.append([current_region]) # Log if we split the bubble if len(groups) > 1: self._log(f" πŸ”ͺ SPLIT: Detected bubble actually contains {len(groups)} separate bubbles", "warning") for idx, group in enumerate(groups): group_texts = [r.text[:15] + '...' for r in group] self._log(f" Sub-bubble {idx + 1}: {len(group)} regions - {group_texts}", "info") return groups def _likely_different_bubbles(self, region1: TextRegion, region2: TextRegion) -> bool: """Detect if regions are likely in different speech bubbles based on spatial patterns""" x1, y1, w1, h1 = region1.bounding_box x2, y2, w2, h2 = region2.bounding_box # Calculate gaps and positions horizontal_gap = 0 if x1 + w1 < x2: horizontal_gap = x2 - (x1 + w1) elif x2 + w2 < x1: horizontal_gap = x1 - (x2 + w2) vertical_gap = 0 if y1 + h1 < y2: vertical_gap = y2 - (y1 + h1) elif y2 + h2 < y1: vertical_gap = y1 - (y2 + h2) # Calculate relative positions center_x1 = x1 + w1 / 2 center_x2 = x2 + w2 / 2 center_y1 = y1 + h1 / 2 center_y2 = y2 + h2 / 2 horizontal_center_diff = abs(center_x1 - center_x2) avg_width = (w1 + w2) / 2 # FIRST CHECK: Very small gaps always indicate same bubble if horizontal_gap < 15 and vertical_gap < 15: return False # Definitely same bubble # STRICTER CHECK: For regions that are horizontally far apart # Even if they pass the gap threshold, check if they're likely different bubbles if horizontal_gap > 40: # Significant horizontal gap # Unless they're VERY well aligned vertically, they're different bubbles vertical_overlap = min(y1 + h1, y2 + h2) - max(y1, y2) min_height = min(h1, h2) if vertical_overlap < min_height * 0.8: # Need 80% overlap to be same bubble return True # SPECIFIC FIX: Check for multi-line text pattern # If regions are well-aligned horizontally, they're likely in the same bubble if horizontal_center_diff < avg_width * 0.35: # Relaxed from 0.2 to 0.35 # Additional checks for multi-line text: # 1. Similar widths (common in speech bubbles) width_ratio = max(w1, w2) / min(w1, w2) if min(w1, w2) > 0 else 999 # 2. Reasonable vertical spacing (not too far apart) avg_height = (h1 + h2) / 2 if width_ratio < 2.0 and vertical_gap < avg_height * 1.5: # This is very likely multi-line text in the same bubble return False # Pattern 1: Side-by-side bubbles (common in manga) # Characteristics: Significant horizontal gap, similar vertical position if horizontal_gap > 50: # Increased from 25 to avoid false positives vertical_overlap = min(y1 + h1, y2 + h2) - max(y1, y2) min_height = min(h1, h2) # If they have good vertical overlap, they're likely side-by-side bubbles if vertical_overlap > min_height * 0.5: return True # Pattern 2: Stacked bubbles # Characteristics: Significant vertical gap, similar horizontal position # CRITICAL: Lower threshold to catch vertically stacked bubbles in manga if vertical_gap > 15: # Reduced from 25 to catch closer stacked bubbles horizontal_overlap = min(x1 + w1, x2 + w2) - max(x1, x2) min_width = min(w1, w2) # If they have good horizontal overlap, they're likely stacked bubbles if horizontal_overlap > min_width * 0.5: return True # Pattern 3: Diagonal arrangement (different speakers) # If regions are separated both horizontally and vertically if horizontal_gap > 20 and vertical_gap > 20: return True # Pattern 4: Large gap relative to region size avg_height = (h1 + h2) / 2 if horizontal_gap > avg_width * 0.6 or vertical_gap > avg_height * 0.6: return True return False def _regions_should_merge(self, region1: TextRegion, region2: TextRegion, threshold: int = 50) -> bool: """Determine if two regions should be merged - with bubble detection""" # First check if they're close enough spatially if not self._regions_are_nearby(region1, region2, threshold): return False x1, y1, w1, h1 = region1.bounding_box x2, y2, w2, h2 = region2.bounding_box # ONLY apply special handling if regions are from Azure if hasattr(region1, 'from_azure') and region1.from_azure: # Azure lines are typically small - be more lenient avg_height = (h1 + h2) / 2 if avg_height < 50: # Likely single lines self._log(f" Azure lines detected, using lenient merge criteria", "info") center_x1 = x1 + w1 / 2 center_x2 = x2 + w2 / 2 horizontal_center_diff = abs(center_x1 - center_x2) avg_width = (w1 + w2) / 2 # If horizontally aligned and nearby, merge them if horizontal_center_diff < avg_width * 0.7: return True # GOOGLE LOGIC - unchanged from your original # SPECIAL CASE: If one region is very small, bypass strict checks area1 = w1 * h1 area2 = w2 * h2 if area1 < 500 or area2 < 500: self._log(f" Small text region (area: {min(area1, area2)}), bypassing strict alignment checks", "info") return True # Calculate actual gaps between regions horizontal_gap = 0 if x1 + w1 < x2: horizontal_gap = x2 - (x1 + w1) elif x2 + w2 < x1: horizontal_gap = x1 - (x2 + w2) vertical_gap = 0 if y1 + h1 < y2: vertical_gap = y2 - (y1 + h1) elif y2 + h2 < y1: vertical_gap = y1 - (y2 + h2) # Calculate centers for alignment checks center_x1 = x1 + w1 / 2 center_x2 = x2 + w2 / 2 center_y1 = y1 + h1 / 2 center_y2 = y2 + h2 / 2 horizontal_center_diff = abs(center_x1 - center_x2) vertical_center_diff = abs(center_y1 - center_y2) avg_width = (w1 + w2) / 2 avg_height = (h1 + h2) / 2 # Determine text orientation and layout is_horizontal_text = horizontal_gap > vertical_gap or (horizontal_center_diff < avg_width * 0.5) is_vertical_text = vertical_gap > horizontal_gap or (vertical_center_diff < avg_height * 0.5) # PRELIMINARY CHECK: If regions overlap or are extremely close, merge them # This handles text that's clearly in the same bubble # Check for overlap overlap_x = max(0, min(x1 + w1, x2 + w2) - max(x1, x2)) overlap_y = max(0, min(y1 + h1, y2 + h2) - max(y1, y2)) has_overlap = overlap_x > 0 and overlap_y > 0 if has_overlap: self._log(f" Regions overlap - definitely same bubble, merging", "info") return True # If gaps are tiny (< 10 pixels), merge regardless of other factors if horizontal_gap < 10 and vertical_gap < 10: self._log(f" Very small gaps ({horizontal_gap}, {vertical_gap}) - merging", "info") return True # BUBBLE BOUNDARY CHECK: Use spatial patterns to detect different bubbles # But be less aggressive if gaps are small # CRITICAL: Reduced threshold to allow bubble boundary detection for stacked bubbles if horizontal_gap < 12 and vertical_gap < 12: # Very close regions are almost certainly in the same bubble self._log(f" Regions very close, skipping bubble boundary check", "info") elif self._likely_different_bubbles(region1, region2): self._log(f" Regions likely in different speech bubbles", "info") return False # CHECK 1: For well-aligned text with small gaps, merge immediately # This catches multi-line text in the same bubble if is_horizontal_text and vertical_center_diff < avg_height * 0.4: # Horizontal text that's well-aligned vertically if horizontal_gap <= threshold and vertical_gap <= threshold * 0.5: self._log(f" Well-aligned horizontal text with acceptable gaps, merging", "info") return True if is_vertical_text and horizontal_center_diff < avg_width * 0.4: # Vertical text that's well-aligned horizontally if vertical_gap <= threshold and horizontal_gap <= threshold * 0.5: self._log(f" Well-aligned vertical text with acceptable gaps, merging", "info") return True # ADDITIONAL CHECK: Multi-line text in speech bubbles # Even if not perfectly aligned, check for typical multi-line patterns if horizontal_center_diff < avg_width * 0.5 and vertical_gap <= threshold: # Lines that are reasonably centered and within threshold should merge self._log(f" Multi-line text pattern detected, merging", "info") return True # CHECK 2: Check alignment quality # Poor alignment often indicates different bubbles if is_horizontal_text: # For horizontal text, check vertical alignment if vertical_center_diff > avg_height * 0.6: self._log(f" Poor vertical alignment for horizontal text", "info") return False elif is_vertical_text: # For vertical text, check horizontal alignment if horizontal_center_diff > avg_width * 0.6: self._log(f" Poor horizontal alignment for vertical text", "info") return False # CHECK 3: Font size check (but be reasonable) font_size1 = self._estimate_font_size_for_region(region1) font_size2 = self._estimate_font_size_for_region(region2) size_ratio = max(font_size1, font_size2) / max(min(font_size1, font_size2), 1) # Allow some variation for emphasis or stylistic choices if size_ratio > 2.0: self._log(f" Font sizes too different ({font_size1} vs {font_size2})", "info") return False # CHECK 4: Final sanity check on merged area merged_width = max(x1 + w1, x2 + w2) - min(x1, x2) merged_height = max(y1 + h1, y2 + h2) - min(y1, y2) merged_area = merged_width * merged_height combined_area = (w1 * h1) + (w2 * h2) # If merged area is way larger than combined areas, they're probably far apart if merged_area > combined_area * 2.5: self._log(f" Merged area indicates regions are too far apart", "info") return False # If we get here, apply standard threshold checks if horizontal_gap <= threshold and vertical_gap <= threshold: self._log(f" Standard threshold check passed, merging", "info") return True self._log(f" No merge conditions met", "info") return False def _merge_nearby_regions(self, regions: List[TextRegion], threshold: int = 50) -> List[TextRegion]: """Merge text regions that are likely part of the same speech bubble - with debug logging""" if len(regions) <= 1: return regions self._log(f"\n=== MERGE DEBUG: Starting merge analysis ===", "info") self._log(f" Total regions: {len(regions)}", "info") self._log(f" Threshold: {threshold}px", "info") # First, let's log what regions we have for i, region in enumerate(regions): x, y, w, h = region.bounding_box self._log(f" Region {i}: pos({x},{y}) size({w}x{h}) text='{region.text[:20]}...'", "info") # Sort regions by area (largest first) to handle contained regions properly sorted_indices = sorted(range(len(regions)), key=lambda i: regions[i].bounding_box[2] * regions[i].bounding_box[3], reverse=True) merged = [] used = set() # Process each region in order of size (largest first) for idx in sorted_indices: i = idx if i in used: continue region1 = regions[i] # Start with this region merged_text = region1.text merged_vertices = list(region1.vertices) if hasattr(region1, 'vertices') else [] regions_merged = [i] # Track which regions were merged self._log(f"\n Checking region {i} for merges:", "info") # Check against all other unused regions for j in range(len(regions)): if j == i or j in used: continue region2 = regions[j] self._log(f" Testing merge with region {j}:", "info") # Check if region2 is contained within region1 x1, y1, w1, h1 = region1.bounding_box x2, y2, w2, h2 = region2.bounding_box # Check if region2 is fully contained within region1 if (x2 >= x1 and y2 >= y1 and x2 + w2 <= x1 + w1 and y2 + h2 <= y1 + h1): self._log(f" βœ“ Region {j} is INSIDE region {i} - merging!", "success") merged_text += " " + region2.text if hasattr(region2, 'vertices'): merged_vertices.extend(region2.vertices) used.add(j) regions_merged.append(j) continue # Check if region1 is contained within region2 (shouldn't happen due to sorting, but be safe) if (x1 >= x2 and y1 >= y2 and x1 + w1 <= x2 + w2 and y1 + h1 <= y2 + h2): self._log(f" βœ“ Region {i} is INSIDE region {j} - merging!", "success") merged_text += " " + region2.text if hasattr(region2, 'vertices'): merged_vertices.extend(region2.vertices) used.add(j) regions_merged.append(j) # Update region1's bounding box to the larger region region1 = TextRegion( text=merged_text, vertices=merged_vertices, bounding_box=region2.bounding_box, confidence=region1.confidence, region_type='temp_merge' ) continue # FIX: Always check proximity against ORIGINAL regions, not the expanded one # This prevents cascade merging across bubble boundaries if self._regions_are_nearby(regions[i], region2, threshold): # Use regions[i] not region1 #self._log(f" βœ“ Regions are nearby", "info") # Then check if they should merge (also use original region) if self._regions_should_merge(regions[i], region2, threshold): # Use regions[i] not region1 #self._log(f" βœ“ Regions should merge!", "success") # Actually perform the merge merged_text += " " + region2.text if hasattr(region2, 'vertices'): merged_vertices.extend(region2.vertices) used.add(j) regions_merged.append(j) # DON'T update region1 for proximity checks - keep using original regions else: self._log(f" βœ— Regions should not merge", "warning") else: self._log(f" βœ— Regions not nearby", "warning") # Log if we merged multiple regions if len(regions_merged) > 1: self._log(f" βœ… MERGED regions {regions_merged} into one bubble", "success") else: self._log(f" ℹ️ Region {i} not merged with any other", "info") # Create final merged region with all the merged vertices if merged_vertices: xs = [v[0] for v in merged_vertices] ys = [v[1] for v in merged_vertices] else: # Fallback: calculate from all merged regions all_xs = [] all_ys = [] for idx in regions_merged: x, y, w, h = regions[idx].bounding_box all_xs.extend([x, x + w]) all_ys.extend([y, y + h]) xs = all_xs ys = all_ys min_x, max_x = min(xs), max(xs) min_y, max_y = min(ys), max(ys) merged_bbox = (min_x, min_y, max_x - min_x, max_y - min_y) merged_region = TextRegion( text=merged_text, vertices=merged_vertices, bounding_box=merged_bbox, confidence=regions[i].confidence, region_type='merged_text_block' if len(regions_merged) > 1 else regions[i].region_type ) # Copy over any additional attributes if hasattr(regions[i], 'translated_text'): merged_region.translated_text = regions[i].translated_text merged.append(merged_region) used.add(i) self._log(f"\n=== MERGE DEBUG: Complete ===", "info") self._log(f" Final region count: {len(merged)} (was {len(regions)})", "info") # Verify the merge worked if len(merged) == len(regions): self._log(f" ⚠️ WARNING: No regions were actually merged!", "warning") return merged def _regions_are_nearby(self, region1: TextRegion, region2: TextRegion, threshold: int = 50) -> bool: """Check if two regions are close enough to be in the same bubble - WITH DEBUG""" x1, y1, w1, h1 = region1.bounding_box x2, y2, w2, h2 = region2.bounding_box #self._log(f"\n === NEARBY CHECK DEBUG ===", "info") #self._log(f" Region 1: pos({x1},{y1}) size({w1}x{h1})", "info") #self._log(f" Region 2: pos({x2},{y2}) size({w2}x{h2})", "info") #self._log(f" Threshold: {threshold}", "info") # Calculate gaps between closest edges horizontal_gap = 0 if x1 + w1 < x2: # region1 is to the left horizontal_gap = x2 - (x1 + w1) elif x2 + w2 < x1: # region2 is to the left horizontal_gap = x1 - (x2 + w2) vertical_gap = 0 if y1 + h1 < y2: # region1 is above vertical_gap = y2 - (y1 + h1) elif y2 + h2 < y1: # region2 is above vertical_gap = y1 - (y2 + h2) #self._log(f" Horizontal gap: {horizontal_gap}", "info") #self._log(f" Vertical gap: {vertical_gap}", "info") # Detect if regions are likely vertical text based on aspect ratio aspect1 = w1 / max(h1, 1) aspect2 = w2 / max(h2, 1) # More permissive vertical text detection # Vertical text typically has aspect ratio < 1.0 (taller than wide) is_vertical_text = (aspect1 < 1.0 and aspect2 < 1.0) or (aspect1 < 0.5 or aspect2 < 0.5) # Also check if text is arranged vertically (one above the other with minimal horizontal offset) center_x1 = x1 + w1 / 2 center_x2 = x2 + w2 / 2 horizontal_center_diff = abs(center_x1 - center_x2) avg_width = (w1 + w2) / 2 # If regions are vertically stacked with aligned centers, treat as vertical text is_vertically_stacked = (horizontal_center_diff < avg_width * 1.5) and (vertical_gap >= 0) #self._log(f" Is vertical text: {is_vertical_text}", "info") #self._log(f" Is vertically stacked: {is_vertically_stacked}", "info") #self._log(f" Horizontal center diff: {horizontal_center_diff:.1f}", "info") # SIMPLE APPROACH: Just check if gaps are within threshold # Don't overthink it if horizontal_gap <= threshold and vertical_gap <= threshold: #self._log(f" βœ… NEARBY: Both gaps within threshold", "success") return True # SPECIAL CASE: Vertically stacked text with good alignment # This is specifically for multi-line text in bubbles if horizontal_center_diff < avg_width * 0.8 and vertical_gap <= threshold * 1.5: #self._log(f" βœ… NEARBY: Vertically aligned text in same bubble", "success") return True # If one gap is small and the other is slightly over, still consider nearby if (horizontal_gap <= threshold * 0.5 and vertical_gap <= threshold * 1.5) or \ (vertical_gap <= threshold * 0.5 and horizontal_gap <= threshold * 1.5): #self._log(f" βœ… NEARBY: One small gap, other slightly over", "success") return True # Special case: Wide bubbles with text on sides # If regions are at nearly the same vertical position, they might be in a wide bubble if abs(y1 - y2) < 10: # Nearly same vertical position # Check if this could be a wide bubble spanning both regions if horizontal_gap <= threshold * 3: # Allow up to 3x threshold for wide bubbles #self._log(f" βœ… NEARBY: Same vertical level, possibly wide bubble", "success") return True #self._log(f" ❌ NOT NEARBY: Gaps exceed threshold", "warning") return False def _find_font(self) -> str: """Find a suitable font for text rendering""" font_candidates = [ "C:/Windows/Fonts/comicbd.ttf", # Comic Sans MS Bold as first choice "C:/Windows/Fonts/arial.ttf", "C:/Windows/Fonts/calibri.ttf", "C:/Windows/Fonts/tahoma.ttf", "/System/Library/Fonts/Helvetica.ttc", "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf", "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf" ] for font_path in font_candidates: if os.path.exists(font_path): return font_path return None # Will use default font def _get_singleton_bubble_detector(self): """Get or initialize the singleton bubble detector instance with load coordination.""" start_time = None with MangaTranslator._singleton_lock: if MangaTranslator._singleton_bubble_detector is not None: self._log("πŸ€– Using bubble detector (already loaded)", "info") MangaTranslator._singleton_refs += 1 return MangaTranslator._singleton_bubble_detector # If another thread is loading, wait for it if MangaTranslator._singleton_bd_loading: self._log("⏳ Waiting for bubble detector to finish loading (singleton)", "debug") evt = MangaTranslator._singleton_bd_event # Drop the lock while waiting pass else: # Mark as loading and proceed to load outside lock MangaTranslator._singleton_bd_loading = True MangaTranslator._singleton_bd_event.clear() start_time = time.time() # Release lock and perform heavy load pass # Outside the lock: perform load or wait if start_time is None: # We are a waiter try: MangaTranslator._singleton_bd_event.wait(timeout=300) except Exception: pass with MangaTranslator._singleton_lock: if MangaTranslator._singleton_bubble_detector is not None: MangaTranslator._singleton_refs += 1 return MangaTranslator._singleton_bubble_detector else: # We are the loader try: from bubble_detector import BubbleDetector bd = None # First, try to get a preloaded detector from the pool try: ocr_settings = self.main_gui.config.get('manga_settings', {}).get('ocr', {}) if hasattr(self, 'main_gui') else {} det_type = ocr_settings.get('detector_type', 'rtdetr_onnx') model_id = ocr_settings.get('rtdetr_model_url') or ocr_settings.get('bubble_model_path') or '' key = (det_type, model_id) self._log(f"[DEBUG] Looking for detector in pool with key: {key}", "debug") with MangaTranslator._detector_pool_lock: self._log(f"[DEBUG] Pool keys available: {list(MangaTranslator._detector_pool.keys())}", "debug") rec = MangaTranslator._detector_pool.get(key) if rec and isinstance(rec, dict): spares = rec.get('spares') or [] self._log(f"[DEBUG] Found pool record with {len(spares)} spares", "debug") # For singleton mode, we can use a pool instance without checking it out # since the singleton will keep it loaded permanently if spares: # Just use the first spare (don't pop or check out) # Singleton will keep it loaded, pool can still track it bd = spares[0] self._log(f"πŸ€– Using pool bubble detector for singleton (no check-out needed)", "info") else: self._log(f"[DEBUG] No pool record found for key: {key}", "debug") except Exception as e: self._log(f"Could not fetch preloaded detector: {e}", "debug") # If no preloaded detector, create a new one if bd is None: bd = BubbleDetector() self._log("πŸ€– Created new bubble detector instance", "info") # Optionally: defer model load until first actual call inside BD; keeping instance resident with MangaTranslator._singleton_lock: MangaTranslator._singleton_bubble_detector = bd MangaTranslator._singleton_refs += 1 MangaTranslator._singleton_bd_loading = False try: MangaTranslator._singleton_bd_event.set() except Exception: pass elapsed = time.time() - start_time self._log(f"πŸ€– Singleton bubble detector ready (took {elapsed:.2f}s)", "info") return bd except Exception as e: with MangaTranslator._singleton_lock: MangaTranslator._singleton_bd_loading = False try: MangaTranslator._singleton_bd_event.set() except Exception: pass self._log(f"Failed to create singleton bubble detector: {e}", "error") return None def _initialize_singleton_local_inpainter(self): """Initialize singleton local inpainter instance""" with MangaTranslator._singleton_lock: was_existing = MangaTranslator._singleton_local_inpainter is not None if MangaTranslator._singleton_local_inpainter is None: try: from local_inpainter import LocalInpainter local_method = self.manga_settings.get('inpainting', {}).get('local_method', 'anime') # LocalInpainter only accepts config_path, not method MangaTranslator._singleton_local_inpainter = LocalInpainter() # Now load the model with the specified method if local_method: # Try to load the model model_path = self.manga_settings.get('inpainting', {}).get('local_model_path') if not model_path: # Try to download if no path specified try: model_path = MangaTranslator._singleton_local_inpainter.download_jit_model(local_method) except Exception as e: self._log(f"⚠️ Failed to download model for {local_method}: {e}", "warning") if model_path and os.path.exists(model_path): success = MangaTranslator._singleton_local_inpainter.load_model_with_retry(local_method, model_path) if success: self._log(f"🎨 Created singleton local inpainter with {local_method} model", "info") else: self._log(f"⚠️ Failed to load {local_method} model", "warning") else: self._log(f"🎨 Created singleton local inpainter (no model loaded yet)", "info") else: self._log(f"🎨 Created singleton local inpainter (default)", "info") except Exception as e: self._log(f"Failed to create singleton local inpainter: {e}", "error") return # Use the singleton instance self.local_inpainter = MangaTranslator._singleton_local_inpainter self.inpainter = self.local_inpainter MangaTranslator._singleton_refs += 1 if was_existing: self._log("🎨 Using local inpainter (already loaded)", "info") def _get_thread_bubble_detector(self): """Get or initialize bubble detector (singleton or thread-local based on settings). Will consume a preloaded detector if available for current settings. """ if getattr(self, 'use_singleton_bubble_detector', False) or (hasattr(self, 'use_singleton_models') and self.use_singleton_models): # Use singleton instance (preferred) if self.bubble_detector is None: self.bubble_detector = self._get_singleton_bubble_detector() return self.bubble_detector else: # Use thread-local instance (original behavior for parallel processing) if not hasattr(self, '_thread_local') or getattr(self, '_thread_local', None) is None: self._thread_local = threading.local() if not hasattr(self._thread_local, 'bubble_detector') or self._thread_local.bubble_detector is None: from bubble_detector import BubbleDetector # Try to check out a preloaded spare for the current detector settings try: ocr_settings = self.main_gui.config.get('manga_settings', {}).get('ocr', {}) if hasattr(self, 'main_gui') else {} det_type = ocr_settings.get('detector_type', 'rtdetr_onnx') model_id = ocr_settings.get('rtdetr_model_url') or ocr_settings.get('bubble_model_path') or '' key = (det_type, model_id) with MangaTranslator._detector_pool_lock: rec = MangaTranslator._detector_pool.get(key) if rec and isinstance(rec, dict): spares = rec.get('spares') or [] # Initialize checked_out list if it doesn't exist if 'checked_out' not in rec: rec['checked_out'] = [] checked_out = rec['checked_out'] # Look for an available spare (not checked out) if spares: for spare in spares: if spare not in checked_out and spare: # Check out this spare instance checked_out.append(spare) self._thread_local.bubble_detector = spare # Store references for later return self._checked_out_bubble_detector = spare self._bubble_detector_pool_key = key self._log(f"πŸ€– Checked out bubble detector from pool ({len(checked_out)}/{len(spares)} in use)", "info") break except Exception: pass # If still not set, create a fresh detector and store it for future use if not hasattr(self._thread_local, 'bubble_detector') or self._thread_local.bubble_detector is None: self._thread_local.bubble_detector = BubbleDetector() self._log("πŸ€– Created thread-local bubble detector", "debug") # Store this new detector in the pool for future reuse try: with MangaTranslator._detector_pool_lock: if key not in MangaTranslator._detector_pool: MangaTranslator._detector_pool[key] = {'spares': [], 'checked_out': []} # Add this new detector to spares and immediately check it out rec = MangaTranslator._detector_pool[key] if 'spares' not in rec: rec['spares'] = [] if 'checked_out' not in rec: rec['checked_out'] = [] rec['spares'].append(self._thread_local.bubble_detector) rec['checked_out'].append(self._thread_local.bubble_detector) # Store references for later return self._checked_out_bubble_detector = self._thread_local.bubble_detector self._bubble_detector_pool_key = key except Exception: pass return self._thread_local.bubble_detector def _get_thread_local_inpainter(self, local_method: str, model_path: str): """Get or create a LocalInpainter (singleton or thread-local based on settings). Loads the requested model if needed. """ if hasattr(self, 'use_singleton_models') and self.use_singleton_models: # Use singleton instance if self.local_inpainter is None: self._initialize_singleton_local_inpainter() return self.local_inpainter # Use thread-local instance (original behavior for parallel processing) # Ensure thread-local storage exists and has a dict tl = getattr(self, '_thread_local', None) if tl is None: self._thread_local = threading.local() tl = self._thread_local if not hasattr(tl, 'local_inpainters') or getattr(tl, 'local_inpainters', None) is None: tl.local_inpainters = {} key = (local_method or 'anime', model_path or '') if key not in tl.local_inpainters or tl.local_inpainters[key] is None: # First, try to use a preloaded spare instance from the shared pool try: rec = MangaTranslator._inpaint_pool.get(key) if rec and isinstance(rec, dict): spares = rec.get('spares') or [] if spares: tl.local_inpainters[key] = spares.pop(0) self._log("🎨 Using preloaded local inpainting instance", "info") return tl.local_inpainters[key] # If there's a fully loaded shared instance but no spares, use it as a last resort if rec.get('loaded') and rec.get('inpainter') is not None: tl.local_inpainters[key] = rec.get('inpainter') self._log("🎨 Using shared preloaded inpainting instance", "info") return tl.local_inpainters[key] except Exception: pass # No preloaded instance available: create and load thread-local instance try: from local_inpainter import LocalInpainter # Use a per-thread config path to avoid concurrent JSON writes try: import tempfile thread_cfg = os.path.join(tempfile.gettempdir(), f"gl_inpainter_{threading.get_ident()}.json") except Exception: thread_cfg = "config_thread_local.json" inp = LocalInpainter(config_path=thread_cfg) # Apply tiling settings tiling_settings = self.manga_settings.get('tiling', {}) if hasattr(self, 'manga_settings') else {} inp.tiling_enabled = tiling_settings.get('enabled', False) inp.tile_size = tiling_settings.get('tile_size', 512) inp.tile_overlap = tiling_settings.get('tile_overlap', 64) # Ensure model is available resolved_model_path = model_path if not resolved_model_path or not os.path.exists(resolved_model_path): try: resolved_model_path = inp.download_jit_model(local_method) except Exception as e: self._log(f"⚠️ JIT model download failed for {local_method}: {e}", "warning") resolved_model_path = None # Load model for this thread's instance if resolved_model_path and os.path.exists(resolved_model_path): try: self._log(f"πŸ“₯ Loading {local_method} inpainting model (thread-local)", "info") inp.load_model_with_retry(local_method, resolved_model_path, force_reload=False) except Exception as e: self._log(f"⚠️ Thread-local inpainter load error: {e}", "warning") else: self._log("⚠️ No model path available for thread-local inpainter", "warning") # Re-check thread-local and publish ONLY if model loaded successfully tl2 = getattr(self, '_thread_local', None) if tl2 is None: self._thread_local = threading.local() tl2 = self._thread_local if not hasattr(tl2, 'local_inpainters') or getattr(tl2, 'local_inpainters', None) is None: tl2.local_inpainters = {} if getattr(inp, 'model_loaded', False): tl2.local_inpainters[key] = inp # Store this loaded instance info in the pool for future reuse try: with MangaTranslator._inpaint_pool_lock: if key not in MangaTranslator._inpaint_pool: MangaTranslator._inpaint_pool[key] = {'inpainter': None, 'loaded': False, 'event': threading.Event(), 'spares': []} # Mark that we have a loaded instance available MangaTranslator._inpaint_pool[key]['loaded'] = True MangaTranslator._inpaint_pool[key]['inpainter'] = inp # Store reference if MangaTranslator._inpaint_pool[key].get('event'): MangaTranslator._inpaint_pool[key]['event'].set() except Exception: pass else: # Ensure future calls will attempt a fresh init instead of using a half-initialized instance tl2.local_inpainters[key] = None except Exception as e: self._log(f"❌ Failed to create thread-local inpainter: {e}", "error") try: tl3 = getattr(self, '_thread_local', None) if tl3 is None: self._thread_local = threading.local() tl3 = self._thread_local if not hasattr(tl3, 'local_inpainters') or getattr(tl3, 'local_inpainters', None) is None: tl3.local_inpainters = {} tl3.local_inpainters[key] = None except Exception: pass return getattr(self._thread_local, 'local_inpainters', {}).get(key) def translate_regions(self, regions: List[TextRegion], image_path: str) -> List[TextRegion]: """Translate all text regions with API delay""" self._log(f"\nπŸ“ Translating {len(regions)} text regions...") # Check stop before even starting if self._check_stop(): self._log(f"\n⏹️ Translation stopped before processing any regions", "warning") return regions # Check if parallel processing OR batch translation is enabled parallel_enabled = self.manga_settings.get('advanced', {}).get('parallel_processing', False) batch_enabled = getattr(self, 'batch_mode', False) max_workers = self.manga_settings.get('advanced', {}).get('max_workers', 4) # Batch translation (parallel API calls) should work independently of parallel processing if batch_enabled: max_workers = getattr(self, 'batch_size', max_workers) self._log(f"πŸ“¦ Using BATCH TRANSLATION with {max_workers} concurrent API calls") return self._translate_regions_parallel(regions, image_path, max_workers) elif parallel_enabled and len(regions) > 1: self._log(f"πŸš€ Using PARALLEL processing with {max_workers} workers") return self._translate_regions_parallel(regions, image_path, max_workers) else: # SEQUENTIAL CODE for i, region in enumerate(regions): if self._check_stop(): self._log(f"\n⏹️ Translation stopped by user after {i}/{len(regions)} regions", "warning") break if region.text.strip(): self._log(f"\n[{i+1}/{len(regions)}] Original: {region.text}") # Get context for translation context = self.translation_context[-5:] if self.contextual_enabled else None # Translate with image context translated = self.translate_text( region.text, context, image_path=image_path, region=region ) region.translated_text = translated self._log(f"Translated: {translated}") # SAVE TO HISTORY HERE if self.history_manager and self.contextual_enabled and translated: try: self.history_manager.append_to_history( user_content=region.text, assistant_content=translated, hist_limit=self.translation_history_limit, reset_on_limit=not self.rolling_history_enabled, rolling_window=self.rolling_history_enabled ) self._log(f"πŸ“š Saved to history (exchange {i+1})") except Exception as e: self._log(f"⚠️ Failed to save history: {e}", "warning") # Apply API delay if i < len(regions) - 1: # Don't delay after last translation self._log(f"⏳ Waiting {self.api_delay}s before next translation...") # Check stop flag every 0.1 seconds during delay for _ in range(int(self.api_delay * 10)): if self._check_stop(): self._log(f"\n⏹️ Translation stopped during delay", "warning") return regions time.sleep(0.1) return regions # parallel processing: def _wait_for_api_slot(self, min_interval=None, jitter_max=0.25): """Global, thread-safe front-edge rate limiter for API calls. Ensures parallel requests are spaced out before dispatch, avoiding tail latency. """ import time import random import threading if min_interval is None: try: min_interval = float(getattr(self, "api_delay", 0.0)) except Exception: min_interval = 0.0 if min_interval < 0: min_interval = 0.0 # Lazy init shared state if not hasattr(self, "_api_rl_lock"): self._api_rl_lock = threading.Lock() self._api_next_allowed = 0.0 # monotonic seconds while True: now = time.monotonic() with self._api_rl_lock: # If we're allowed now, book the next slot and proceed if now >= self._api_next_allowed: jitter = random.uniform(0.0, max(jitter_max, 0.0)) if jitter_max else 0.0 self._api_next_allowed = now + min_interval + jitter return # Otherwise compute wait time (don’t hold the lock while sleeping) wait = self._api_next_allowed - now # Sleep outside the lock in short increments so stop flags can be honored if wait > 0: try: if self._check_stop(): return except Exception: pass time.sleep(min(wait, 0.05)) def _translate_regions_parallel(self, regions: List[TextRegion], image_path: str, max_workers: int = None) -> List[TextRegion]: """Translate regions using parallel processing""" # Get max_workers from settings if not provided if max_workers is None: max_workers = self.manga_settings.get('advanced', {}).get('max_workers', 4) # Override with API batch size when batch mode is enabled β€” these are API calls. try: if getattr(self, 'batch_mode', False): bs = int(getattr(self, 'batch_size', 0) or int(os.getenv('BATCH_SIZE', '0'))) if bs and bs > 0: max_workers = bs except Exception: pass # Bound to number of regions max_workers = max(1, min(max_workers, len(regions))) # Thread-safe storage for results results_lock = threading.Lock() translated_regions = {} failed_indices = [] # Filter out empty regions valid_regions = [(i, region) for i, region in enumerate(regions) if region.text.strip()] if not valid_regions: return regions # Create a thread pool with ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit all translation tasks future_to_data = {} for i, region in valid_regions: # Check for stop signal before submitting if self._check_stop(): self._log(f"\n⏹️ Translation stopped before submitting region {i+1}", "warning") break # Submit translation task future = executor.submit( self._translate_single_region_parallel, region, i, len(valid_regions), image_path ) future_to_data[future] = (i, region) # Process completed translations completed = 0 for future in as_completed(future_to_data): i, region = future_to_data[future] # Check for stop signal if self._check_stop(): self._log(f"\n⏹️ Translation stopped at {completed}/{len(valid_regions)} completed", "warning") # Cancel remaining futures for f in future_to_data: f.cancel() break try: translated_text = future.result() if translated_text: with results_lock: translated_regions[i] = translated_text completed += 1 self._log(f"βœ… [{completed}/{len(valid_regions)}] Completed region {i+1}") else: with results_lock: failed_indices.append(i) self._log(f"❌ [{completed}/{len(valid_regions)}] Failed region {i+1}", "error") except Exception as e: with results_lock: failed_indices.append(i) self._log(f"❌ Error in region {i+1}: {str(e)}", "error") # Apply translations back to regions for i, region in enumerate(regions): if i in translated_regions: region.translated_text = translated_regions[i] # Report summary success_count = len(translated_regions) fail_count = len(failed_indices) self._log(f"\nπŸ“Š Parallel translation complete: {success_count} succeeded, {fail_count} failed") return regions def reset_for_new_image(self): """Reset internal state for processing a new image""" # ============================================================ # CRITICAL: COMPREHENSIVE CACHE CLEARING FOR NEW IMAGE # This ensures NO text data leaks between images # ============================================================ # Clear any cached detection results if hasattr(self, 'last_detection_results'): del self.last_detection_results # FORCE clear OCR ROI cache (main text contamination source) # THREAD-SAFE: Use lock for parallel panel translation if hasattr(self, 'ocr_roi_cache'): with self._cache_lock: self.ocr_roi_cache.clear() self._current_image_hash = None # Clear OCR manager and ALL provider caches if hasattr(self, 'ocr_manager') and self.ocr_manager: if hasattr(self.ocr_manager, 'last_results'): self.ocr_manager.last_results = None if hasattr(self.ocr_manager, 'cache'): self.ocr_manager.cache.clear() # Clear ALL provider-level caches if hasattr(self.ocr_manager, 'providers'): for provider_name, provider in self.ocr_manager.providers.items(): if hasattr(provider, 'last_results'): provider.last_results = None if hasattr(provider, 'cache'): provider.cache.clear() # Clear bubble detector cache if hasattr(self, 'bubble_detector') and self.bubble_detector: if hasattr(self.bubble_detector, 'last_detections'): self.bubble_detector.last_detections = None if hasattr(self.bubble_detector, 'cache'): self.bubble_detector.cache.clear() # Don't clear translation context if using rolling history if not self.rolling_history_enabled: self.translation_context = [] # Clear any cached regions if hasattr(self, '_cached_regions'): del self._cached_regions self._log("πŸ”„ Reset translator state for new image (ALL text caches cleared)", "debug") def _translate_single_region_parallel(self, region: TextRegion, index: int, total: int, image_path: str) -> Optional[str]: """Translate a single region for parallel processing""" try: thread_name = threading.current_thread().name self._log(f"\n[{thread_name}] [{index+1}/{total}] Original: {region.text}") # Note: Context is not used in parallel mode to avoid race conditions # Pass None for context to maintain compatibility with your translate_text method # Front-edge rate limiting across threads self._wait_for_api_slot() translated = self.translate_text( region.text, None, # No context in parallel mode image_path=image_path, region=region ) if translated: self._log(f"[{thread_name}] Translated: {translated}") return translated else: self._log(f"[{thread_name}] Translation failed", "error") return None except Exception as e: self._log(f"[{thread_name}] Error: {str(e)}", "error") return None def _is_bubble_detector_loaded(self, ocr_settings: Dict[str, Any]) -> Tuple[bool, str]: """Check if the configured bubble detector's model is already loaded. Returns (loaded, detector_type). Safe: does not trigger a load. """ try: bd = self._get_thread_bubble_detector() except Exception: return False, ocr_settings.get('detector_type', 'rtdetr_onnx') det = ocr_settings.get('detector_type', 'rtdetr_onnx') try: if det == 'rtdetr_onnx': return bool(getattr(bd, 'rtdetr_onnx_loaded', False)), det elif det == 'rtdetr': return bool(getattr(bd, 'rtdetr_loaded', False)), det elif det == 'yolo': return bool(getattr(bd, 'model_loaded', False)), det else: # Auto or unknown – consider any ready model as loaded ready = bool(getattr(bd, 'rtdetr_loaded', False) or getattr(bd, 'rtdetr_onnx_loaded', False) or getattr(bd, 'model_loaded', False)) return ready, det except Exception: return False, det def _is_local_inpainter_loaded(self) -> Tuple[bool, Optional[str]]: """Check if a local inpainter model is already loaded for current settings. Returns (loaded, local_method) or (False, None). This respects UI flags: skip_inpainting / use_cloud_inpainting. """ try: # If skipping or using cloud, this does not apply if getattr(self, 'skip_inpainting', False) or getattr(self, 'use_cloud_inpainting', False): return False, None except Exception: pass inpaint_cfg = self.manga_settings.get('inpainting', {}) if hasattr(self, 'manga_settings') else {} local_method = inpaint_cfg.get('local_method', 'anime') try: model_path = self.main_gui.config.get(f'manga_{local_method}_model_path', '') if hasattr(self, 'main_gui') else '' except Exception: model_path = '' # Singleton path if getattr(self, 'use_singleton_models', False): inp = getattr(MangaTranslator, '_singleton_local_inpainter', None) return (bool(getattr(inp, 'model_loaded', False)), local_method) # Thread-local/pooled path inp = getattr(self, 'local_inpainter', None) if inp is not None and getattr(inp, 'model_loaded', False): return True, local_method try: key = (local_method, model_path or '') rec = MangaTranslator._inpaint_pool.get(key) # Consider the shared 'inpainter' loaded or any spare that is model_loaded if rec: if rec.get('loaded') and rec.get('inpainter') is not None and getattr(rec['inpainter'], 'model_loaded', False): return True, local_method for spare in rec.get('spares') or []: if getattr(spare, 'model_loaded', False): return True, local_method except Exception: pass return False, local_method def _log_model_status(self): """Emit concise status lines for already-loaded heavy models to avoid confusing 'loading' logs.""" try: ocr_settings = self.manga_settings.get('ocr', {}) if hasattr(self, 'manga_settings') else {} if ocr_settings.get('bubble_detection_enabled', False): loaded, det = self._is_bubble_detector_loaded(ocr_settings) det_name = 'YOLO' if det == 'yolo' else ('RT-DETR' if det == 'rtdetr' else 'RTEDR_onnx') if loaded: self._log("πŸ€– Using bubble detector (already loaded)", "info") else: self._log("πŸ€– Bubble detector will load on first use", "debug") except Exception: pass try: loaded, local_method = self._is_local_inpainter_loaded() if local_method: label = local_method.upper() if loaded: self._log("🎨 Using local inpainter (already loaded)", "info") else: self._log("🎨 Local inpainter will load on first use", "debug") except Exception: pass def process_image(self, image_path: str, output_path: Optional[str] = None, batch_index: int = None, batch_total: int = None) -> Dict[str, Any]: """Process a single manga image through the full pipeline""" # Ensure local references exist for cleanup in finally image = None inpainted = None final_image = None mask = None mask_viz = None pil_image = None heatmap = None # Set batch tracking if provided if batch_index is not None and batch_total is not None: self.batch_current = batch_index self.batch_size = batch_total self.batch_mode = True # Simplified header for batch mode if not self.batch_mode: self._log(f"\n{'='*60}") self._log(f"πŸ“· STARTING MANGA TRANSLATION PIPELINE") self._log(f"πŸ“ Input: {image_path}") self._log(f"πŸ“ Output: {output_path or 'Auto-generated'}") self._log(f"{'='*60}\n") else: self._log(f"\n[{batch_index}/{batch_total}] Processing: {os.path.basename(image_path)}") # Before heavy work, report model status to avoid confusing 'loading' logs later try: self._log_model_status() except Exception: pass result = { 'success': False, 'input_path': image_path, 'output_path': output_path, 'regions': [], 'errors': [], 'interrupted': False, 'format_info': {} } try: # RAM cap gating before heavy processing try: self._block_if_over_cap("processing image") except Exception: pass # Determine the output directory from output_path if output_path: output_dir = os.path.dirname(output_path) else: # If no output path specified, use default output_dir = os.path.join(os.path.dirname(image_path), "translated_images") # Ensure output directory exists os.makedirs(output_dir, exist_ok=True) # Initialize HistoryManager with the output directory if self.contextual_enabled and not self.history_manager_initialized: # Only initialize if we're in a new output directory if output_dir != getattr(self, 'history_output_dir', None): try: self.history_manager = HistoryManager(output_dir) self.history_manager_initialized = True self.history_output_dir = output_dir self._log(f"πŸ“š Initialized HistoryManager in output directory: {output_dir}") except Exception as e: self._log(f"⚠️ Failed to initialize history manager: {str(e)}", "warning") self.history_manager = None # Check for stop signal if self._check_stop(): result['interrupted'] = True self._log("⏹️ Translation stopped before processing", "warning") return result # Format detection if enabled if self.manga_settings.get('advanced', {}).get('format_detection', False): self._log("πŸ” Analyzing image format...") img = Image.open(image_path) width, height = img.size aspect_ratio = height / width # Detect format type format_info = { 'width': width, 'height': height, 'aspect_ratio': aspect_ratio, 'is_webtoon': aspect_ratio > 3.0, 'is_spread': width > height * 1.3, 'format': 'unknown' } if format_info['is_webtoon']: format_info['format'] = 'webtoon' self._log("πŸ“± Detected WEBTOON format - vertical scroll manga") elif format_info['is_spread']: format_info['format'] = 'spread' self._log("πŸ“– Detected SPREAD format - two-page layout") else: format_info['format'] = 'single_page' self._log("πŸ“„ Detected SINGLE PAGE format") result['format_info'] = format_info # Handle webtoon mode if detected and enabled webtoon_mode = self.manga_settings.get('advanced', {}).get('webtoon_mode', 'auto') if format_info['is_webtoon'] and webtoon_mode != 'disabled': if webtoon_mode == 'auto' or webtoon_mode == 'force': self._log("πŸ”„ Webtoon mode active - will process in chunks for better OCR") # Process webtoon in chunks return self._process_webtoon_chunks(image_path, output_path, result) # Step 1: Detect text regions using Google Cloud Vision self._log(f"πŸ“ [STEP 1] Text Detection Phase") regions = self.detect_text_regions(image_path) if not regions: error_msg = "No text regions detected by Cloud Vision" self._log(f"⚠️ {error_msg}", "warning") result['errors'].append(error_msg) # Still save the original image as "translated" if no text found if output_path: import shutil shutil.copy2(image_path, output_path) result['output_path'] = output_path result['success'] = True return result self._log(f"\nβœ… Detection complete: {len(regions)} regions found") # Save debug outputs only if 'Save intermediate images' is enabled if self.manga_settings.get('advanced', {}).get('save_intermediate', False): self._save_debug_image(image_path, regions, debug_base_dir=output_dir) # Step 2: Translation & Inpainting (concurrent) self._log(f"\nπŸ“ [STEP 2] Translation & Inpainting Phase (concurrent)") # Load image once (used by inpainting task); keep PIL fallback for Unicode paths import cv2 self._log(f"πŸ–ΌοΈ Loading image with OpenCV...") try: image = cv2.imread(image_path) if image is None: self._log(f" Using PIL to handle Unicode path...", "info") from PIL import Image as PILImage import numpy as np pil_image = PILImage.open(image_path) image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR) self._log(f" βœ… Successfully loaded with PIL", "info") except Exception as e: error_msg = f"Failed to load image: {image_path} - {str(e)}" self._log(f"❌ {error_msg}", "error") result['errors'].append(error_msg) return result self._log(f" Image dimensions: {image.shape[1]}x{image.shape[0]}") # Save intermediate original image if enabled if self.manga_settings.get('advanced', {}).get('save_intermediate', False): self._save_intermediate_image(image_path, image, "original", debug_base_dir=output_dir) # Check if we should continue before kicking off tasks if self._check_stop(): result['interrupted'] = True self._log("⏹️ Translation stopped before concurrent phase", "warning") return result # Helper tasks def _task_translate(): try: if self.full_page_context_enabled: # Full page context translation mode self._log(f"\nπŸ“„ Using FULL PAGE CONTEXT mode") self._log(" This mode sends all text together for more consistent translations", "info") if self._check_stop(): return False translations = self.translate_full_page_context(regions, image_path) if translations: translated_count = sum(1 for r in regions if getattr(r, 'translated_text', None) and r.translated_text and r.translated_text != r.text) self._log(f"\nπŸ“Š Full page context translation complete: {translated_count}/{len(regions)} regions translated") return True else: self._log("❌ Full page context translation failed", "error") result['errors'].append("Full page context translation failed") return False else: # Individual translation mode with parallel processing support self._log(f"\nπŸ“ Using INDIVIDUAL translation mode") if self.manga_settings.get('advanced', {}).get('parallel_processing', False): self._log("⚑ Parallel processing ENABLED") _ = self._translate_regions_parallel(regions, image_path) else: _ = self.translate_regions(regions, image_path) return True except Exception as te: self._log(f"❌ Translation task error: {te}", "error") return False def _task_inpaint(): try: if getattr(self, 'skip_inpainting', False): self._log(f"🎨 Skipping inpainting (preserving original art)", "info") return image.copy() self._log(f"🎭 Creating text mask...") try: self._block_if_over_cap("mask creation") except Exception: pass mask_local = self.create_text_mask(image, regions) # Save mask and overlay only if 'Save intermediate images' is enabled if self.manga_settings.get('advanced', {}).get('save_intermediate', False): try: debug_dir = os.path.join(output_dir, 'debug') os.makedirs(debug_dir, exist_ok=True) base_name = os.path.splitext(os.path.basename(image_path))[0] mask_path = os.path.join(debug_dir, f"{base_name}_mask.png") cv2.imwrite(mask_path, mask_local) mask_percentage = ((mask_local > 0).sum() / mask_local.size) * 100 self._log(f" 🎭 DEBUG: Saved mask to {mask_path}", "info") self._log(f" πŸ“Š Mask coverage: {mask_percentage:.1f}% of image", "info") # Save mask overlay visualization mask_viz_local = image.copy() mask_viz_local[mask_local > 0] = [0, 0, 255] viz_path = os.path.join(debug_dir, f"{base_name}_mask_overlay.png") cv2.imwrite(viz_path, mask_viz_local) self._log(f" 🎭 DEBUG: Saved mask overlay to {viz_path}", "info") except Exception as e: self._log(f" ❌ Failed to save mask debug: {str(e)}", "error") # Also save intermediate copies try: self._save_intermediate_image(image_path, mask_local, "mask", debug_base_dir=output_dir) except Exception: pass self._log(f"🎨 Inpainting to remove original text") try: self._block_if_over_cap("inpainting") except Exception: pass inpainted_local = self.inpaint_regions(image, mask_local) if self.manga_settings.get('advanced', {}).get('save_intermediate', False): try: self._save_intermediate_image(image_path, inpainted_local, "inpainted", debug_base_dir=output_dir) except Exception: pass return inpainted_local except Exception as ie: self._log(f"❌ Inpainting task error: {ie}", "error") return image.copy() # Gate on advanced setting (default enabled) adv = self.manga_settings.get('advanced', {}) run_concurrent = adv.get('concurrent_inpaint_translate', True) if run_concurrent: self._log("πŸ”€ Running translation and inpainting concurrently", "info") with ThreadPoolExecutor(max_workers=2) as _executor: fut_translate = _executor.submit(_task_translate) fut_inpaint = _executor.submit(_task_inpaint) # Wait for completion try: translate_ok = fut_translate.result() except Exception: translate_ok = False try: inpainted = fut_inpaint.result() except Exception: inpainted = image.copy() else: self._log("β†ͺ️ Concurrent mode disabled β€” running sequentially", "info") translate_ok = _task_translate() inpainted = _task_inpaint() # After concurrent phase, validate translation if self._check_stop(): result['interrupted'] = True self._log("⏹️ Translation cancelled before rendering", "warning") result['regions'] = [r.to_dict() for r in regions] return result if not any(getattr(region, 'translated_text', None) for region in regions): result['interrupted'] = True self._log("⏹️ No regions were translated - translation was interrupted", "warning") result['regions'] = [r.to_dict() for r in regions] return result # Render translated text self._log(f"✍️ Rendering translated text...") self._log(f" Using enhanced renderer with custom settings", "info") final_image = self.render_translated_text(inpainted, regions) # Save output try: if not output_path: base, ext = os.path.splitext(image_path) output_path = f"{base}_translated{ext}" success = cv2.imwrite(output_path, final_image) if not success: self._log(f" Using PIL to save with Unicode path...", "info") from PIL import Image as PILImage rgb_image = cv2.cvtColor(final_image, cv2.COLOR_BGR2RGB) pil_image = PILImage.fromarray(rgb_image) pil_image.save(output_path) self._log(f" βœ… Successfully saved with PIL", "info") result['output_path'] = output_path self._log(f"\nπŸ’Ύ Saved output to: {output_path}") except Exception as e: error_msg = f"Failed to save output image: {str(e)}" self._log(f"❌ {error_msg}", "error") result['errors'].append(error_msg) result['success'] = False return result # Update result result['regions'] = [r.to_dict() for r in regions] if not result.get('interrupted', False): result['success'] = True self._log(f"\nβœ… TRANSLATION PIPELINE COMPLETE", "success") else: self._log(f"\n⚠️ TRANSLATION INTERRUPTED - Partial output saved", "warning") self._log(f"{'='*60}\n") except Exception as e: error_msg = f"Error processing image: {str(e)}\n{traceback.format_exc()}" self._log(f"\n❌ PIPELINE ERROR:", "error") self._log(f" {str(e)}", "error") self._log(f" Type: {type(e).__name__}", "error") self._log(traceback.format_exc(), "error") result['errors'].append(error_msg) finally: # Per-image memory cleanup to reduce RAM growth across pages try: # Clear self-held large attributes try: self.current_image = None self.current_mask = None self.final_image = None self.text_regions = [] self.translated_regions = [] except Exception: pass # Clear local large objects if present locs = locals() for name in [ 'image', 'inpainted', 'final_image', 'mask', 'mask_viz', 'pil_image', 'heatmap' ]: try: if name in locs: # Explicitly delete reference from locals del locs[name] except Exception: pass # Reset caches for the next image (non-destructive to loaded models) try: self.reset_for_new_image() except Exception: pass # Encourage release of native resources try: import cv2 as _cv2 try: _cv2.destroyAllWindows() except Exception: pass except Exception: pass # Free CUDA memory if torch is available try: import torch if torch.cuda.is_available(): torch.cuda.empty_cache() except Exception: pass # Release thread-local heavy objects to curb RAM growth across runs try: self._cleanup_thread_locals() except Exception: pass # Deep cleanup control - respects user settings and parallel processing try: # Check if auto cleanup is enabled in settings auto_cleanup_enabled = False # Default disabled by default try: if hasattr(self, 'manga_settings'): auto_cleanup_enabled = self.manga_settings.get('advanced', {}).get('auto_cleanup_models', False) except Exception: pass if not auto_cleanup_enabled: # User has disabled automatic cleanup self._log("πŸ”‘ Auto cleanup disabled - models will remain in RAM", "debug") else: # Determine if we should cleanup now should_cleanup_now = True # Check if we're in batch mode is_last_in_batch = False try: if getattr(self, 'batch_mode', False): bc = getattr(self, 'batch_current', None) bt = getattr(self, 'batch_size', None) if bc is not None and bt is not None: is_last_in_batch = (bc >= bt) # In batch mode, only cleanup at the end should_cleanup_now = is_last_in_batch except Exception: pass # For parallel panel translation, cleanup is handled differently # (it's handled in manga_integration.py after all panels complete) is_parallel_panel = False try: if hasattr(self, 'manga_settings'): is_parallel_panel = self.manga_settings.get('advanced', {}).get('parallel_panel_translation', False) except Exception: pass if is_parallel_panel: # Don't cleanup here - let manga_integration handle it after all panels self._log("🎯 Deferring cleanup until all parallel panels complete", "debug") should_cleanup_now = False if should_cleanup_now: # Perform the cleanup self._deep_cleanup_models() # Also clear HF cache for RT-DETR (best-effort) if is_last_in_batch or not getattr(self, 'batch_mode', False): try: self._clear_hf_cache() except Exception: pass except Exception: pass # Force a garbage collection cycle try: import gc gc.collect() except Exception: pass # Aggressively trim process working set (Windows) or libc heap (Linux) try: self._trim_working_set() except Exception: pass except Exception: # Never let cleanup fail the pipeline pass return result def reset_history_manager(self): """Reset history manager for new translation batch""" self.history_manager = None self.history_manager_initialized = False self.history_output_dir = None self.translation_context = [] self._log("πŸ“š Reset history manager for new batch", "debug") def cleanup_all_models(self): """Public method to force cleanup of all models - call this after translation! This ensures all models (YOLO, RT-DETR, inpainters, OCR) are unloaded from RAM. """ self._log("🧹 Forcing cleanup of all models to free RAM...", "info") # Call the comprehensive cleanup self._deep_cleanup_models() # Also cleanup thread locals try: self._cleanup_thread_locals() except Exception: pass # Clear HF cache try: self._clear_hf_cache() except Exception: pass # Trim working set try: self._trim_working_set() except Exception: pass self._log("βœ… All models cleaned up - RAM freed!", "info") def clear_internal_state(self): """Clear all internal state and cached data to free memory. This is called when the translator instance is being reset. Ensures OCR manager, inpainters, and bubble detector are also cleaned. """ try: # Clear image data self.current_image = None self.current_mask = None self.final_image = None # Clear text regions if hasattr(self, 'text_regions'): self.text_regions = [] if hasattr(self, 'translated_regions'): self.translated_regions = [] # Clear ALL caches (including text caches) # THREAD-SAFE: Use lock for parallel panel translation if hasattr(self, 'cache'): self.cache.clear() if hasattr(self, 'ocr_roi_cache'): with self._cache_lock: self.ocr_roi_cache.clear() self._current_image_hash = None # Clear history and context if hasattr(self, 'translation_context'): self.translation_context = [] if hasattr(self, 'history_manager'): self.history_manager = None self.history_manager_initialized = False self.history_output_dir = None # IMPORTANT: Properly unload OCR manager if hasattr(self, 'ocr_manager') and self.ocr_manager: try: ocr = self.ocr_manager if hasattr(ocr, 'providers'): for provider_name, provider in ocr.providers.items(): # Clear all model references if hasattr(provider, 'model'): provider.model = None if hasattr(provider, 'processor'): provider.processor = None if hasattr(provider, 'tokenizer'): provider.tokenizer = None if hasattr(provider, 'reader'): provider.reader = None if hasattr(provider, 'client'): provider.client = None if hasattr(provider, 'is_loaded'): provider.is_loaded = False ocr.providers.clear() self.ocr_manager = None self._log(" βœ“ OCR manager cleared", "debug") except Exception as e: self._log(f" Warning: OCR cleanup failed: {e}", "debug") # IMPORTANT: Handle local inpainter cleanup carefully # DO NOT unload if it's a shared/checked-out instance from the pool if hasattr(self, 'local_inpainter') and self.local_inpainter: try: # Only unload if this is NOT a checked-out or shared instance is_from_pool = hasattr(self, '_checked_out_inpainter') or hasattr(self, '_inpainter_pool_key') if not is_from_pool and hasattr(self.local_inpainter, 'unload'): self.local_inpainter.unload() self._log(" βœ“ Local inpainter unloaded", "debug") else: self._log(" βœ“ Local inpainter reference cleared (pool instance preserved)", "debug") self.local_inpainter = None except Exception as e: self._log(f" Warning: Inpainter cleanup failed: {e}", "debug") # Also clear hybrid and generic inpainter references if hasattr(self, 'hybrid_inpainter'): if self.hybrid_inpainter and hasattr(self.hybrid_inpainter, 'unload'): try: self.hybrid_inpainter.unload() except Exception: pass self.hybrid_inpainter = None if hasattr(self, 'inpainter'): if self.inpainter and hasattr(self.inpainter, 'unload'): try: self.inpainter.unload() except Exception: pass self.inpainter = None # IMPORTANT: Handle bubble detector cleanup carefully # DO NOT unload if it's a singleton or from a preloaded pool if hasattr(self, 'bubble_detector') and self.bubble_detector: try: is_singleton = getattr(self, 'use_singleton_bubble_detector', False) # Check if it's from thread-local which might have gotten it from the pool is_from_pool = hasattr(self, '_thread_local') and hasattr(self._thread_local, 'bubble_detector') if not is_singleton and not is_from_pool: if hasattr(self.bubble_detector, 'unload'): self.bubble_detector.unload(release_shared=True) self._log(" βœ“ Bubble detector unloaded", "debug") else: self._log(" βœ“ Bubble detector reference cleared (pool/singleton instance preserved)", "debug") # In all cases, clear our instance reference self.bubble_detector = None except Exception as e: self._log(f" Warning: Bubble detector cleanup failed: {e}", "debug") # Clear any file handles or temp data if hasattr(self, '_thread_local'): try: self._cleanup_thread_locals() except Exception: pass # Clear processing flags self.is_processing = False self.cancel_requested = False self._log("🧹 Internal state and all components cleared", "debug") except Exception as e: self._log(f"⚠️ Warning: Failed to clear internal state: {e}", "warning") def _process_webtoon_chunks(self, image_path: str, output_path: str, result: Dict) -> Dict: """Process webtoon in chunks for better OCR""" import cv2 import numpy as np from PIL import Image as PILImage try: self._log("πŸ“± Processing webtoon in chunks for better OCR", "info") # Load the image image = cv2.imread(image_path) if image is None: pil_image = PILImage.open(image_path) image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR) height, width = image.shape[:2] # Get chunk settings from config chunk_height = self.manga_settings.get('preprocessing', {}).get('chunk_height', 1000) chunk_overlap = self.manga_settings.get('preprocessing', {}).get('chunk_overlap', 100) self._log(f" Image dimensions: {width}x{height}", "info") self._log(f" Chunk height: {chunk_height}px, Overlap: {chunk_overlap}px", "info") # Calculate number of chunks needed effective_chunk_height = chunk_height - chunk_overlap num_chunks = max(1, (height - chunk_overlap) // effective_chunk_height + 1) self._log(f" Will process in {num_chunks} chunks", "info") # Process each chunk all_regions = [] chunk_offsets = [] for i in range(num_chunks): # Calculate chunk boundaries start_y = i * effective_chunk_height end_y = min(start_y + chunk_height, height) # Make sure we don't miss the bottom part if i == num_chunks - 1: end_y = height self._log(f"\n πŸ“„ Processing chunk {i+1}/{num_chunks} (y: {start_y}-{end_y})", "info") # Extract chunk chunk = image[start_y:end_y, 0:width] # Save chunk temporarily for OCR import tempfile with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp: chunk_path = tmp.name cv2.imwrite(chunk_path, chunk) try: # Detect text in this chunk chunk_regions = self.detect_text_regions(chunk_path) # Adjust region coordinates to full image space for region in chunk_regions: # Adjust bounding box x, y, w, h = region.bounding_box region.bounding_box = (x, y + start_y, w, h) # Adjust vertices if present if hasattr(region, 'vertices') and region.vertices: adjusted_vertices = [] for vx, vy in region.vertices: adjusted_vertices.append((vx, vy + start_y)) region.vertices = adjusted_vertices # Mark which chunk this came from (for deduplication) region.chunk_index = i region.chunk_y_range = (start_y, end_y) all_regions.extend(chunk_regions) chunk_offsets.append(start_y) self._log(f" Found {len(chunk_regions)} text regions in chunk {i+1}", "info") finally: # Clean up temp file import os if os.path.exists(chunk_path): os.remove(chunk_path) # Remove duplicate regions from overlapping areas self._log(f"\n πŸ” Deduplicating regions from overlaps...", "info") unique_regions = self._deduplicate_chunk_regions(all_regions, chunk_overlap) self._log(f" Total regions: {len(all_regions)} β†’ {len(unique_regions)} after deduplication", "info") if not unique_regions: self._log("⚠️ No text regions detected in webtoon", "warning") result['errors'].append("No text regions detected") return result # Now process the regions as normal self._log(f"\nπŸ“ Translating {len(unique_regions)} unique regions", "info") # Translate regions if self.full_page_context_enabled: translations = self.translate_full_page_context(unique_regions, image_path) for region in unique_regions: if region.text in translations: region.translated_text = translations[region.text] else: unique_regions = self.translate_regions(unique_regions, image_path) # Create mask and inpaint self._log(f"\n🎨 Creating mask and inpainting...", "info") mask = self.create_text_mask(image, unique_regions) if self.skip_inpainting: inpainted = image.copy() else: inpainted = self.inpaint_regions(image, mask) # Render translated text self._log(f"✍️ Rendering translated text...", "info") final_image = self.render_translated_text(inpainted, unique_regions) # Save output if not output_path: base, ext = os.path.splitext(image_path) output_path = f"{base}_translated{ext}" cv2.imwrite(output_path, final_image) result['output_path'] = output_path result['regions'] = [r.to_dict() for r in unique_regions] result['success'] = True result['format_info']['chunks_processed'] = num_chunks self._log(f"\nβœ… Webtoon processing complete: {output_path}", "success") return result except Exception as e: error_msg = f"Error processing webtoon chunks: {str(e)}" self._log(f"❌ {error_msg}", "error") result['errors'].append(error_msg) return result def _deduplicate_chunk_regions(self, regions: List, overlap_height: int) -> List: """Remove duplicate regions from overlapping chunk areas""" if not regions: return regions # Sort regions by y position regions.sort(key=lambda r: r.bounding_box[1]) unique_regions = [] used_indices = set() for i, region1 in enumerate(regions): if i in used_indices: continue # Check if this region is in an overlap zone x1, y1, w1, h1 = region1.bounding_box chunk_idx = region1.chunk_index if hasattr(region1, 'chunk_index') else 0 chunk_y_start, chunk_y_end = region1.chunk_y_range if hasattr(region1, 'chunk_y_range') else (0, float('inf')) # Check if region is near chunk boundary (in overlap zone) in_overlap_zone = (y1 < chunk_y_start + overlap_height) and chunk_idx > 0 if in_overlap_zone: # Look for duplicate in previous chunk's regions found_duplicate = False for j, region2 in enumerate(regions): if j >= i or j in used_indices: continue if hasattr(region2, 'chunk_index') and region2.chunk_index == chunk_idx - 1: x2, y2, w2, h2 = region2.bounding_box # Check if regions are the same (similar position and size) if (abs(x1 - x2) < 20 and abs(y1 - y2) < 20 and abs(w1 - w2) < 20 and abs(h1 - h2) < 20): # Check text similarity if region1.text == region2.text: # This is a duplicate found_duplicate = True used_indices.add(i) self._log(f" Removed duplicate: '{region1.text[:30]}...'", "debug") break if not found_duplicate: unique_regions.append(region1) used_indices.add(i) else: # Not in overlap zone, keep it unique_regions.append(region1) used_indices.add(i) return unique_regions def _save_intermediate_image(self, original_path: str, image, stage: str, debug_base_dir: str = None): """Save intermediate processing stages under translated_images/debug or provided base dir""" if debug_base_dir is None: translated_dir = os.path.join(os.path.dirname(original_path), 'translated_images') debug_dir = os.path.join(translated_dir, 'debug') else: debug_dir = os.path.join(debug_base_dir, 'debug') os.makedirs(debug_dir, exist_ok=True) base_name = os.path.splitext(os.path.basename(original_path))[0] output_path = os.path.join(debug_dir, f"{base_name}_{stage}.png") cv2.imwrite(output_path, image) self._log(f" πŸ’Ύ Saved {stage} image: {output_path}")