Spaces:

Shirochi
/

Glossarion

Running

App Files Files Community

Shirochi commited on Oct 5

Commit

b5b56ea

verified ·

1 Parent(s): e54ad44

Upload 2 files

Browse files

Files changed (2) hide show

chapter_extraction_manager.py +406 -0
chapter_extraction_worker.py +158 -0

chapter_extraction_manager.py ADDED Viewed

	@@ -0,0 +1,406 @@

+#!/usr/bin/env python3
+"""
+Chapter Extraction Manager - Manages chapter extraction in subprocess to prevent GUI freezing
+"""
+import subprocess
+import sys
+import os
+import json
+import threading
+import queue
+import time
+from pathlib import Path
+class ChapterExtractionManager:
+    """
+    Manages chapter extraction in a separate process to prevent GUI freezing
+    Similar to GlossaryManager but for chapter extraction
+    """
+    def __init__(self, log_callback=None):
+        """
+        Initialize the extraction manager
+        Args:
+            log_callback: Function to call with log messages (for GUI integration)
+        """
+        self.log_callback = log_callback
+        self.process = None
+        self.output_queue = queue.Queue()
+        self.error_queue = queue.Queue()
+        self.result = None
+        self.is_running = False
+        self.stop_requested = False
+    def extract_chapters_async(self, epub_path, output_dir, extraction_mode="smart",
+                              progress_callback=None, completion_callback=None):
+        """
+        Start chapter extraction in a subprocess
+        Args:
+            epub_path: Path to EPUB file
+            output_dir: Output directory for extracted content
+            extraction_mode: Extraction mode (smart, comprehensive, full, enhanced)
+            progress_callback: Function to call with progress updates
+            completion_callback: Function to call when extraction completes
+        """
+        if self.is_running:
+            self._log("⚠️ Chapter extraction already in progress")
+            return False
+        self.is_running = True
+        self.stop_requested = False
+        self.result = None
+        # Start extraction in a thread that manages the subprocess
+        thread = threading.Thread(
+            target=self._run_extraction_subprocess,
+            args=(epub_path, output_dir, extraction_mode, progress_callback, completion_callback),
+            daemon=True
+        )
+        thread.start()
+        return True
+    def _run_extraction_subprocess(self, epub_path, output_dir, extraction_mode,
+                                   progress_callback, completion_callback):
+        """
+        Run the extraction subprocess and handle its output
+        """
+        try:
+            # Build command differently for frozen vs dev mode
+            if getattr(sys, 'frozen', False):
+                # In a frozen one-file build, sys.executable is our GUI .exe, not Python.
+                # Use an internal worker-mode flag handled by translator_gui.py to run the worker.
+                cmd = [
+                    sys.executable,
+                    '--run-chapter-extraction',
+                    epub_path,
+                    output_dir,
+                    extraction_mode
+                ]
+            else:
+                # In dev mode, invoke the worker script with the Python interpreter
+                base_dir = Path(__file__).parent
+                worker_script = base_dir / "chapter_extraction_worker.py"
+                cmd = [
+                    sys.executable,
+                    str(worker_script),
+                    epub_path,
+                    output_dir,
+                    extraction_mode
+                ]
+            # Set environment to force UTF-8 encoding
+            env = os.environ.copy()
+            env['PYTHONIOENCODING'] = 'utf-8'
+            env['PYTHONLEGACYWINDOWSSTDIO'] = '0'  # Use new Windows console API
+            # Set default worker count if not already set
+            env.setdefault('EXTRACTION_WORKERS', '2')
+            self._log(f"🚀 Starting chapter extraction subprocess...")
+            self._log(f"📚 EPUB: {os.path.basename(epub_path)}")
+            self._log(f"📂 Output: {output_dir}")
+            self._log(f"⚙️ Mode: {extraction_mode}")
+            # Start the subprocess with UTF-8 encoding
+            self.process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                encoding='utf-8',
+                errors='replace',  # Replace invalid chars instead of failing
+                bufsize=1,
+                universal_newlines=True,
+                env=env  # Pass the environment with UTF-8 settings
+            )
+            # Read output in real-time
+            while True:
+                if self.stop_requested:
+                    self._terminate_process()
+                    break
+                # Check if process is still running
+                if self.process.poll() is not None:
+                    break
+                # Read stdout line by line with error handling
+                try:
+                    line = self.process.stdout.readline()
+                    if not line:
+                        continue
+                    line = line.strip()
+                    if not line:
+                        continue
+                except UnicodeDecodeError as e:
+                    self._log(f"⚠️ Encoding error reading output: {e}")
+                    continue
+                # Skip all processing if stop is requested to suppress logs
+                if self.stop_requested:
+                    continue
+                # Parse output based on prefix
+                if line.startswith("[PROGRESS]"):
+                    # Progress update
+                    message = line[10:].strip()
+                    if progress_callback:
+                        progress_callback(message)
+                    self._log(f"📊 {message}")
+                elif line.startswith("[INFO]"):
+                    # Information message
+                    message = line[6:].strip()
+                    self._log(f"ℹ️ {message}")
+                elif line.startswith("[ERROR]"):
+                    # Error message
+                    message = line[7:].strip()
+                    self._log(f"❌ {message}")
+                    self.error_queue.put(message)
+                elif line.startswith("[RESULT]"):
+                    # Final result as JSON
+                    try:
+                        json_str = line[8:].strip()
+                        self.result = json.loads(json_str)
+                        if self.result.get("success"):
+                            self._log(f"✅ Extraction completed successfully!")
+                            self._log(f"📚 Extracted {self.result.get('chapters', 0)} chapters")
+                        else:
+                            error = self.result.get("error", "Unknown error")
+                            self._log(f"❌ Extraction failed: {error}")
+                    except json.JSONDecodeError as e:
+                        self._log(f"⚠️ Failed to parse result: {e}")
+                elif line.startswith("["):
+                    # Other prefixed messages - skip
+                    pass
+                else:
+                    # Regular output - only log if not too verbose
+                    if not any(skip in line for skip in ["📁     Searching for", "📁     Found", "📁   ✓", "📁   ✗"]):
+                        self._log(line)
+            # Get any remaining output - but only process if not stopped
+            if not self.stop_requested:
+                remaining_output, remaining_error = self.process.communicate(timeout=1)
+                # Process any remaining output
+                if remaining_output:
+                    for line in remaining_output.strip().split('\n'):
+                        if line and not line.startswith("["):
+                            self._log(line)
+                # Check for errors
+                if remaining_error:
+                    for line in remaining_error.strip().split('\n'):
+                        if line:
+                            self._log(f"⚠️ {line}")
+                # Check final status
+                if self.process.returncode != 0:
+                    self._log(f"⚠️ Process exited with code {self.process.returncode}")
+            else:
+                # If stopped, just clean up without processing output
+                try:
+                    self.process.communicate(timeout=0.1)
+                except subprocess.TimeoutExpired:
+                    pass  # Ignore timeout when cleaning up
+        except subprocess.TimeoutExpired:
+            if not self.stop_requested:
+                self._log("⚠️ Subprocess communication timeout")
+            self._terminate_process()
+        except Exception as e:
+            # Only log errors if not stopping (unless it's a critical error)
+            if not self.stop_requested or "Subprocess error" in str(e):
+                self._log(f"❌ Subprocess error: {e}")
+            self.result = {
+                "success": False,
+                "error": str(e) if not self.stop_requested else "Extraction stopped by user"
+            }
+        finally:
+            self.is_running = False
+            # Store process reference before clearing it in case termination is needed
+            process_ref = self.process
+            self.process = None
+            # If process is still running, try to clean it up
+            if process_ref and process_ref.poll() is None:
+                try:
+                    process_ref.terminate()
+                    time.sleep(0.1)  # Brief wait
+                    if process_ref.poll() is None:
+                        process_ref.kill()
+                except Exception:
+                    pass  # Ignore cleanup errors in finally block
+            # Ensure result is never None
+            if self.result is None:
+                if self.stop_requested:
+                    self.result = {
+                        "success": False,
+                        "error": "Extraction stopped by user"
+                    }
+                else:
+                    self.result = {
+                        "success": False,
+                        "error": "Extraction process ended unexpectedly"
+                    }
+            # Call completion callback
+            if completion_callback:
+                completion_callback(self.result)
+    def stop_extraction(self):
+        """Stop the extraction process"""
+        if not self.is_running:
+            return False
+        # Set stop flag first to suppress subsequent logs
+        self.stop_requested = True
+        self._log("🛑 Stopping chapter extraction...")
+        # Store process reference to avoid race condition
+        process_ref = self.process
+        # Give it a moment to stop gracefully
+        time.sleep(0.5)
+        # Force terminate if still running and process still exists
+        if process_ref:
+            self._terminate_process_ref(process_ref)
+        return True
+    def _terminate_process(self):
+        """Terminate the subprocess using current process reference"""
+        if self.process:
+            self._terminate_process_ref(self.process)
+    def _terminate_process_ref(self, process_ref):
+        """Terminate a specific process reference"""
+        if not process_ref:
+            return
+        try:
+            # Check if process is still alive before attempting termination
+            if process_ref.poll() is None:
+                process_ref.terminate()
+                # Give it a moment to terminate
+                time.sleep(0.5)
+                # Force kill if still running
+                if process_ref.poll() is None:
+                    process_ref.kill()
+                    time.sleep(0.1)  # Brief wait after kill
+                # Only log termination if not stopping (user already knows they stopped it)
+                if not self.stop_requested:
+                    self._log("✅ Process terminated")
+            else:
+                # Only log if not stopping
+                if not self.stop_requested:
+                    self._log("✅ Process already terminated")
+        except Exception as e:
+            # Always log termination errors as they might indicate a problem
+            self._log(f"⚠️ Error terminating process: {e}")
+    def _log(self, message):
+        """Log a message using the callback if available"""
+        # Suppress logs when stop is requested (except for stop/termination messages)
+        if self.stop_requested and not any(keyword in message for keyword in ["🛑", "✅ Process terminated", "❌ Subprocess error"]):
+            return
+        if self.log_callback:
+            self.log_callback(message)
+        else:
+            print(message)
+    def is_extraction_running(self):
+        """Check if extraction is currently running"""
+        return self.is_running
+    def get_result(self):
+        """Get the extraction result if available"""
+        return self.result
+# Example usage
+if __name__ == "__main__":
+    import tkinter as tk
+    from tkinter import filedialog
+    def test_extraction():
+        """Test the extraction manager"""
+        # Create a simple GUI for testing
+        root = tk.Tk()
+        root.title("Chapter Extraction Test")
+        root.geometry("800x600")
+        # Text widget for logs
+        text = tk.Text(root, wrap=tk.WORD)
+        text.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
+        # Log callback
+        def log_message(msg):
+            text.insert(tk.END, msg + "\n")
+            text.see(tk.END)
+            root.update_idletasks()
+        # Progress callback
+        def progress_update(msg):
+            log_message(f"📊 Progress: {msg}")
+        # Completion callback
+        def extraction_complete(result):
+            if result and result.get("success"):
+                log_message(f"✅ Extraction completed!")
+                log_message(f"   Chapters: {result.get('chapters', 0)}")
+            else:
+                log_message(f"❌ Extraction failed!")
+        # Create manager
+        manager = ChapterExtractionManager(log_callback=log_message)
+        # File selection
+        epub_path = filedialog.askopenfilename(
+            title="Select EPUB file",
+            filetypes=[("EPUB files", "*.epub"), ("All files", "*.*")]
+        )
+        if epub_path:
+            output_dir = os.path.splitext(os.path.basename(epub_path))[0]
+            # Start extraction
+            manager.extract_chapters_async(
+                epub_path,
+                output_dir,
+                extraction_mode="smart",
+                progress_callback=progress_update,
+                completion_callback=extraction_complete
+            )
+        # Button to stop
+        stop_btn = tk.Button(
+            root,
+            text="Stop Extraction",
+            command=lambda: manager.stop_extraction()
+        )
+        stop_btn.pack(pady=5)
+        root.mainloop()
+    # Run test
+    test_extraction()

chapter_extraction_worker.py ADDED Viewed

	@@ -0,0 +1,158 @@

+#!/usr/bin/env python3
+"""
+Chapter Extraction Worker - Runs chapter extraction in a separate process to prevent GUI freezing
+"""
+import sys
+import os
+import io
+# Force UTF-8 encoding for stdout/stderr on Windows
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
+sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
+import json
+import zipfile
+import time
+import traceback
+from pathlib import Path
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent))
+def run_chapter_extraction(epub_path, output_dir, extraction_mode="smart", progress_callback=None):
+    """
+    Run chapter extraction in this worker process
+    Args:
+        epub_path: Path to EPUB file
+        output_dir: Output directory for extracted content
+        extraction_mode: Extraction mode (smart, comprehensive, full, enhanced)
+        progress_callback: Callback function for progress updates (uses print for IPC)
+    Returns:
+        dict: Extraction results including chapters and metadata
+    """
+    try:
+        # Import here to avoid loading heavy modules until needed
+        from TransateKRtoEN import ChapterExtractor
+        # Create progress callback that prints to stdout for IPC
+        def worker_progress_callback(message):
+            # Use special prefix for progress messages
+            print(f"[PROGRESS] {message}", flush=True)
+        # Create extractor with progress callback
+        extractor = ChapterExtractor(progress_callback=worker_progress_callback)
+        # Set extraction mode
+        os.environ["EXTRACTION_MODE"] = extraction_mode
+        # Open EPUB and extract chapters
+        print(f"[INFO] Starting extraction of: {epub_path}", flush=True)
+        print(f"[INFO] Output directory: {output_dir}", flush=True)
+        print(f"[INFO] Extraction mode: {extraction_mode}", flush=True)
+        with zipfile.ZipFile(epub_path, 'r') as zf:
+            # Extract metadata first
+            metadata = extractor._extract_epub_metadata(zf)
+            print(f"[INFO] Extracted metadata: {list(metadata.keys())}", flush=True)
+            # Extract chapters
+            chapters = extractor.extract_chapters(zf, output_dir)
+            print(f"[INFO] Extracted {len(chapters)} chapters", flush=True)
+            # The extract_chapters method already handles OPF sorting internally
+            # Just log if OPF was used
+            opf_path = os.path.join(output_dir, 'content.opf')
+            if os.path.exists(opf_path):
+                print(f"[INFO] OPF file available for chapter ordering", flush=True)
+            # CRITICAL: Save the full chapters with body content!
+            # This is what the main process needs to load
+            chapters_full_path = os.path.join(output_dir, "chapters_full.json")
+            try:
+                with open(chapters_full_path, 'w', encoding='utf-8') as f:
+                    json.dump(chapters, f, ensure_ascii=False)
+                print(f"[INFO] Saved full chapters data to: {chapters_full_path}", flush=True)
+            except Exception as e:
+                print(f"[WARNING] Could not save full chapters: {e}", flush=True)
+                # Fall back to saving individual files
+                for chapter in chapters:
+                    try:
+                        chapter_file = f"chapter_{chapter['num']:04d}_{chapter.get('filename', 'content').replace('/', '_')}.html"
+                        chapter_path = os.path.join(output_dir, chapter_file)
+                        with open(chapter_path, 'w', encoding='utf-8') as f:
+                            f.write(chapter.get('body', ''))
+                        print(f"[INFO] Saved chapter {chapter['num']} to {chapter_file}", flush=True)
+                    except Exception as ce:
+                        print(f"[WARNING] Could not save chapter {chapter.get('num')}: {ce}", flush=True)
+            # Return results as JSON for IPC
+            result = {
+                "success": True,
+                "chapters": len(chapters),
+                "metadata": metadata,
+                "chapter_info": [
+                    {
+                        "num": ch.get("num"),
+                        "title": ch.get("title"),
+                        "has_images": ch.get("has_images", False),
+                        "file_size": ch.get("file_size", 0),
+                        "content_hash": ch.get("content_hash", "")
+                    }
+                    for ch in chapters
+                ]
+            }
+            # Output result as JSON
+            print(f"[RESULT] {json.dumps(result)}", flush=True)
+            return result
+    except Exception as e:
+        # Send error information
+        error_info = {
+            "success": False,
+            "error": str(e),
+            "traceback": traceback.format_exc()
+        }
+        print(f"[ERROR] {str(e)}", flush=True)
+        print(f"[RESULT] {json.dumps(error_info)}", flush=True)
+        return error_info
+def main():
+    """Main entry point for worker process"""
+    # Parse command line arguments
+    if len(sys.argv) < 3:
+        print("[ERROR] Usage: chapter_extraction_worker.py <epub_path> <output_dir> [extraction_mode]", flush=True)
+        sys.exit(1)
+    epub_path = sys.argv[1]
+    output_dir = sys.argv[2]
+    extraction_mode = sys.argv[3] if len(sys.argv) > 3 else "smart"
+    # Validate inputs
+    if not os.path.exists(epub_path):
+        print(f"[ERROR] EPUB file not found: {epub_path}", flush=True)
+        sys.exit(1)
+    # Create output directory if needed
+    os.makedirs(output_dir, exist_ok=True)
+    # Run extraction
+    result = run_chapter_extraction(epub_path, output_dir, extraction_mode)
+    # Exit with appropriate code
+    sys.exit(0 if result.get("success", False) else 1)
+if __name__ == "__main__":
+    # Ensure freeze support for Windows frozen exe
+    try:
+        import multiprocessing
+        multiprocessing.freeze_support()
+    except Exception:
+        pass
+    main()