Shirochi commited on
Commit
b5b56ea
·
verified ·
1 Parent(s): e54ad44

Upload 2 files

Browse files
chapter_extraction_manager.py ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Chapter Extraction Manager - Manages chapter extraction in subprocess to prevent GUI freezing
4
+ """
5
+
6
+ import subprocess
7
+ import sys
8
+ import os
9
+ import json
10
+ import threading
11
+ import queue
12
+ import time
13
+ from pathlib import Path
14
+
15
+
16
+ class ChapterExtractionManager:
17
+ """
18
+ Manages chapter extraction in a separate process to prevent GUI freezing
19
+ Similar to GlossaryManager but for chapter extraction
20
+ """
21
+
22
+ def __init__(self, log_callback=None):
23
+ """
24
+ Initialize the extraction manager
25
+
26
+ Args:
27
+ log_callback: Function to call with log messages (for GUI integration)
28
+ """
29
+ self.log_callback = log_callback
30
+ self.process = None
31
+ self.output_queue = queue.Queue()
32
+ self.error_queue = queue.Queue()
33
+ self.result = None
34
+ self.is_running = False
35
+ self.stop_requested = False
36
+
37
+ def extract_chapters_async(self, epub_path, output_dir, extraction_mode="smart",
38
+ progress_callback=None, completion_callback=None):
39
+ """
40
+ Start chapter extraction in a subprocess
41
+
42
+ Args:
43
+ epub_path: Path to EPUB file
44
+ output_dir: Output directory for extracted content
45
+ extraction_mode: Extraction mode (smart, comprehensive, full, enhanced)
46
+ progress_callback: Function to call with progress updates
47
+ completion_callback: Function to call when extraction completes
48
+ """
49
+ if self.is_running:
50
+ self._log("⚠️ Chapter extraction already in progress")
51
+ return False
52
+
53
+ self.is_running = True
54
+ self.stop_requested = False
55
+ self.result = None
56
+
57
+ # Start extraction in a thread that manages the subprocess
58
+ thread = threading.Thread(
59
+ target=self._run_extraction_subprocess,
60
+ args=(epub_path, output_dir, extraction_mode, progress_callback, completion_callback),
61
+ daemon=True
62
+ )
63
+ thread.start()
64
+
65
+ return True
66
+
67
+ def _run_extraction_subprocess(self, epub_path, output_dir, extraction_mode,
68
+ progress_callback, completion_callback):
69
+ """
70
+ Run the extraction subprocess and handle its output
71
+ """
72
+ try:
73
+ # Build command differently for frozen vs dev mode
74
+ if getattr(sys, 'frozen', False):
75
+ # In a frozen one-file build, sys.executable is our GUI .exe, not Python.
76
+ # Use an internal worker-mode flag handled by translator_gui.py to run the worker.
77
+ cmd = [
78
+ sys.executable,
79
+ '--run-chapter-extraction',
80
+ epub_path,
81
+ output_dir,
82
+ extraction_mode
83
+ ]
84
+ else:
85
+ # In dev mode, invoke the worker script with the Python interpreter
86
+ base_dir = Path(__file__).parent
87
+ worker_script = base_dir / "chapter_extraction_worker.py"
88
+ cmd = [
89
+ sys.executable,
90
+ str(worker_script),
91
+ epub_path,
92
+ output_dir,
93
+ extraction_mode
94
+ ]
95
+
96
+ # Set environment to force UTF-8 encoding
97
+ env = os.environ.copy()
98
+ env['PYTHONIOENCODING'] = 'utf-8'
99
+ env['PYTHONLEGACYWINDOWSSTDIO'] = '0' # Use new Windows console API
100
+
101
+ # Set default worker count if not already set
102
+ env.setdefault('EXTRACTION_WORKERS', '2')
103
+
104
+ self._log(f"🚀 Starting chapter extraction subprocess...")
105
+ self._log(f"📚 EPUB: {os.path.basename(epub_path)}")
106
+ self._log(f"📂 Output: {output_dir}")
107
+ self._log(f"⚙️ Mode: {extraction_mode}")
108
+
109
+ # Start the subprocess with UTF-8 encoding
110
+ self.process = subprocess.Popen(
111
+ cmd,
112
+ stdout=subprocess.PIPE,
113
+ stderr=subprocess.PIPE,
114
+ text=True,
115
+ encoding='utf-8',
116
+ errors='replace', # Replace invalid chars instead of failing
117
+ bufsize=1,
118
+ universal_newlines=True,
119
+ env=env # Pass the environment with UTF-8 settings
120
+ )
121
+
122
+ # Read output in real-time
123
+ while True:
124
+ if self.stop_requested:
125
+ self._terminate_process()
126
+ break
127
+
128
+ # Check if process is still running
129
+ if self.process.poll() is not None:
130
+ break
131
+
132
+ # Read stdout line by line with error handling
133
+ try:
134
+ line = self.process.stdout.readline()
135
+ if not line:
136
+ continue
137
+
138
+ line = line.strip()
139
+ if not line:
140
+ continue
141
+ except UnicodeDecodeError as e:
142
+ self._log(f"⚠️ Encoding error reading output: {e}")
143
+ continue
144
+
145
+ # Skip all processing if stop is requested to suppress logs
146
+ if self.stop_requested:
147
+ continue
148
+
149
+ # Parse output based on prefix
150
+ if line.startswith("[PROGRESS]"):
151
+ # Progress update
152
+ message = line[10:].strip()
153
+ if progress_callback:
154
+ progress_callback(message)
155
+ self._log(f"📊 {message}")
156
+
157
+ elif line.startswith("[INFO]"):
158
+ # Information message
159
+ message = line[6:].strip()
160
+ self._log(f"ℹ️ {message}")
161
+
162
+ elif line.startswith("[ERROR]"):
163
+ # Error message
164
+ message = line[7:].strip()
165
+ self._log(f"❌ {message}")
166
+ self.error_queue.put(message)
167
+
168
+ elif line.startswith("[RESULT]"):
169
+ # Final result as JSON
170
+ try:
171
+ json_str = line[8:].strip()
172
+ self.result = json.loads(json_str)
173
+
174
+ if self.result.get("success"):
175
+ self._log(f"✅ Extraction completed successfully!")
176
+ self._log(f"📚 Extracted {self.result.get('chapters', 0)} chapters")
177
+ else:
178
+ error = self.result.get("error", "Unknown error")
179
+ self._log(f"❌ Extraction failed: {error}")
180
+
181
+ except json.JSONDecodeError as e:
182
+ self._log(f"⚠️ Failed to parse result: {e}")
183
+
184
+ elif line.startswith("["):
185
+ # Other prefixed messages - skip
186
+ pass
187
+ else:
188
+ # Regular output - only log if not too verbose
189
+ if not any(skip in line for skip in ["📁 Searching for", "📁 Found", "📁 ✓", "📁 ✗"]):
190
+ self._log(line)
191
+
192
+ # Get any remaining output - but only process if not stopped
193
+ if not self.stop_requested:
194
+ remaining_output, remaining_error = self.process.communicate(timeout=1)
195
+
196
+ # Process any remaining output
197
+ if remaining_output:
198
+ for line in remaining_output.strip().split('\n'):
199
+ if line and not line.startswith("["):
200
+ self._log(line)
201
+
202
+ # Check for errors
203
+ if remaining_error:
204
+ for line in remaining_error.strip().split('\n'):
205
+ if line:
206
+ self._log(f"⚠️ {line}")
207
+
208
+ # Check final status
209
+ if self.process.returncode != 0:
210
+ self._log(f"⚠️ Process exited with code {self.process.returncode}")
211
+ else:
212
+ # If stopped, just clean up without processing output
213
+ try:
214
+ self.process.communicate(timeout=0.1)
215
+ except subprocess.TimeoutExpired:
216
+ pass # Ignore timeout when cleaning up
217
+
218
+ except subprocess.TimeoutExpired:
219
+ if not self.stop_requested:
220
+ self._log("⚠️ Subprocess communication timeout")
221
+ self._terminate_process()
222
+
223
+ except Exception as e:
224
+ # Only log errors if not stopping (unless it's a critical error)
225
+ if not self.stop_requested or "Subprocess error" in str(e):
226
+ self._log(f"❌ Subprocess error: {e}")
227
+ self.result = {
228
+ "success": False,
229
+ "error": str(e) if not self.stop_requested else "Extraction stopped by user"
230
+ }
231
+
232
+ finally:
233
+ self.is_running = False
234
+ # Store process reference before clearing it in case termination is needed
235
+ process_ref = self.process
236
+ self.process = None
237
+
238
+ # If process is still running, try to clean it up
239
+ if process_ref and process_ref.poll() is None:
240
+ try:
241
+ process_ref.terminate()
242
+ time.sleep(0.1) # Brief wait
243
+ if process_ref.poll() is None:
244
+ process_ref.kill()
245
+ except Exception:
246
+ pass # Ignore cleanup errors in finally block
247
+
248
+ # Ensure result is never None
249
+ if self.result is None:
250
+ if self.stop_requested:
251
+ self.result = {
252
+ "success": False,
253
+ "error": "Extraction stopped by user"
254
+ }
255
+ else:
256
+ self.result = {
257
+ "success": False,
258
+ "error": "Extraction process ended unexpectedly"
259
+ }
260
+
261
+ # Call completion callback
262
+ if completion_callback:
263
+ completion_callback(self.result)
264
+
265
+ def stop_extraction(self):
266
+ """Stop the extraction process"""
267
+ if not self.is_running:
268
+ return False
269
+
270
+ # Set stop flag first to suppress subsequent logs
271
+ self.stop_requested = True
272
+ self._log("🛑 Stopping chapter extraction...")
273
+
274
+ # Store process reference to avoid race condition
275
+ process_ref = self.process
276
+
277
+ # Give it a moment to stop gracefully
278
+ time.sleep(0.5)
279
+
280
+ # Force terminate if still running and process still exists
281
+ if process_ref:
282
+ self._terminate_process_ref(process_ref)
283
+
284
+ return True
285
+
286
+ def _terminate_process(self):
287
+ """Terminate the subprocess using current process reference"""
288
+ if self.process:
289
+ self._terminate_process_ref(self.process)
290
+
291
+ def _terminate_process_ref(self, process_ref):
292
+ """Terminate a specific process reference"""
293
+ if not process_ref:
294
+ return
295
+
296
+ try:
297
+ # Check if process is still alive before attempting termination
298
+ if process_ref.poll() is None:
299
+ process_ref.terminate()
300
+ # Give it a moment to terminate
301
+ time.sleep(0.5)
302
+
303
+ # Force kill if still running
304
+ if process_ref.poll() is None:
305
+ process_ref.kill()
306
+ time.sleep(0.1) # Brief wait after kill
307
+
308
+ # Only log termination if not stopping (user already knows they stopped it)
309
+ if not self.stop_requested:
310
+ self._log("✅ Process terminated")
311
+ else:
312
+ # Only log if not stopping
313
+ if not self.stop_requested:
314
+ self._log("✅ Process already terminated")
315
+ except Exception as e:
316
+ # Always log termination errors as they might indicate a problem
317
+ self._log(f"⚠️ Error terminating process: {e}")
318
+
319
+ def _log(self, message):
320
+ """Log a message using the callback if available"""
321
+ # Suppress logs when stop is requested (except for stop/termination messages)
322
+ if self.stop_requested and not any(keyword in message for keyword in ["🛑", "✅ Process terminated", "❌ Subprocess error"]):
323
+ return
324
+
325
+ if self.log_callback:
326
+ self.log_callback(message)
327
+ else:
328
+ print(message)
329
+
330
+ def is_extraction_running(self):
331
+ """Check if extraction is currently running"""
332
+ return self.is_running
333
+
334
+ def get_result(self):
335
+ """Get the extraction result if available"""
336
+ return self.result
337
+
338
+
339
+ # Example usage
340
+ if __name__ == "__main__":
341
+ import tkinter as tk
342
+ from tkinter import filedialog
343
+
344
+ def test_extraction():
345
+ """Test the extraction manager"""
346
+
347
+ # Create a simple GUI for testing
348
+ root = tk.Tk()
349
+ root.title("Chapter Extraction Test")
350
+ root.geometry("800x600")
351
+
352
+ # Text widget for logs
353
+ text = tk.Text(root, wrap=tk.WORD)
354
+ text.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
355
+
356
+ # Log callback
357
+ def log_message(msg):
358
+ text.insert(tk.END, msg + "\n")
359
+ text.see(tk.END)
360
+ root.update_idletasks()
361
+
362
+ # Progress callback
363
+ def progress_update(msg):
364
+ log_message(f"📊 Progress: {msg}")
365
+
366
+ # Completion callback
367
+ def extraction_complete(result):
368
+ if result and result.get("success"):
369
+ log_message(f"✅ Extraction completed!")
370
+ log_message(f" Chapters: {result.get('chapters', 0)}")
371
+ else:
372
+ log_message(f"❌ Extraction failed!")
373
+
374
+ # Create manager
375
+ manager = ChapterExtractionManager(log_callback=log_message)
376
+
377
+ # File selection
378
+ epub_path = filedialog.askopenfilename(
379
+ title="Select EPUB file",
380
+ filetypes=[("EPUB files", "*.epub"), ("All files", "*.*")]
381
+ )
382
+
383
+ if epub_path:
384
+ output_dir = os.path.splitext(os.path.basename(epub_path))[0]
385
+
386
+ # Start extraction
387
+ manager.extract_chapters_async(
388
+ epub_path,
389
+ output_dir,
390
+ extraction_mode="smart",
391
+ progress_callback=progress_update,
392
+ completion_callback=extraction_complete
393
+ )
394
+
395
+ # Button to stop
396
+ stop_btn = tk.Button(
397
+ root,
398
+ text="Stop Extraction",
399
+ command=lambda: manager.stop_extraction()
400
+ )
401
+ stop_btn.pack(pady=5)
402
+
403
+ root.mainloop()
404
+
405
+ # Run test
406
+ test_extraction()
chapter_extraction_worker.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Chapter Extraction Worker - Runs chapter extraction in a separate process to prevent GUI freezing
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ import io
9
+
10
+ # Force UTF-8 encoding for stdout/stderr on Windows
11
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
12
+ sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
13
+ import json
14
+ import zipfile
15
+ import time
16
+ import traceback
17
+ from pathlib import Path
18
+
19
+ # Add parent directory to path for imports
20
+ sys.path.insert(0, str(Path(__file__).parent))
21
+
22
+ def run_chapter_extraction(epub_path, output_dir, extraction_mode="smart", progress_callback=None):
23
+ """
24
+ Run chapter extraction in this worker process
25
+
26
+ Args:
27
+ epub_path: Path to EPUB file
28
+ output_dir: Output directory for extracted content
29
+ extraction_mode: Extraction mode (smart, comprehensive, full, enhanced)
30
+ progress_callback: Callback function for progress updates (uses print for IPC)
31
+
32
+ Returns:
33
+ dict: Extraction results including chapters and metadata
34
+ """
35
+ try:
36
+ # Import here to avoid loading heavy modules until needed
37
+ from TransateKRtoEN import ChapterExtractor
38
+
39
+ # Create progress callback that prints to stdout for IPC
40
+ def worker_progress_callback(message):
41
+ # Use special prefix for progress messages
42
+ print(f"[PROGRESS] {message}", flush=True)
43
+
44
+ # Create extractor with progress callback
45
+ extractor = ChapterExtractor(progress_callback=worker_progress_callback)
46
+
47
+ # Set extraction mode
48
+ os.environ["EXTRACTION_MODE"] = extraction_mode
49
+
50
+ # Open EPUB and extract chapters
51
+ print(f"[INFO] Starting extraction of: {epub_path}", flush=True)
52
+ print(f"[INFO] Output directory: {output_dir}", flush=True)
53
+ print(f"[INFO] Extraction mode: {extraction_mode}", flush=True)
54
+
55
+ with zipfile.ZipFile(epub_path, 'r') as zf:
56
+ # Extract metadata first
57
+ metadata = extractor._extract_epub_metadata(zf)
58
+ print(f"[INFO] Extracted metadata: {list(metadata.keys())}", flush=True)
59
+
60
+ # Extract chapters
61
+ chapters = extractor.extract_chapters(zf, output_dir)
62
+
63
+ print(f"[INFO] Extracted {len(chapters)} chapters", flush=True)
64
+
65
+ # The extract_chapters method already handles OPF sorting internally
66
+ # Just log if OPF was used
67
+ opf_path = os.path.join(output_dir, 'content.opf')
68
+ if os.path.exists(opf_path):
69
+ print(f"[INFO] OPF file available for chapter ordering", flush=True)
70
+
71
+ # CRITICAL: Save the full chapters with body content!
72
+ # This is what the main process needs to load
73
+ chapters_full_path = os.path.join(output_dir, "chapters_full.json")
74
+ try:
75
+ with open(chapters_full_path, 'w', encoding='utf-8') as f:
76
+ json.dump(chapters, f, ensure_ascii=False)
77
+ print(f"[INFO] Saved full chapters data to: {chapters_full_path}", flush=True)
78
+ except Exception as e:
79
+ print(f"[WARNING] Could not save full chapters: {e}", flush=True)
80
+ # Fall back to saving individual files
81
+ for chapter in chapters:
82
+ try:
83
+ chapter_file = f"chapter_{chapter['num']:04d}_{chapter.get('filename', 'content').replace('/', '_')}.html"
84
+ chapter_path = os.path.join(output_dir, chapter_file)
85
+ with open(chapter_path, 'w', encoding='utf-8') as f:
86
+ f.write(chapter.get('body', ''))
87
+ print(f"[INFO] Saved chapter {chapter['num']} to {chapter_file}", flush=True)
88
+ except Exception as ce:
89
+ print(f"[WARNING] Could not save chapter {chapter.get('num')}: {ce}", flush=True)
90
+
91
+ # Return results as JSON for IPC
92
+ result = {
93
+ "success": True,
94
+ "chapters": len(chapters),
95
+ "metadata": metadata,
96
+ "chapter_info": [
97
+ {
98
+ "num": ch.get("num"),
99
+ "title": ch.get("title"),
100
+ "has_images": ch.get("has_images", False),
101
+ "file_size": ch.get("file_size", 0),
102
+ "content_hash": ch.get("content_hash", "")
103
+ }
104
+ for ch in chapters
105
+ ]
106
+ }
107
+
108
+ # Output result as JSON
109
+ print(f"[RESULT] {json.dumps(result)}", flush=True)
110
+ return result
111
+
112
+ except Exception as e:
113
+ # Send error information
114
+ error_info = {
115
+ "success": False,
116
+ "error": str(e),
117
+ "traceback": traceback.format_exc()
118
+ }
119
+ print(f"[ERROR] {str(e)}", flush=True)
120
+ print(f"[RESULT] {json.dumps(error_info)}", flush=True)
121
+ return error_info
122
+
123
+
124
+ def main():
125
+ """Main entry point for worker process"""
126
+
127
+ # Parse command line arguments
128
+ if len(sys.argv) < 3:
129
+ print("[ERROR] Usage: chapter_extraction_worker.py <epub_path> <output_dir> [extraction_mode]", flush=True)
130
+ sys.exit(1)
131
+
132
+ epub_path = sys.argv[1]
133
+ output_dir = sys.argv[2]
134
+ extraction_mode = sys.argv[3] if len(sys.argv) > 3 else "smart"
135
+
136
+ # Validate inputs
137
+ if not os.path.exists(epub_path):
138
+ print(f"[ERROR] EPUB file not found: {epub_path}", flush=True)
139
+ sys.exit(1)
140
+
141
+ # Create output directory if needed
142
+ os.makedirs(output_dir, exist_ok=True)
143
+
144
+ # Run extraction
145
+ result = run_chapter_extraction(epub_path, output_dir, extraction_mode)
146
+
147
+ # Exit with appropriate code
148
+ sys.exit(0 if result.get("success", False) else 1)
149
+
150
+
151
+ if __name__ == "__main__":
152
+ # Ensure freeze support for Windows frozen exe
153
+ try:
154
+ import multiprocessing
155
+ multiprocessing.freeze_support()
156
+ except Exception:
157
+ pass
158
+ main()