Spaces:
Running
Running
File size: 8,082 Bytes
457b8fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
import re
from bs4 import BeautifulSoup
import tiktoken
class ChapterSplitter:
"""Split large chapters into smaller chunks while preserving structure"""
def __init__(self, model_name="gpt-3.5-turbo", target_tokens=80000, compression_factor=1.0):
"""
Initialize splitter with token counter
target_tokens: Target size for each chunk (leaving room for system prompt & history)
compression_factor: Expected compression ratio from source to target language (0.7-1.0)
"""
try:
self.enc = tiktoken.encoding_for_model(model_name)
except:
self.enc = tiktoken.get_encoding("cl100k_base")
self.target_tokens = target_tokens
self.compression_factor = compression_factor
def count_tokens(self, text):
"""Count tokens in text"""
try:
return len(self.enc.encode(text))
except:
# Fallback estimation
return len(text) // 4
def split_chapter(self, chapter_html, max_tokens=None):
"""
Split a chapter into smaller chunks if it exceeds token limit
Returns: List of (chunk_html, chunk_index, total_chunks)
"""
if max_tokens is None:
max_tokens = self.target_tokens
# Apply compression factor to output token limit
# If compression_factor is 0.7 and max_tokens is 4096,
# we expect output to be 4096 * 0.7 = 2867 tokens
effective_max_tokens = int(max_tokens * self.compression_factor)
# First check if splitting is needed
total_tokens = self.count_tokens(chapter_html)
if total_tokens <= effective_max_tokens:
return [(chapter_html, 1, 1)] # No split needed
# Parse HTML
soup = BeautifulSoup(chapter_html, 'html.parser')
# Try to find natural break points
chunks = []
current_chunk = []
current_tokens = 0
# Get all direct children of body, or all top-level elements
if soup.body:
elements = list(soup.body.children)
else:
elements = list(soup.children)
for element in elements:
if isinstance(element, str) and element.strip() == '':
continue
element_html = str(element)
element_tokens = self.count_tokens(element_html)
# If single element is too large, try to split it
if element_tokens > effective_max_tokens:
sub_chunks = self._split_large_element(element, effective_max_tokens)
for sub_chunk in sub_chunks:
chunks.append(sub_chunk)
else:
# Check if adding this element would exceed limit
if current_tokens + element_tokens > effective_max_tokens and current_chunk:
# Save current chunk
chunks.append(self._create_chunk_html(current_chunk))
current_chunk = [element_html]
current_tokens = element_tokens
else:
current_chunk.append(element_html)
current_tokens += element_tokens
# Don't forget the last chunk
if current_chunk:
chunks.append(self._create_chunk_html(current_chunk))
# Return chunks with metadata
total_chunks = len(chunks)
return [(chunk, i+1, total_chunks) for i, chunk in enumerate(chunks)]
def _split_large_element(self, element, max_tokens):
"""Split a single large element (like a long paragraph)"""
chunks = []
if element.name == 'p' or not hasattr(element, 'children'):
# For paragraphs or text elements, split by sentences
text = element.get_text()
sentences = re.split(r'(?<=[.!?])\s+', text)
current_chunk = []
current_tokens = 0
for sentence in sentences:
sentence_tokens = self.count_tokens(sentence)
if current_tokens + sentence_tokens > max_tokens * 0.8 and current_chunk:
# Create paragraph with current sentences
chunk_text = ' '.join(current_chunk)
chunks.append(f"<p>{chunk_text}</p>")
current_chunk = [sentence]
current_tokens = sentence_tokens
else:
current_chunk.append(sentence)
current_tokens += sentence_tokens
if current_chunk:
chunk_text = ' '.join(current_chunk)
chunks.append(f"<p>{chunk_text}</p>")
else:
# For other elements, try to split by children
children = list(element.children)
current_chunk = []
current_tokens = 0
for child in children:
child_html = str(child)
child_tokens = self.count_tokens(child_html)
if current_tokens + child_tokens > max_tokens * 0.8 and current_chunk:
# Wrap in parent element type
wrapper = BeautifulSoup(f"<{element.name}></{element.name}>", 'html.parser')
wrapper_elem = wrapper.find(element.name)
for item in current_chunk:
wrapper_elem.append(BeautifulSoup(item, 'html.parser'))
chunks.append(str(wrapper))
current_chunk = [child_html]
current_tokens = child_tokens
else:
current_chunk.append(child_html)
current_tokens += child_tokens
if current_chunk:
wrapper = BeautifulSoup(f"<{element.name}></{element.name}>", 'html.parser')
wrapper_elem = wrapper.find(element.name)
for item in current_chunk:
wrapper_elem.append(BeautifulSoup(item, 'html.parser'))
chunks.append(str(wrapper))
return chunks
def _create_chunk_html(self, elements):
"""Create a valid HTML chunk from list of elements"""
# Join elements and wrap in basic HTML structure if needed
content = '\n'.join(elements)
# Check if it already has body tags
if '<body' not in content.lower():
# Just return the content, let the translation handle it
return content
else:
return content
def merge_translated_chunks(self, translated_chunks):
"""
Merge translated chunks back together
translated_chunks: List of (translated_html, chunk_index, total_chunks)
"""
# Sort by chunk index to ensure correct order
sorted_chunks = sorted(translated_chunks, key=lambda x: x[1])
# Extract just the HTML content
html_parts = [chunk[0] for chunk in sorted_chunks]
# Simply concatenate - the chunks should maintain structure
merged = '\n'.join(html_parts)
# Clean up any duplicate body tags if they exist
soup = BeautifulSoup(merged, 'html.parser')
# If multiple body tags, merge their contents
bodies = soup.find_all('body')
if len(bodies) > 1:
# Keep first body, move all content from others into it
main_body = bodies[0]
for extra_body in bodies[1:]:
for child in list(extra_body.children):
main_body.append(child)
extra_body.decompose()
return str(soup)
return merged
|