snake11235 commited on
Commit
c56fd22
·
1 Parent(s): 2308352

feat: initial project setup with multi-LLM OCR to CSV converter

Browse files

Add Gradio-based application for converting handwritten or printed text from PDFs/images to CSV format using multiple LLM backends (ChatGPT 5.2, Gemini 3 Pro, olmOCR-2-7B-1025-FP8).

- Add core application with image/PDF upload and processing pipeline
- Add support for OpenAI Vision API with configurable model selection
- Add support for Google Gemini API for vision tasks
- Add local olmOCR model integration with Qwen2.5-VL backend
- Add Docker

Files changed (6) hide show
  1. .gitignore +1 -0
  2. .gradio/certificate.pem +31 -0
  3. Dockerfile +28 -0
  4. app.py +305 -0
  5. docker-compose.yml +14 -0
  6. requirements.txt +9 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONUNBUFFERED=1 \
4
+ PIP_NO_CACHE_DIR=1
5
+
6
+ WORKDIR /app
7
+
8
+ # System dependencies (if olmocr / rendering requires them, extend here)
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ git \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ COPY requirements.txt ./
14
+ RUN pip install --upgrade pip && pip install -r requirements.txt
15
+
16
+ COPY . .
17
+
18
+ EXPOSE 7860
19
+
20
+ # Environment variables expected (documented for convenience)
21
+ # - OPENAI_API_KEY: API key for ChatGPT 5.2 backend
22
+ # - WORDS2DOC_OPENAI_MODEL: Optional, OpenAI model name (default: gpt-4.1-mini)
23
+ # - GEMINI_API_KEY: API key for Gemini backend
24
+ # - WORDS2DOC_GEMINI_MODEL: Optional, Gemini model name (default: gemini-1.5-flash)
25
+
26
+ ENV GRADIO_SERVER_NAME="0.0.0.0"
27
+
28
+ CMD ["gradio", "app.py"]
app.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+ import base64
7
+ from io import BytesIO
8
+ from typing import Tuple, Optional
9
+
10
+ import gradio as gr
11
+ from PIL import Image
12
+
13
+ from olmocr.data.renderpdf import render_pdf_to_base64png
14
+
15
+ # Optional imports for cloud LLMs
16
+ try:
17
+ from openai import OpenAI
18
+ except ImportError: # pragma: no cover
19
+ OpenAI = None # type: ignore
20
+
21
+ try:
22
+ import google.generativeai as genai
23
+ except ImportError: # pragma: no cover
24
+ genai = None # type: ignore
25
+
26
+ import torch
27
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
28
+
29
+
30
+ APP_TITLE = "words2doc"
31
+ APP_DESCRIPTION = "Upload a PDF or image with (handwritten) text and convert it to CSV using different LLM backends."
32
+
33
+ MODEL_CHATGPT = "ChatGPT 5.2"
34
+ MODEL_GEMINI = "Gemini 3 Pro"
35
+ MODEL_OLMOCR = "olmOCR-2-7B-1025-FP8"
36
+
37
+
38
+ # -------- Utility helpers -------- #
39
+
40
+
41
+ def _load_image_from_upload(path: str) -> Image.Image:
42
+ """Load an image from a path (for image uploads)."""
43
+ return Image.open(path).convert("RGB")
44
+
45
+
46
+ def _pdf_to_pil_image(path: str, page: int = 1, target_longest_image_dim: int = 1288) -> Image.Image:
47
+ """Render a single PDF page to PIL Image via olmocr's helper."""
48
+ image_base64 = render_pdf_to_base64png(path, page, target_longest_image_dim=target_longest_image_dim)
49
+ return Image.open(BytesIO(base64.b64decode(image_base64)))
50
+
51
+
52
+ def _image_from_any_file(file_path: str) -> Image.Image:
53
+ """Accept either PDF or image and always return a PIL Image (first page for PDFs)."""
54
+ lower = file_path.lower()
55
+ if lower.endswith(".pdf"):
56
+ return _pdf_to_pil_image(file_path)
57
+ return _load_image_from_upload(file_path)
58
+
59
+
60
+ def _write_csv_to_temp_file(csv_text: str) -> str:
61
+ """Write CSV text to a temporary file and return the path."""
62
+ import tempfile
63
+
64
+ fd, path = tempfile.mkstemp(suffix=".csv", prefix="words2doc_")
65
+ with os.fdopen(fd, "w", encoding="utf-8") as f:
66
+ f.write(csv_text)
67
+ return path
68
+
69
+
70
+ # -------- Backends -------- #
71
+ # Function to encode the image
72
+ def _encode_image(image_path):
73
+ with open(image_path, "rb") as image_file:
74
+ return base64.b64encode(image_file.read()).decode("utf-8")
75
+
76
+ def _run_openai_vision(image: Image.Image, prompt: str) -> str:
77
+ if OpenAI is None:
78
+ raise RuntimeError("openai package is not installed. Please install it to use ChatGPT 5.2 backend.")
79
+
80
+ api_key = os.getenv("OPENAI_API_KEY")
81
+ if not api_key:
82
+ raise RuntimeError("OPENAI_API_KEY environment variable is not set.")
83
+
84
+ client = OpenAI(api_key=api_key)
85
+
86
+ buffered = BytesIO()
87
+ image.save(buffered, format="JPEG")
88
+ img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
89
+
90
+ model_name = os.getenv("WORDS2DOC_OPENAI_MODEL", "gpt-5-nano-2025-08-07")
91
+ _log(f"Using OpenAI model: {model_name}")
92
+ _log(f"Input image size: {image.size}")
93
+ response = client.responses.create(
94
+ model=model_name,
95
+ input=[
96
+ {
97
+ "role": "user",
98
+ "content": [
99
+ {"type": "input_text", "text": prompt},
100
+ {
101
+ "type": "input_image",
102
+ "image_url": f"data:image/jpeg;base64,{img_b64}",
103
+ },
104
+ ],
105
+ }
106
+ ],
107
+ max_output_tokens=2048,
108
+ )
109
+ _log("OpenAI vision response received")
110
+ _log_debug(f"Response length: {len(response.output_text)} characters")
111
+ _log_debug(f"First 200 chars: {response.output_text[:200]}...")
112
+ return response.output_text
113
+
114
+
115
+ def _run_gemini_vision(image: Image.Image, prompt: str) -> str:
116
+ if genai is None:
117
+ raise RuntimeError("google-generativeai package is not installed. Please install it to use Gemini backend.")
118
+
119
+ api_key = os.getenv("GEMINI_API_KEY")
120
+ if not api_key:
121
+ raise RuntimeError("GEMINI_API_KEY environment variable is not set.")
122
+
123
+ genai.configure(api_key=api_key)
124
+ model_name = os.getenv("WORDS2DOC_GEMINI_MODEL", "gemini-1.5-flash")
125
+ model = genai.GenerativeModel(model_name)
126
+
127
+ # Gemini expects a PIL Image directly
128
+ response = model.generate_content([prompt, image])
129
+ return response.text or ""
130
+
131
+
132
+ _olmocr_model: Optional[Qwen2_5_VLForConditionalGeneration] = None
133
+ _olmocr_processor: Optional[AutoProcessor] = None
134
+
135
+
136
+ def _ensure_olmocr_loaded() -> Tuple[Qwen2_5_VLForConditionalGeneration, AutoProcessor]:
137
+ global _olmocr_model, _olmocr_processor
138
+
139
+ if _olmocr_model is None or _olmocr_processor is None:
140
+ _olmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
141
+ "allenai/olmOCR-2-7B-1025-FP8", device_map="auto"
142
+ ).eval()
143
+ _olmocr_processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
144
+
145
+ return _olmocr_model, _olmocr_processor
146
+
147
+
148
+ def _run_olmocr(image: Image.Image, prompt: str) -> str:
149
+ model, processor = _ensure_olmocr_loaded()
150
+
151
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
152
+ model.to(device)
153
+
154
+ messages = [
155
+ {
156
+ "role": "user",
157
+ "content": [
158
+ {"type": "text", "text": prompt},
159
+ {"type": "image", "image": image},
160
+ ],
161
+ }
162
+ ]
163
+
164
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
165
+
166
+ inputs = processor(
167
+ text=[text],
168
+ images=[image],
169
+ padding=True,
170
+ return_tensors="pt",
171
+ )
172
+ inputs = {key: value.to(device) for (key, value) in inputs.items()}
173
+
174
+ output = model.generate(
175
+ **inputs,
176
+ temperature=0.1,
177
+ max_new_tokens=1024,
178
+ num_return_sequences=1,
179
+ do_sample=True,
180
+ )
181
+
182
+ prompt_length = inputs["input_ids"].shape[1]
183
+ new_tokens = output[:, prompt_length:]
184
+
185
+ text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
186
+ return text_output[0] if text_output else ""
187
+
188
+
189
+ # -------- Main processing function -------- #
190
+
191
+
192
+ def process_document(file_obj, model_choice: str, prompt: str):
193
+ if file_obj is None:
194
+ return "No file uploaded.", None
195
+
196
+ file_path = getattr(file_obj, "name", None) or file_obj
197
+ image = _image_from_any_file(file_path)
198
+
199
+ if not prompt.strip():
200
+ prompt = (
201
+ "You are an OCR-to-CSV assistant. Read the table or structured text in the image and output a valid "
202
+ "CSV representation. Use commas as separators and include a header row if appropriate."
203
+ )
204
+
205
+ if model_choice == MODEL_CHATGPT:
206
+ csv_text = _run_openai_vision(image, prompt)
207
+ elif model_choice == MODEL_GEMINI:
208
+ csv_text = _run_gemini_vision(image, prompt)
209
+ elif model_choice == MODEL_OLMOCR:
210
+ csv_text = _run_olmocr(image, prompt)
211
+ else:
212
+ csv_text = f"Unknown model choice: {model_choice}"
213
+
214
+ csv_file_path = _write_csv_to_temp_file(csv_text)
215
+ return csv_text, csv_file_path
216
+
217
+ def _log(message: str):
218
+ print(f"[WORDS2CSV] {message}")
219
+
220
+ def _log_debug(message: str):
221
+ if os.getenv("WORDS2CSV_DEBUG"):
222
+ print(f"[WORDS2CSV-DEBUG] {message}")
223
+
224
+ # -------- Gradio UI -------- #
225
+
226
+
227
+ def build_interface() -> gr.Blocks:
228
+ with gr.Blocks(title=APP_TITLE) as demo:
229
+ gr.Markdown(f"# {APP_TITLE}")
230
+ gr.Markdown(APP_DESCRIPTION)
231
+
232
+ with gr.Row():
233
+ with gr.Column(scale=1):
234
+ file_input = gr.File(
235
+ label="Upload PDF or image",
236
+ file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp"],
237
+ )
238
+
239
+ model_selector = gr.Dropdown(
240
+ label="LLM backend",
241
+ choices=[MODEL_CHATGPT, MODEL_GEMINI, MODEL_OLMOCR],
242
+ value=MODEL_CHATGPT,
243
+ )
244
+
245
+ prompt_editor = gr.Textbox(
246
+ label="Prompt editor",
247
+ value=(
248
+ "You are an OCR and vocabulary extractor.\n"
249
+ "You are given a photo of a vocabulary book page with words in original language and their translations.\n"
250
+
251
+ "Your task:\n"
252
+ "- Read the text on the page.\n"
253
+ "- First, detect the language of the words.\n"
254
+ "- Identify all words and their corresponding translations.\n"
255
+ "- Do NOT include dates, page numbers, headings, or example sentences.\n"
256
+ "- Do NOT repeat the same word twice.\n"
257
+ "- If there are duplicates, keep only one row.\n"
258
+ "\n"
259
+ "Output format (VERY IMPORTANT):\n"
260
+ "- Output ONLY CSV rows.\n"
261
+ "- NO explanations, NO extra text, NO quotes.\n"
262
+ "- Each line must be: <word>,<translation>\n"
263
+ "- Use a comma as separator.\n"
264
+ "- No header row.\n"
265
+ "- Example:\n"
266
+ "word1,translation1\n"
267
+ "word2,translation2\n"
268
+ "word3,translation3\n"
269
+ "\n"
270
+ "Now output ONLY the CSV rows for the attached image."
271
+ ),
272
+ lines=6,
273
+ placeholder=(
274
+ "Describe how the CSV should be structured. If left empty, a default OCR-to-CSV prompt is used."
275
+ ),
276
+ )
277
+
278
+ run_button = gr.Button("Run", variant="primary")
279
+
280
+ with gr.Column(scale=1):
281
+ csv_output = gr.Textbox(
282
+ label="CSV output (preview)",
283
+ lines=20,
284
+ buttons=["copy"],
285
+ )
286
+ csv_file = gr.File(label="Download CSV file", interactive=False)
287
+
288
+ run_button.click(
289
+ fn=process_document,
290
+ inputs=[file_input, model_selector, prompt_editor],
291
+ outputs=[csv_output, csv_file],
292
+ )
293
+
294
+ return demo
295
+
296
+
297
+ demo = build_interface()
298
+
299
+
300
+ if __name__ == "__main__":
301
+ demo.launch(
302
+ server_name="0.0.0.0",
303
+ server_port=int(os.getenv("PORT", "7860")),
304
+ share=True,
305
+ )
docker-compose.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ app:
3
+ container_name: words2csv
4
+ build: .
5
+ ports:
6
+ - "7860:7860"
7
+ volumes:
8
+ - .:/app
9
+ env_file:
10
+ - .env
11
+ environment:
12
+ - PYTHONUNBUFFERED=1
13
+ - GRADIO_SERVER_NAME=0.0.0.0
14
+ command: gradio app.py
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=6.1.0
2
+ openai>=1.40.0
3
+ google-generativeai>=0.7.0
4
+ olmocr>=0.1.0
5
+ torch>=2.2.0
6
+ transformers>=4.42.0
7
+ pillow>=10.3.0
8
+ python-dotenv>=1.0.0
9
+