Spaces:

snake11235
/

words2csv

Running

snake11235 commited on 10 days ago

Commit

c56fd22

1 Parent(s): 2308352

feat: initial project setup with multi-LLM OCR to CSV converter

Add Gradio-based application for converting handwritten or printed text from PDFs/images to CSV format using multiple LLM backends (ChatGPT 5.2, Gemini 3 Pro, olmOCR-2-7B-1025-FP8).

- Add core application with image/PDF upload and processing pipeline
- Add support for OpenAI Vision API with configurable model selection
- Add support for Google Gemini API for vision tasks
- Add local olmOCR model integration with Qwen2.5-VL backend
- Add Docker

Files changed (6) hide show

.gitignore +1 -0
.gradio/certificate.pem +31 -0
Dockerfile +28 -0
app.py +305 -0
docker-compose.yml +14 -0
requirements.txt +9 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM python:3.11-slim
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1
+WORKDIR /app
+# System dependencies (if olmocr / rendering requires them, extend here)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt ./
+RUN pip install --upgrade pip && pip install -r requirements.txt
+COPY . .
+EXPOSE 7860
+# Environment variables expected (documented for convenience)
+# - OPENAI_API_KEY: API key for ChatGPT 5.2 backend
+# - WORDS2DOC_OPENAI_MODEL: Optional, OpenAI model name (default: gpt-4.1-mini)
+# - GEMINI_API_KEY: API key for Gemini backend
+# - WORDS2DOC_GEMINI_MODEL: Optional, Gemini model name (default: gemini-1.5-flash)
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+CMD ["gradio", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+import base64
+from io import BytesIO
+from typing import Tuple, Optional
+import gradio as gr
+from PIL import Image
+from olmocr.data.renderpdf import render_pdf_to_base64png
+# Optional imports for cloud LLMs
+try:
+    from openai import OpenAI
+except ImportError:  # pragma: no cover
+    OpenAI = None  # type: ignore
+try:
+    import google.generativeai as genai
+except ImportError:  # pragma: no cover
+    genai = None  # type: ignore
+import torch
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+APP_TITLE = "words2doc"
+APP_DESCRIPTION = "Upload a PDF or image with (handwritten) text and convert it to CSV using different LLM backends."
+MODEL_CHATGPT = "ChatGPT 5.2"
+MODEL_GEMINI = "Gemini 3 Pro"
+MODEL_OLMOCR = "olmOCR-2-7B-1025-FP8"
+# -------- Utility helpers -------- #
+def _load_image_from_upload(path: str) -> Image.Image:
+    """Load an image from a path (for image uploads)."""
+    return Image.open(path).convert("RGB")
+def _pdf_to_pil_image(path: str, page: int = 1, target_longest_image_dim: int = 1288) -> Image.Image:
+    """Render a single PDF page to PIL Image via olmocr's helper."""
+    image_base64 = render_pdf_to_base64png(path, page, target_longest_image_dim=target_longest_image_dim)
+    return Image.open(BytesIO(base64.b64decode(image_base64)))
+def _image_from_any_file(file_path: str) -> Image.Image:
+    """Accept either PDF or image and always return a PIL Image (first page for PDFs)."""
+    lower = file_path.lower()
+    if lower.endswith(".pdf"):
+        return _pdf_to_pil_image(file_path)
+    return _load_image_from_upload(file_path)
+def _write_csv_to_temp_file(csv_text: str) -> str:
+    """Write CSV text to a temporary file and return the path."""
+    import tempfile
+    fd, path = tempfile.mkstemp(suffix=".csv", prefix="words2doc_")
+    with os.fdopen(fd, "w", encoding="utf-8") as f:
+        f.write(csv_text)
+    return path
+# -------- Backends -------- #
+# Function to encode the image
+def _encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def _run_openai_vision(image: Image.Image, prompt: str) -> str:
+    if OpenAI is None:
+        raise RuntimeError("openai package is not installed. Please install it to use ChatGPT 5.2 backend.")
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        raise RuntimeError("OPENAI_API_KEY environment variable is not set.")
+    client = OpenAI(api_key=api_key)
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    model_name = os.getenv("WORDS2DOC_OPENAI_MODEL", "gpt-5-nano-2025-08-07")
+    _log(f"Using OpenAI model: {model_name}")
+    _log(f"Input image size: {image.size}")
+    response = client.responses.create(
+        model=model_name,
+        input=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "input_text", "text": prompt},
+                    {
+                        "type": "input_image",
+                        "image_url": f"data:image/jpeg;base64,{img_b64}",
+                    },
+                ],
+            }
+        ],
+        max_output_tokens=2048,
+    )
+    _log("OpenAI vision response received")
+    _log_debug(f"Response length: {len(response.output_text)} characters")
+    _log_debug(f"First 200 chars: {response.output_text[:200]}...")
+    return response.output_text
+def _run_gemini_vision(image: Image.Image, prompt: str) -> str:
+    if genai is None:
+        raise RuntimeError("google-generativeai package is not installed. Please install it to use Gemini backend.")
+    api_key = os.getenv("GEMINI_API_KEY")
+    if not api_key:
+        raise RuntimeError("GEMINI_API_KEY environment variable is not set.")
+    genai.configure(api_key=api_key)
+    model_name = os.getenv("WORDS2DOC_GEMINI_MODEL", "gemini-1.5-flash")
+    model = genai.GenerativeModel(model_name)
+    # Gemini expects a PIL Image directly
+    response = model.generate_content([prompt, image])
+    return response.text or ""
+_olmocr_model: Optional[Qwen2_5_VLForConditionalGeneration] = None
+_olmocr_processor: Optional[AutoProcessor] = None
+def _ensure_olmocr_loaded() -> Tuple[Qwen2_5_VLForConditionalGeneration, AutoProcessor]:
+    global _olmocr_model, _olmocr_processor
+    if _olmocr_model is None or _olmocr_processor is None:
+        _olmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            "allenai/olmOCR-2-7B-1025-FP8", device_map="auto"
+        ).eval()
+        _olmocr_processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+    return _olmocr_model, _olmocr_processor
+def _run_olmocr(image: Image.Image, prompt: str) -> str:
+    model, processor = _ensure_olmocr_loaded()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image", "image": image},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(
+        text=[text],
+        images=[image],
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = {key: value.to(device) for (key, value) in inputs.items()}
+    output = model.generate(
+        **inputs,
+        temperature=0.1,
+        max_new_tokens=1024,
+        num_return_sequences=1,
+        do_sample=True,
+    )
+    prompt_length = inputs["input_ids"].shape[1]
+    new_tokens = output[:, prompt_length:]
+    text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
+    return text_output[0] if text_output else ""
+# -------- Main processing function -------- #
+def process_document(file_obj, model_choice: str, prompt: str):
+    if file_obj is None:
+        return "No file uploaded.", None
+    file_path = getattr(file_obj, "name", None) or file_obj
+    image = _image_from_any_file(file_path)
+    if not prompt.strip():
+        prompt = (
+            "You are an OCR-to-CSV assistant. Read the table or structured text in the image and output a valid "
+            "CSV representation. Use commas as separators and include a header row if appropriate."
+        )
+    if model_choice == MODEL_CHATGPT:
+        csv_text = _run_openai_vision(image, prompt)
+    elif model_choice == MODEL_GEMINI:
+        csv_text = _run_gemini_vision(image, prompt)
+    elif model_choice == MODEL_OLMOCR:
+        csv_text = _run_olmocr(image, prompt)
+    else:
+        csv_text = f"Unknown model choice: {model_choice}"
+    csv_file_path = _write_csv_to_temp_file(csv_text)
+    return csv_text, csv_file_path
+def _log(message: str):
+    print(f"[WORDS2CSV] {message}")
+def _log_debug(message: str):
+    if os.getenv("WORDS2CSV_DEBUG"):
+        print(f"[WORDS2CSV-DEBUG] {message}")
+# -------- Gradio UI -------- #
+def build_interface() -> gr.Blocks:
+    with gr.Blocks(title=APP_TITLE) as demo:
+        gr.Markdown(f"# {APP_TITLE}")
+        gr.Markdown(APP_DESCRIPTION)
+        with gr.Row():
+            with gr.Column(scale=1):
+                file_input = gr.File(
+                    label="Upload PDF or image",
+                    file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp"],
+                )
+                model_selector = gr.Dropdown(
+                    label="LLM backend",
+                    choices=[MODEL_CHATGPT, MODEL_GEMINI, MODEL_OLMOCR],
+                    value=MODEL_CHATGPT,
+                )
+                prompt_editor = gr.Textbox(
+                    label="Prompt editor",
+                    value=(
+                        "You are an OCR and vocabulary extractor.\n"
+        "You are given a photo of a vocabulary book page with words in original language and their translations.\n"
+        "Your task:\n"
+        "- Read the text on the page.\n"
+        "- First, detect the language of the words.\n"
+        "- Identify all words and their corresponding translations.\n"
+        "- Do NOT include dates, page numbers, headings, or example sentences.\n"
+        "- Do NOT repeat the same word twice.\n"
+        "- If there are duplicates, keep only one row.\n"
+        "\n"
+        "Output format (VERY IMPORTANT):\n"
+        "- Output ONLY CSV rows.\n"
+        "- NO explanations, NO extra text, NO quotes.\n"
+        "- Each line must be: <word>,<translation>\n"
+        "- Use a comma as separator.\n"
+        "- No header row.\n"
+        "- Example:\n"
+        "word1,translation1\n"
+        "word2,translation2\n"
+        "word3,translation3\n"
+        "\n"
+        "Now output ONLY the CSV rows for the attached image."
+                    ),
+                    lines=6,
+                    placeholder=(
+                        "Describe how the CSV should be structured. If left empty, a default OCR-to-CSV prompt is used."
+                    ),
+                )
+                run_button = gr.Button("Run", variant="primary")
+            with gr.Column(scale=1):
+                csv_output = gr.Textbox(
+                    label="CSV output (preview)",
+                    lines=20,
+                    buttons=["copy"],
+                )
+                csv_file = gr.File(label="Download CSV file", interactive=False)
+        run_button.click(
+            fn=process_document,
+            inputs=[file_input, model_selector, prompt_editor],
+            outputs=[csv_output, csv_file],
+        )
+    return demo
+demo = build_interface()
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=int(os.getenv("PORT", "7860")),
+        share=True,
+    )

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,14 @@

+services:
+  app:
+    container_name: words2csv
+    build: .
+    ports:
+      - "7860:7860"
+    volumes:
+      - .:/app
+    env_file:
+      - .env
+    environment:
+      - PYTHONUNBUFFERED=1
+      - GRADIO_SERVER_NAME=0.0.0.0
+    command: gradio app.py

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio>=6.1.0
+openai>=1.40.0
+google-generativeai>=0.7.0
+olmocr>=0.1.0
+torch>=2.2.0
+transformers>=4.42.0
+pillow>=10.3.0
+python-dotenv>=1.0.0