Spaces:

snake11235
/

words2csv

Running

App Files Files Community

words2csv / app.py

snake11235

feat: change default model from olmOCR to Gemini 3 Pro Preview

038d6db 5 days ago

raw

history blame contribute delete

7.24 kB

	import os
	from dotenv import load_dotenv

	load_dotenv()

	import base64
	from io import BytesIO
	from typing import Optional
	import time

	import gradio as gr
	from PIL import Image

	from olmocr.data.renderpdf import render_pdf_to_base64png

	from openai_backend import _run_openai_vision
	from common import MODELS_MAP, MODEL_GEMINI, MODEL_OLMOCR
	from gemini_backend import _run_gemini_vision
	from logging_helper import log as _log, log_debug as _log_debug, get_latest_model_log as _get_latest_model_log
	from olm_ocr import _run_olmocr

	APP_TITLE = "words2doc"
	APP_DESCRIPTION = "Upload a PDF or image with (handwritten) text and convert it to CSV using different LLM backends."


	# -------- Utility helpers -------- #


	def _load_image_from_upload(path: str) -> Image.Image:
	"""Load an image from a path (for image uploads)."""
	return Image.open(path).convert("RGB")


	def _pdf_to_pil_image(path: str, page: int = 1, target_longest_image_dim: int = 1288) -> Image.Image:
	"""Render a single PDF page to PIL Image via olmocr's helper."""
	image_base64 = render_pdf_to_base64png(path, page, target_longest_image_dim=target_longest_image_dim)
	return Image.open(BytesIO(base64.b64decode(image_base64)))


	def _image_from_any_file(file_path: str) -> Image.Image:
	"""Accept either PDF or image and always return a PIL Image (first page for PDFs)."""
	lower = file_path.lower()
	if lower.endswith(".pdf"):
	return _pdf_to_pil_image(file_path)
	return _load_image_from_upload(file_path)


	def _convert_to_grayscale(image: Image.Image) -> Image.Image:
	"""Convert an image to grayscale."""
	return image.convert("L")


	def _downscale_image(image: Image.Image, target_width: int = 1024) -> Image.Image:
	"""Downscale image to target width, preserving aspect ratio."""
	if image.width > target_width:
	ratio = target_width / float(image.width)
	new_height = int(float(image.height) * ratio)
	return image.resize((target_width, new_height), Image.Resampling.LANCZOS)
	return image


	def _write_csv_to_temp_file(csv_text: str) -> str:
	"""Write CSV text to a temporary file and return the path."""
	import tempfile

	fd, path = tempfile.mkstemp(suffix=".csv", prefix="words2doc_")
	with os.fdopen(fd, "w", encoding="utf-8") as f:
	f.write(csv_text)
	return path


	# -------- Backends -------- #
	# Function to encode the image
	def _encode_image(image_path):
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode("utf-8")


	# -------- Main processing function -------- #


	def process_document(file_obj, model_choice: str, prompt: str):
	if file_obj is None:
	return "No file uploaded.", None, ""

	file_path = getattr(file_obj, "name", None) or file_obj
	image = _image_from_any_file(file_path)

	_log("Converting image to grayscale")
	image = _convert_to_grayscale(image)

	_log("Downscaling image to 1024 width")
	image = _downscale_image(image, 1024)

	if not prompt.strip():
	prompt = (
	"You are an OCR-to-CSV assistant. Read the table or structured text in the image and output a valid "
	"CSV representation. Use commas as separators and include a header row if appropriate."
	)
	_log_debug(f"Using model: {model_choice}")
	if MODELS_MAP[model_choice]["backend"] == "openai":
	csv_text = _run_openai_vision(image, prompt, model_choice)
	elif MODELS_MAP[model_choice]["backend"] == "gemini":
	csv_text = _run_gemini_vision(image, prompt, model_choice)
	elif MODELS_MAP[model_choice]["backend"] == "olmocr":
	csv_text = _run_olmocr(image, prompt)
	else:
	csv_text = f"Unknown model choice: {model_choice}"

	csv_file_path = _write_csv_to_temp_file(csv_text)

	latest_log = _get_latest_model_log() or ""

	return csv_text, csv_file_path, latest_log

	# -------- Gradio UI -------- #


	def build_interface() -> gr.Blocks:
	with gr.Blocks(title=APP_TITLE) as demo:
	gr.Markdown(f"# {APP_TITLE}")
	gr.Markdown(APP_DESCRIPTION)

	with gr.Row():
	with gr.Column(scale=1):
	file_input = gr.File(
	label="Upload PDF or image",
	file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp"],
	)
	image_example_preview = gr.Image(
	label="Example image preview",
	value="static/vocab.jpg", # adjust to real relative path
	interactive=False,
	)
	gr.Examples(
	examples=[["vocab.jpg", "vocab.jpg"]],
	inputs=[file_input, image_example_preview],
	label="Example image",
	)

	model_selector = gr.Dropdown(
	label="LLM backend",
	choices=list(MODELS_MAP.keys()),
	value=MODEL_GEMINI,
	)

	prompt_editor = gr.Textbox(
	label="Prompt editor",
	value=(
	"You are an OCR and vocabulary extractor.\n"
	"You are given a photo of a vocabulary book page with words in original language and their translations.\n"

	"Your task:\n"
	"- Read the text on the page.\n"
	"- First, detect the language of the words.\n"
	"- Identify all words and their corresponding translations.\n"
	"- Do NOT include dates, page numbers, headings, or example sentences.\n"
	"- Do NOT repeat the same word twice.\n"
	"- If there are duplicates, keep only one row.\n"
	"\n"
	"Output format (VERY IMPORTANT):\n"
	"- Output ONLY CSV rows.\n"
	"- NO explanations, NO extra text, NO quotes.\n"
	"- Each line must be: <word>,<translation>\n"
	"- Use a comma as separator.\n"
	"- No header row.\n"
	"- Example:\n"
	"word1,translation1\n"
	"word2,translation2\n"
	"word3,translation3\n"
	"\n"
	"Now output ONLY the CSV rows for the attached image."
	),
	lines=6,
	placeholder=(
	"Describe how the CSV should be structured. If left empty, a default OCR-to-CSV prompt is used."
	),
	)

	run_button = gr.Button("Run", variant="primary")

	with gr.Column(scale=1):
	csv_output = gr.Textbox(
	label="CSV output (preview)",
	lines=20,
	buttons=["copy"],
	)
	csv_file = gr.File(label="Download CSV file", interactive=False)
	with gr.Row():
	logs_output = gr.Textbox(
	label="Logs",
	lines=4,
	)

	run_button.click(
	fn=process_document,
	inputs=[file_input, model_selector, prompt_editor],
	outputs=[csv_output, csv_file, logs_output],
	)

	return demo


	demo = build_interface()


	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=int(os.getenv("PORT", "7860")),
	share=True,
	)