SystemPromptTestsGPU

Sleeping

App Files Files Community

neovalle commited on Oct 19

Commit

66771c3

verified ·

1 Parent(s): e517122

Create app.py

Browse files

Files changed (1) hide show

app.py +214 -0

app.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
+from threading import Thread
+from datetime import datetime
+import pandas as pd
+# ---------- Config ----------
+# Small, free chat models that run on CPU in a basic Space (pick one if you like)
+DEFAULT_MODELS = [
+    "google/gemma-2-2b-it",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "Qwen/Qwen2.5-1.5B-Instruct",
+]
+# Cache for loaded models to avoid reloading on each call
+_MODEL_CACHE = {}
+def _load_model(model_id: str):
+    """Load tokenizer and model (cached)."""
+    if model_id in _MODEL_CACHE:
+        return _MODEL_CACHE[model_id]
+    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+    # bfloat16 works on many CPUs and GPUs; fall back to float32 if needed
+    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=dtype,
+        low_cpu_mem_usage=True,
+        device_map="auto",
+    )
+    _MODEL_CACHE[model_id] = (tok, model)
+    return tok, model
+def _format_prompt(tokenizer, system_prompt: str, user_prompt: str) -> str:
+    """
+    Use the model's chat template if available; otherwise
+    create a simple system+user concatenation.
+    """
+    sys = system_prompt.strip() if system_prompt else ""
+    usr = user_prompt.strip()
+    if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template:
+        messages = []
+        if sys:
+            messages.append({"role": "system", "content": sys})
+        messages.append({"role": "user", "content": usr})
+        return tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+    # Fallback: a lightweight instruction format
+    prompt = ""
+    if sys:
+        prompt += f"<<SYS>>\n{sys}\n<</SYS>>\n\n"
+    prompt += f"<<USER>>\n{usr}\n<</USER>>\n<<ASSISTANT>>\n"
+    return prompt
+def generate_batch(
+    model_id: str,
+    system_prompt: str,
+    prompts_multiline: str,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    repetition_penalty: float,
+):
+    """Generate for multiple user prompts (one per line)."""
+    tok, model = _load_model(model_id)
+    device = model.device
+    # Split lines, drop empties
+    prompts = [p.strip() for p in prompts_multiline.splitlines() if p.strip()]
+    if not prompts:
+        return pd.DataFrame([{"user_prompt": "", "response": "", "tokens_out": 0}])
+    # Prepare inputs
+    formatted = [_format_prompt(tok, system_prompt, p) for p in prompts]
+    inputs = tok(
+        formatted,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+    ).to(device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=(temperature > 0.0),
+            temperature=temperature if temperature > 0 else None,
+            top_p=top_p,
+            top_k=top_k if top_k > 0 else None,
+            repetition_penalty=repetition_penalty,
+            eos_token_id=tok.eos_token_id,
+            pad_token_id=tok.eos_token_id,
+        )
+    # Slice off the prompt tokens to get only the generated text
+    gen_texts = []
+    for i in range(outputs.size(0)):
+        prompt_len = inputs["input_ids"][i].size(0)
+        # Some tokenizers need special handling; safest: decode full and strip prompt
+        full = tok.decode(outputs[i], skip_special_tokens=True)
+        prompt_only = tok.decode(inputs["input_ids"][i], skip_special_tokens=True)
+        # Remove the first occurrence of the prompt text
+        resp = full[len(prompt_only):].strip()
+        gen_texts.append(resp)
+    df = pd.DataFrame(
+        {
+            "user_prompt": prompts,
+            "response": gen_texts,
+            "tokens_out": [len(tok.encode(t)) for t in gen_texts],
+        }
+    )
+    return df
+def to_csv(df: pd.DataFrame):
+    ts = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
+    path = f"/tmp/batch_{ts}.csv"
+    df.to_csv(path, index=False)
+    return path
+# ---------- UI ----------
+with gr.Blocks(title="Multi-Prompt Chat (System Prompt Control)") as demo:
+    gr.Markdown(
+        """
+        # 🧪 Multi-Prompt Chat for HF Space
+        Pick a small free model, set a **system prompt**, and enter **multiple user prompts** (one per line).
+        Click **Generate** to get batched responses as a table (downloadable as CSV).
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            model_id = gr.Dropdown(
+                choices=DEFAULT_MODELS,
+                value=DEFAULT_MODELS[0],
+                label="Model",
+                info="Free, small instruction-tuned models that run on CPU in a basic Space.",
+            )
+            system_prompt = gr.Textbox(
+                label="System prompt",
+                placeholder="e.g., You are an ecolinguistics-aware assistant that prefers concise, actionable answers.",
+                lines=5,
+            )
+            prompts_multiline = gr.Textbox(
+                label="User prompts (one per line)",
+                placeholder="Write one query per line.\nExample:\nExplain transformers in simple terms\nGive 3 eco-friendly tips for students\nSummarise the benefits of multilingual models",
+                lines=10,
+            )
+            with gr.Accordion("Generation settings", open=False):
+                max_new_tokens = gr.Slider(16, 1024, value=256, step=1, label="max_new_tokens")
+                temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="temperature")
+                top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")
+                top_k = gr.Slider(0, 200, value=40, step=1, label="top_k (0 to disable)")
+                repetition_penalty = gr.Slider(1.0, 2.0, value=1.1, step=0.01, label="repetition_penalty")
+            run_btn = gr.Button("Generate", variant="primary")
+            csv_btn = gr.Button("Download CSV")
+        with gr.Column(scale=1):
+            out_df = gr.Dataframe(
+                headers=["user_prompt", "response", "tokens_out"],
+                datatype=["str", "str", "number"],
+                label="Results",
+                wrap=True,
+                interactive=False,
+                row_count=(0, "dynamic"),
+            )
+            out_file = gr.File(label="CSV file", visible=False)
+    def _generate(model_id, system_prompt, prompts_multiline, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
+        df = generate_batch(
+            model_id=model_id,
+            system_prompt=system_prompt,
+            prompts_multiline=prompts_multiline,
+            max_new_tokens=int(max_new_tokens),
+            temperature=float(temperature),
+            top_p=float(top_p),
+            top_k=int(top_k),
+            repetition_penalty=float(repetition_penalty),
+        )
+        return df
+    def _download(df):
+        # Gradio passes a dict-like table; normalise to DataFrame
+        if isinstance(df, list):
+            df = pd.DataFrame(df, columns=["user_prompt", "response", "tokens_out"])
+        else:
+            df = pd.DataFrame(df)
+        path = to_csv(df)
+        return gr.File.update(value=path, visible=True)
+    run_btn.click(
+        _generate,
+        inputs=[model_id, system_prompt, prompts_multiline, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=out_df,
+        api_name="generate_batch",
+    )
+    csv_btn.click(_download, inputs=out_df, outputs=out_file, api_name="download_csv")
+if __name__ == "__main__":
+    demo.launch()