Spaces:

arsath-sm
/

Tamil-spell-checker

Sleeping

App Files Files Community

arsath-sm commited on Oct 16, 2024

Commit

3ab7e14

verified ·

1 Parent(s): e2b6396

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -4

app.py CHANGED Viewed

@@ -8,6 +8,15 @@ model_name = 'abinayam/gpt-2-tamil'
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(model_name)
 # Common error corrections
 common_errors = {
     'பழங்கல்': 'பழங்கள்',
@@ -35,15 +44,29 @@ def correct_text(input_text):
     # Preprocess the input text
     preprocessed_text = preprocess_text(input_text)
-    # Tokenize the preprocessed text
-    input_ids = tokenizer.encode(preprocessed_text, return_tensors='pt')
     # Generate corrected text
     with torch.no_grad():
-        output = model.generate(input_ids, max_length=100, num_return_sequences=1, temperature=0.7)
     # Decode the generated text
-    corrected_text = tokenizer.decode(output[0], skip_special_tokens=True)
     # Postprocess the corrected text
     final_text = postprocess_text(corrected_text)

 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(model_name)
+# System prompt
+system_prompt = """You are an expert Tamil language model specializing in spelling and grammar correction. Your task is to:
+1. Correct any spelling errors in the given text.
+2. Fix grammatical mistakes, including proper application of sandhi rules.
+3. Ensure the corrected text maintains the original meaning and context.
+4. Provide the corrected version of the entire input text.
+Remember to preserve the structure and intent of the original text while making necessary corrections."""
 # Common error corrections
 common_errors = {
     'பழங்கல்': 'பழங்கள்',
     # Preprocess the input text
     preprocessed_text = preprocess_text(input_text)
+    # Prepare the full prompt with system prompt and input text
+    full_prompt = f"{system_prompt}\n\nInput: {preprocessed_text}\n\nCorrected:"
+    # Tokenize the full prompt
+    input_ids = tokenizer.encode(full_prompt, return_tensors='pt')
     # Generate corrected text
     with torch.no_grad():
+        output = model.generate(
+            input_ids,
+            max_length=len(input_ids[0]) + 100,  # Adjust based on expected output length
+            num_return_sequences=1,
+            temperature=0.7,
+            do_sample=True,
+            top_k=50,
+            top_p=0.95
+        )
     # Decode the generated text
+    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
+    # Extract the corrected text (everything after "Corrected:")
+    corrected_text = generated_text.split("Corrected:")[-1].strip()
     # Postprocess the corrected text
     final_text = postprocess_text(corrected_text)