Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import torch | |
| import re | |
| # Load the model and tokenizer | |
| model_name = 'abinayam/gpt-2-tamil' | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained(model_name) | |
| # System prompt | |
| system_prompt = """You are an expert Tamil language model specializing in spelling and grammar correction. Your task is to: | |
| 1. Correct any spelling errors in the given text. | |
| 2. Fix grammatical mistakes, including proper application of sandhi rules. | |
| 3. Ensure the corrected text maintains the original meaning and context. | |
| 4. Provide the corrected version of the entire input text. | |
| Remember to preserve the structure and intent of the original text while making necessary corrections.""" | |
| # Common error corrections | |
| common_errors = { | |
| 'பழங்கல்': 'பழங்கள்', | |
| # Add more common spelling errors here | |
| } | |
| def apply_sandhi_rules(text): | |
| # Apply sandhi rules | |
| text = re.sub(r'(கு|க்கு)\s+(ப|த|க|ச)', r'\1ப் \2', text) | |
| # Add more sandhi rules as needed | |
| return text | |
| def preprocess_text(text): | |
| # Apply common error corrections | |
| for error, correction in common_errors.items(): | |
| text = text.replace(error, correction) | |
| return text | |
| def postprocess_text(text): | |
| # Apply sandhi rules | |
| text = apply_sandhi_rules(text) | |
| return text | |
| def correct_text(input_text): | |
| # Preprocess the input text | |
| preprocessed_text = preprocess_text(input_text) | |
| # Prepare the full prompt with system prompt and input text | |
| full_prompt = f"{system_prompt}\n\nInput: {preprocessed_text}\n\nCorrected:" | |
| # Tokenize the full prompt | |
| input_ids = tokenizer.encode(full_prompt, return_tensors='pt') | |
| # Generate corrected text | |
| with torch.no_grad(): | |
| output = model.generate( | |
| input_ids, | |
| max_length=len(input_ids[0]) + 100, # Adjust based on expected output length | |
| num_return_sequences=1, | |
| temperature=0.7, | |
| do_sample=True, | |
| top_k=50, | |
| top_p=0.95 | |
| ) | |
| # Decode the generated text | |
| generated_text = tokenizer.decode(output[0], skip_special_tokens=True) | |
| # Extract the corrected text (everything after "Corrected:") | |
| corrected_text = generated_text.split("Corrected:")[-1].strip() | |
| # Postprocess the corrected text | |
| final_text = postprocess_text(corrected_text) | |
| return final_text | |
| # Create the Gradio interface | |
| iface = gr.Interface( | |
| fn=correct_text, | |
| inputs=gr.Textbox(lines=5, placeholder="Enter Tamil text here..."), | |
| outputs=gr.Textbox(label="Corrected Text"), | |
| title="Tamil Spell Corrector and Grammar Checker", | |
| description="This app uses the 'abinayam/gpt-2-tamil' model along with custom rules to correct spelling and grammar in Tamil text.", | |
| examples=[ | |
| ["நான் நேற்று கடைக்கு போனேன். அங்கே நிறைய பழங்கல் வாங்கினேன்."], | |
| ["நான் பள்ளிகு செல்கிறேன்."], | |
| ["அவன் வீட்டுகு வந்தான்."] | |
| ] | |
| ) | |
| # Launch the app | |
| iface.launch() |