Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,15 @@ model_name = 'abinayam/gpt-2-tamil'
|
|
| 8 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 9 |
model = AutoModelForCausalLM.from_pretrained(model_name)
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
# Common error corrections
|
| 12 |
common_errors = {
|
| 13 |
'பழங்கல்': 'பழங்கள்',
|
|
@@ -35,15 +44,29 @@ def correct_text(input_text):
|
|
| 35 |
# Preprocess the input text
|
| 36 |
preprocessed_text = preprocess_text(input_text)
|
| 37 |
|
| 38 |
-
#
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# Generate corrected text
|
| 42 |
with torch.no_grad():
|
| 43 |
-
output = model.generate(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
# Decode the generated text
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
# Postprocess the corrected text
|
| 49 |
final_text = postprocess_text(corrected_text)
|
|
|
|
| 8 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 9 |
model = AutoModelForCausalLM.from_pretrained(model_name)
|
| 10 |
|
| 11 |
+
# System prompt
|
| 12 |
+
system_prompt = """You are an expert Tamil language model specializing in spelling and grammar correction. Your task is to:
|
| 13 |
+
1. Correct any spelling errors in the given text.
|
| 14 |
+
2. Fix grammatical mistakes, including proper application of sandhi rules.
|
| 15 |
+
3. Ensure the corrected text maintains the original meaning and context.
|
| 16 |
+
4. Provide the corrected version of the entire input text.
|
| 17 |
+
|
| 18 |
+
Remember to preserve the structure and intent of the original text while making necessary corrections."""
|
| 19 |
+
|
| 20 |
# Common error corrections
|
| 21 |
common_errors = {
|
| 22 |
'பழங்கல்': 'பழங்கள்',
|
|
|
|
| 44 |
# Preprocess the input text
|
| 45 |
preprocessed_text = preprocess_text(input_text)
|
| 46 |
|
| 47 |
+
# Prepare the full prompt with system prompt and input text
|
| 48 |
+
full_prompt = f"{system_prompt}\n\nInput: {preprocessed_text}\n\nCorrected:"
|
| 49 |
+
|
| 50 |
+
# Tokenize the full prompt
|
| 51 |
+
input_ids = tokenizer.encode(full_prompt, return_tensors='pt')
|
| 52 |
|
| 53 |
# Generate corrected text
|
| 54 |
with torch.no_grad():
|
| 55 |
+
output = model.generate(
|
| 56 |
+
input_ids,
|
| 57 |
+
max_length=len(input_ids[0]) + 100, # Adjust based on expected output length
|
| 58 |
+
num_return_sequences=1,
|
| 59 |
+
temperature=0.7,
|
| 60 |
+
do_sample=True,
|
| 61 |
+
top_k=50,
|
| 62 |
+
top_p=0.95
|
| 63 |
+
)
|
| 64 |
|
| 65 |
# Decode the generated text
|
| 66 |
+
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
|
| 67 |
+
|
| 68 |
+
# Extract the corrected text (everything after "Corrected:")
|
| 69 |
+
corrected_text = generated_text.split("Corrected:")[-1].strip()
|
| 70 |
|
| 71 |
# Postprocess the corrected text
|
| 72 |
final_text = postprocess_text(corrected_text)
|