Spaces:

koyu008
/

Toxic_Comment_Classifier

Sleeping

App Files Files Community

koyu008 commited on Jun 27

Commit

2b470ab

verified ·

1 Parent(s): 8b0dba6

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -103

app.py CHANGED Viewed

@@ -1,103 +1,92 @@
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-from langdetect import detect
-import torch
-import torch.nn as nn
-from transformers import (
-    DistilBertTokenizer, DistilBertModel,
-    AutoTokenizer, AutoModel
-)
-# ==== Model Classes ====
-class ToxicBERT(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
-        self.dropout = nn.Dropout(0.3)
-        self.classifier = nn.Linear(self.bert.config.hidden_size, 6)
-    def forward(self, input_ids, attention_mask):
-        output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
-        return self.classifier(self.dropout(output))
-class HinglishToxicClassifier(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.bert = AutoModel.from_pretrained("xlm-roberta-base")
-        hidden_size = self.bert.config.hidden_size
-        self.pool = lambda hidden: torch.cat([
-            hidden.mean(dim=1),
-            hidden.max(dim=1).values
-        ], dim=1)
-        self.bottleneck = nn.Sequential(
-            nn.Linear(2 * hidden_size, hidden_size),
-            nn.ReLU(),
-            nn.Dropout(0.2)
-        )
-        self.classifier = nn.Linear(hidden_size, 2)
-    def forward(self, input_ids, attention_mask):
-        hidden = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
-        pooled = self.pool(hidden)
-        x = self.bottleneck(pooled)
-        return self.classifier(x)
-# ==== Load Tokenizers ====
-english_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-hinglish_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
-# ==== Load Models from Hugging Face Hub ====
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-english_model = ToxicBERT()
-eng_url = "https://huggingface.co/koyu008/English_Toxic_Classifier/resolve/main/bert_toxic_classifier.pt"
-english_model.load_state_dict(torch.hub.load_state_dict_from_url(eng_url, map_location=device))
-english_model.eval().to(device)
-hinglish_model = HinglishToxicClassifier()
-hin_url = "https://huggingface.co/koyu008/HInglish_comment_classifier/resolve/main/best_hinglish_model.pt"
-hinglish_model.load_state_dict(torch.hub.load_state_dict_from_url(hin_url, map_location=device))
-hinglish_model.eval().to(device)
-# ==== FastAPI setup ====
-app = FastAPI()
-class InputText(BaseModel):
-    text: str
-@app.post("/predict")
-def predict(input: InputText):
-    text = input.text.strip()
-    if not text:
-        raise HTTPException(status_code=400, detail="Input text cannot be empty")
-    # Language detection
-    try:
-        lang = detect(text)
-    except:
-        lang = "und"
-    if lang == "en":
-        model = english_model
-        tokenizer = english_tokenizer
-        labels = ["toxic", "severe toxic", "obscene", "threat", "insult", "identity hate"]
-    else:
-        model = hinglish_model
-        tokenizer = hinglish_tokenizer
-        labels = ["not toxic", "toxic"]
-    # Tokenization
-    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
-    with torch.no_grad():
-        outputs = model(**inputs)
-        probs = torch.softmax(outputs, dim=1).squeeze().tolist()
-    response = {
-        "language": "english" if lang == "en" else "hinglish",
-        "prediction": {label: float(round(prob, 4)) for label, prob in zip(labels, probs)}
-    }
-    return response

+from fastapi import FastAPI, Request
+from pydantic import BaseModel
+from langdetect import detect
+import torch
+import torch.nn as nn
+from transformers import DistilBertModel, AutoModel, AutoTokenizer, DistilBertTokenizer
+from huggingface_hub import snapshot_download
+import os
+app = FastAPI()
+# Use local cache folder for downloaded models
+os.environ["TRANSFORMERS_CACHE"] = "/app/.hf_cache"
+os.makedirs("/app/.hf_cache", exist_ok=True)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# -------------------------------
+# Model Classes
+# -------------------------------
+class ToxicBERT(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.bert = DistilBertModel.from_pretrained(snapshot_download("koyu008/English_Toxic_Classifier"))
+        self.dropout = nn.Dropout(0.3)
+        self.classifier = nn.Linear(self.bert.config.hidden_size, 6)
+    def forward(self, input_ids, attention_mask):
+        output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
+        return self.classifier(self.dropout(output))
+class HinglishToxicClassifier(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.bert = AutoModel.from_pretrained(snapshot_download("koyu008/Hinglish_comment_classifier"))
+        hidden_size = self.bert.config.hidden_size
+        self.pool = lambda hidden: torch.cat([
+            hidden.mean(dim=1),
+            hidden.max(dim=1).values
+        ], dim=1)
+        self.bottleneck = nn.Sequential(
+            nn.Linear(2 * hidden_size, hidden_size),
+            nn.ReLU(),
+            nn.Dropout(0.2)
+        )
+        self.classifier = nn.Linear(hidden_size, 2)
+    def forward(self, input_ids, attention_mask):
+        hidden = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
+        pooled = self.pool(hidden)
+        x = self.bottleneck(pooled)
+        return self.classifier(x)
+# -------------------------------
+# Load Models and Tokenizers
+# -------------------------------
+english_model = ToxicBERT().to(device)
+english_model.load_state_dict(torch.load("bert_toxic_classifier.pt", map_location=device))
+english_model.eval()
+english_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+hinglish_model = HinglishToxicClassifier().to(device)
+hinglish_model.load_state_dict(torch.load("best_hinglish_model.pt", map_location=device))
+hinglish_model.eval()
+hinglish_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
+# -------------------------------
+# Request & Inference
+# -------------------------------
+class InputText(BaseModel):
+    text: str
+@app.post("/predict")
+async def predict(input: InputText):
+    text = input.text
+    lang = detect(text)
+    if lang == "en":
+        inputs = english_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
+        with torch.no_grad():
+            logits = english_model(**inputs)
+        probs = torch.softmax(logits, dim=1).cpu().numpy().tolist()[0]
+        return {"language": "english", "classes": ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"], "probabilities": probs}
+    else:
+        inputs = hinglish_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
+        with torch.no_grad():
+            logits = hinglish_model(**inputs)
+        probs = torch.softmax(logits, dim=1).cpu().numpy().tolist()[0]
+        return {"language": "hinglish", "classes": ["toxic", "non-toxic"], "probabilities": probs}