yagnik12 commited on
Commit
13a1290
·
verified ·
1 Parent(s): 6c5caaa

Update ai_text_detector_valid_final.py

Browse files
Files changed (1) hide show
  1. ai_text_detector_valid_final.py +56 -52
ai_text_detector_valid_final.py CHANGED
@@ -28,21 +28,39 @@ model_3 = AutoModelForSequenceClassification.from_pretrained("answerdotai/Modern
28
  model_3.load_state_dict(torch.hub.load_state_dict_from_url(model3_path, map_location=device))
29
  model_3.to(device).eval()
30
 
31
- label_mapping = { ... } # keep as is
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- # ---------------------------
34
- # Text Cleaning
35
- # ---------------------------
36
  def clean_text(text: str) -> str:
 
37
  text = text.replace("\xa0", " ").replace("\u200b", "")
 
 
38
  text = re.sub(r"\s{2,}", " ", text)
 
 
39
  text = re.sub(r"\s+([,.;:?!])", r"\1", text)
 
 
40
  return text.strip()
41
 
42
- # ---------------------------
43
- # SzegedAI Detector
44
- # ---------------------------
45
  def classify_szegedai(text: str):
 
46
  cleaned_text = clean_text(text)
47
  if not cleaned_text.strip():
48
  return {"error": "Empty text"}
@@ -60,7 +78,7 @@ def classify_szegedai(text: str):
60
 
61
  probs = probs[0]
62
  ai_probs = probs.clone()
63
- ai_probs[24] = 0 # "human"
64
  ai_total_prob = ai_probs.sum().item() * 100
65
  human_prob = 100 - ai_total_prob
66
 
@@ -74,9 +92,10 @@ def classify_szegedai(text: str):
74
  }
75
 
76
  # ---------------------------
77
- # HuggingFace Detectors
78
  # ---------------------------
79
  MODELS = {
 
80
  "MonkeyDAnh": "MonkeyDAnh/my-awesome-ai-detector-roberta-base-v4-human-vs-machine-finetune",
81
  # "Andreas122001": "andreas122001/roberta-academic-detector",
82
  }
@@ -94,9 +113,10 @@ def run_hf_model(model_id, text):
94
  return {"error": str(e)}
95
 
96
  # ---------------------------
97
- # Verdict Logic
98
  # ---------------------------
99
  def verdict(ai_prob):
 
100
  if ai_prob < 20:
101
  return "Most likely human-written."
102
  elif 20 <= ai_prob < 40:
@@ -105,61 +125,45 @@ def verdict(ai_prob):
105
  return "Unclear – could be either human or AI-assisted."
106
  elif 60 <= ai_prob < 80:
107
  return "Possibly AI-generated, or a human using AI assistance."
108
- else:
109
  return "Likely AI-generated or heavily AI-assisted."
110
 
111
- # ---------------------------
112
- # Weighted Final Score
113
- # ---------------------------
114
- def compute_final_score(results: dict) -> dict:
115
- weighted_ai_probs = []
116
- weighted_human_probs = []
117
- weights = []
118
 
119
- for model, scores in results.items():
120
- if model == "Final Score" or "AI Probability" not in scores:
121
- continue
122
 
123
- ai_prob = scores.get("AI Probability", 0.0)
124
- human_prob = scores.get("Human Probability", 0.0)
 
 
 
125
 
126
- weight = 0.5 if model == "SzegedAI Detector" else 1.0
 
 
 
 
127
 
128
- weighted_ai_probs.append(ai_prob * weight)
129
- weighted_human_probs.append(human_prob * weight)
130
- weights.append(weight)
131
 
132
- if not weights:
133
- avg_ai_prob = 0
134
- avg_human_prob = 100
 
135
  else:
136
- avg_ai_prob = sum(weighted_ai_probs) / sum(weights)
137
- avg_human_prob = sum(weighted_human_probs) / sum(weights)
138
-
139
- verdict_text = verdict(avg_ai_prob)
140
 
141
  results["Final Score"] = {
142
- "Human Probability": round(avg_human_prob, 2),
143
- "AI Probability": round(avg_ai_prob, 2),
144
- "Verdict": verdict_text
145
  }
146
  return results
147
 
148
- # ---------------------------
149
- # Main Detector
150
- # ---------------------------
151
- def detect_text(text):
152
- results = {}
153
- for name, model_id in MODELS.items():
154
- results[name] = run_hf_model(model_id, text)
155
-
156
- results["SzegedAI Detector"] = classify_szegedai(text)
157
-
158
- # compute weighted final score
159
- results = compute_final_score(results)
160
-
161
- return results
162
-
163
  if __name__ == "__main__":
164
  sample = "This is a test sentence written by AI or human."
165
  print(detect_text(sample))
 
28
  model_3.load_state_dict(torch.hub.load_state_dict_from_url(model3_path, map_location=device))
29
  model_3.to(device).eval()
30
 
31
+ label_mapping = {
32
+ 0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b',
33
+ 6: 'bloomz', 7: 'cohere', 8: 'davinci', 9: 'dolly', 10: 'dolly-v2-12b',
34
+ 11: 'flan_t5_base', 12: 'flan_t5_large', 13: 'flan_t5_small',
35
+ 14: 'flan_t5_xl', 15: 'flan_t5_xxl', 16: 'gemma-7b-it', 17: 'gemma2-9b-it',
36
+ 18: 'gpt-3.5-turbo', 19: 'gpt-35', 20: 'gpt4', 21: 'gpt4o',
37
+ 22: 'gpt_j', 23: 'gpt_neox', 24: 'human', 25: 'llama3-70b', 26: 'llama3-8b',
38
+ 27: 'mixtral-8x7b', 28: 'opt_1.3b', 29: 'opt_125m', 30: 'opt_13b',
39
+ 31: 'opt_2.7b', 32: 'opt_30b', 33: 'opt_350m', 34: 'opt_6.7b',
40
+ 35: 'opt_iml_30b', 36: 'opt_iml_max_1.3b', 37: 't0_11b', 38: 't0_3b',
41
+ 39: 'text-davinci-002', 40: 'text-davinci-003'
42
+ }
43
+
44
+ # def clean_text(text: str) -> str:
45
+ # text = re.sub(r"\s{2,}", " ", text)
46
+ # text = re.sub(r"\s+([,.;:?!])", r"\1", text)
47
+ # return text
48
 
 
 
 
49
  def clean_text(text: str) -> str:
50
+ # Normalize non-breaking spaces to normal space
51
  text = text.replace("\xa0", " ").replace("\u200b", "")
52
+
53
+ # Collapse multiple spaces
54
  text = re.sub(r"\s{2,}", " ", text)
55
+
56
+ # Remove space before punctuation
57
  text = re.sub(r"\s+([,.;:?!])", r"\1", text)
58
+
59
+ # Trim leading/trailing spaces
60
  return text.strip()
61
 
 
 
 
62
  def classify_szegedai(text: str):
63
+ """ModernBERT ensemble detector (replaces SzegedAI Space call)."""
64
  cleaned_text = clean_text(text)
65
  if not cleaned_text.strip():
66
  return {"error": "Empty text"}
 
78
 
79
  probs = probs[0]
80
  ai_probs = probs.clone()
81
+ ai_probs[24] = 0 # "human" label index
82
  ai_total_prob = ai_probs.sum().item() * 100
83
  human_prob = 100 - ai_total_prob
84
 
 
92
  }
93
 
94
  # ---------------------------
95
+ # Your Other Detectors
96
  # ---------------------------
97
  MODELS = {
98
+ # "DeBERTa Detector": "distilbert-base-uncased-finetuned-sst-2-english",
99
  "MonkeyDAnh": "MonkeyDAnh/my-awesome-ai-detector-roberta-base-v4-human-vs-machine-finetune",
100
  # "Andreas122001": "andreas122001/roberta-academic-detector",
101
  }
 
113
  return {"error": str(e)}
114
 
115
  # ---------------------------
116
+ # Main Detector
117
  # ---------------------------
118
  def verdict(ai_prob):
119
+ """Return a human-readable verdict based on AI probability"""
120
  if ai_prob < 20:
121
  return "Most likely human-written."
122
  elif 20 <= ai_prob < 40:
 
125
  return "Unclear – could be either human or AI-assisted."
126
  elif 60 <= ai_prob < 80:
127
  return "Possibly AI-generated, or a human using AI assistance."
128
+ else: # ai_prob >= 80
129
  return "Likely AI-generated or heavily AI-assisted."
130
 
131
+ def detect_text(text):
132
+ results = {}
133
+ # HuggingFace transformer models
134
+ for name, model_id in MODELS.items():
135
+ results[name] = run_hf_model(model_id, text)
 
 
136
 
137
+ # SzegedAI ModernBERT ensemble
138
+ results["SzegedAI Detector"] = classify_szegedai(text)
 
139
 
140
+ # ---------------------------
141
+ # Final Verdict (Hybrid Rule)
142
+ # ---------------------------
143
+ ai_probs = []
144
+ strong_ai_detector = None
145
 
146
+ for k, v in results.items():
147
+ if "AI Probability" in v:
148
+ ai_probs.append(v["AI Probability"])
149
+ if v["AI Probability"] > 90: # strong AI flag
150
+ strong_ai_detector = v
151
 
152
+ avg_ai = np.mean(ai_probs) if ai_probs else 0
 
 
153
 
154
+ if strong_ai_detector:
155
+ final_verdict = verdict(strong_ai_detector["AI Probability"])
156
+ if "Identified LLM" in strong_ai_detector:
157
+ final_verdict += f" (Identified: {strong_ai_detector['Identified LLM']})"
158
  else:
159
+ final_verdict = verdict(avg_ai)
 
 
 
160
 
161
  results["Final Score"] = {
162
+ "Verdict": final_verdict,
163
+ # "Average AI Probability": round(avg_ai, 2)
 
164
  }
165
  return results
166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  if __name__ == "__main__":
168
  sample = "This is a test sentence written by AI or human."
169
  print(detect_text(sample))