Spaces:
Running
Running
File size: 5,860 Bytes
44db095 95b8024 44db095 27f9fcd 44db095 95b8024 44db095 84883ea 44db095 13a1290 44db095 d4bae5a 44db095 44d54f5 44db095 44d54f5 a82a2ae 44d54f5 44db095 44d54f5 2766639 44db095 e7d2c96 84883ea 95adb4e 44d54f5 84883ea d4db00e 95adb4e 44db095 44d54f5 44db095 84883ea 44db095 d4db00e 44db095 d4db00e 44db095 95b8024 44db095 84883ea 44db095 ea16ad3 84883ea ea16ad3 13a1290 84883ea 13a1290 ea16ad3 84883ea 13a1290 44db095 84883ea 13a1290 84883ea 13a1290 59daa60 13a1290 59daa60 13a1290 44db095 13a1290 59daa60 84883ea 44db095 2766639 84883ea 95b8024 84883ea 44db095 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import re
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# ---------------------------
# ModernBERT Models (SzegedAI)
# ---------------------------
model1_path = "modernbert.bin"
model2_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12"
model3_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
tokenizer_modernbert = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
model_1 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
model_1.load_state_dict(torch.load(model1_path, map_location=device))
model_1.to(device).eval()
model_2 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
model_2.load_state_dict(torch.hub.load_state_dict_from_url(model2_path, map_location=device))
model_2.to(device).eval()
model_3 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
model_3.load_state_dict(torch.hub.load_state_dict_from_url(model3_path, map_location=device))
model_3.to(device).eval()
label_mapping = {
0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b',
6: 'bloomz', 7: 'cohere', 8: 'davinci', 9: 'dolly', 10: 'dolly-v2-12b',
11: 'flan_t5_base', 12: 'flan_t5_large', 13: 'flan_t5_small',
14: 'flan_t5_xl', 15: 'flan_t5_xxl', 16: 'gemma-7b-it', 17: 'gemma2-9b-it',
18: 'gpt-3.5-turbo', 19: 'gpt-35', 20: 'gpt4', 21: 'gpt4o',
22: 'gpt_j', 23: 'gpt_neox', 24: 'human', 25: 'llama3-70b', 26: 'llama3-8b',
27: 'mixtral-8x7b', 28: 'opt_1.3b', 29: 'opt_125m', 30: 'opt_13b',
31: 'opt_2.7b', 32: 'opt_30b', 33: 'opt_350m', 34: 'opt_6.7b',
35: 'opt_iml_30b', 36: 'opt_iml_max_1.3b', 37: 't0_11b', 38: 't0_3b',
39: 'text-davinci-002', 40: 'text-davinci-003'
}
def clean_text(text: str) -> str:
"""Normalize text for ModernBERT"""
text = text.replace("\xa0", " ").replace("\u200b", "")
text = re.sub(r"\s{2,}", " ", text)
text = re.sub(r"\s+([,.;:?!])", r"\1", text)
return text.strip()
def classify_szegedai(text: str):
"""
ModernBERT ensemble detector with:
- Human label boost
- Short text handling (<30 words ignored)
"""
cleaned_text = clean_text(text)
if not cleaned_text.strip():
return {"error": "Empty text"}
word_count = len(cleaned_text.split())
if word_count < 30:
# For very short texts, skip AI classification and assume mostly human
return {"Please Enter at least 30 words"}
inputs = tokenizer_modernbert(cleaned_text, return_tensors="pt", truncation=True, padding=True).to(device)
with torch.no_grad():
logits_1 = model_1(**inputs).logits
logits_2 = model_2(**inputs).logits
logits_3 = model_3(**inputs).logits
probs1 = torch.softmax(logits_1, dim=1)
probs2 = torch.softmax(logits_2, dim=1)
probs3 = torch.softmax(logits_3, dim=1)
human_index = 24
for p in [probs1, probs2, probs3]:
p[:, human_index] *= 2.0 # Boost human label
p = p / p.sum(dim=1, keepdim=True) # Re-normalize
probs = (probs1 + probs2 + probs3) / 3
human_prob = probs[0][human_index].item() * 100
ai_prob = 100 - human_prob
return {"Human Probability": round(human_prob, 2), "AI Probability": round(ai_prob, 2)}
# ---------------------------
# HuggingFace other models
# ---------------------------
MODELS = {
"MonkeyDAnh": "MonkeyDAnh/my-awesome-ai-detector-roberta-base-v4-human-vs-machine-finetune",
}
def run_hf_model(model_id, text):
try:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
return {"Human Probability": float(probs[0]*100), "AI Probability": float(probs[1]*100)}
except Exception as e:
return {"error": str(e)}
# ---------------------------
# Verdict logic
# ---------------------------
def verdict(ai_prob):
if ai_prob < 20:
return "Most likely human-written."
elif 20 <= ai_prob < 40:
return "Possibly human-written with minimal AI assistance."
elif 40 <= ai_prob < 60:
return "Unclear – could be either human or AI-assisted."
elif 60 <= ai_prob < 80:
return "Possibly AI-generated, or a human using AI assistance."
else:
return "Likely AI-generated or heavily AI-assisted."
def detect_text(text):
results = {}
# Run other HuggingFace detectors
for name, model_id in MODELS.items():
results[name] = run_hf_model(model_id, text)
# Run ModernBERT ensemble
results["SzegedAI Detector"] = classify_szegedai(text)
# Compute average AI probability
ai_probs = []
strong_ai_detector = None
for v in results.values():
if "AI Probability" in v:
ai_probs.append(v["AI Probability"])
if v["AI Probability"] > 90: # strong AI flag
strong_ai_detector = v
avg_ai = np.mean(ai_probs) if ai_probs else 0
if strong_ai_detector:
final_verdict = verdict(strong_ai_detector["AI Probability"])
else:
final_verdict = verdict(avg_ai)
results["Final Score"] = {"Verdict": final_verdict}
return results
# ---------------------------
# Test Example
# ---------------------------
if __name__ == "__main__":
sample = "This is a test sentence written by a human."
print(detect_text(sample)) |