Spaces:
Runtime error
Runtime error
Reduced the number of parameters
Browse files
app.py
CHANGED
|
@@ -13,8 +13,13 @@ dataset = load_dataset("mwitiderrick/swahili")
|
|
| 13 |
# Print dataset columns for verification
|
| 14 |
print(f"Dataset columns: {dataset['train'].column_names}")
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
# Initialize the tokenizer and model
|
| 17 |
-
model_name = "gpt2" # Use
|
| 18 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
| 19 |
model = GPT2LMHeadModel.from_pretrained(model_name)
|
| 20 |
|
|
@@ -24,21 +29,21 @@ tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
|
|
| 24 |
|
| 25 |
# Preprocess the dataset
|
| 26 |
def preprocess_function(examples):
|
|
|
|
| 27 |
encodings = tokenizer(
|
| 28 |
-
examples['text'],
|
| 29 |
truncation=True,
|
| 30 |
-
padding='max_length',
|
| 31 |
max_length=512
|
| 32 |
)
|
| 33 |
-
encodings['labels'] = encodings['input_ids']
|
| 34 |
return encodings
|
| 35 |
|
| 36 |
# Tokenize the dataset
|
| 37 |
try:
|
| 38 |
-
tokenized_datasets =
|
| 39 |
preprocess_function,
|
| 40 |
-
batched=True
|
| 41 |
-
batch_size=1000 # Adjust batch size for efficiency
|
| 42 |
)
|
| 43 |
except Exception as e:
|
| 44 |
print(f"Error during tokenization: {e}")
|
|
@@ -46,21 +51,20 @@ except Exception as e:
|
|
| 46 |
# Define training arguments
|
| 47 |
training_args = TrainingArguments(
|
| 48 |
output_dir='./results',
|
| 49 |
-
per_device_train_batch_size=
|
| 50 |
num_train_epochs=1,
|
| 51 |
logging_dir='./logs',
|
| 52 |
-
logging_steps=500,
|
| 53 |
-
evaluation_strategy="steps",
|
| 54 |
-
save_steps=
|
| 55 |
-
save_total_limit=2,
|
| 56 |
-
gradient_accumulation_steps=8, # Accumulate gradients to simulate larger batch size
|
| 57 |
)
|
| 58 |
|
| 59 |
# Define Trainer
|
| 60 |
trainer = Trainer(
|
| 61 |
model=model,
|
| 62 |
args=training_args,
|
| 63 |
-
train_dataset=tokenized_datasets
|
| 64 |
tokenizer=tokenizer,
|
| 65 |
)
|
| 66 |
|
|
|
|
| 13 |
# Print dataset columns for verification
|
| 14 |
print(f"Dataset columns: {dataset['train'].column_names}")
|
| 15 |
|
| 16 |
+
# Select a subset of the dataset (e.g., first 100,000 rows)
|
| 17 |
+
subset_size = 100000 # Adjust the size as needed
|
| 18 |
+
subset_dataset = dataset["train"].select(range(min(subset_size, len(dataset["train"]))))
|
| 19 |
+
print(f"Using a subset of {len(subset_dataset)} rows for training.")
|
| 20 |
+
|
| 21 |
# Initialize the tokenizer and model
|
| 22 |
+
model_name = "gpt2" # Use GPT-2 for text generation
|
| 23 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
| 24 |
model = GPT2LMHeadModel.from_pretrained(model_name)
|
| 25 |
|
|
|
|
| 29 |
|
| 30 |
# Preprocess the dataset
|
| 31 |
def preprocess_function(examples):
|
| 32 |
+
# Tokenize and format the dataset
|
| 33 |
encodings = tokenizer(
|
| 34 |
+
examples['text'], # Use 'text' column from your dataset
|
| 35 |
truncation=True,
|
| 36 |
+
padding='max_length', # Ensure consistent length
|
| 37 |
max_length=512
|
| 38 |
)
|
| 39 |
+
encodings['labels'] = encodings['input_ids'] # Use input_ids directly as labels
|
| 40 |
return encodings
|
| 41 |
|
| 42 |
# Tokenize the dataset
|
| 43 |
try:
|
| 44 |
+
tokenized_datasets = subset_dataset.map(
|
| 45 |
preprocess_function,
|
| 46 |
+
batched=True
|
|
|
|
| 47 |
)
|
| 48 |
except Exception as e:
|
| 49 |
print(f"Error during tokenization: {e}")
|
|
|
|
| 51 |
# Define training arguments
|
| 52 |
training_args = TrainingArguments(
|
| 53 |
output_dir='./results',
|
| 54 |
+
per_device_train_batch_size=4,
|
| 55 |
num_train_epochs=1,
|
| 56 |
logging_dir='./logs',
|
| 57 |
+
logging_steps=500, # Log every 500 steps
|
| 58 |
+
evaluation_strategy="steps", # Use evaluation strategy
|
| 59 |
+
save_steps=10_000, # Save checkpoint every 10,000 steps
|
| 60 |
+
save_total_limit=2, # Keep only the last 2 checkpoints
|
|
|
|
| 61 |
)
|
| 62 |
|
| 63 |
# Define Trainer
|
| 64 |
trainer = Trainer(
|
| 65 |
model=model,
|
| 66 |
args=training_args,
|
| 67 |
+
train_dataset=tokenized_datasets,
|
| 68 |
tokenizer=tokenizer,
|
| 69 |
)
|
| 70 |
|