{ "_description": "Configuration for Dynamic-K Mixture-of-Experts Model", "_model_type": "DynamicMOELM", "model": { "vocab_size": 10000, "num_layers": 4, "context_length": 256, "d_model": 512, "d_ff": 2048, "num_heads": 8, "theta": 10000.0 }, "moe": { "num_experts": 4, "confidence_threshold": 0.8 }, "loss_weights": { "balance_loss_weight": 0.01, "entropy_loss_weight": 0.001 }, "optimizer": { "learning_rate": 3e-4, "beta1": 0.9, "beta2": 0.95, "eps": 1e-8, "weight_decay": 0.1, "max_grad_norm": 1.0 }, "scheduler": { "warmup_steps": 2000, "max_steps": 20000 }, "training": { "batch_size": 4, "grad_accum_steps": 1, "eval_interval": 500, "log_interval": 100, "save_interval": 2000, "eval_steps": 10 }, "paths": { "train_data_path": "data/train.txt", "val_data_path": "data/test.txt", "checkpoint_dir": "checkpoints_dynamic_moe", "resume_from": null }, "system": { "device": "cuda" }, "logging": { "use_wandb": true, "wandb_project": "dynamic-moe-phase2", "wandb_run_name": null } }