{
    "_description": "Configuration for Dynamic-K Mixture-of-Experts Model",
    "_model_type": "DynamicMOELM",
    "model": {
        "vocab_size": 10000,
        "num_layers": 4,
        "context_length": 256,
        "d_model": 512,
        "d_ff": 2048,
        "num_heads": 8,
        "theta": 10000.0
    },
    "moe": {
        "num_experts": 4,
        "confidence_threshold": 0.8
    },
    "loss_weights": {
        "balance_loss_weight": 0.01,
        "entropy_loss_weight": 0.001
    },
    "optimizer": {
        "learning_rate": 3e-4,
        "beta1": 0.9,
        "beta2": 0.95,
        "eps": 1e-8,
        "weight_decay": 0.1,
        "max_grad_norm": 1.0
    },
    "scheduler": {
        "warmup_steps": 2000,
        "max_steps": 20000
    },
    "training": {
        "batch_size": 4,
        "grad_accum_steps": 1,
        "eval_interval": 500,
        "log_interval": 100,
        "save_interval": 2000,
        "eval_steps": 10
    },
    "paths": {
        "train_data_path": "data/train.txt",
        "val_data_path": "data/test.txt",
        "checkpoint_dir": "checkpoints_dynamic_moe",
        "resume_from": null
    },
    "system": {
        "device": "cuda"
    },
    "logging": {
        "use_wandb": true,
        "wandb_project": "dynamic-moe-phase2",
        "wandb_run_name": null
    }
}