| { | |
| "root_path": "/home/zli", | |
| "available_corpus": { | |
| "cc3m": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "image" | |
| }, | |
| "webvid_10m": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| }, | |
| "smol_test": { | |
| "anno_path": "/root/IV2/InternVideo2/multi_modality/data_test/smol_test.json", | |
| "data_root": "/root/IV2/InternVideo2/multi_modality/data_test/", | |
| "media_type": "video" | |
| }, | |
| "slim_kinetics": { | |
| "anno_path": "/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json", | |
| "data_root": "/home/zli/kinetics-dataset/k600/train/train", | |
| "media_type": "video" | |
| }, | |
| "slim_kinetics_act_val": { | |
| "anno_path": "/home/zli/kinetics-dataset/k600/test/kinetics-test.json", | |
| "data_root": "/home/zli/kinetics-dataset/k600/test/", | |
| "media_type": "video", | |
| "is_act_rec": true | |
| } | |
| }, | |
| "VisionEncoders": {}, | |
| "TextEncoders": { | |
| "bert": { | |
| "name": "bert_base", | |
| "pretrained": "bert-base-uncased", | |
| "config": "configs/config_bert.json", | |
| "d_model": 768, | |
| "fusion_layer": 9 | |
| }, | |
| "bert_large": { | |
| "name": "bert_large", | |
| "pretrained": "bert-large-uncased", | |
| "config": "configs/config_bert_large.json", | |
| "d_model": 1024, | |
| "fusion_layer": 19 | |
| }, | |
| "med_bert": { | |
| "name": "med_bert_base", | |
| "pretrained": "bert-base-uncased", | |
| "config": "configs/med_config.json", | |
| "d_model": 768 | |
| }, | |
| "med_bert_large": { | |
| "name": "med_bert_large", | |
| "pretrained": "bert-base-uncased", | |
| "config": "configs/med_large_config.json", | |
| "d_model": 768 | |
| } | |
| }, | |
| "train_corpus": "slim_kinetics", | |
| "train_file": { | |
| "anno_path": "/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json", | |
| "data_root": "/home/zli/kinetics-dataset/k600/train/train", | |
| "media_type": "video" | |
| }, | |
| "test_file": { | |
| "act_val": { | |
| "anno_path": "/home/zli/kinetics-dataset/k600/test/kinetics-test.json", | |
| "data_root": "/home/zli/kinetics-dataset/k600/test/", | |
| "media_type": "video", | |
| "is_act_rec": true | |
| } | |
| }, | |
| "test_types": [ | |
| "act_val" | |
| ], | |
| "num_workers": 2, | |
| "stop_key": null, | |
| "num_frames": 8, | |
| "num_frames_test": 8, | |
| "batch_size": 16, | |
| "batch_size_test": 16, | |
| "max_txt_l": 32, | |
| "size_t": 224, | |
| "inputs": { | |
| "image_res": 224, | |
| "video_input": { | |
| "num_frames": 8, | |
| "sample_type": "all", | |
| "num_frames_test": 8, | |
| "sample_type_test": "all", | |
| "random_aug": false | |
| }, | |
| "max_txt_l": { | |
| "image": 32, | |
| "video": 32 | |
| }, | |
| "batch_size": { | |
| "image": 16, | |
| "video": 16 | |
| }, | |
| "batch_size_test": { | |
| "image": 16, | |
| "video": 16 | |
| } | |
| }, | |
| "model": { | |
| "model_cls": "InternVideo2_CLIP_small", | |
| "vision_encoder": { | |
| "name": "internvideo2", | |
| "in_chans": 3, | |
| "patch_size": 14, | |
| "img_size": 224, | |
| "qkv_bias": false, | |
| "drop_path_rate": 0.0, | |
| "head_drop_path_rate": 0.0, | |
| "embed_dim": 768, | |
| "num_heads": 12, | |
| "mlp_ratio": 4, | |
| "init_values": 0.1, | |
| "qk_normalization": true, | |
| "depth": 12, | |
| "use_flash_attn": true, | |
| "use_fused_rmsnorm": true, | |
| "use_fused_mlp": true, | |
| "fused_mlp_heuristic": 1, | |
| "drop_cls_token": false, | |
| "attn_pool_num_heads": 16, | |
| "clip_embed_dim": 768, | |
| "layerscale_no_force_fp32": true, | |
| "num_frames": 8, | |
| "tubelet_size": 1, | |
| "sep_pos_embed": false, | |
| "use_checkpoint": false, | |
| "checkpoint_num": 0, | |
| "align_dim": 512 | |
| }, | |
| "streaming_vision_encoder": { | |
| "in_chans": 3, | |
| "patch_size": 14, | |
| "img_size": 224, | |
| "vit_qkv_bias": true, | |
| "vit_drop_path_rate": 0.05, | |
| "student_embed_dim": 384, | |
| "student_depth": 4, | |
| "student_num_heads": 6, | |
| "vit_mlp_ratio": 3.0, | |
| "vit_init_values": null, | |
| "vit_qk_normalization": false, | |
| "vit_sep_pos_embed": true, | |
| "vit_norm_layer_type": "rmsnorm", | |
| "rnn_type": "lstm", | |
| "rnn_hidden_size": 1024, | |
| "rnn_num_layers": 1, | |
| "fc_hidden_layers": [], | |
| "teacher_clip_embed_dim": 768, | |
| "student_num_frames_processed_by_vit": 1, | |
| "student_tubelet_size_for_vit": 1 | |
| }, | |
| "text_encoder": { | |
| "name": "mobileclip_b" | |
| }, | |
| "temp": 0.01, | |
| "temp_min": 0.01, | |
| "freeze_vision": true, | |
| "open_vision_clip_projector": false, | |
| "freeze_text": true, | |
| "open_text_projection": false, | |
| "open_text_lora": false, | |
| "vision_ckpt_path": "/home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin", | |
| "load_vision_ckpt_from_internvideo2_stage2": false, | |
| "text_ckpt_path": "/home/zli/IV2/models/mobileclip_blt.pt", | |
| "extra_ckpt_path": "/home/zli/IV2/models/clip/B14/pytorch_model.bin" | |
| }, | |
| "criterion": { | |
| "loss_weight": { | |
| "vtc": 1.0 | |
| } | |
| }, | |
| "optimizer": { | |
| "opt": "adamW", | |
| "lr": 1e-05, | |
| "opt_betas": [ | |
| 0.9, | |
| 0.98 | |
| ], | |
| "weight_decay": 0.01, | |
| "max_grad_norm": 0.7, | |
| "different_lr": { | |
| "enable": false, | |
| "module_names": [], | |
| "lr": 1e-05 | |
| } | |
| }, | |
| "scheduler": { | |
| "sched": "cosine", | |
| "epochs": 1, | |
| "min_lr_multi": 0.01, | |
| "warmup_epochs": 0.05 | |
| }, | |
| "evaluate": false, | |
| "deep_fusion": false, | |
| "evaluation": { | |
| "eval_frame_ensemble": "concat", | |
| "eval_x_only": false, | |
| "k_test": 128, | |
| "eval_offload": true | |
| }, | |
| "use_half_precision": true, | |
| "use_bf16": true, | |
| "gradient_checkpointing": true, | |
| "wandb": { | |
| "enable": true, | |
| "entity": "qingy2019-conker-mobile-inc-", | |
| "project": "window_iv2" | |
| }, | |
| "dist_url": "env://", | |
| "device": "cuda", | |
| "mode": "pt", | |
| "output_dir": "scripts/pretraining/clip/B14/B14", | |
| "resume": true, | |
| "debug": false, | |
| "log_freq": 1, | |
| "seed": 42, | |
| "save_latest": false, | |
| "save_iter": 5000, | |
| "eval_freq_steps": 1000, | |
| "eval_video_repo_id": "qingy2024/backflip_train", | |
| "eval_video_filename": "1.mp4", | |
| "eval_plot_output_dir": "scripts/pretraining/clip/B14/cosine_sim_graphs", | |
| "auto_resume": true, | |
| "pretrained_path": "", | |
| "deepspeed": { | |
| "enable": true, | |
| "stage": 1 | |
| }, | |
| "rank": 0, | |
| "world_size": 1, | |
| "gpu": 0, | |
| "distributed": true, | |
| "dist_backend": "nccl", | |
| "deepspeed_config": "scripts/pretraining/clip/B14/B14/deepspeed_config.json" | |
| } |