{ "root_path": "/home/zli", "available_corpus": { "cc3m": { "anno_path": "your_path", "data_root": "", "media_type": "image" }, "webvid_10m": { "anno_path": "your_path", "data_root": "", "media_type": "video" }, "smol_test": { "anno_path": "/root/IV2/InternVideo2/multi_modality/data_test/smol_test.json", "data_root": "/root/IV2/InternVideo2/multi_modality/data_test/", "media_type": "video" }, "slim_kinetics": { "anno_path": "/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json", "data_root": "/home/zli/kinetics-dataset/k600/train/train", "media_type": "video" }, "slim_kinetics_act_val": { "anno_path": "/home/zli/kinetics-dataset/k600/test/kinetics-test.json", "data_root": "/home/zli/kinetics-dataset/k600/test/", "media_type": "video", "is_act_rec": true } }, "VisionEncoders": {}, "TextEncoders": { "bert": { "name": "bert_base", "pretrained": "bert-base-uncased", "config": "configs/config_bert.json", "d_model": 768, "fusion_layer": 9 }, "bert_large": { "name": "bert_large", "pretrained": "bert-large-uncased", "config": "configs/config_bert_large.json", "d_model": 1024, "fusion_layer": 19 }, "med_bert": { "name": "med_bert_base", "pretrained": "bert-base-uncased", "config": "configs/med_config.json", "d_model": 768 }, "med_bert_large": { "name": "med_bert_large", "pretrained": "bert-base-uncased", "config": "configs/med_large_config.json", "d_model": 768 } }, "train_corpus": "slim_kinetics", "train_file": { "anno_path": "/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json", "data_root": "/home/zli/kinetics-dataset/k600/train/train", "media_type": "video" }, "test_file": { "act_val": { "anno_path": "/home/zli/kinetics-dataset/k600/test/kinetics-test.json", "data_root": "/home/zli/kinetics-dataset/k600/test/", "media_type": "video", "is_act_rec": true } }, "test_types": [ "act_val" ], "num_workers": 2, "stop_key": null, "num_frames": 8, "num_frames_test": 8, "batch_size": 16, "batch_size_test": 16, "max_txt_l": 32, "size_t": 224, "inputs": { "image_res": 224, "video_input": { "num_frames": 8, "sample_type": "all", "num_frames_test": 8, "sample_type_test": "all", "random_aug": false }, "max_txt_l": { "image": 32, "video": 32 }, "batch_size": { "image": 16, "video": 16 }, "batch_size_test": { "image": 16, "video": 16 } }, "model": { "model_cls": "InternVideo2_CLIP_small", "vision_encoder": { "name": "internvideo2", "in_chans": 3, "patch_size": 14, "img_size": 224, "qkv_bias": false, "drop_path_rate": 0.0, "head_drop_path_rate": 0.0, "embed_dim": 768, "num_heads": 12, "mlp_ratio": 4, "init_values": 0.1, "qk_normalization": true, "depth": 12, "use_flash_attn": true, "use_fused_rmsnorm": true, "use_fused_mlp": true, "fused_mlp_heuristic": 1, "drop_cls_token": false, "attn_pool_num_heads": 16, "clip_embed_dim": 768, "layerscale_no_force_fp32": true, "num_frames": 8, "tubelet_size": 1, "sep_pos_embed": false, "use_checkpoint": false, "checkpoint_num": 0, "align_dim": 512 }, "streaming_vision_encoder": { "in_chans": 3, "patch_size": 14, "img_size": 224, "vit_qkv_bias": true, "vit_drop_path_rate": 0.05, "student_embed_dim": 384, "student_depth": 4, "student_num_heads": 6, "vit_mlp_ratio": 3.0, "vit_init_values": null, "vit_qk_normalization": false, "vit_sep_pos_embed": true, "vit_norm_layer_type": "rmsnorm", "rnn_type": "lstm", "rnn_hidden_size": 1024, "rnn_num_layers": 1, "fc_hidden_layers": [], "teacher_clip_embed_dim": 768, "student_num_frames_processed_by_vit": 1, "student_tubelet_size_for_vit": 1 }, "text_encoder": { "name": "mobileclip_b" }, "temp": 0.01, "temp_min": 0.01, "freeze_vision": true, "open_vision_clip_projector": false, "freeze_text": true, "open_text_projection": false, "open_text_lora": false, "vision_ckpt_path": "/home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin", "load_vision_ckpt_from_internvideo2_stage2": false, "text_ckpt_path": "/home/zli/IV2/models/mobileclip_blt.pt", "extra_ckpt_path": "/home/zli/IV2/models/clip/B14/pytorch_model.bin" }, "criterion": { "loss_weight": { "vtc": 1.0 } }, "optimizer": { "opt": "adamW", "lr": 1e-05, "opt_betas": [ 0.9, 0.98 ], "weight_decay": 0.01, "max_grad_norm": 0.7, "different_lr": { "enable": false, "module_names": [], "lr": 1e-05 } }, "scheduler": { "sched": "cosine", "epochs": 1, "min_lr_multi": 0.01, "warmup_epochs": 0.05 }, "evaluate": false, "deep_fusion": false, "evaluation": { "eval_frame_ensemble": "concat", "eval_x_only": false, "k_test": 128, "eval_offload": true }, "use_half_precision": true, "use_bf16": true, "gradient_checkpointing": true, "wandb": { "enable": true, "entity": "qingy2019-conker-mobile-inc-", "project": "window_iv2" }, "dist_url": "env://", "device": "cuda", "mode": "pt", "output_dir": "scripts/pretraining/clip/B14/B14", "resume": true, "debug": false, "log_freq": 1, "seed": 42, "save_latest": false, "save_iter": 5000, "eval_freq_steps": 1000, "eval_video_repo_id": "qingy2024/backflip_train", "eval_video_filename": "1.mp4", "eval_plot_output_dir": "scripts/pretraining/clip/B14/cosine_sim_graphs", "auto_resume": true, "pretrained_path": "", "deepspeed": { "enable": true, "stage": 1 }, "rank": 0, "world_size": 1, "gpu": 0, "distributed": true, "dist_backend": "nccl", "deepspeed_config": "scripts/pretraining/clip/B14/B14/deepspeed_config.json" }