qingy2024's picture
Upload folder using huggingface_hub
4d7702e verified
{
"root_path": "/home/zli",
"available_corpus": {
"cc3m": {
"anno_path": "your_path",
"data_root": "",
"media_type": "image"
},
"webvid_10m": {
"anno_path": "your_path",
"data_root": "",
"media_type": "video"
},
"smol_test": {
"anno_path": "/root/IV2/InternVideo2/multi_modality/data_test/smol_test.json",
"data_root": "/root/IV2/InternVideo2/multi_modality/data_test/",
"media_type": "video"
},
"slim_kinetics": {
"anno_path": "/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json",
"data_root": "/home/zli/kinetics-dataset/k600/train/train",
"media_type": "video"
},
"slim_kinetics_act_val": {
"anno_path": "/home/zli/kinetics-dataset/k600/test/kinetics-test.json",
"data_root": "/home/zli/kinetics-dataset/k600/test/",
"media_type": "video",
"is_act_rec": true
}
},
"VisionEncoders": {},
"TextEncoders": {
"bert": {
"name": "bert_base",
"pretrained": "bert-base-uncased",
"config": "configs/config_bert.json",
"d_model": 768,
"fusion_layer": 9
},
"bert_large": {
"name": "bert_large",
"pretrained": "bert-large-uncased",
"config": "configs/config_bert_large.json",
"d_model": 1024,
"fusion_layer": 19
},
"med_bert": {
"name": "med_bert_base",
"pretrained": "bert-base-uncased",
"config": "configs/med_config.json",
"d_model": 768
},
"med_bert_large": {
"name": "med_bert_large",
"pretrained": "bert-base-uncased",
"config": "configs/med_large_config.json",
"d_model": 768
}
},
"train_corpus": "slim_kinetics",
"train_file": {
"anno_path": "/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json",
"data_root": "/home/zli/kinetics-dataset/k600/train/train",
"media_type": "video"
},
"test_file": {
"act_val": {
"anno_path": "/home/zli/kinetics-dataset/k600/test/kinetics-test.json",
"data_root": "/home/zli/kinetics-dataset/k600/test/",
"media_type": "video",
"is_act_rec": true
}
},
"test_types": [
"act_val"
],
"num_workers": 2,
"stop_key": null,
"num_frames": 8,
"num_frames_test": 8,
"batch_size": 16,
"batch_size_test": 16,
"max_txt_l": 32,
"size_t": 224,
"inputs": {
"image_res": 224,
"video_input": {
"num_frames": 8,
"sample_type": "all",
"num_frames_test": 8,
"sample_type_test": "all",
"random_aug": false
},
"max_txt_l": {
"image": 32,
"video": 32
},
"batch_size": {
"image": 16,
"video": 16
},
"batch_size_test": {
"image": 16,
"video": 16
}
},
"model": {
"model_cls": "InternVideo2_CLIP_small",
"vision_encoder": {
"name": "internvideo2",
"in_chans": 3,
"patch_size": 14,
"img_size": 224,
"qkv_bias": false,
"drop_path_rate": 0.0,
"head_drop_path_rate": 0.0,
"embed_dim": 768,
"num_heads": 12,
"mlp_ratio": 4,
"init_values": 0.1,
"qk_normalization": true,
"depth": 12,
"use_flash_attn": true,
"use_fused_rmsnorm": true,
"use_fused_mlp": true,
"fused_mlp_heuristic": 1,
"drop_cls_token": false,
"attn_pool_num_heads": 16,
"clip_embed_dim": 768,
"layerscale_no_force_fp32": true,
"num_frames": 8,
"tubelet_size": 1,
"sep_pos_embed": false,
"use_checkpoint": false,
"checkpoint_num": 0,
"align_dim": 512
},
"streaming_vision_encoder": {
"in_chans": 3,
"patch_size": 14,
"img_size": 224,
"vit_qkv_bias": true,
"vit_drop_path_rate": 0.05,
"student_embed_dim": 384,
"student_depth": 4,
"student_num_heads": 6,
"vit_mlp_ratio": 3.0,
"vit_init_values": null,
"vit_qk_normalization": false,
"vit_sep_pos_embed": true,
"vit_norm_layer_type": "rmsnorm",
"rnn_type": "lstm",
"rnn_hidden_size": 1024,
"rnn_num_layers": 1,
"fc_hidden_layers": [],
"teacher_clip_embed_dim": 768,
"student_num_frames_processed_by_vit": 1,
"student_tubelet_size_for_vit": 1
},
"text_encoder": {
"name": "mobileclip_b"
},
"temp": 0.01,
"temp_min": 0.01,
"freeze_vision": true,
"open_vision_clip_projector": false,
"freeze_text": true,
"open_text_projection": false,
"open_text_lora": false,
"vision_ckpt_path": "/home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin",
"load_vision_ckpt_from_internvideo2_stage2": false,
"text_ckpt_path": "/home/zli/IV2/models/mobileclip_blt.pt",
"extra_ckpt_path": "/home/zli/IV2/models/clip/B14/pytorch_model.bin"
},
"criterion": {
"loss_weight": {
"vtc": 1.0
}
},
"optimizer": {
"opt": "adamW",
"lr": 1e-05,
"opt_betas": [
0.9,
0.98
],
"weight_decay": 0.01,
"max_grad_norm": 0.7,
"different_lr": {
"enable": false,
"module_names": [],
"lr": 1e-05
}
},
"scheduler": {
"sched": "cosine",
"epochs": 1,
"min_lr_multi": 0.01,
"warmup_epochs": 0.05
},
"evaluate": false,
"deep_fusion": false,
"evaluation": {
"eval_frame_ensemble": "concat",
"eval_x_only": false,
"k_test": 128,
"eval_offload": true
},
"use_half_precision": true,
"use_bf16": true,
"gradient_checkpointing": true,
"wandb": {
"enable": true,
"entity": "qingy2019-conker-mobile-inc-",
"project": "window_iv2"
},
"dist_url": "env://",
"device": "cuda",
"mode": "pt",
"output_dir": "scripts/pretraining/clip/B14/B14",
"resume": true,
"debug": false,
"log_freq": 1,
"seed": 42,
"save_latest": false,
"save_iter": 5000,
"eval_freq_steps": 1000,
"eval_video_repo_id": "qingy2024/backflip_train",
"eval_video_filename": "1.mp4",
"eval_plot_output_dir": "scripts/pretraining/clip/B14/cosine_sim_graphs",
"auto_resume": true,
"pretrained_path": "",
"deepspeed": {
"enable": true,
"stage": 1
},
"rank": 0,
"world_size": 1,
"gpu": 0,
"distributed": true,
"dist_backend": "nccl",
"deepspeed_config": "scripts/pretraining/clip/B14/B14/deepspeed_config.json"
}