align_f0: false align_loss_weight: 1.0 asc_loss_weight: 0.02 attention_mechanism: graves augment_sr: false base_model: null bit_depth: 9 causal_convs: false causal_decoder: false clap_dims: 512 compat_dcnar_f0_std_cond: false conv_stack_dilation: !!python/tuple - 1 - 3 - 9 - 27 convbn_bias: false cudnn_deterministic: false dcnar_1d_discrim: false dcnar_aligner_kernel: 5 dcnar_aligner_type: null dcnar_allow_trivial_speaker_table: true dcnar_batch_size: 24 dcnar_conformer: false dcnar_conformer_attn_chunk_size: null dcnar_conformer_attn_dim_head: 64 dcnar_conformer_attn_ff_mult: 4 dcnar_conformer_attn_win_size: null dcnar_conv_weight_grouping: 1 dcnar_df0_loss_weight: 0.5 dcnar_dim_lrg: 512 dcnar_dim_sml: 256 dcnar_dim_style: 32 dcnar_discrim_tanh: false dcnar_dtw_loss_weight: 1 dcnar_dur_loss_weight: 0.1 dcnar_dur_pred_scale: linear dcnar_f0_cond_mel_decoding: false dcnar_f0_cond_mel_decoding_teacher_forcing: true dcnar_f0_loss_weight: 0.5 dcnar_gan_dims: 64 dcnar_global_style: true dcnar_hard_gumbel_tones: false dcnar_hubert_downsample: 1 dcnar_inpaint_vae: false dcnar_inpaint_vae_kld_loss_weight: 0 dcnar_inpaint_vae_latent_dim: 32 dcnar_inpaint_vae_warmup_steps: 5000 dcnar_inpaint_vae_weight_step_size: 0.0002 dcnar_local_f0: false dcnar_local_intensity: false dcnar_local_style: false dcnar_lr: 0.0001 dcnar_mel_adv: false dcnar_mel_loss_weight: 10.0 dcnar_mixed_sr_loss: false dcnar_n_terminal_tones: 0 dcnar_ph_f0_loss_weight: 1.0 dcnar_ph_hubert_loss_weight: 1.0 dcnar_ph_intensity_loss_weight: 1.0 dcnar_pitch_adv: false dcnar_prosody_adv: false dcnar_prosody_stats_cond: false dcnar_pstat_weight_f0_mean: 10 dcnar_pstat_weight_f0_std: 100 dcnar_pstat_weight_intensity_mean: 10 dcnar_pstat_weight_intensity_std: 0 dcnar_pstat_weight_phdur_mean: 1 dcnar_pstat_weight_phdur_std: 1 dcnar_reverb_label: false dcnar_sampler: default dcnar_sr_label: false dcnar_terminal_tone_usl_weight: 0 dcnar_terminal_tone_weight: 0 dcnar_upsampling: gaussian dcnar_use_log_f0_frames: false dcnar_use_toucan_utt_embs: false dcnar_usl_mfcc: false dcnar_usl_mfcc_deltas: false dcnar_usl_mfcc_dim: 12 dcnar_usl_mfcc_var_dec: false dcnar_usl_slim: false dcnar_usl_slim_dim: 16 dcnar_usl_with_f0: false dcnar_utt_dur_loss_weight: 0 dcnar_vc_local_hubert: false dcnar_vc_mode: nn dcnar_vc_text_predict: false dcnar_vuv_loss_weight: 0.5 dcvoc_causal: false dcvoc_causal_lookahead: 3 dcvoc_channel_downsample_mode: interleave dcvoc_convs_per_scale: 8 dcvoc_disc_duplicates: 1 dcvoc_disc_mpwd: true dcvoc_disc_mrsd: false dcvoc_disc_pdd: true dcvoc_disc_phase_aug: false dcvoc_discriminator_bound: 1.01 dcvoc_groups_init: 8 dcvoc_halfres_conv: true dcvoc_hidden_init: 1024 dcvoc_hop: 8 dcvoc_kernel: 7 dcvoc_mel_bneck: 256 dcvoc_smpwd_hidden_max: 1024 dcvoc_smpwd_periods: - 2 - 3 - 5 - 7 - 9 - 11 - 13 dcvoc_upsample_method: linear denoise: false dfd_clip_stft: 1.0e-09 dfd_ramdisk_path: /mnt/ramdisk ema_coeff: 0.99995 emo_embedded_speaker_id: false emotion_adv: false enable_eos_bos_chars: true encoder_type: voice_encoder eval_crosslang: false eval_langs: dataset eval_max_ref_samples: 192 eval_max_repeats: 1 eval_max_runs: 10 eval_max_sentences: 192 eval_mbnet_name: null eval_models_dir: saved_models eval_n_plots: 2 eval_n_wavs: 4 eval_reference: train eval_syn_batch_size: 64 eval_text_source: default eval_ve_name: universal/ve_v2 eval_voc_max_frames: 2000 eval_voc_name: null f0_mode: praat flatten_lstm_params: true fmax: 16000 fmin: 0 frames_per_framegroup: 10 freeze_mel_head: false gmvae_ema_lr: 0.0001 gmvae_latent_dim: 16 gmvae_num_components: 0 gpt_masked_loss: false gpt_prod_max_text: 200 gpt_speaker_ref_type: same_speaker gpt_transformer_type: gpt2-medium hifigan_channels: 256 hooli_enc_dims: 256 hooli_filter_size: 257 hooli_inv_no_uv: false hooli_inv_pitch_diff_reg_weight: 0 hooli_inv_pitch_shift_reg_weight: 0 hooli_nfft: 16 hooli_osc_freq_cutoff: 0.15 hooli_safe_step: true hooli_tv_fir: false hooli_wn_dims: 64 hooligan_discriminators: univnet hooligan_istft: true hop_size: 320 input_pos_emb: handled_internally_by_backbone is_lora: false language_embed_size: 16 legacy_gpt_hidden_size: 1024 lfcc_nfilts: 128 llama_config_name: Llama_520M lora_alpha: 64 lora_dropout: 0.05 lora_r: 32 lossynet_bsize: 25 lossynet_clip_stft: 1.0e-09 lossynet_lr: 0.001 lossynet_n_out_classes: 2 lowest_sr: 8000 max_LR: 0.001 max_conditioning_inputs: 2 max_decoder_frames: 2000 max_f0_freq: 600 max_speech_tokens: 604 max_text_tokens: 402 max_total_tokens: 8196 mel_pad_difference: 1 mel_power: 1.0 mel_type: db min_LR: 1.0e-06 min_f0_freq: 75 mpbert_n_freeze: 0 mpbert_tokenizer: null mpbert_type: transformer mu_law: true n_cqcc_bins: 96 n_cqt_bins: 84 n_fft: 2048 n_gpt_channels: 1024 n_reverbs: 256 n_spk_cond_samples: 2 n_state_per_symbol: 1 n_transformer_heads: 16 n_transformer_layers: 30 normalize_loudness: false normalized_mels: true num_ceps: 29 num_diacritcs: 512 num_freq: 1025 num_heads: 4 num_mels: 256 num_style_tokens: 0 num_tones: 16 onehot_language: false onehot_speaker: false pf_word_boundaries: false phonemizer_backend: espeak preemphasis: 0.97 preemphasize_voc_target: false prenet_type: original project_conditioning: false prosody_embed_size: 0 r_schedule: - - 1 - -1 rvc_emb_channels: 768 rvc_enc_spk_input: false rvc_f0_up: 0 rvc_f0_voc: true rvc_filter_channels: 768 rvc_gin_channels: 256 rvc_hidden_channels: 192 rvc_inter_channels: 192 rvc_kernel_size: 3 rvc_mel_bins: 80 rvc_n_heads: 2 rvc_n_layers: 6 rvc_p_dropout: 0 rvc_resblock: '1' rvc_resblock_dilation_sizes: - - 1 - 3 - 5 - - 1 - 3 - 5 - - 1 - 3 - 5 rvc_resblock_kernel_sizes: - 3 - 7 - 11 rvc_seg_enc_size_frames: 370 rvc_seg_enc_size_samples: 118400 rvc_seg_voc_size_frames: 40 rvc_seg_voc_size_samples: 12800 rvc_speaker_enc: table rvc_speaker_enc_type: V1 rvc_speaker_pitch: null rvc_spec_channels: 513 rvc_spk_embed_dim: 109 rvc_stft_filter_len: 1024 rvc_stft_win_len: 1024 rvc_train_kl_weight: 1.0 rvc_train_mel_weight: 45 rvc_upsample_initial_channel: 512 rvc_upsample_kernel_sizes: - 20 - 16 - 4 - 4 rvc_upsample_rates: - 10 - 8 - 2 - 2 rvc_use_f0: true sample_rate: 32000 scheduler_max_total_steps: 200000 seed: 0 self_conditioning: false separate_stopnet: false singing_dim: 4 speaker_embed_size: 256 speech_cond_prompt_len: 250 speech_token_type: tortoise speech_tokens_dict_size: 6563 speed_scale: 0.1 start_speech_token: 6561 start_text_token: 255 stepwise_sigmoid_noise: 2.0 stft_magnitude_min: 0.0001 stop_speech_token: 6562 stop_text_token: 0 stop_threshold: 0.25 style_embed_size: 256 supports_cfg: false symbol_type: tortoise/data/gpt2_medium.json syn_ar_f0_predict: true syn_batch_frames: 16000 syn_batch_size: 32 syn_mel_scale: 1 syn_predict_f0: true syn_sampler: binnedlength syn_symmetric_mel: false syn_train_max_frames: 700 syn_train_min_duration: 1 taco1_postnet: true taco_decoder_att_rnn_dim: 1024 taco_decoder_prenet_dim: 256 taco_decoder_rnn_dim: 1024 taco_disjoint_conditioning: true taco_encoder_dim: 512 taco_grad_clip: 1 taco_loss_masking: true taco_lr: 0.0001 taco_weight_decay: 1.0e-06 target_loudness: -18 text_loss_weight: 0.1 text_preproc: none text_tokens_dict_size: 50276 ti_vocoder: false toucan_utt_emb_dim: 704 trim_silence: true upsample_factors: !!python/tuple - 5 - 8 - 8 upsample_rate: null upsamplenet_dropout: false upsamplenet_lr: 1.0e-05 use_adv_speaker_classifier: false use_clap_embeds: false use_diacritic: false use_emotion_table: false use_lamb_optimizer: false use_language_table: false use_monotonic_alignment: false use_mpbert: false use_one_cycle_lr: false use_perceiver_resampler: false use_pf: false use_ph_durations: false use_singing_labels: false use_snr_labels: false use_speaker_table: false use_speech_codes_as_input: true use_sv2tts: false use_tb: false use_tone: false use_tpgst: false use_wandb: false vad_algo: webrtc vad_margin: 0.1 validate_sr: true validate_wav_len: true vc_mel2f0: false vc_soft_gt_pitch: false vc_soft_units: true ve_final_relu: false ve_hidden_size: 768 ve_lr: 0.0001 ve_min_samples: 20 ve_partial_frames: 128 ve_spk_batch_size: 128 ve_utt_batch_size: 10 voc_future_horizon: 11 voc_lvc: false voc_lvc_dims: 8 voc_noise_fir: true voc_subscale: 0 voc_train_max_duration: 30 voc_train_min_duration: 1.5 voc_voiced_logits_scale: 0 vocoder_bsize: 16 vocoder_fc_dims: 512 vocoder_hidden_size: 512 vocoder_input_length: 16000 vocoder_input_pad: 0 vocoder_lr: 0.0001 vocoder_mode: MOL wandb_watch_model: false webrtc_mode: 2 weight_init: false win_size: 2048