Spaces:
Sleeping
Sleeping
| import torch | |
| import gradio as gr | |
| from tortoise.api import TextToSpeech | |
| from tortoise.utils.audio import load_voice, load_voices | |
| tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True) | |
| languages = ['Male', 'Female'] | |
| voices = {'Male': ['deniro', 'freeman'], 'Female': ['emma', 'angie']} | |
| def inference(text, Gender, voice, Emotion, Preset): | |
| texts = [text] | |
| Angry_tone = "[I am so angry]" | |
| Sad_tone = "[I am so sad]" | |
| Happy_tone = "[I am so happy]" | |
| Scared_tone = "[I am so scared]" | |
| if Emotion == "Angry": | |
| text = Angry_tone + text | |
| if Emotion == "Sad": | |
| text = Sad_tone + text | |
| if Emotion == "Happy": | |
| text = Happy_tone + text | |
| if Emotion == "Scared": | |
| text = Scared_tone + text | |
| voice_samples, conditioning_latents = load_voice(voice) | |
| audio_frames = [] | |
| for j, text in enumerate(texts): | |
| for audio_frame in tts.tts_with_preset( | |
| text, | |
| voice_samples=voice_samples, | |
| conditioning_latents=conditioning_latents, | |
| preset=Preset, | |
| k=1 | |
| ): | |
| audio_frames.append(torch.from_numpy(audio_frame.cpu().detach().numpy())) | |
| complete_audio = torch.cat(audio_frames, dim=0) | |
| yield (24000, complete_audio.numpy()) | |
| def rs_change(rs): | |
| new_choices = voices[rs] | |
| return gr.update(choices=new_choices, value=new_choices[0] if new_choices else None) | |
| title = "Tortoise TTS" | |
| with gr.Blocks() as app: | |
| text = gr.Textbox(lines=4, label="Text:") | |
| rs = gr.Dropdown(choices=languages, value='Male', label="Gender") | |
| rs_hw = gr.Dropdown(choices=voices['Male'], interactive=True, label="Voice") | |
| rs.change(fn=rs_change, inputs=[rs], outputs=[rs_hw]) | |
| Emotion = gr.Radio(["Angry", "Sad", "Happy", "Scared"], type="value", label="Emotion") | |
| Preset = gr.Radio(["ultra_fast", "fast", "standard", "high_quality"], type="value", value="ultra_fast", label="Preset") | |
| output_audio = gr.Audio(label="Streaming audio:", streaming=True, autoplay=True) | |
| btn = gr.Button("Generate") | |
| btn.click(inference, inputs=[text, rs, rs_hw, Emotion, Preset], outputs=[output_audio]) | |
| app.launch() |