File size: 1,952 Bytes
fab9582 8050d3c 570b380 fab9582 8050d3c a37b5fd fab9582 a37b5fd 3f8dd66 570b380 a37b5fd fab9582 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# import gradio as gr
# from msclap import CLAP
# clap_model = CLAP(version = 'clapcap', use_cuda=False)
# def clap_inference(mic=None, file=None):
# if mic is not None:
# audio = mic
# elif file is not None:
# audio = file
# else:
# return "You must either provide a mic recording or a file"
# # Generate captions for the recording
# captions = clap_model.generate_caption([audio],
# resample=True,
# beam_size=5,
# entry_length=67,
# temperature=0.01)
# return captions[0]
import gradio as gr
from dcase24t6.nn.hub import baseline_pipeline
import librosa
import torch
model = baseline_pipeline()
def dcase_inference(mic=None, file=None):
if mic is not None:
audio = mic
sr = 48000
gr.Info(f"sr 1: {sr}")
elif file is not None:
gr.Info(f"file 1: {file}")
audio, sr = librosa.load(file, sr=None)
audio = torch.from_numpy(audio)
gr.Info(f"file 1: {sr}")
else:
return "You must either provide a mic recording or a file"
# Generate captions for the recording
item = {"audio": audio, "sr": sr}
outputs = model(item)
candidate = outputs["candidates"][0]
return candidate
def create_app():
with gr.Blocks() as demo:
gr.Markdown(
"""
# DCASE demo for automatic audio captioning
"""
)
gr.Interface(
fn=dcase_inference,
inputs=[
gr.Audio(sources="microphone", type="filepath"),
gr.Audio(sources="upload", type="filepath"),
],
outputs="text",
)
return demo
def main():
app = create_app()
app.launch(debug=True)
if __name__ == "__main__":
main()
|