File size: 1,952 Bytes
fab9582
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8050d3c
570b380
fab9582
 
 
 
 
 
8050d3c
 
a37b5fd
fab9582
a37b5fd
3f8dd66
570b380
a37b5fd
fab9582
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

# import gradio as gr
# from msclap import CLAP

# clap_model = CLAP(version = 'clapcap', use_cuda=False)

# def clap_inference(mic=None, file=None):

#     if mic is not None:
#         audio = mic
#     elif file is not None:
#         audio = file
#     else:
#         return "You must either provide a mic recording or a file"

#     # Generate captions for the recording
#     captions = clap_model.generate_caption([audio], 
#                                            resample=True, 
#                                            beam_size=5, 
#                                            entry_length=67, 
#                                            temperature=0.01)

#     return captions[0]

import gradio as gr
from dcase24t6.nn.hub import baseline_pipeline
import librosa
import torch 

model = baseline_pipeline()

def dcase_inference(mic=None, file=None):

    if mic is not None:
        audio = mic
        sr = 48000
        gr.Info(f"sr 1: {sr}")
    elif file is not None:
        gr.Info(f"file 1: {file}")
        audio, sr = librosa.load(file, sr=None)
        audio = torch.from_numpy(audio)
        gr.Info(f"file 1: {sr}")
    else:
        return "You must either provide a mic recording or a file"

    # Generate captions for the recording
    item = {"audio": audio, "sr": sr}
    outputs = model(item)
    candidate = outputs["candidates"][0]

    return candidate

def create_app():

    with gr.Blocks() as demo:
        gr.Markdown(
            """
            # DCASE demo for automatic audio captioning
            """
        )
        gr.Interface(
            fn=dcase_inference,
            inputs=[
                gr.Audio(sources="microphone", type="filepath"),
                gr.Audio(sources="upload", type="filepath"),
            ],
            outputs="text",
        )

    return demo

def main():
    
    app = create_app()
    app.launch(debug=True)

    
if __name__ == "__main__":
    main()