Spaces:
Runtime error
Runtime error
| import os | |
| import pdb | |
| import shutil | |
| import pandas as pd | |
| from datasets import Dataset, load_dataset | |
| audio_dir = "./data/Patient_sil_trim_16k_normed_5_snr_40/" | |
| # split_files = {"train": "data/Patient_sil_trim_16k_normed_5_snr_40/train.csv", | |
| # "test": "data/Patient_sil_trim_16k_normed_5_snr_40/test.csv", | |
| # "dev": "data/Patient_sil_trim_16k_normed_5_snr_40/dev.csv"} | |
| src_dataset = load_dataset("audiofolder", data_dir=audio_dir, split="train") | |
| pdb.set_trace() | |
| def train_dev_test_split( | |
| dataset: Dataset, dev_rate=0.1, test_rate=0.1, seed=1, metadata_output=False, root_dir=None | |
| ): | |
| """ | |
| input: dataset | |
| dev_rate, | |
| test_rate | |
| seed | |
| ------- | |
| Output: | |
| dataset_dict{"train", "dev", "test"} | |
| """ | |
| train_dev_test = dataset.train_test_split(test_size=test_rate, seed=seed) | |
| test = train_dev_test["test"] | |
| train_dev = train_dev_test["train"] | |
| if len(train_dev) <= int(len(dataset) * dev_rate): | |
| train = Dataset.from_dict({"audio": [], "transcription": []}) | |
| dev = train_dev | |
| else: | |
| train_dev = train_dev.train_test_split( | |
| test_size=int(len(dataset) * dev_rate), seed=seed | |
| ) | |
| train = train_dev["train"] | |
| dev = train_dev["test"] | |
| train_size = len(train) | |
| dev_size = len(dev) | |
| test_size = len(test) | |
| print(f"Train Size: {len(train)}") | |
| print(f"Dev Size: {len(dev)}") | |
| print(f"Test Size: {len(test)}") | |
| import pdb | |
| if metadata_output: | |
| pdb.set_trace() | |
| train_df = pd.DateFrame(train) | |
| dev_df = pd.DataFrame(dev) | |
| test_df = pd.DataFrame(test) | |
| try: | |
| os.path.exists(root_dir) | |
| except: | |
| raise FileNotFoundError | |
| # Create directories for train, dev, and test data | |
| import pdb | |
| if not os.path.exists(f'{root_dir}/train'): | |
| os.makedirs(f'{root_dir}/train') | |
| if not os.path.exists(f'{root_dir}/dev'): | |
| os.makedirs(f'{root_dir}/dev') | |
| if not os.path.exists(f'{root_dir}/test'): | |
| os.makedirs(f'{root_dir}/test') | |
| pdb.set_trace() | |
| train_df.to_csv(f'{root_dir}/train/metadata.csv', index=False) | |
| dev_df.to_csv(f'{root_dir}/dev/metadata.csv', index=False) | |
| test_df.to_csv(f'{root_dir}/test/metadata.csv', index=False) | |
| return train, dev, test | |
| train, dev, test = train_dev_test_split(src_dataset, dev_rate=0.1, test_rate=0.1, seed=1, metadata_output=True, root_dir=audio_dir) | |
| pdb.set_trace() |