本次准确识别5种大类,共计50种小类的音频。每个音频文件时长5秒,格式为wav。
数据集来自freesound.org公开项目,从中手动提取。训练集共计1600个,测试集400个。
import os import random import numpy as np import pandas as pd import librosa import librosa.display import matplotlib.pyplot as plt import seaborn as sn from sklearn import model_selection from sklearn import preprocessing import IPython.display as ipd 1234567891011
# define directories base_dir = "." esc_dir = os.path.join(base_dir, "ESC-50-master") meta_file = os.path.join(esc_dir, "esc50.csv") audio_dir = os.path.join(esc_dir, "audio/audio/") 12345
# load metadata meta_data = pd.read_csv(meta_file,header=None,names=["filename","target"]) meta_data 123
# get data size data_size = meta_data.shape print(data_size) 123
(2000, 2)
# load a wave data def load_wave_data(audio_dir, file_name): file_path = os.path.join(audio_dir, file_name) x, fs = librosa.load(file_path, sr=44100) return x,fs 12345
# change wave data to mel-stft def calculate_melsp(x, n_fft=1024, hop_length=128): stft = np.abs(librosa.stft(x, n_fft=n_fft, hop_length=hop_length))**2 log_stft = librosa.power_to_db(stft) melsp = librosa.feature.melspectrogram(S=log_stft,n_mels=128) return melsp 123456
# display wave in plots def show_wave(x): plt.plot(x) plt.show() 1234
# display wave in heatmap def show_melsp(melsp, fs): librosa.display.specshow(melsp, sr=fs) plt.colorbar() plt.show() 12345
# example data x, fs = load_wave_data(audio_dir, meta_data.loc[0,"filename"]) melsp = calculate_melsp(x) print("wave size:{0}nmelsp size:{1}nsamping rate:{2}".format(x.shape, melsp.shape, fs)) show_wave(x) show_melsp(melsp, fs) 123456
wave size:(220500,)
melsp size:(128, 1723)
samping rate:44100
ipd.Audio(x, rate=fs) 1
# data augmentation: add white noise def add_white_noise(x, rate=0.002): return x + rate*np.random.randn(len(x)) 123
x_wn = add_white_noise(x) melsp = calculate_melsp(x_wn) print("wave size:{0}nmelsp size:{1}nsamping rate:{2}".format(x_wn.shape, melsp.shape, fs)) show_wave(x_wn) show_melsp(melsp, fs) 12345
wave size:(220500,)
melsp size:(128, 1723)
samping rate:44100
ipd.Audio(x_wn, rate=fs) 1
# data augmentation: shift sound in timeframe def shift_sound(x, rate=2): return np.roll(x, int(len(x)//rate)) 123
x_ss = shift_sound(x) melsp = calculate_melsp(x_ss) print("wave size:{0}nmelsp size:{1}nsamping rate:{2}".format(x_ss.shape, melsp.shape, fs)) show_wave(x_ss) show_melsp(melsp, fs) 12345
melsp size:(128, 1723)
samping rate:44100
ipd.Audio(x_ss, rate=fs) 1
# data augmentation: stretch sound def stretch_sound(x, rate=1.1): input_length = len(x) x = librosa.effects.time_stretch(x, rate) if len(x)>input_length: return x[:input_length] else: return np.pad(x, (0, max(0, input_length - len(x))), "constant") 12345678
x_st = stretch_sound(x) melsp = calculate_melsp(x_st) print("wave size:{0}nmelsp size:{1}nsamping rate:{2}".format(x_st.shape, melsp.shape, fs)) show_wave(x_st) show_melsp(melsp, fs) 12345
wave size:(220500,)
melsp size:(128, 1723)
samping rate:44100
ipd.Audio(x_st, rate=fs) 1
# get training dataset and target dataset x = list(meta_data.loc[:,"filename"]) y = list(meta_data.loc[:, "target"]) x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.2, stratify=y) print("x train:{0}ny train:{1}nx test:{2}ny test:{3}".format(len(x_train), len(y_train), len(x_test), len(y_test))) 123456789
x train:1600
y train:1600
x test:400
y test:400
freq = 128 time = 1723 12
# save wave data in npz, with augmentation def save_np_data(filename, x, y, aug=None, rates=None): np_data = np.zeros(freq*time*len(x)).reshape(len(x), freq, time) np_targets = np.zeros(len(y)) for i in range(len(y)): _x, fs = load_wave_data(audio_dir, x[i]) if aug is not None: _x = aug(x=_x, rate=rates[i]) _x = calculate_melsp(_x) np_data[i] = _x np_targets[i] = y[i] np.savez(filename, x=np_data, y=np_targets) 123456789101112
# save raw training dataset if not os.path.exists("esc_melsp_all_train_raw.npz"): save_np_data("esc_melsp_all_train_raw.npz", x_train, y_train) 123
# save test dataset if not os.path.exists("esc_melsp_all_test.npz"): save_np_data("esc_melsp_all_test.npz", x_test, y_test) 123
# save training dataset with white noise if not os.path.exists("esc_melsp_train_white_noise.npz"): rates = np.random.randint(1,50,len(x_train))/10000 save_np_data("esc_melsp_train_white_noise.npz", x_train, y_train, aug=add_white_noise, rates=rates) 1234
# save training dataset with sound shift if not os.path.exists("esc_melsp_train_shift_sound.npz"): rates = np.random.choice(np.arange(2,6),len(y_train)) save_np_data("esc_melsp_train_shift_sound.npz", x_train, y_train, aug=shift_sound, rates=rates) 1234
# save training dataset with stretch if not os.path.exists("esc_melsp_train_stretch_sound.npz"): rates = np.random.choice(np.arange(80,120),len(y_train))/100 save_np_data("esc_melsp_train_stretch_sound.npz", x_train, y_train, aug=stretch_sound, rates=rates) 1234
# save training dataset with combination of white noise and shift or stretch if not os.path.exists("esc_melsp_train_combination.npz"): np_data = np.zeros(freq*time*len(x_train)).reshape(len(x_train), freq, time) np_targets = np.zeros(len(y_train)) for i in range(len(y_train)): x, fs = load_wave_data(audio_dir, x_train[i]) x = add_white_noise(x=x, rate=np.random.randint(1,50)/1000) if np.random.choice((True,False)): x = shift_sound(x=x, rate=np.random.choice(np.arange(2,6))) else: x = stretch_sound(x=x, rate=np.random.choice(np.arange(80,120))/100) x = calculate_melsp(x) np_data[i] = x np_targets[i] = y_train[i] np.savez("esc_melsp_train_combination.npz", x=np_data, y=np_targets) 123456789101112131415
import keras from keras.models import Model from keras.layers import Input, Dense, Dropout, Activation from keras.layers import Conv2D, GlobalAveragePooling2D from keras.layers import BatchNormalization, Add from keras.callbacks import EarlyStopping, ModelCheckpoint,ReduceLROnPlateau from keras.models import load_model import warnings warnings.filterwarnings("ignore") 123456789
# dataset files train_files = ["esc_melsp_all_train_raw.npz"] test_file = "esc_melsp_all_test.npz" 123
train_num = len(x_train) test_num = len(x_test) print(train_num) print(test_num) 1234
1600
400
# define dataset placeholders x_train = np.zeros(freq*time*train_num*len(train_files)).reshape(train_num*len(train_files), freq, time) y_train = np.zeros(train_num*len(train_files)) 123
# load dataset for i in range(len(train_files)): data = np.load(train_files[i]) x_train[i*train_num:(i+1)*train_num] = data["x"] y_train[i*train_num:(i+1)*train_num] = data["y"] 12345
# load test dataset test_data = np.load(test_file) x_test = test_data["x"] y_test = test_data["y"] 1234
print(x_train.shape) print(y_train.shape) print(x_test.shape) print(y_test.shape) 1234
(1600, 128, 1723)
(1600,)
(400, 128, 1723)
(400,)
# redefine target data into one hot vector classes = 50 y_train = keras.utils.to_categorical(y_train, classes) y_test = keras.utils.to_categorical(y_test, classes) 1234
# reshape training dataset x_train = x_train.reshape(train_num*1, freq, time, 1) x_test = x_test.reshape(test_num, freq, time, 1) 123
print(x_train.shape) print(y_train.shape) print(x_test.shape) print(y_test.shape) 1234
(1600, 128, 1723, 1)
(1600, 50)
(400, 128, 1723, 1)
(400, 50)
print("x train:{0}ny train:{1}nx test:{2}ny test:{3}".format(x_train.shape, y_train.shape, x_test.shape, y_test.shape)) 1234
x train:(1600, 128, 1723, 1) y train:(1600, 50) x test:(400, 128, 1723, 1) y test:(400, 50) 1234 Define convolutional neural network
def cba(inputs, filters, kernel_size, strides): x = Conv2D(filters, kernel_size=kernel_size, strides=strides, padding='same')(inputs) x = BatchNormalization()(x) x = Activation("relu")(x) return x 12345
# define CNN inputs = Input(shape=(x_train.shape[1:])) x_1 = cba(inputs, filters=32, kernel_size=(1,8), strides=(1,2)) x_1 = cba(x_1, filters=32, kernel_size=(8,1), strides=(2,1)) x_1 = cba(x_1, filters=64, kernel_size=(1,8), strides=(1,2)) x_1 = cba(x_1, filters=64, kernel_size=(8,1), strides=(2,1)) x_2 = cba(inputs, filters=32, kernel_size=(1,16), strides=(1,2)) x_2 = cba(x_2, filters=32, kernel_size=(16,1), strides=(2,1)) x_2 = cba(x_2, filters=64, kernel_size=(1,16), strides=(1,2)) x_2 = cba(x_2, filters=64, kernel_size=(16,1), strides=(2,1)) x_3 = cba(inputs, filters=32, kernel_size=(1,32), strides=(1,2)) x_3 = cba(x_3, filters=32, kernel_size=(32,1), strides=(2,1)) x_3 = cba(x_3, filters=64, kernel_size=(1,32), strides=(1,2)) x_3 = cba(x_3, filters=64, kernel_size=(32,1), strides=(2,1)) x_4 = cba(inputs, filters=32, kernel_size=(1,64), strides=(1,2)) x_4 = cba(x_4, filters=32, kernel_size=(64,1), strides=(2,1)) x_4 = cba(x_4, filters=64, kernel_size=(1,64), strides=(1,2)) x_4 = cba(x_4, filters=64, kernel_size=(64,1), strides=(2,1)) x = Add()([x_1, x_2, x_3, x_4]) x = cba(x, filters=128, kernel_size=(1,16), strides=(1,2)) x = cba(x, filters=128, kernel_size=(16,1), strides=(2,1)) x = GlobalAveragePooling2D()(x) x = Dense(classes)(x) x = Activation("softmax")(x) model = Model(inputs, x) model.summary()
12345678910111213141516171819202122232425262728293031323334 Optimization and callbacks# initiate Adam optimizer opt = keras.optimizers.adam(lr=0.0001, decay=1e-6, amsgrad=True) # Let's train the model using Adam with amsgrad model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) 1234567
# directory for model checkpoints model_dir = "./models" if not os.path.exists(model_dir): os.mkdir(model_dir) # early stopping and model checkpoint# early es_cb = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True,verbose=1, mode='min') chkpt = os.path.join(model_dir, 'esc50_.{epoch:02d}_{val_loss:.4f}_{val_acc:.4f}.hdf5') cp_cb = ModelCheckpoint(filepath = chkpt, monitor='val_loss', verbose=1, save_best_only=True, mode='min') reduce_lr = ReduceLROnPlateau(monitor="val_loss",factor=0.6,patience=3,verbose=1,mode="min") 12345678910 Train CNN model with between class dataset
# between class data generator class MixupGenerator(): def __init__(self, x_train, y_train, batch_size=16, alpha=0.2, shuffle=True): self.x_train = x_train self.y_train = y_train self.batch_size = batch_size self.alpha = alpha self.shuffle = shuffle self.sample_num = len(x_train) def __call__(self): while True: indexes = self.__get_exploration_order() itr_num = int(len(indexes) // (self.batch_size * 2)) for i in range(itr_num): batch_ids = indexes[i * self.batch_size * 2:(i + 1) * self.batch_size * 2] x, y = self.__data_generation(batch_ids) yield x, y def __get_exploration_order(self): indexes = np.arange(self.sample_num) if self.shuffle: np.random.shuffle(indexes) return indexes def __data_generation(self, batch_ids): _, h, w, c = self.x_train.shape _, class_num = self.y_train.shape x1 = self.x_train[batch_ids[:self.batch_size]] x2 = self.x_train[batch_ids[self.batch_size:]] y1 = self.y_train[batch_ids[:self.batch_size]] y2 = self.y_train[batch_ids[self.batch_size:]] l = np.random.beta(self.alpha, self.alpha, self.batch_size) x_l = l.reshape(self.batch_size, 1, 1, 1) y_l = l.reshape(self.batch_size, 1) x = x1 * x_l + x2 * (1 - x_l) y = y1 * y_l + y2 * (1 - y_l) return x, y
1234567891011121314151617181920212223242526272829303132333435363738394041424344# train model batch_size = 16 epochs = 1000 training_generator = MixupGenerator(x_train, y_train)() model.fit_generator(generator=training_generator, steps_per_epoch=x_train.shape[0] // batch_size, validation_data=(x_test, y_test), epochs=epochs, verbose=1, shuffle=True, callbacks=[es_cb, cp_cb,reduce_lr]) 123456789101112
evaluation = model.evaluate(x_test, y_test) 12
evaluation = model.evaluate(x_test, y_test) print(evaluation) 12
320/320 [==============================] - 48s 151ms/step
[1.4442142248153687, 0.675]
freq = 128 time = 1723 12
predict_audio_dir = os.path.join(esc_dir, "audio/test/") 1
predict_file = os.path.join(esc_dir,"test.csv") 1
predict_data = pd.read_csv("test.csv",header=None,names=["filename"]) predict_data 12
predict_data.shape[0] 1
400
predict = list(predict_data.loc[:,"filename"]) 1
# save wave data in npz, with augmentation def save_np_data(filename, x, aug=None, rates=None): np_data = np.zeros(freq*time*len(x)).reshape(len(x), freq, time) for i in range(len(predict)): _x, fs = load_wave_data(predict_audio_dir, x[i]) if aug is not None: _x = aug(x=_x, rate=rates[i]) _x = calculate_melsp(_x) np_data[i] = _x np.savez(filename, x=np_data) 12345678910
# save raw training dataset if not os.path.exists("esc_melsp_predict_raw.npz"): save_np_data("esc_melsp_predict_raw.npz", predict) 123
predict_file = "esc_melsp_predict_raw.npz" 1
# load test dataset predict = np.load(predict_file) x_predict = predict["x"] 123
x_predict = x_predict.reshape(predict_data.shape[0],freq,time,1) 1
pred = None for model_path in ["models/esc50_.14_1.3803_0.7312.hdf5","models/esc50_.18_1.2065_0.7000.hdf5","models/esc50_.20_1.1664_0.7594.hdf5"]: model = load_model(model_path) if pred is None: pred = model.predict(x_predict) else: pred += model.predict(x_predict) 1234567
print(pred.shape) 1
res = np.argmax(pred,axis=1) 1
print(res[:5]) 1
import pandas as pd df = pd.DataFrame({"img_path":predict_data["filename"], "tags":res}) df.to_csv("submit.csv",index=None,header=None) 1234
下次见!!!
相关知识
深度学习+大规模计算+大数据=人工智能,谁掌握先进的算力?
基于Pytorch实现的声音分类
鸟类分类、鸟类声音相关深度学习数据集大合集
深度学习的艺术:从理论到实践
PyTorch深度学习:猫狗情感识别
科学家利用人工智能开发跨物种翻译,实现人类与动物对话
基于深度学习的猫狗图片分类研究(数据集+实验代码+4000字实验报告)
声音分类及其实战(一)
毕业设计:基于深度学习的动物叫声识别系统
动物声音识别
网址: 【人工智能项目】深度学习实现50种环境声音分类 https://m.mcbbbk.com/newsview189492.html
上一篇: 鸟类及动物声音辨识方法原理进展境 |
下一篇: 天信互通 |