前提
Pythonでディープラーニングでの音声分類を行っています。
ネット(https://qiita.com/cvusk/items/61cdbce80785eaf28349) にあるコードを参考にしたので、数字が異なるためエラーになっていると思いますが、どこを変更したらいいかわかりません。
初心者であるため、改善方法と説明を詳しく教えていただきたいです。よろしくお願いいたします。
学習データとテストデータに分けるところまでできて、kerasでの機械学習で手詰まっています。
実現したいこと
正解か正解でないかの二種類のみでの分類を行う。学習データとテストデータに分けるところまでできています。
発生している問題・エラーメッセージ
Traceback (most recent call last): File ~\Downloads\music-cnn\music_keras.py:39 in <module> x_1 = cba(inputs, filters=32, kernel_size=(1,8), strides=(1,2)) File ~\Downloads\music-cnn\music_keras.py:30 in cba x = Conv2D(filters, kernel_size=kernel_size, strides=strides, padding='same')(inputs) File ~\anaconda3\lib\site-packages\keras\utils\traceback_utils.py:70 in error_handler raise e.with_traceback(filtered_tb) from None File ~\anaconda3\lib\site-packages\keras\engine\input_spec.py:250 in assert_input_compatibility raise ValueError( ValueError: Input 0 of layer "conv2d_236" is incompatible with the layer: expected min_ndim=4, found ndim=1. Full shape received: (None,)
該当のソースコード
misic_data.py
1import os 2import random 3import numpy as np 4import pandas as pd 5import librosa 6import librosa.display 7import matplotlib.pyplot as plt 8import seaborn as sn 9from sklearn import model_selection 10from sklearn import preprocessing 11import IPython.display as ipd 12import librosa 13import librosa.display 14 15# define directories 16meta_file = "C:/Users/hosei/Downloads/music-cnn/data-THE_REVELATION.csv" 17audio_dir = "C:/Users/hosei/Downloads/music-cnn/music/output_THE_REVELATION" 18 19# load metadata 20meta_data = pd.read_csv(meta_file) 21 22# get data size 23data_size = meta_data.shape 24print(data_size) 25 26# arrange target label and its name 27class_dict = {} 28for i in range(data_size[0]): 29 if meta_data.loc[i,"target"] not in class_dict.keys(): 30 class_dict[meta_data.loc[i,"target"]] = meta_data.loc[i,"category"] 31 32# load a wave data 33def load_wave_data(audio_dir, file_name): 34 file_path = os.path.join(audio_dir, file_name) 35 x, fs = librosa.load(file_path, sr=44100) 36 return x,fs 37 38# change wave data to mel-stft 39def calculate_melsp(x, n_fft=1024, hop_length=128): 40 stft = np.abs(librosa.stft(x, n_fft=n_fft, hop_length=hop_length))**2 41 log_stft = librosa.power_to_db(stft) 42 melsp = librosa.feature.melspectrogram(S=log_stft,n_mels=128) 43 return melsp 44 45# display wave in plots 46def show_wave(x): 47 plt.plot(x) 48 plt.show() 49 50# display wave in heatmap 51def show_melsp(melsp, fs): 52 librosa.display.specshow(melsp, sr=fs) 53 plt.colorbar() 54 plt.show() 55 56# example data 57x, fs = load_wave_data(audio_dir, meta_data.loc[0,"filename"]) 58#print(x) 59melsp = calculate_melsp(x) 60print("wave size:{0}\nmelsp size:{1}\nsamping rate:{2}".format(x.shape, melsp.shape, fs)) 61show_wave(x) 62show_melsp(melsp, fs) 63 64# data augmentation: add white noise 65def add_white_noise(x, rate=0.002): 66 return x + rate*np.random.randn(len(x)) 67 68# data augmentation: shift sound in timeframe 69def shift_sound(x, rate=2): 70 return np.roll(x, int(len(x)//rate)) 71 72# data augmentation: stretch sound 73def stretch_sound(x, rate=1.1): 74 input_length = len(x) 75 x = librosa.effects.time_stretch(x, rate) 76 if len(x)>input_length: 77 return x[:input_length] 78 else: 79 return np.pad(x, (0, max(0, input_length - len(x))), "constant") 80 81 82 83# change wave data to mel-stft 84def calculate_melsp(x, n_fft=1024, hop_length=128): 85 stft = np.abs(librosa.stft(x, n_fft=n_fft, hop_length=hop_length))**2 86 log_stft = librosa.power_to_db(stft) 87 melsp = librosa.feature.melspectrogram(S=log_stft,n_mels=128) 88 return melsp 89 90# get training dataset and target dataset 91x = list(meta_data.loc[:,"filename"]) 92y = list(meta_data.loc[:, "target"]) 93 94x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.25, stratify=y) 95print("x train:{0}\ny train:{1}\nx test:{2}\ny test:{3}".format(len(x_train), 96 len(y_train), 97 len(x_test), 98 len(y_test))) 99 100# showing the classes are equally splitted 101a = np.zeros(50) 102for c in y_test: 103 a[c] += 1 104print(a) 105 106freq = 128 107time = 1723 108 109# save wave data in npz, with augmentation 110def save_np_data(filename, x, y, aug=None, rates=None): 111 np_data = np.zeros(freq*time*len(x)).reshape(len(x), freq, time) 112 np_targets = np.zeros(len(y)) 113 for i in range(len(y)): 114 _x, fs = load_wave_data(audio_dir, x[i]) 115 if aug is not None: 116 _x = aug(x=_x, rate=rates[i]) 117 _x = calculate_melsp(_x) 118 np_data[i] = _x 119 np_targets[i] = y[i] 120 np.savez(filename, x=np_data, y=np_targets) 121 122# save test dataset 123if not os.path.exists("esc_melsp_test.npz"): 124 save_np_data("esc_melsp_test.npz", x_test, y_test) 125 126# save raw training dataset 127if not os.path.exists("esc_melsp_train_raw.npz"): 128 save_np_data("esc_melsp_train_raw.npz", x_train, y_train) 129 130# save training dataset with white noise 131if not os.path.exists("esc_melsp_train_wn.npz"): 132 rates = np.random.randint(1,50,len(x_train))/10000 133 save_np_data("esc_melsp_train_wn.npz", x_train, y_train, aug=add_white_noise, rates=rates) 134 135# save training dataset with sound shift 136if not os.path.exists("esc_melsp_train_ss.npz"): 137 rates = np.random.choice(np.arange(2,6),len(y_train)) 138 save_np_data("esc_melsp_train_ss.npz", x_train, y_train, aug=shift_sound, rates=rates) 139 140# save training dataset with stretch 141if not os.path.exists("esc_melsp_train_st.npz"): 142 rates = np.random.choice(np.arange(80,120),len(y_train))/100 143 save_np_data("esc_melsp_train_st.npz", x_train, y_train, aug=stretch_sound, rates=rates) 144 145# save training dataset with combination of white noise and shift or stretch 146if not os.path.exists("esc_melsp_train_com.npz"): 147 np_data = np.zeros(freq*time*len(x_train)).reshape(len(x_train), freq, time) 148 np_targets = np.zeros(len(y_train)) 149 for i in range(len(y_train)): 150 x, fs = load_wave_data(audio_dir, x_train[i]) 151 x = add_white_noise(x=x, rate=np.random.randint(1,50)/1000) 152 if np.random.choice((True,False)): 153 x = shift_sound(x=x, rate=np.random.choice(np.arange(2,6))) 154 else: 155 x = stretch_sound(x=x, rate=np.random.choice(np.arange(80,120))/100) 156 x = calculate_melsp(x) 157 np_data[i] = x 158 np_targets[i] = y_train[i] 159 np.savez("esc_melsp_train_com.npz", x=np_data, y=np_targets) 160
該当のソースコード
music_keras.py
1import music_data 2import numpy as np 3import keras 4from keras.models import Model 5from keras.layers import Input, Dense, Dropout, Activation 6from keras.layers import Conv2D, GlobalAveragePooling2D 7from keras.layers import BatchNormalization, Add 8from keras.callbacks import EarlyStopping, ModelCheckpoint 9 10# redefine target data into one hot vector 11classes = 50 12X_train = np.array(music_data.x_train) 13X_test = np.array(music_data.x_test) 14Y_train = np.array(music_data.y_train) 15Y_test = np.array(music_data.y_test) 16 17y_train = keras.utils.to_categorical(Y_train, classes) 18y_test = keras.utils.to_categorical(Y_test, classes) 19 20def cba(inputs, filters, kernel_size, strides): 21 x = Conv2D(filters, kernel_size=kernel_size, strides=strides, padding='same')(inputs) 22 x = BatchNormalization()(x) 23 x = Activation("relu")(x) 24 return x 25 26# define CNN 27inputs = Input(shape=(X_train.shape[1:])) 28 29x_1 = cba(inputs, filters=32, kernel_size=(1,8), strides=(1,2)) 30x_1 = cba(x_1, filters=32, kernel_size=(8,1), strides=(2,1)) 31x_1 = cba(x_1, filters=64, kernel_size=(1,8), strides=(1,2)) 32x_1 = cba(x_1, filters=64, kernel_size=(8,1), strides=(2,1)) 33 34x_2 = cba(inputs, filters=32, kernel_size=(1,16), strides=(1,2)) 35x_2 = cba(x_2, filters=32, kernel_size=(16,1), strides=(2,1)) 36x_2 = cba(x_2, filters=64, kernel_size=(1,16), strides=(1,2)) 37x_2 = cba(x_2, filters=64, kernel_size=(16,1), strides=(2,1)) 38 39x_3 = cba(inputs, filters=32, kernel_size=(1,32), strides=(1,2)) 40x_3 = cba(x_3, filters=32, kernel_size=(32,1), strides=(2,1)) 41x_3 = cba(x_3, filters=64, kernel_size=(1,32), strides=(1,2)) 42x_3 = cba(x_3, filters=64, kernel_size=(32,1), strides=(2,1)) 43 44x_4 = cba(inputs, filters=32, kernel_size=(1,64), strides=(1,2)) 45x_4 = cba(x_4, filters=32, kernel_size=(64,1), strides=(2,1)) 46x_4 = cba(x_4, filters=64, kernel_size=(1,64), strides=(1,2)) 47x_4 = cba(x_4, filters=64, kernel_size=(64,1), strides=(2,1)) 48 49x = Add()([x_1, x_2, x_3, x_4]) 50 51x = cba(x, filters=128, kernel_size=(1,16), strides=(1,2)) 52x = cba(x, filters=128, kernel_size=(16,1), strides=(2,1)) 53 54x = GlobalAveragePooling2D()(x) 55x = Dense(classes)(x) 56x = Activation("softmax")(x) 57 58model = Model(inputs, x) 59 60# initiate Adam optimizer 61opt = keras.optimizers.adam(lr=0.00001, decay=1e-6, amsgrad=True) 62 63# Let's train the model using Adam with amsgrad 64model.compile(loss='categorical_crossentropy', 65 optimizer=opt, 66 metrics=['accuracy']) 67 68model.summary()
試したこと
https://teratail.com/questions/291260
この質問を見て改善しようとしましたが、なぜinput_shape=x.shape[1:]の部分をinput_shape=(256,256,3)にしているのかわかりませんでした。(256,256,3)にしてみたら、コード自体はエラーもなく動いたのですが、自分のやっているものでもこの数字にしていいのかわかりません。