Pythonを用いて自前のデータセットで異常検知を行いたい

#前提・実現したいこと
GitHub WeaklyAnomalyDetection
上記のサイト様を参考に自前のデータセットを用いて異常検知を行いたいと考えております。

#発生している問題・エラーメッセージ

Python
1---------------------------------------------------------------------------
2ValueError                                Traceback (most recent call last)
3<ipython-input-9-adac0b839533> in <module>
4     17 
5     18 #trainデータからランダムに50個抽出
6---> 19 number = np.random.choice(np.arange(0,x_train_sum.shape[0]),50,replace=False)
7     20 
8     21 x, y = [], []
9
10mtrand.pyx in numpy.random.mtrand.RandomState.choice()
11
12ValueError: 'a' cannot be empty unless no samples are taken

#コード
自前のデータセットのコード

import matplotlib.pyplot as plt
import os
import cv2
import random
import numpy as np

DATADIR_train = '/Users/username/desktop/弱教師あり学習/b_c_dataset/png/train'
DATADIR_test = '/Users/username/desktop/弱教師あり学習/b_c_dataset/png/test'
CATEGORIES = ["bell", "call"]
train_data = []
test_data = []

def load_bc_data():
    random.shuffle(train_data)  # データをシャッフル
    x_train = []  # 画像データ
    y_train = []  # ラベル情
    for class_num, category in enumerate(CATEGORIES):
        path = os.path.join(DATADIR_train, category)
        for image_name in os.listdir(path):
            try:
                img_array = cv2.imread(os.path.join(path, image_name),)  # 画像読み込み
                img_resize_array = cv2.resize(img_array, (583, 438))  # 画像のリサイズ
                training_data.append([img_resize_array, class_num])  # 画像データ、ラベル情報を追加
            except Exception as e:
                pass

    for class_num, category in enumerate(CATEGORIES):
        path = os.path.join(DATADIR_test, category)
        for image_name in os.listdir(path):
            try:
                img_array = cv2.imread(os.path.join(path, image_name),)  # 画像読み込み
                img_resize_array = cv2.resize(img_array, (583, 438))  # 画像のリサイズ
                test_data.append([img_resize_array, class_num])  # 画像データ、ラベル情報を追加
            except Exception as e:
                pass

    random.shuffle(test_data)  # データをシャッフル
    x_test = []  # 画像データ
    y_test = []  # ラベル情報

    # データセット作成(train)
    for feature, label in train_data:
        x_train.append(feature)
        y_train.append(label)

    # データセット作成(test)
    for feature, label in test_data:
        x_test.append(feature)
        y_test.append(label)

    # numpy配列に変換
    x_train = np.array(x_train)
    y_train = np.array(y_train)

    x_test =np.array(x_test)
    y_test =np.array(y_test)

    return (x_train, y_train), (x_test, y_test)

異常検知のコード

Python
1import matplotlib.pyplot as plt
2import os
3import cv2
4import random
5from b_c_dataset import B_C_Dataset2
6
7import numpy as np
8from keras.utils import to_categorical
9from keras.preprocessing.image import ImageDataGenerator
10
11bell = 0#bellは0
12call = 1#callは1
13
14# dataset
15(x_train, y_train), (x_test, y_test) = B_C_Dataset2.load_bc_data()
16
17x_train = x_train.reshape(x_train.shape[0], 583, 438, 3)
18x_test = x_test.reshape(x_test.shape[0], 583, 438, 3)
19
20x_train = x_train.astype('float32') / 255
21x_test = x_test.astype('float32') / 255
22
23
24#学習データ（以下のコードからエラーが発生します）
25x_train_sum, x_train_s, x_train_b, x_test_s, x_test_b, = [], [], [], [], []
26y_train_sum = []
27
28#以下を修正いたしました
29for i in range(len(x_train)):
30    if y_train[i] == bell:
31        x_train_b.append(x_train[i]) #正解がbellの時はt_train_bに格納
32    elif y_train[i] == call:
33        x_train_s.append(x_train[i]) #正解がcellの時はt_train_sに格納
34
35    #正解がbellである場合もcellである場合も
36    #常にt_train_sumやy_train_sumにデータを格納
37    x_train_sum.append(x_train[i])
38    y_train_sum.append(y_train[i])
39
40x_train_sum = np.array(x_train_sum)
41x_train_b = np.array(x_train_b)
42x_train_s = np.array(x_train_s)
43        
44#trainデータからランダムに50個抽出
45number = np.random.choice(np.arange(0,x_train_sum.shape[0]),50,replace=False)
46
47x, y = [], []
48
49for i in number:
50    x.append(x_train_sum[i])
51    y.append(y_train_sum[i])
52    
53x_train_sum = np.array(x)
54y_train_sum = np.array(y)

元のサイト様のコードはまだ続きがありますが、上記の場所でエラーが発生しているため省略させていただきます。

#試していること
エラーメッセージである

ValueError: 'a' cannot be empty unless no samples are taken

について現在、調査を行っておりますが全く何もわかっていない状況です。しかし、異常検知のコードの途中にある

#trainデータからランダムに50個抽出
number = np.random.choice(np.arange(0,x_train_sum.shape[0]),50,replace=False)

の「trainデータからランダムに50個抽出」は元のコードでは、fashion_mnistデータセットを使用しているため、サンプル数を4000個に設定されていました。そこを私は自前のデータセットを用いたために上記のようにサンプル数を50個にしたことがこのエラーの原因ではないのか、と考えております。
また、自前のデータセットの詳細を以下に記載いたします。
B_C_Dataset2
トレーニングデータ：bell(画像)120枚、call(画像)60枚
テストデータ：bell(画像)28枚、call(画像)12枚
となっております。

#補足
使っているPCはmacOS Catalina バージョン10.15.5
Pythonのバージョンは3.6.5です
jupyter notebookを使用しています。

行動規範の内容に同意します

回答1件

ベストアンサー

まず、エラーの意味はnp.random.choiceの第一引数、つまり、np.arange(0,x_train_sum.shape[0])の部分がemptyなのでサンプルが抽出できませんと言っています。

np.arrangeがemptyになるということはx_train_sum.shape[0]が0、つまり、x_train_sumにデータが入っていないようですね。

確かにその上にあるデータを作っている部分を拝見すると、
(コメントは私が追記しました)

python
1for i in range(len(x_train)):
2    if y_train[i] == bell:
3        x_train_b.append(x_train[i]) #正解がbellの時はt_train_bに格納
4    elif y_train[i] == call:
5        x_train_s.append(x_train[i]) #正解がcellの時はt_train_sに格納
6    else:
7        #それ以外の時にt_train_sumに格納というコードになっているのですが、
8        #正解がbellでもcellでもないっていうデータはあるんでしたっけ？
9        x_train_sum.append(x_train[i])
10        y_train_sum.append(y_train[i])

となっているので、確かにx_train_sumには何も格納されないように思います。

おそらくですが、やりたかったことはこうではないでしょうか？

python
1for i in range(len(x_train)):
2    if y_train[i] == bell:
3        x_train_b.append(x_train[i]) #正解がbellの時はt_train_bに格納
4    elif y_train[i] == call:
5        x_train_s.append(x_train[i]) #正解がcellの時はt_train_sに格納
6    
7    #正解がbellである場合もcellである場合も
8    #常にt_train_sumやy_train_sumにデータを格納
9    x_train_sum.append(x_train[i])
10    y_train_sum.append(y_train[i])

投稿2020/07/14 21:37

TetsuyaZama

総合スコア216

maguro2020

2020/07/15 00:36

ご回答いただきありがとうございます。TetsuyaZama様。 TetsuyaZama様にご指摘いただいた点を自分のコードに反映させたのですが、質問の時と同じエラーが発生してしまいました。やはりこの場合、自分の作成した自前のデータセットが良くないということでしょうか？自分の想像となってしまい申し訳ないのですが、自分の作成したつもりのデータセットの中身は、 bellとcallのスペクトログラムの画像をbellは0、callは1としてラベル付を行いtrain_dataとtest_dataに分けたデータセットを作成したつもりです。

行動規範の内容に同意します