KMeansでクラスター数を決めてから、クラスの構成比の抽出件数を計算するソースコードの作成に苦慮しております

KMeansでクラスター数を決めてから、クラスの構成比の抽出件数を計算するソースコードの作成に苦慮しております。

アンダーサンプリングを行う中で、少数派のデータ件数に合うように多数派データからランダムにデータを抽出しております。

多数派のデータを対象にクラスター分析を行い、特徴量が類似するデータ同士で群を作成し、この群単位にランダムにデータを抽出することが目的です。

area_1	area_2	area_3	area_4	area_5	area_6	area_7	area_8	area_9	area_10	area_11	area_12
13	371.1	436	0.119266055	0	50.26	14	27.8551532	34.9	35.3	35.9	38.99721448
21	235	436	0.123853211	0.333333333	59.5	22.1	20.16806723	34.5	34.9	35	34.28571429
14	142.5	394	0.137055838	0.204545455	110.04	30.8	12.72264631	36.4	37.3	39.3	35.62340967
27	671.5	432	0.118055556	0.225	146.52	55.5	12.28501229	37.7	38.1	39.6	45.45454545
35	272.8	462	0.119047619	0	125.44	51.2	9.566326531	36.1	38.7	39.2	30.6122449
84	630.9	478	0.117154812	0	200.5	75	5.985037406	35.2	37.7	40.1	29.92518703
14	355.1	422	0.127962085	0.055555556	60.18	23.8	19.94017946	34.5	34.6	35.4	33.89830508
49	203.2	498	0.102409639	0.558139535	149.85	37	12.01201201	38	38.9	40.5	44.44444444
14	871.1	446	0.114349776	0	107.88	37.7	11.12347052	35.8	37.1	37.2	32.25806452
14	511.6	398	0.133165829	0.323943662	88.8	43.2	20.27027027	35.7	35.2	37	48.64864865
13	579.9	446	0.121076233	0	82.94	26.4	14.46829033	37.1	37.3	37.7	31.83023873
19	69.8	430	0.125581395	0.2	73.6	16	19.02173913	34.3	36.3	36.8	38.04347826
64	593.9	422	0.127962085	0	132	49.5	10.60606061	36.2	37.5	40	35
147	577.6	462	0.116883117	0.139534884	258.64	97.6	5.412929168	35.4	38.3	42.4	33.01886792
13	225.2	436	0.123853211	0.153846154	135.66	45.6	11.79419136	35.3	35.5	35.7	44.81792717
14	296.2	454	0.123348018	0	92.82	33.8	19.39237233	35.2	34.5	35.7	50.42016807
13	260.4	388	0.131443299	0.261904762	93.86	26	17.04666525	34.9	35.2	36.1	44.32132964
35	262.7	422	0.127962085	0.318181818	72.6	30	16.52892562	35.1	35.4	36.3	33.05785124
20	343	428	0.130841121	0	184.8	62.4	12.98701299	35	35.7	38.5	62.33766234
30	487.1	434	0.126728111	0.32	145.78	51.8	9.603512142	36.1	37	39.4	35.53299492
41	264.1	516	0.110465116	0.173076923	38.28	16.5	36.57262278	35	34.8	34.8	40.22988506
28	59	430	0.120930233	0.1875	132.33	52.8	9.068238495	35.8	37.9	40.1	29.92518703
29	86.4	454	0.123348018	0.384615385	87.63	32.2	15.97626384	37.2	37.5	38.1	36.74540682
14	290.5	440	0.122727273	0.225	184.95	49.5	9.732360097	37.9	38.9	41.1	43.79562044
20	204.7	434	0.124423963	0.24137931	82.94	22	14.46829033	35.9	36.3	37.7	31.83023873
21	847.3	410	0.131707317	0.205128205	75.2	24	15.95744681	35.8	36.6	37.6	31.91489362
28	351.2	428	0.123831776	0	107.3	31.9	14.91146319	37	34.5	37	43.24324324
14	127.3	480	0.1125	0	128.86	51	9.312432097	34.9	37	37.9	31.66226913
34	192.1	432	0.125	0	178.2	72	6.734006734	35.2	38	39.6	30.3030303
22	359.1	478	0.112970711	0.142857143	117.6	45	11.9047619	37.2	37.5	39.2	35.71428571
21	573.6	406	0.13546798	0.47826087	55.84	25.6	28.65329513	36.8	34.7	34.9	45.84527221
15	527.4	434	0.119815668	0.254901961	55.8	16.5	21.50537634	35.5	36.8	37.2	32.25806452
21	443.5	510	0.109803922	0.176470588	184.5	72	9.756097561	38.3	37	41	43.90243902
189	80.3	478	0.115062762	0.333333333	141.78	51	12.69572577	37.6	38.7	41.7	43.16546763
19	509.9	478	0.117154812	0.083333333	169.33	57.4	10.63013051	37.8	38.9	41.3	43.58353511
61	722.7	416	0.129807692	0	181.28	66	7.722859665	34.7	39	41.2	33.98058252
14	782	442	0.126696833	0	145.41	51.8	8.252527336	35.8	37	39.3	30.53435115
14	289.2	462	0.116883117	0.19047619	130.35	33	9.20598389	35.2	37.8	39.5	30.37974684
29	296.4	444	0.123873874	0.193548387	54.45	21	33.05785124	35.8	35	36.3	49.58677686
14	17.8	440	0.125	0.066666667	117.74	31.9	10.19194836	35.4	38	40.6	29.55665025

上記が hiyoko_2 でございます。

# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.lines as mlines
import codecs
from sklearn.cluster import KMeans

f1 = codecs.open('hiyoko.csv', 'w', 'utf-8')

# データセットを読み込み

cust_df = pd.read_csv("hiyoko_2.csv" , sep=",")  
print (cust_df)

cust_array = cust_df.as_matrix().astype(np.float)        

# クラスタ分析を実行 (クラスタ数=4）今回は3と仮定します
pred = KMeans(n_clusters= 3).fit_predict(cust_array)
print (pred)

# Pandas のデータフレームにクラスタ番号を追加
cust_df['cluster_id']=pred
print (cust_df)

cust_df.to_csv('hiyoko.csv', index=None)


# 各クラスタに属するサンプル数の分布
cust_df['cluster_id'].value_counts()

print (cust_df['cluster_id'].value_counts())

##############################

#結果

#  1番目のクラス　サンプル数　17個
#  2番目のクラス　サンプル数　14個　
#  3番目のクラス　サンプル数　9個　　（実際に行うと多少ずれがあることもございます）


#　次に、合計サンプル数40個の上記の集まりを、合計サンプル数以下の、指定する任意の数と同じ数の集まりにしたいのです

#　今回は例えば10個の集まりにしたいと思います

#  計算方法　10/40 = 0.25


#  1番目のクラス　サンプル数　17個  * 0.25 = 4.25  約　4
#  2番目のクラス　サンプル数　14個　* 0.25 = 3.5   約　4
#  3番目のクラス　サンプル数　9個   * 0.25 = 2.25  約　2   合計サンプル数　10個の集まりになりました 

#  上記の一連の計算を行うソースコードの作成に苦慮しております



############################################ネット上にありました関連ソースコード（参考）

# Under Samplingの関数（X: num:ターゲット件数 label:少数派のラベル）

def under_sampling_func(X,num,label) :

    # KMeansによるクラスタリング
    from sklearn.cluster import KMeans
    km = KMeans(random_state=201707)
    km.fit(X,Y)
    X['Cluster'] = km.predict(X)

    # 群別の構成比を少数派の件数に乗じて群別の抽出件数を計算
    count_sum = X.groupby('Cluster').count().iloc[0:,0].as_matrix()
    ratio = count_sum / count_sum.sum()
    samp_num = np.round(ratio * num,0).astype(np.int32)

    # 群別にサンプリング処理を実施
    for i in np.arange(8) :
        tmp = X[X['Cluster']==i]
        if i == 0 :
            tmp1 = X.sample(samp_num[i],replace=True)
        else :
            tmp2 = X.sample(samp_num[i],replace=True)
            tmp1 = pd.concat([tmp1,tmp2])
    tmp1['Class'] = label
    return tmp1

############################################

# 各クラスタのデータの平均値

cust_df[cust_df['cluster_id']==0].mean() # クラスタ番号 = 0
print (cust_df[cust_df['cluster_id']==0].mean()) # クラスタ番号 = 0

cust_df[cust_df['cluster_id']==1].mean() # クラスタ番号 = 1
print (cust_df[cust_df['cluster_id']==1].mean()) # クラスタ番号 = 1

cust_df[cust_df['cluster_id']==2].mean() # クラスタ番号 = 2
print (cust_df[cust_df['cluster_id']==2].mean()) # クラスタ番号 = 2



# 可視化（積み上げ棒グラフ）
import matplotlib.pyplot as plt

clusterinfo = pd.DataFrame()
for i in range(3):
    clusterinfo['cluster' + str(i)] = cust_df[cust_df['cluster_id'] == i].mean()
clusterinfo = clusterinfo.drop('cluster_id')

my_plot = clusterinfo.T.plot(kind='bar', stacked=True, title="Mean Value of some Clusters")
my_plot.set_xticklabels(my_plot.xaxis.get_majorticklabels(), rotation=0)

plt.legend(loc='uppper right',
           bbox_to_anchor=(1.05, 0.5, 0.5, 10), 
           borderaxespad=0.,)
plt.show()

my_plot = clusterinfo.T.plot(kind='bar', stacked=True, title="Mean Value of some Clusters")
my_plot.set_xticklabels(my_plot.xaxis.get_majorticklabels(), rotation=0)
plt.show()

上記が未完成のソースコードでございます。

中段ほどに書いたのですが、サンプル数の分布までは出来たのですが、それ以降の
手書きの計算方法をソースコードで表現することに苦慮しております。

最終的な目的は、ランダムに抽出するソースコードの作成ですが、そこにたどり着く前に苦慮しております。

丸投げをするつもりはございません。
最後に、今回は以下のサイトを参照させて頂きました。

https://qiita.com/ryouta0506/items/619d9ac0d80f8c0aed92

考え方のヒント、参考になる方法等の助言でもありがたく思います。

先輩方の御教示、よろしくお願いいたします。

行動規範の内容に同意します

回答1件

ベストアンサー

ソースコード中のコメントを読む限り、クラスタ毎の要素（行）数の比率を保ったまま、総数を指定してリサンプルしたい（いわゆるアンダーサンプリングとは異なる）と解釈しました。

クラスタ毎の要素はランダムで抽出しています。

Python
1import random
2
3# クラスタ毎の要素（行）数の比率を保ったままリサンプル
4def resample( df, resample_cnt):
5
6    # クラスタ毎の行数
7    cluster_cnts = df['cluster_id'].value_counts()
8    print(cluster_cnts)
9
10    # 元データの総行数
11    total_src_cnt = cluster_cnts.sum()
12    print(total_src_cnt)
13
14    # リサンプル後のクラスタ毎の行数
15    dst_cnts = [int(round( float(resample_cnt) / total_src_cnt * i)) for i in cluster_cnts]
16
17    # 丸め誤差を吸収（かなり適当）
18    if np.sum(dst_cnts) < resample_cnt:
19        dst_cnts[-1] += resample_cnt - np.sum(dst_cnts)
20    print(dst_cnts)
21
22    # クラスタ毎に必要な数だけ行インデックスを取得
23    cluster_ids = cluster_cnts.index.tolist()
24    dst_indexes = []
25    for i,v in enumerate(dst_cnts):
26        l = df[df['cluster_id'] == cluster_ids[i]].index.values.tolist()
27
28        # ランダムシャッフルして先頭から必要な個数だけ取得
29        random.shuffle(l)
30        l = l[:v]
31        print(l)
32        dst_indexes += l
33
34    print(dst_indexes)
35    dst_df = df.iloc[dst_indexes]
36    return dst_df
37
38# 略
39# Pandas のデータフレームにクラスタ番号を追加
40cust_df['cluster_id']=pred
41print (cust_df)
42
43#  1番目のクラス　サンプル数　17個
44#  2番目のクラス　サンプル数　14個　
45#  3番目のクラス　サンプル数　9個　　（実際に行うと多少ずれがあることもございます）
46
47#　次に、合計サンプル数40個の上記の集まりを、合計サンプル数以下の、指定する任意の数と同じ数の集まりにしたいのです
48
49#　今回は例えば10個の集まりにしたいと思います
50
51#  計算方法　10/40 = 0.25
52
53#  1番目のクラス　サンプル数　17個  * 0.25 = 4.25  約　4
54#  2番目のクラス　サンプル数　14個　* 0.25 = 3.5   約　4
55#  3番目のクラス　サンプル数　9個   * 0.25 = 2.25  約　2   合計サンプル数　10個の集まりになりました 
56
57# １０個にリサンプル
58dst_df = resample( cust_df, 10)
59print( dst_df)