質問編集履歴

ソースを変更しました。求めている内容を記載しました。

2020/05/13 01:30

投稿

zenji0705

スコア69

test CHANGED Viewed

File without changes

test CHANGED Viewed

@@ -10,6 +10,18 @@
+※ソースを記載しました。データイメージと記述内容が異なっていますが、読み替えてくれると助かります。
+やりたいことは、愚直にfor文で分類分けしているロジックをもっと効率する方法はないか。
+現状だと数万件のデータが来たらかなり遅い処理になると予想しているので改善したいのです。
+大変恐縮ですが、アドバイスいただけると助かります。
 ###データイメージ
@@ -52,7 +64,7 @@
-###ソース(稚拙なロジックで申し訳ないです)
+###ソース
 ```pyhon3
@@ -66,107 +78,281 @@
     """
-    #
+    # dos
-    chk = re.match(r"^みかん*", value)
+    chk = re.match(r"^IPBLOCK-*", value)
-    if chk:
+    if chk:
-        return '1'
+        return '4'
+    # bot manager
-    chk = re.match(r"^ピーマン*", value)
+    chk = re.match(r"^39", value)
     if chk:
         return '2'
+    chk = re.match(r"^BOT-", value)
-        ・・・・省略
+    if chk:
+        return '2'
+    # client reputation
+    chk = re.match(r"^REP_", value)
+    if chk:
+        return '3'
+    # API Protection
+    chk = re.match(r"^API_", value)
+    if chk:
+        return '6'
+    # UA filter
+    chk = re.match(r"^6", value)
+    if chk:
+        return '7'
+    # no match
+    return '1'
 def log_parser():
-　　・・・・・省略
-    pd_result[[11,12,13]] = pd_result[11].str.split('|', expand=True)
-    for i, row in pd_result.iterrows():
-        #####
-        ### row[12]が品物、row[13]がグループを想定してください
-        #####
-        log_kind = 0 # init
-        if row[12] is None and row[13] is None :
-            log_kind = 5 # そのほか
-            pd_result.at[i, 14] = log_kind
-            continue
-        # 品物
-        ruleID = row[12].split(':')
-        ruleID_dir = []
-        if ruleID is not None:
-            for rID in ruleID:
-                log_code = logDistribution(str(rID))
-                ruleID_dir += [log_code]
-        # Risk Group Name
-        groupName = row[13].split(':')
-        groupName_dir = []
-        if ruleID is not None:
-            for rName in groupName:
-                log_code = logDistribution(str(rName))
-                groupName_dir += [log_code]
-        # ruleID_dirが[6,6,7]　groupName_dir[6,5]なら
-        # 5と6と7のレコードが必要
-        Aggregation_dir = [[]]
-        if ruleID_dir is not None:
-            for i , ruleID_elem in ruleID_dir:
-                if i == 0:
-                    Aggregation_dir[0,0] = ruleID_elem
-                    Aggregation_dir[0,1] = ruleID[i]
-                    Aggregation_dir[0,2] = ''
-                else :
-                    # todo すでにAggregation_dirに登録されてる分類コードなら:で連結する
+    test_file = 'C:\tmp/sample_apache_log.txt'
+    reader = pd.read_csv(test_file, sep='\n', chunksize=50,header=None)
+    for row in reader:
+        regex = '([(\d.)]+) - - [(.*?)] "(\S+?)(?: +(.*?) +(\S*?))?" (\d+) (\d+) "(.*?)" "(.*?)" "(.*?)" "(.*?)"'
+        pd_result=row[0].str.extract(regex, expand=True)
+        #
+        print(pd_result.at[0,0])
+        print(pd_result.at[0,1])
+        print(pd_result.at[0,2])
+        print(pd_result.at[0,3])
+        print(pd_result.at[0,4])
+        print(pd_result.at[0,5])
+        print(pd_result.at[0,6])
+        print(pd_result.at[0,7])
+        print(pd_result.at[0,8])
+        print(pd_result.at[0,9])
+        print(pd_result.at[0,10])     #分類分けデータ
+        #　policyID,ruleID,riskName
+        pd_result[[11,12,13]] = pd_result[10].str.split('|', expand=True)
+        # 複数カラムにまたがる処理を愚直にforでやってる（todo:改善したい）
+        for i, row in pd_result.iterrows():
+            log_kind = 0 # init
+            if row[12] == [''] and row[13] == [''] :
+                log_kind = 5 # Access_log
+                print (row[10] + 'is :' +log_kind)
+                #pd_result.at[i, 14] = log_kind # ★★要デバッグ
+                continue
+            # RuleID
+            ruleID = row[12].split(':')
+            ruleID_dir = []
+            if ruleID == ['']:
+                pass
+            else :
+                for rID in ruleID:
+                    log_code = logDistribution(str(rID))
+                    ruleID_dir += [log_code]
+            # Risk Group Name
+            riskName = row[13].split(':')
+            riskName_dir = []
+            if riskName == ['']:
+                pass
+            else:
+                for rName in riskName:
+                    log_code = logDistribution(str(rName))
+                    riskName_dir += [log_code]
+            # ログの分割
+            Aggregation_dir = []
+            if not ruleID_dir:
+                pass
+            else:
+                for i in range(len(ruleID_dir)):
+                    tmp_dir = [ruleID_dir[i],ruleID[i],'',row[11]]
+                    if i == 0:
+                        Aggregation_dir.append(tmp_dir)
+                    else :
+                        # すでにAggregation_dirに登録されてる分類コードなら:で連結する
+                        merge_flg = False
+                        for j in range(len(Aggregation_dir)):
+                            if Aggregation_dir[j][0] == tmp_dir[0]:
+                                if not Aggregation_dir[j][1]:
+                                    Aggregation_dir[j][1] = tmp_dir[1]
+                                    merge_flg = True
+                                else:
+                                    Aggregation_dir[j][1] = Aggregation_dir[j][1] + ':' + tmp_dir[1]
+                                    merge_flg = True
+                        if merge_flg == False :
+                            Aggregation_dir.append(tmp_dir)
+            if not riskName_dir:
+                pass
+            else:
+                for i in range(len(riskName_dir)):
+                    tmp_dir = [riskName_dir[i],'',riskName[i],row[11]]
+                    if i == 0 and not Aggregation_dir:
+                        Aggregation_dir.append(tmp_dir)
+                    else :
+                        # すでにAggregation_dirに登録されてる分類コードなら:で連結する
+                        merge_flg = False
+                        for j in range(len(Aggregation_dir)):
+                            if Aggregation_dir[j][0] == tmp_dir[0]:
+                                if not Aggregation_dir[j][2]:
+                                    Aggregation_dir[j][2] = tmp_dir[2]
+                                    merge_flg = True
+                                else:
+                                    Aggregation_dir[j][2] = Aggregation_dir[j][2] + ':' + tmp_dir[2]
+                                    merge_flg = True
+                        if merge_flg == False :
+                            Aggregation_dir.append(tmp_dir)
+            print(Aggregation_dir)
+            # Aggregation_dirで増えたレコードをもとのpandasに追加
+if __name__ == "__main__":
+    # ログをファイルから読み込み
+    log_parser()