質問編集履歴

1

追記しました

2020/04/25 17:14

投稿

villas
villas

スコア14

test CHANGED
File without changes
test CHANGED
@@ -37,3 +37,1333 @@
37
37
  tmpファイルは作成されていてその中にコメントは入っています。
38
38
 
39
39
  他にも気になる部分があればお尋ねください
40
+
41
+
42
+
43
+ ### 追記
44
+
45
+  編集できる事については初めて知りましたありがとうございます。
46
+
47
+
48
+
49
+ エラーが起こった部分は
50
+
51
+ ```
52
+
53
+ cmnd = ['./CatFiles.sh', tmpHeadFile, tmpMainFile, headlessFile, pediLogFileName]
54
+
55
+ ```
56
+
57
+ だと思います
58
+
59
+
60
+
61
+ コードは以下の通りです
62
+
63
+ ```
64
+
65
+ # coding: UTF-8
66
+
67
+
68
+
69
+ import requests
70
+
71
+
72
+
73
+ from bs4 import BeautifulSoup
74
+
75
+
76
+
77
+ import re
78
+
79
+
80
+
81
+ from time import sleep
82
+
83
+
84
+
85
+ from pprint import pprint
86
+
87
+
88
+
89
+ import os.path
90
+
91
+
92
+
93
+ from datetime import datetime, timedelta, timezone
94
+
95
+
96
+
97
+ import subprocess
98
+
99
+
100
+
101
+ import sys
102
+
103
+
104
+
105
+ import shutil
106
+
107
+
108
+
109
+ from functools import partial
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+ RES_IN_SINGLEPAGE = 30
118
+
119
+
120
+
121
+ LOG_STORE_DIRECTORY = 'logs'
122
+
123
+
124
+
125
+ SCRAPING_INTERVAL_TIME = 3
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+ NICOPEDI_URL_HEAD_A = "https://dic.nicovideo.jp/a/"
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+ def CheckCreateDirectory(location, dirName) :
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+ relativePath = location + '/' + dirName
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+ if not os.path.exists(relativePath) :
166
+
167
+
168
+
169
+
170
+
171
+
172
+
173
+ os.mkdir(relativePath)
174
+
175
+
176
+
177
+ # print('Create',relativePath)
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+ return relativePath
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+ def GetSearchTargetURLs(baseURL, latestId) :
198
+
199
+
200
+
201
+
202
+
203
+
204
+
205
+ pageUrls = []
206
+
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
+
216
+
217
+ tgtPage = requests.get(baseURL)
218
+
219
+
220
+
221
+ soup = BeautifulSoup(tgtPage.content, "html.parser")
222
+
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
+ if not soup.find('div', class_='st-pg_contents') :
234
+
235
+
236
+
237
+ print_red('Nothing any response in this article.', is_bold=True)
238
+
239
+
240
+
241
+ return None
242
+
243
+
244
+
245
+
246
+
247
+
248
+
249
+
250
+
251
+
252
+
253
+ if soup.find('a', class_='navi') :
254
+
255
+
256
+
257
+
258
+
259
+
260
+
261
+ soup.find('a', class_='navi').decompose()
262
+
263
+
264
+
265
+
266
+
267
+
268
+
269
+
270
+
271
+
272
+
273
+ pagers = soup.select("div.st-pg_contents")
274
+
275
+
276
+
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+
285
+ pager = pagers[0]
286
+
287
+
288
+
289
+
290
+
291
+
292
+
293
+
294
+
295
+
296
+
297
+ pager = pager.getText()
298
+
299
+
300
+
301
+
302
+
303
+
304
+
305
+
306
+
307
+
308
+
309
+ splitedTxt = pager.strip()
310
+
311
+
312
+
313
+
314
+
315
+
316
+
317
+ splitedTxts = splitedTxt.split('\n')
318
+
319
+
320
+
321
+
322
+
323
+
324
+
325
+ txts = []
326
+
327
+
328
+
329
+
330
+
331
+
332
+
333
+
334
+
335
+
336
+
337
+ for txt in splitedTxts :
338
+
339
+
340
+
341
+ v = re.sub(r'\D', '', txt)
342
+
343
+
344
+
345
+ if v == '' : continue
346
+
347
+
348
+
349
+ txts.append(int(v))
350
+
351
+
352
+
353
+
354
+
355
+
356
+
357
+
358
+
359
+
360
+
361
+ if len(txts) == 0 :
362
+
363
+
364
+
365
+ print('Nothing any response to get.')
366
+
367
+
368
+
369
+ return None
370
+
371
+
372
+
373
+
374
+
375
+
376
+
377
+
378
+
379
+
380
+
381
+ # print(len(txts), txts[-1])
382
+
383
+
384
+
385
+ finalPage = int((txts[-1] - 1) / RES_IN_SINGLEPAGE)
386
+
387
+
388
+
389
+ finalPage += 1
390
+
391
+
392
+
393
+
394
+
395
+
396
+
397
+
398
+
399
+
400
+
401
+ startPage = latestId // RES_IN_SINGLEPAGE
402
+
403
+
404
+
405
+
406
+
407
+
408
+
409
+
410
+
411
+
412
+
413
+ baseBbsUrl = baseURL.replace('/a/', '/b/a/')
414
+
415
+
416
+
417
+
418
+
419
+
420
+
421
+ print(startPage * RES_IN_SINGLEPAGE, 'To', finalPage * RES_IN_SINGLEPAGE)
422
+
423
+
424
+
425
+ # pprint(txts)
426
+
427
+
428
+
429
+
430
+
431
+
432
+
433
+
434
+
435
+
436
+
437
+ for i in range(startPage, finalPage) :
438
+
439
+
440
+
441
+ pageNum = i * RES_IN_SINGLEPAGE + 1
442
+
443
+
444
+
445
+ pageUrl = baseBbsUrl + '/' + str(pageNum) + '-'
446
+
447
+
448
+
449
+ pageUrls.append(pageUrl)
450
+
451
+
452
+
453
+
454
+
455
+
456
+
457
+
458
+
459
+
460
+
461
+ estSec = len(pageUrls) * SCRAPING_INTERVAL_TIME
462
+
463
+
464
+
465
+ estMin = estSec // 60
466
+
467
+
468
+
469
+ estHrs = estMin // 60
470
+
471
+
472
+
473
+
474
+
475
+
476
+
477
+ estSec = str(estSec) + 's / '
478
+
479
+
480
+
481
+ estMin = str(estMin) + 'm / '
482
+
483
+
484
+
485
+ estHrs = str(estHrs) + 'h'
486
+
487
+
488
+
489
+
490
+
491
+
492
+
493
+ print('Minimum estimation time =', estSec, estMin, estHrs)
494
+
495
+
496
+
497
+
498
+
499
+
500
+
501
+ return pageUrls
502
+
503
+
504
+
505
+
506
+
507
+
508
+
509
+
510
+
511
+ def GetAllResInPage(tgtUrl) :
512
+
513
+
514
+
515
+
516
+
517
+
518
+
519
+ r = requests.get(tgtUrl)
520
+
521
+
522
+
523
+
524
+
525
+
526
+
527
+
528
+
529
+
530
+
531
+ soup = BeautifulSoup(r.content, "html.parser")
532
+
533
+
534
+
535
+
536
+
537
+
538
+
539
+
540
+
541
+
542
+
543
+ resheads = soup.find_all("dt", class_="st-bbs_reshead")
544
+
545
+
546
+
547
+ resbodys = soup.find_all("dd", class_="st-bbs_resbody")
548
+
549
+
550
+
551
+
552
+
553
+
554
+
555
+ formattedHead = []
556
+
557
+
558
+
559
+ formattedBody = []
560
+
561
+
562
+
563
+ resCount = 0
564
+
565
+
566
+
567
+
568
+
569
+
570
+
571
+
572
+
573
+
574
+
575
+ for rhead in resheads:
576
+
577
+
578
+
579
+ h = rhead
580
+
581
+
582
+
583
+
584
+
585
+
586
+
587
+
588
+
589
+
590
+
591
+ hObj = BeautifulSoup(str(h), 'html.parser')
592
+
593
+
594
+
595
+
596
+
597
+
598
+
599
+
600
+
601
+
602
+
603
+ bbs_resNo = hObj.find('span', class_='st-bbs_resNo').getText()
604
+
605
+
606
+
607
+ bbs_name = hObj.find('span', class_='st-bbs_name').getText()
608
+
609
+
610
+
611
+ bbs_resInfo = hObj.find('div', class_='st-bbs_resInfo').getText()
612
+
613
+
614
+
615
+
616
+
617
+ bbs_resInfo = bbs_resInfo.strip()
618
+
619
+
620
+
621
+ bbs_resInfo = bbs_resInfo.strip('\n')
622
+
623
+
624
+
625
+ bbs_resInfo = bbs_resInfo.replace('\n', ' ')
626
+
627
+
628
+
629
+
630
+
631
+
632
+
633
+
634
+
635
+
636
+
637
+ pattern = r' +'
638
+
639
+
640
+
641
+ bbs_resInfo = re.sub(pattern, ' ', bbs_resInfo)
642
+
643
+
644
+
645
+ # print(bbs_resNo, bbs_name, bbs_resInfo)
646
+
647
+
648
+
649
+
650
+
651
+
652
+
653
+ resHeaders = [bbs_resNo, bbs_name, bbs_resInfo]
654
+
655
+
656
+
657
+ h = ' '.join(resHeaders)
658
+
659
+
660
+
661
+
662
+
663
+
664
+
665
+ formattedHead.append(h)
666
+
667
+
668
+
669
+
670
+
671
+
672
+
673
+
674
+
675
+
676
+
677
+ for rbody in resbodys:
678
+
679
+
680
+
681
+
682
+
683
+
684
+
685
+ b = str(rbody)
686
+
687
+
688
+
689
+ b = b.replace("<br>", "\n")
690
+
691
+
692
+
693
+ b = b.replace("<br/>", "\n")
694
+
695
+
696
+
697
+ b = BeautifulSoup(b, "html.parser").getText()
698
+
699
+
700
+
701
+
702
+
703
+
704
+
705
+ b = b.strip()
706
+
707
+
708
+
709
+ b = b.strip('\n')
710
+
711
+
712
+
713
+ formattedBody.append(b)
714
+
715
+
716
+
717
+
718
+
719
+
720
+
721
+ resCount += 1
722
+
723
+
724
+
725
+
726
+
727
+
728
+
729
+ return resCount, formattedHead, formattedBody
730
+
731
+
732
+
733
+
734
+
735
+
736
+
737
+
738
+
739
+
740
+
741
+ def TeeOutput(text, file) :
742
+
743
+
744
+
745
+ # print(text + '\n', end="")
746
+
747
+
748
+
749
+ file.write(text + '\n')
750
+
751
+
752
+
753
+ return
754
+
755
+
756
+
757
+
758
+
759
+
760
+
761
+
762
+
763
+
764
+
765
+ def GetLatestID(fName):
766
+
767
+
768
+
769
+ try:
770
+
771
+
772
+
773
+ cmnd = ['head', '-1', fName]
774
+
775
+
776
+
777
+ subResult = subprocess.check_output(cmnd)
778
+
779
+
780
+
781
+ except:
782
+
783
+
784
+
785
+ print("Error.")
786
+
787
+
788
+
789
+
790
+
791
+
792
+
793
+ heads = subResult.split()
794
+
795
+
796
+
797
+ id = int(heads[2])
798
+
799
+
800
+
801
+
802
+
803
+
804
+
805
+ return id
806
+
807
+
808
+
809
+
810
+
811
+
812
+
813
+
814
+
815
+
816
+
817
+ def print_colored(code, text, is_bold=False):
818
+
819
+
820
+
821
+ if is_bold:
822
+
823
+
824
+
825
+ code = '1;%s' % code
826
+
827
+
828
+
829
+
830
+
831
+
832
+
833
+ print('\033[%sm%s\033[0m' % (code, text))
834
+
835
+
836
+
837
+
838
+
839
+
840
+
841
+ print_red = partial(print_colored, '31')
842
+
843
+
844
+
845
+
846
+
847
+
848
+
849
+
850
+
851
+
852
+
853
+ def IsValidURL(targetURL) :
854
+
855
+
856
+
857
+ isValid = targetURL.startswith(NICOPEDI_URL_HEAD_A)
858
+
859
+
860
+
861
+ return isValid
862
+
863
+
864
+
865
+
866
+
867
+
868
+
869
+ # メイン処理スタート -----------------------------------------------------------------
870
+
871
+
872
+
873
+ args=sys.argv
874
+
875
+ if len(args)<=1:
876
+
877
+ print_red('Nothing Target URL',is_bold=True)
878
+
879
+ sys.exit(0)
880
+
881
+ tgtArtUrl = args[1]
882
+
883
+
884
+
885
+
886
+
887
+
888
+
889
+
890
+
891
+
892
+
893
+ if not IsValidURL(tgtArtUrl) :
894
+
895
+
896
+
897
+ print_red('This is not valid URL.', is_bold=True)
898
+
899
+
900
+
901
+ print('Target URL should be under', NICOPEDI_URL_HEAD_A)
902
+
903
+
904
+
905
+ sys.exit(0)
906
+
907
+
908
+
909
+
910
+
911
+
912
+
913
+
914
+
915
+
916
+
917
+ logDir = CheckCreateDirectory('.', LOG_STORE_DIRECTORY)
918
+
919
+
920
+
921
+
922
+
923
+
924
+
925
+
926
+
927
+
928
+
929
+ art_req = requests.get(tgtArtUrl)
930
+
931
+
932
+
933
+ art_soup = BeautifulSoup(art_req.content, 'html.parser')
934
+
935
+
936
+
937
+
938
+
939
+
940
+
941
+
942
+
943
+
944
+
945
+ art_soup.find('span', class_='st-label_title-category').decompose()
946
+
947
+
948
+
949
+
950
+
951
+
952
+
953
+ art_soup.find('div', class_='a-title-yomi').decompose()
954
+
955
+
956
+
957
+
958
+
959
+
960
+
961
+ titleTxt = art_soup.find('div', class_='a-title')
962
+
963
+
964
+
965
+
966
+
967
+
968
+
969
+
970
+
971
+
972
+
973
+ pageTitle = titleTxt.getText()
974
+
975
+
976
+
977
+
978
+
979
+ pageTitle = pageTitle.strip()
980
+
981
+
982
+
983
+ pageTitle = pageTitle.strip('\n')
984
+
985
+
986
+
987
+
988
+
989
+
990
+
991
+ pageTitle = pageTitle.replace(' ', '_')
992
+
993
+
994
+
995
+
996
+
997
+
998
+
999
+
1000
+
1001
+
1002
+
1003
+ pediLogFileName = pageTitle + ".log"
1004
+
1005
+
1006
+
1007
+ pediLogFileName = logDir + '/' + pediLogFileName
1008
+
1009
+
1010
+
1011
+
1012
+
1013
+
1014
+
1015
+
1016
+
1017
+
1018
+
1019
+ JST = timezone(timedelta(hours=+9), 'JST')
1020
+
1021
+
1022
+
1023
+ now = datetime.now(JST)
1024
+
1025
+
1026
+
1027
+ nowstamp = str(now.timestamp()).replace('.','')
1028
+
1029
+
1030
+
1031
+
1032
+
1033
+
1034
+
1035
+
1036
+
1037
+
1038
+
1039
+ tmpDir = CheckCreateDirectory('.', nowstamp)
1040
+
1041
+
1042
+
1043
+
1044
+
1045
+
1046
+
1047
+
1048
+
1049
+
1050
+
1051
+ tmpMainFile = tmpDir + '/' + nowstamp + '.main' + '.tmp'
1052
+
1053
+
1054
+
1055
+
1056
+
1057
+
1058
+
1059
+
1060
+
1061
+
1062
+
1063
+
1064
+
1065
+
1066
+
1067
+ print('Output log file = [', pediLogFileName, ']')
1068
+
1069
+
1070
+
1071
+
1072
+
1073
+
1074
+
1075
+
1076
+
1077
+
1078
+
1079
+ if os.path.exists(pediLogFileName) :
1080
+
1081
+
1082
+
1083
+
1084
+
1085
+
1086
+
1087
+ print("Found log file.")
1088
+
1089
+
1090
+
1091
+ latestId = GetLatestID(pediLogFileName)
1092
+
1093
+
1094
+
1095
+ openMode = 'a'
1096
+
1097
+
1098
+
1099
+ shutil.copyfile(pediLogFileName, tmpMainFile)
1100
+
1101
+
1102
+
1103
+ else :
1104
+
1105
+
1106
+
1107
+
1108
+
1109
+
1110
+
1111
+ print("Not found log file.")
1112
+
1113
+
1114
+
1115
+ latestId = 0
1116
+
1117
+
1118
+
1119
+ openMode = 'w'
1120
+
1121
+
1122
+
1123
+
1124
+
1125
+
1126
+
1127
+ writer = open(tmpMainFile, openMode)
1128
+
1129
+
1130
+
1131
+
1132
+
1133
+
1134
+
1135
+ if openMode == 'w' : TeeOutput(pageTitle + '\n', writer)
1136
+
1137
+
1138
+
1139
+ writer.close()
1140
+
1141
+
1142
+
1143
+
1144
+
1145
+
1146
+
1147
+
1148
+
1149
+
1150
+
1151
+ targetURLs = GetSearchTargetURLs(tgtArtUrl, latestId)
1152
+
1153
+
1154
+
1155
+
1156
+
1157
+
1158
+
1159
+
1160
+
1161
+
1162
+
1163
+
1164
+
1165
+
1166
+
1167
+
1168
+
1169
+
1170
+
1171
+ if targetURLs == None :
1172
+
1173
+
1174
+
1175
+ sys.exit(0)
1176
+
1177
+
1178
+
1179
+
1180
+
1181
+
1182
+
1183
+ print('Progress ... ', end='', flush=True)
1184
+
1185
+
1186
+
1187
+
1188
+
1189
+
1190
+
1191
+ for url in targetURLs:
1192
+
1193
+
1194
+
1195
+
1196
+
1197
+
1198
+
1199
+
1200
+
1201
+
1202
+
1203
+ with open(tmpMainFile, 'a') as writer:
1204
+
1205
+
1206
+
1207
+
1208
+
1209
+
1210
+
1211
+
1212
+
1213
+
1214
+
1215
+ resCount, formattedHead, formattedBody = GetAllResInPage(url)
1216
+
1217
+
1218
+
1219
+
1220
+
1221
+
1222
+
1223
+
1224
+
1225
+
1226
+
1227
+ mark = (latestId % RES_IN_SINGLEPAGE)
1228
+
1229
+
1230
+
1231
+
1232
+
1233
+
1234
+
1235
+
1236
+
1237
+
1238
+
1239
+
1240
+
1241
+
1242
+
1243
+ for i in range(mark, resCount):
1244
+
1245
+
1246
+
1247
+ TeeOutput(formattedHead[i], writer)
1248
+
1249
+
1250
+
1251
+ TeeOutput(formattedBody[i], writer)
1252
+
1253
+
1254
+
1255
+ TeeOutput("", writer)
1256
+
1257
+
1258
+
1259
+ latestId += 1
1260
+
1261
+
1262
+
1263
+
1264
+
1265
+
1266
+
1267
+
1268
+
1269
+
1270
+
1271
+
1272
+
1273
+
1274
+
1275
+
1276
+
1277
+
1278
+
1279
+ print(latestId, end=' ', flush=True)
1280
+
1281
+
1282
+
1283
+
1284
+
1285
+
1286
+
1287
+
1288
+
1289
+
1290
+
1291
+ if url != targetURLs[-1] : sleep(SCRAPING_INTERVAL_TIME)
1292
+
1293
+
1294
+
1295
+
1296
+
1297
+
1298
+
1299
+ print()
1300
+
1301
+
1302
+
1303
+
1304
+
1305
+
1306
+
1307
+
1308
+
1309
+
1310
+
1311
+ tmpHeadFile = tmpDir + '/' + nowstamp + '.head' + '.tmp'
1312
+
1313
+
1314
+
1315
+
1316
+
1317
+
1318
+
1319
+ with open(tmpHeadFile, 'w') as writer:
1320
+
1321
+
1322
+
1323
+ metaInfo = [pageTitle, str(now.strftime("%Y-%m-%d/%H:%M")), str(latestId)]
1324
+
1325
+
1326
+
1327
+ metaInfoLine = ' '.join(metaInfo)
1328
+
1329
+
1330
+
1331
+ TeeOutput(metaInfoLine, writer)
1332
+
1333
+
1334
+
1335
+
1336
+
1337
+
1338
+
1339
+ headlessFile = tmpDir + '/' + 'headless' + '.tmp'
1340
+
1341
+
1342
+
1343
+ cmnd = ['./CatFiles.sh', tmpHeadFile, tmpMainFile, headlessFile, pediLogFileName]
1344
+
1345
+
1346
+
1347
+ # pprint(cmnd)
1348
+
1349
+
1350
+
1351
+
1352
+
1353
+ subResult=subprocess.call(cmnd)
1354
+
1355
+
1356
+
1357
+
1358
+
1359
+ shutil.rmtree(tmpDir)
1360
+
1361
+
1362
+
1363
+
1364
+
1365
+
1366
+
1367
+ print("Output =", pediLogFileName, '(', latestId, ')' )
1368
+
1369
+ ```