編集履歴

質問編集履歴

追記しました

2020/04/25 17:14

投稿

villas

スコア14

test CHANGED Viewed

File without changes

test CHANGED Viewed

@@ -37,3 +37,1333 @@
 tmpファイルは作成されていてその中にコメントは入っています。
 他にも気になる部分があればお尋ねください
+### 追記
+　編集できる事については初めて知りましたありがとうございます。
+エラーが起こった部分は
+```
+cmnd = ['./CatFiles.sh', tmpHeadFile, tmpMainFile, headlessFile, pediLogFileName]
+```
+だと思います
+コードは以下の通りです
+```
+# coding: UTF-8
+import requests
+from bs4 import BeautifulSoup
+import re
+from time import sleep
+from pprint import pprint
+import os.path
+from datetime import datetime, timedelta, timezone
+import subprocess
+import sys
+import shutil
+from functools import partial
+RES_IN_SINGLEPAGE = 30
+LOG_STORE_DIRECTORY = 'logs'
+SCRAPING_INTERVAL_TIME = 3
+NICOPEDI_URL_HEAD_A = "https://dic.nicovideo.jp/a/"
+def CheckCreateDirectory(location, dirName) :
+    relativePath = location + '/' + dirName
+    if not os.path.exists(relativePath) :
+        os.mkdir(relativePath)
+        # print('Create',relativePath)
+    return relativePath
+def GetSearchTargetURLs(baseURL, latestId) :
+    pageUrls = []
+    tgtPage = requests.get(baseURL)
+    soup = BeautifulSoup(tgtPage.content, "html.parser")
+    if not soup.find('div', class_='st-pg_contents') :
+        print_red('Nothing any response in this article.', is_bold=True)
+        return None
+    if soup.find('a', class_='navi') :
+        soup.find('a', class_='navi').decompose()
+    pagers = soup.select("div.st-pg_contents")
+    pager = pagers[0]
+    pager = pager.getText()
+    splitedTxt = pager.strip()
+    splitedTxts = splitedTxt.split('\n')
+    txts = []
+    for txt in splitedTxts :
+        v = re.sub(r'\D', '', txt)
+        if v == '' : continue
+        txts.append(int(v))
+    if len(txts) == 0 :
+        print('Nothing any response to get.')
+        return None
+    # print(len(txts), txts[-1])
+    finalPage = int((txts[-1] - 1) / RES_IN_SINGLEPAGE)
+    finalPage += 1
+    startPage = latestId // RES_IN_SINGLEPAGE
+    baseBbsUrl = baseURL.replace('/a/', '/b/a/')
+    print(startPage * RES_IN_SINGLEPAGE, 'To', finalPage * RES_IN_SINGLEPAGE)
+    # pprint(txts)
+    for i in range(startPage, finalPage) :
+        pageNum = i * RES_IN_SINGLEPAGE + 1
+        pageUrl = baseBbsUrl + '/' + str(pageNum) + '-'
+        pageUrls.append(pageUrl)
+    estSec = len(pageUrls) * SCRAPING_INTERVAL_TIME
+    estMin = estSec // 60
+    estHrs = estMin // 60
+    estSec = str(estSec) + 's / '
+    estMin = str(estMin) + 'm / '
+    estHrs = str(estHrs) + 'h'
+    print('Minimum estimation time =', estSec, estMin, estHrs)
+    return pageUrls
+def GetAllResInPage(tgtUrl) :
+    r = requests.get(tgtUrl)
+    soup = BeautifulSoup(r.content, "html.parser")
+    resheads = soup.find_all("dt", class_="st-bbs_reshead")
+    resbodys = soup.find_all("dd", class_="st-bbs_resbody")
+    formattedHead = []
+    formattedBody = []
+    resCount = 0
+    for rhead in resheads:
+        h = rhead
+        hObj = BeautifulSoup(str(h), 'html.parser')
+        bbs_resNo   = hObj.find('span', class_='st-bbs_resNo').getText()
+        bbs_name    = hObj.find('span', class_='st-bbs_name').getText()
+        bbs_resInfo = hObj.find('div', class_='st-bbs_resInfo').getText()
+        bbs_resInfo = bbs_resInfo.strip()
+        bbs_resInfo = bbs_resInfo.strip('\n')
+        bbs_resInfo = bbs_resInfo.replace('\n', ' ')
+        pattern = r' +'
+        bbs_resInfo = re.sub(pattern, ' ', bbs_resInfo)
+        # print(bbs_resNo, bbs_name, bbs_resInfo)
+        resHeaders = [bbs_resNo, bbs_name, bbs_resInfo]
+        h = ' '.join(resHeaders)
+        formattedHead.append(h)
+    for rbody in resbodys:
+        b = str(rbody)
+        b = b.replace("<br>", "\n")
+        b = b.replace("<br/>", "\n")
+        b = BeautifulSoup(b, "html.parser").getText()
+        b = b.strip()
+        b = b.strip('\n')
+        formattedBody.append(b)
+        resCount += 1
+    return resCount, formattedHead, formattedBody
+def TeeOutput(text, file) :
+    # print(text + '\n', end="")
+    file.write(text + '\n')
+    return
+def GetLatestID(fName):
+    try:
+        cmnd = ['head', '-1', fName]
+        subResult = subprocess.check_output(cmnd)
+    except:
+        print("Error.")
+    heads = subResult.split()
+    id = int(heads[2])
+    return id
+def print_colored(code, text, is_bold=False):
+    if is_bold:
+        code = '1;%s' % code
+    print('\033[%sm%s\033[0m' % (code, text))
+print_red = partial(print_colored, '31')
+def IsValidURL(targetURL) :
+    isValid = targetURL.startswith(NICOPEDI_URL_HEAD_A)
+    return isValid
+# メイン処理スタート -----------------------------------------------------------------
+args=sys.argv
+if len(args)<=1:
+    print_red('Nothing Target URL',is_bold=True)
+    sys.exit(0)
+tgtArtUrl = args[1]
+if not IsValidURL(tgtArtUrl) :
+    print_red('This is not valid URL.', is_bold=True)
+    print('Target URL should be under', NICOPEDI_URL_HEAD_A)
+    sys.exit(0)
+logDir = CheckCreateDirectory('.', LOG_STORE_DIRECTORY)
+art_req = requests.get(tgtArtUrl)
+art_soup = BeautifulSoup(art_req.content, 'html.parser')
+art_soup.find('span', class_='st-label_title-category').decompose()
+art_soup.find('div', class_='a-title-yomi').decompose()
+titleTxt = art_soup.find('div', class_='a-title')
+pageTitle = titleTxt.getText()
+pageTitle = pageTitle.strip()
+pageTitle = pageTitle.strip('\n')
+pageTitle = pageTitle.replace(' ', '_')
+pediLogFileName = pageTitle + ".log"
+pediLogFileName = logDir + '/' + pediLogFileName
+JST = timezone(timedelta(hours=+9), 'JST')
+now = datetime.now(JST)
+nowstamp = str(now.timestamp()).replace('.','')
+tmpDir = CheckCreateDirectory('.', nowstamp)
+tmpMainFile = tmpDir + '/' + nowstamp + '.main' + '.tmp'
+print('Output log file = [', pediLogFileName, ']')
+if os.path.exists(pediLogFileName) :
+    print("Found log file.")
+    latestId = GetLatestID(pediLogFileName)
+    openMode = 'a'
+    shutil.copyfile(pediLogFileName, tmpMainFile)
+else :
+    print("Not found log file.")
+    latestId = 0
+    openMode = 'w'
+writer = open(tmpMainFile, openMode)
+if openMode == 'w' : TeeOutput(pageTitle + '\n', writer)
+writer.close()
+targetURLs = GetSearchTargetURLs(tgtArtUrl, latestId)
+if targetURLs == None :
+    sys.exit(0)
+print('Progress ... ', end='', flush=True)
+for url in targetURLs:
+    with open(tmpMainFile, 'a') as writer:
+        resCount, formattedHead, formattedBody = GetAllResInPage(url)
+        mark = (latestId % RES_IN_SINGLEPAGE)
+        for i in range(mark, resCount):
+            TeeOutput(formattedHead[i], writer)
+            TeeOutput(formattedBody[i], writer)
+            TeeOutput("", writer)
+            latestId += 1
+        print(latestId, end=' ', flush=True)
+    if url != targetURLs[-1] : sleep(SCRAPING_INTERVAL_TIME)
+print()
+tmpHeadFile = tmpDir + '/' + nowstamp + '.head' + '.tmp'
+with open(tmpHeadFile, 'w') as writer:
+    metaInfo = [pageTitle, str(now.strftime("%Y-%m-%d/%H:%M")), str(latestId)]
+    metaInfoLine = ' '.join(metaInfo)
+    TeeOutput(metaInfoLine, writer)
+headlessFile = tmpDir + '/' + 'headless' + '.tmp'
+cmnd = ['./CatFiles.sh', tmpHeadFile, tmpMainFile, headlessFile, pediLogFileName]
+# pprint(cmnd)
+subResult=subprocess.call(cmnd)
+shutil.rmtree(tmpDir)
+print("Output =", pediLogFileName, '(', latestId, ')' )
+```