前提・実現したいこと
ニコニコ大百科コメント欄を保存したいと思いこちらのコードをコピペしました↓
https://github.com/we-yu/sb.webscraping/blob/master/nicopedy_saver.py
実行した結果
実行しようとするとこんなエラーが起こりました
FileNotFoundError: [WinError 2] 指定されたファイルが見つかりません。
補足情報
コマンドライン引数は入れてあります。
tmpファイルは作成されていてその中にコメントは入っています。
他にも気になる部分があればお尋ねください
追記
編集できる事については初めて知りましたありがとうございます。
エラーが起こった部分は
cmnd = ['./CatFiles.sh', tmpHeadFile, tmpMainFile, headlessFile, pediLogFileName]
だと思います
コードは以下の通りです
# coding: UTF-8 import requests from bs4 import BeautifulSoup import re from time import sleep from pprint import pprint import os.path from datetime import datetime, timedelta, timezone import subprocess import sys import shutil from functools import partial RES_IN_SINGLEPAGE = 30 LOG_STORE_DIRECTORY = 'logs' SCRAPING_INTERVAL_TIME = 3 NICOPEDI_URL_HEAD_A = "https://dic.nicovideo.jp/a/" def CheckCreateDirectory(location, dirName) : relativePath = location + '/' + dirName if not os.path.exists(relativePath) : os.mkdir(relativePath) # print('Create',relativePath) return relativePath def GetSearchTargetURLs(baseURL, latestId) : pageUrls = [] tgtPage = requests.get(baseURL) soup = BeautifulSoup(tgtPage.content, "html.parser") if not soup.find('div', class_='st-pg_contents') : print_red('Nothing any response in this article.', is_bold=True) return None if soup.find('a', class_='navi') : soup.find('a', class_='navi').decompose() pagers = soup.select("div.st-pg_contents") pager = pagers[0] pager = pager.getText() splitedTxt = pager.strip() splitedTxts = splitedTxt.split('\n') txts = [] for txt in splitedTxts : v = re.sub(r'\D', '', txt) if v == '' : continue txts.append(int(v)) if len(txts) == 0 : print('Nothing any response to get.') return None # print(len(txts), txts[-1]) finalPage = int((txts[-1] - 1) / RES_IN_SINGLEPAGE) finalPage += 1 startPage = latestId // RES_IN_SINGLEPAGE baseBbsUrl = baseURL.replace('/a/', '/b/a/') print(startPage * RES_IN_SINGLEPAGE, 'To', finalPage * RES_IN_SINGLEPAGE) # pprint(txts) for i in range(startPage, finalPage) : pageNum = i * RES_IN_SINGLEPAGE + 1 pageUrl = baseBbsUrl + '/' + str(pageNum) + '-' pageUrls.append(pageUrl) estSec = len(pageUrls) * SCRAPING_INTERVAL_TIME estMin = estSec // 60 estHrs = estMin // 60 estSec = str(estSec) + 's / ' estMin = str(estMin) + 'm / ' estHrs = str(estHrs) + 'h' print('Minimum estimation time =', estSec, estMin, estHrs) return pageUrls def GetAllResInPage(tgtUrl) : r = requests.get(tgtUrl) soup = BeautifulSoup(r.content, "html.parser") resheads = soup.find_all("dt", class_="st-bbs_reshead") resbodys = soup.find_all("dd", class_="st-bbs_resbody") formattedHead = [] formattedBody = [] resCount = 0 for rhead in resheads: h = rhead hObj = BeautifulSoup(str(h), 'html.parser') bbs_resNo = hObj.find('span', class_='st-bbs_resNo').getText() bbs_name = hObj.find('span', class_='st-bbs_name').getText() bbs_resInfo = hObj.find('div', class_='st-bbs_resInfo').getText() bbs_resInfo = bbs_resInfo.strip() bbs_resInfo = bbs_resInfo.strip('\n') bbs_resInfo = bbs_resInfo.replace('\n', ' ') pattern = r' +' bbs_resInfo = re.sub(pattern, ' ', bbs_resInfo) # print(bbs_resNo, bbs_name, bbs_resInfo) resHeaders = [bbs_resNo, bbs_name, bbs_resInfo] h = ' '.join(resHeaders) formattedHead.append(h) for rbody in resbodys: b = str(rbody) b = b.replace("<br>", "\n") b = b.replace("<br/>", "\n") b = BeautifulSoup(b, "html.parser").getText() b = b.strip() b = b.strip('\n') formattedBody.append(b) resCount += 1 return resCount, formattedHead, formattedBody def TeeOutput(text, file) : # print(text + '\n', end="") file.write(text + '\n') return def GetLatestID(fName): try: cmnd = ['head', '-1', fName] subResult = subprocess.check_output(cmnd) except: print("Error.") heads = subResult.split() id = int(heads[2]) return id def print_colored(code, text, is_bold=False): if is_bold: code = '1;%s' % code print('\033[%sm%s\033[0m' % (code, text)) print_red = partial(print_colored, '31') def IsValidURL(targetURL) : isValid = targetURL.startswith(NICOPEDI_URL_HEAD_A) return isValid # メイン処理スタート ----------------------------------------------------------------- args=sys.argv if len(args)<=1: print_red('Nothing Target URL',is_bold=True) sys.exit(0) tgtArtUrl = args[1] if not IsValidURL(tgtArtUrl) : print_red('This is not valid URL.', is_bold=True) print('Target URL should be under', NICOPEDI_URL_HEAD_A) sys.exit(0) logDir = CheckCreateDirectory('.', LOG_STORE_DIRECTORY) art_req = requests.get(tgtArtUrl) art_soup = BeautifulSoup(art_req.content, 'html.parser') art_soup.find('span', class_='st-label_title-category').decompose() art_soup.find('div', class_='a-title-yomi').decompose() titleTxt = art_soup.find('div', class_='a-title') pageTitle = titleTxt.getText() pageTitle = pageTitle.strip() pageTitle = pageTitle.strip('\n') pageTitle = pageTitle.replace(' ', '_') pediLogFileName = pageTitle + ".log" pediLogFileName = logDir + '/' + pediLogFileName JST = timezone(timedelta(hours=+9), 'JST') now = datetime.now(JST) nowstamp = str(now.timestamp()).replace('.','') tmpDir = CheckCreateDirectory('.', nowstamp) tmpMainFile = tmpDir + '/' + nowstamp + '.main' + '.tmp' print('Output log file = [', pediLogFileName, ']') if os.path.exists(pediLogFileName) : print("Found log file.") latestId = GetLatestID(pediLogFileName) openMode = 'a' shutil.copyfile(pediLogFileName, tmpMainFile) else : print("Not found log file.") latestId = 0 openMode = 'w' writer = open(tmpMainFile, openMode) if openMode == 'w' : TeeOutput(pageTitle + '\n', writer) writer.close() targetURLs = GetSearchTargetURLs(tgtArtUrl, latestId) if targetURLs == None : sys.exit(0) print('Progress ... ', end='', flush=True) for url in targetURLs: with open(tmpMainFile, 'a') as writer: resCount, formattedHead, formattedBody = GetAllResInPage(url) mark = (latestId % RES_IN_SINGLEPAGE) for i in range(mark, resCount): TeeOutput(formattedHead[i], writer) TeeOutput(formattedBody[i], writer) TeeOutput("", writer) latestId += 1 print(latestId, end=' ', flush=True) if url != targetURLs[-1] : sleep(SCRAPING_INTERVAL_TIME) print() tmpHeadFile = tmpDir + '/' + nowstamp + '.head' + '.tmp' with open(tmpHeadFile, 'w') as writer: metaInfo = [pageTitle, str(now.strftime("%Y-%m-%d/%H:%M")), str(latestId)] metaInfoLine = ' '.join(metaInfo) TeeOutput(metaInfoLine, writer) headlessFile = tmpDir + '/' + 'headless' + '.tmp' cmnd = ['./CatFiles.sh', tmpHeadFile, tmpMainFile, headlessFile, pediLogFileName] # pprint(cmnd) subResult=subprocess.call(cmnd) shutil.rmtree(tmpDir) print("Output =", pediLogFileName, '(', latestId, ')' )
エラーが発生したコードを掲載してください。
# coding: UTF-8
import requests
from bs4 import BeautifulSoup
import re
from time import sleep
from pprint import pprint
import os.path
from datetime import datetime, timedelta, timezone
import subprocess
import sys
import shutil
from functools import partial
RES_IN_SINGLEPAGE = 30
LOG_STORE_DIRECTORY = 'logs'
SCRAPING_INTERVAL_TIME = 3
NICOPEDI_URL_HEAD_A = "https://dic.nicovideo.jp/a/"
def CheckCreateDirectory(location, dirName) :
relativePath = location + '/' + dirName
if not os.path.exists(relativePath) :
os.mkdir(relativePath)
# print('Create',relativePath)
return relativePath
def GetSearchTargetURLs(baseURL, latestId) :
pageUrls = []
tgtPage = requests.get(baseURL)
soup = BeautifulSoup(tgtPage.content, "html.parser")
if not soup.find('div', class_='st-pg_contents') :
print_red('Nothing any response in this article.', is_bold=True)
return None
if soup.find('a', class_='navi') :
soup.find('a', class_='navi').decompose()
pagers = soup.select("div.st-pg_contents")
pager = pagers[0]
pager = pager.getText()
splitedTxt = pager.strip()
splitedTxts = splitedTxt.split('\n')
txts = []
for txt in splitedTxts :
v = re.sub(r'\D', '', txt)
if v == '' : continue
txts.append(int(v))
if len(txts) == 0 :
print('Nothing any response to get.')
return None
# print(len(txts), txts[-1])
finalPage = int((txts[-1] - 1) / RES_IN_SINGLEPAGE)
finalPage += 1
startPage = latestId // RES_IN_SINGLEPAGE
baseBbsUrl = baseURL.replace('/a/', '/b/a/')
print(startPage * RES_IN_SINGLEPAGE, 'To', finalPage * RES_IN_SINGLEPAGE)
# pprint(txts)
for i in range(startPage, finalPage) :
pageNum = i * RES_IN_SINGLEPAGE + 1
pageUrl = baseBbsUrl + '/' + str(pageNum) + '-'
pageUrls.append(pageUrl)
estSec = len(pageUrls) * SCRAPING_INTERVAL_TIME
estMin = estSec // 60
estHrs = estMin // 60
estSec = str(estSec) + 's / '
estMin = str(estMin) + 'm / '
estHrs = str(estHrs) + 'h'
print('Minimum estimation time =', estSec, estMin, estHrs)
return pageUrls
def GetAllResInPage(tgtUrl) :
r = requests.get(tgtUrl)
soup = BeautifulSoup(r.content, "html.parser")
resheads = soup.find_all("dt", class_="st-bbs_reshead")
resbodys = soup.find_all("dd", class_="st-bbs_resbody")
formattedHead = []
formattedBody = []
resCount = 0
for rhead in resheads:
h = rhead
hObj = BeautifulSoup(str(h), 'html.parser')
bbs_resNo = hObj.find('span', class_='st-bbs_resNo').getText()
bbs_name = hObj.find('span', class_='st-bbs_name').getText()
bbs_resInfo = hObj.find('div', class_='st-bbs_resInfo').getText()
bbs_resInfo = bbs_resInfo.strip()
bbs_resInfo = bbs_resInfo.strip('\n')
bbs_resInfo = bbs_resInfo.replace('\n', ' ')
pattern = r' +'
bbs_resInfo = re.sub(pattern, ' ', bbs_resInfo)
# print(bbs_resNo, bbs_name, bbs_resInfo)
resHeaders = [bbs_resNo, bbs_name, bbs_resInfo]
h = ' '.join(resHeaders)
formattedHead.append(h)
for rbody in resbodys:
b = str(rbody)
b = b.replace("<br>", "\n")
b = b.replace("<br/>", "\n")
b = BeautifulSoup(b, "html.parser").getText()
b = b.strip()
b = b.strip('\n')
formattedBody.append(b)
resCount += 1
return resCount, formattedHead, formattedBody
def TeeOutput(text, file) :
# print(text + '\n', end="")
file.write(text + '\n')
return
def GetLatestID(fName):
try:
cmnd = ['head', '-1', fName]
subResult = subprocess.check_output(cmnd)
except:
print("Error.")
heads = subResult.split()
id = int(heads[2])
return id
def print_colored(code, text, is_bold=False):
if is_bold:
code = '1;%s' % code
print('\033[%sm%s\033[0m' % (code, text))
print_red = partial(print_colored, '31')
def IsValidURL(targetURL) :
isValid = targetURL.startswith(NICOPEDI_URL_HEAD_A)
return isValid
# メイン処理スタート -----------------------------------------------------------------
args=sys.argv
if len(args)<=1:
print_red('Nothing Target URL',is_bold=True)
sys.exit(0)
tgtArtUrl = args[1]
if not IsValidURL(tgtArtUrl) :
print_red('This is not valid URL.', is_bold=True)
print('Target URL should be under', NICOPEDI_URL_HEAD_A)
sys.exit(0)
logDir = CheckCreateDirectory('.', LOG_STORE_DIRECTORY)
art_req = requests.get(tgtArtUrl)
art_soup = BeautifulSoup(art_req.content, 'html.parser')
art_soup.find('span', class_='st-label_title-category').decompose()
art_soup.find('div', class_='a-title-yomi').decompose()
titleTxt = art_soup.find('div', class_='a-title')
pageTitle = titleTxt.getText()
pageTitle = pageTitle.strip()
pageTitle = pageTitle.strip('\n')
pageTitle = pageTitle.replace(' ', '_')
pediLogFileName = pageTitle + ".log"
pediLogFileName = logDir + '/' + pediLogFileName
JST = timezone(timedelta(hours=+9), 'JST')
now = datetime.now(JST)
nowstamp = str(now.timestamp()).replace('.','')
tmpDir = CheckCreateDirectory('.', nowstamp)
tmpMainFile = tmpDir + '/' + nowstamp + '.main' + '.tmp'
print('Output log file = [', pediLogFileName, ']')
if os.path.exists(pediLogFileName) :
print("Found log file.")
latestId = GetLatestID(pediLogFileName)
openMode = 'a'
shutil.copyfile(pediLogFileName, tmpMainFile)
else :
print("Not found log file.")
latestId = 0
openMode = 'w'
writer = open(tmpMainFile, openMode)
if openMode == 'w' : TeeOutput(pageTitle + '\n', writer)
writer.close()
targetURLs = GetSearchTargetURLs(tgtArtUrl, latestId)
if targetURLs == None :
sys.exit(0)
print('Progress ... ', end='', flush=True)
for url in targetURLs:
with open(tmpMainFile, 'a') as writer:
resCount, formattedHead, formattedBody = GetAllResInPage(url)
mark = (latestId % RES_IN_SINGLEPAGE)
for i in range(mark, resCount):
TeeOutput(formattedHead[i], writer)
TeeOutput(formattedBody[i], writer)
TeeOutput("", writer)
latestId += 1
print(latestId, end=' ', flush=True)
if url != targetURLs[-1] : sleep(SCRAPING_INTERVAL_TIME)
print()
tmpHeadFile = tmpDir + '/' + nowstamp + '.head' + '.tmp'
with open(tmpHeadFile, 'w') as writer:
metaInfo = [pageTitle, str(now.strftime("%Y-%m-%d/%H:%M")), str(latestId)]
metaInfoLine = ' '.join(metaInfo)
TeeOutput(metaInfoLine, writer)
headlessFile = tmpDir + '/' + 'headless' + '.tmp'
cmnd = ['./CatFiles.sh', tmpHeadFile, tmpMainFile, headlessFile, pediLogFileName]
# pprint(cmnd)
subResult=subprocess.call(cmnd)
shutil.rmtree(tmpDir)
print("Output =", pediLogFileName, '(', latestId, ')' )
1.せっかくですが、この欄に書かれてもコードとして読めません。質問は編集できますので、質問に追記お願いします。
2.エラーが発生したのはコードのどの部分ですか?
回答1件
あなたの回答
tips
プレビュー