FileNotFoundError: [WinError 2] 指定されたファイルが見つかりません。
cmnd = ['./CatFiles.sh', tmpHeadFile, tmpMainFile, headlessFile, pediLogFileName]
# coding: UTF-8 import requests from bs4 import BeautifulSoup import re from time import sleep from pprint import pprint import os.path from datetime import datetime, timedelta, timezone import subprocess import sys import shutil from functools import partial RES_IN_SINGLEPAGE = 30 LOG_STORE_DIRECTORY = 'logs' SCRAPING_INTERVAL_TIME = 3 NICOPEDI_URL_HEAD_A = "https://dic.nicovideo.jp/a/" def CheckCreateDirectory(location, dirName) : relativePath = location + '/' + dirName if not os.path.exists(relativePath) : os.mkdir(relativePath) # print('Create',relativePath) return relativePath def GetSearchTargetURLs(baseURL, latestId) : pageUrls = [] tgtPage = requests.get(baseURL) soup = BeautifulSoup(tgtPage.content, "html.parser") if not soup.find('div', class_='st-pg_contents') : print_red('Nothing any response in this article.', is_bold=True) return None if soup.find('a', class_='navi') : soup.find('a', class_='navi').decompose() pagers = soup.select("div.st-pg_contents") pager = pagers[0] pager = pager.getText() splitedTxt = pager.strip() splitedTxts = splitedTxt.split('\n') txts = [] for txt in splitedTxts : v = re.sub(r'\D', '', txt) if v == '' : continue txts.append(int(v)) if len(txts) == 0 : print('Nothing any response to get.') return None # print(len(txts), txts[-1]) finalPage = int((txts[-1] - 1) / RES_IN_SINGLEPAGE) finalPage += 1 startPage = latestId // RES_IN_SINGLEPAGE baseBbsUrl = baseURL.replace('/a/', '/b/a/') print(startPage * RES_IN_SINGLEPAGE, 'To', finalPage * RES_IN_SINGLEPAGE) # pprint(txts) for i in range(startPage, finalPage) : pageNum = i * RES_IN_SINGLEPAGE + 1 pageUrl = baseBbsUrl + '/' + str(pageNum) + '-' pageUrls.append(pageUrl) estSec = len(pageUrls) * SCRAPING_INTERVAL_TIME estMin = estSec // 60 estHrs = estMin // 60 estSec = str(estSec) + 's / ' estMin = str(estMin) + 'm / ' estHrs = str(estHrs) + 'h' print('Minimum estimation time =', estSec, estMin, estHrs) return pageUrls def GetAllResInPage(tgtUrl) : r = requests.get(tgtUrl) soup = BeautifulSoup(r.content, "html.parser") resheads = soup.find_all("dt", class_="st-bbs_reshead") resbodys = soup.find_all("dd", class_="st-bbs_resbody") formattedHead = [] formattedBody = [] resCount = 0 for rhead in resheads: h = rhead hObj = BeautifulSoup(str(h), 'html.parser') bbs_resNo = hObj.find('span', class_='st-bbs_resNo').getText() bbs_name = hObj.find('span', class_='st-bbs_name').getText() bbs_resInfo = hObj.find('div', class_='st-bbs_resInfo').getText() bbs_resInfo = bbs_resInfo.strip() bbs_resInfo = bbs_resInfo.strip('\n') bbs_resInfo = bbs_resInfo.replace('\n', ' ') pattern = r' +' bbs_resInfo = re.sub(pattern, ' ', bbs_resInfo) # print(bbs_resNo, bbs_name, bbs_resInfo) resHeaders = [bbs_resNo, bbs_name, bbs_resInfo] h = ' '.join(resHeaders) formattedHead.append(h) for rbody in resbodys: b = str(rbody) b = b.replace("<br>", "\n") b = b.replace("<br/>", "\n") b = BeautifulSoup(b, "html.parser").getText() b = b.strip() b = b.strip('\n') formattedBody.append(b) resCount += 1 return resCount, formattedHead, formattedBody def TeeOutput(text, file) : # print(text + '\n', end="") file.write(text + '\n') return def GetLatestID(fName): try: cmnd = ['head', '-1', fName] subResult = subprocess.check_output(cmnd) except: print("Error.") heads = subResult.split() id = int(heads[2]) return id def print_colored(code, text, is_bold=False): if is_bold: code = '1;%s' % code print('\033[%sm%s\033[0m' % (code, text)) print_red = partial(print_colored, '31') def IsValidURL(targetURL) : isValid = targetURL.startswith(NICOPEDI_URL_HEAD_A) return isValid # メイン処理スタート ----------------------------------------------------------------- args=sys.argv if len(args)<=1: print_red('Nothing Target URL',is_bold=True) sys.exit(0) tgtArtUrl = args[1] if not IsValidURL(tgtArtUrl) : print_red('This is not valid URL.', is_bold=True) print('Target URL should be under', NICOPEDI_URL_HEAD_A) sys.exit(0) logDir = CheckCreateDirectory('.', LOG_STORE_DIRECTORY) art_req = requests.get(tgtArtUrl) art_soup = BeautifulSoup(art_req.content, 'html.parser') art_soup.find('span', class_='st-label_title-category').decompose() art_soup.find('div', class_='a-title-yomi').decompose() titleTxt = art_soup.find('div', class_='a-title') pageTitle = titleTxt.getText() pageTitle = pageTitle.strip() pageTitle = pageTitle.strip('\n') pageTitle = pageTitle.replace(' ', '_') pediLogFileName = pageTitle + ".log" pediLogFileName = logDir + '/' + pediLogFileName JST = timezone(timedelta(hours=+9), 'JST') now = datetime.now(JST) nowstamp = str(now.timestamp()).replace('.','') tmpDir = CheckCreateDirectory('.', nowstamp) tmpMainFile = tmpDir + '/' + nowstamp + '.main' + '.tmp' print('Output log file = [', pediLogFileName, ']') if os.path.exists(pediLogFileName) : print("Found log file.") latestId = GetLatestID(pediLogFileName) openMode = 'a' shutil.copyfile(pediLogFileName, tmpMainFile) else : print("Not found log file.") latestId = 0 openMode = 'w' writer = open(tmpMainFile, openMode) if openMode == 'w' : TeeOutput(pageTitle + '\n', writer) writer.close() targetURLs = GetSearchTargetURLs(tgtArtUrl, latestId) if targetURLs == None : sys.exit(0) print('Progress ... ', end='', flush=True) for url in targetURLs: with open(tmpMainFile, 'a') as writer: resCount, formattedHead, formattedBody = GetAllResInPage(url) mark = (latestId % RES_IN_SINGLEPAGE) for i in range(mark, resCount): TeeOutput(formattedHead[i], writer) TeeOutput(formattedBody[i], writer) TeeOutput("", writer) latestId += 1 print(latestId, end=' ', flush=True) if url != targetURLs[-1] : sleep(SCRAPING_INTERVAL_TIME) print() tmpHeadFile = tmpDir + '/' + nowstamp + '.head' + '.tmp' with open(tmpHeadFile, 'w') as writer: metaInfo = [pageTitle, str(now.strftime("%Y-%m-%d/%H:%M")), str(latestId)] metaInfoLine = ' '.join(metaInfo) TeeOutput(metaInfoLine, writer) headlessFile = tmpDir + '/' + 'headless' + '.tmp' cmnd = ['./CatFiles.sh', tmpHeadFile, tmpMainFile, headlessFile, pediLogFileName] # pprint(cmnd) subResult=subprocess.call(cmnd) shutil.rmtree(tmpDir) print("Output =", pediLogFileName, '(', latestId, ')' )
# coding: UTF-8
import requests
from bs4 import BeautifulSoup
import re
from time import sleep
from pprint import pprint
import os.path
from datetime import datetime, timedelta, timezone
import subprocess
import sys
import shutil
from functools import partial
NICOPEDI_URL_HEAD_A = "https://dic.nicovideo.jp/a/"
def CheckCreateDirectory(location, dirName) :
relativePath = location + '/' + dirName
if not os.path.exists(relativePath) :
# print('Create',relativePath)
return relativePath
def GetSearchTargetURLs(baseURL, latestId) :
pageUrls = []
tgtPage = requests.get(baseURL)
soup = BeautifulSoup(tgtPage.content, "html.parser")
if not soup.find('div', class_='st-pg_contents') :
print_red('Nothing any response in this article.', is_bold=True)
return None
if soup.find('a', class_='navi') :
soup.find('a', class_='navi').decompose()
pagers = soup.select("div.st-pg_contents")
pager = pagers[0]
pager = pager.getText()
splitedTxt = pager.strip()
splitedTxts = splitedTxt.split('\n')
txts = []
for txt in splitedTxts :
v = re.sub(r'\D', '', txt)
if v == '' : continue
if len(txts) == 0 :
print('Nothing any response to get.')
return None
# print(len(txts), txts[-1])
finalPage = int((txts[-1] - 1) / RES_IN_SINGLEPAGE)
finalPage += 1
startPage = latestId // RES_IN_SINGLEPAGE
baseBbsUrl = baseURL.replace('/a/', '/b/a/')
print(startPage * RES_IN_SINGLEPAGE, 'To', finalPage * RES_IN_SINGLEPAGE)
# pprint(txts)
for i in range(startPage, finalPage) :
pageNum = i * RES_IN_SINGLEPAGE + 1
pageUrl = baseBbsUrl + '/' + str(pageNum) + '-'
estSec = len(pageUrls) * SCRAPING_INTERVAL_TIME
estMin = estSec // 60
estHrs = estMin // 60
estSec = str(estSec) + 's / '
estMin = str(estMin) + 'm / '
estHrs = str(estHrs) + 'h'
print('Minimum estimation time =', estSec, estMin, estHrs)
return pageUrls
def GetAllResInPage(tgtUrl) :
r = requests.get(tgtUrl)
soup = BeautifulSoup(r.content, "html.parser")
resheads = soup.find_all("dt", class_="st-bbs_reshead")
resbodys = soup.find_all("dd", class_="st-bbs_resbody")
formattedHead = []
formattedBody = []
resCount = 0
for rhead in resheads:
h = rhead
hObj = BeautifulSoup(str(h), 'html.parser')
bbs_resNo = hObj.find('span', class_='st-bbs_resNo').getText()
bbs_name = hObj.find('span', class_='st-bbs_name').getText()
bbs_resInfo = hObj.find('div', class_='st-bbs_resInfo').getText()
bbs_resInfo = bbs_resInfo.strip()
bbs_resInfo = bbs_resInfo.strip('\n')
bbs_resInfo = bbs_resInfo.replace('\n', ' ')
pattern = r' +'
bbs_resInfo = re.sub(pattern, ' ', bbs_resInfo)
# print(bbs_resNo, bbs_name, bbs_resInfo)
resHeaders = [bbs_resNo, bbs_name, bbs_resInfo]
h = ' '.join(resHeaders)
for rbody in resbodys:
b = str(rbody)
b = b.replace("<br>", "\n")
b = b.replace("<br/>", "\n")
b = BeautifulSoup(b, "html.parser").getText()
b = b.strip()
b = b.strip('\n')
resCount += 1
return resCount, formattedHead, formattedBody
def TeeOutput(text, file) :
# print(text + '\n', end="")
file.write(text + '\n')
def GetLatestID(fName):
cmnd = ['head', '-1', fName]
subResult = subprocess.check_output(cmnd)
heads = subResult.split()
id = int(heads[2])
return id
def print_colored(code, text, is_bold=False):
if is_bold:
code = '1;%s' % code
print('\033[%sm%s\033[0m' % (code, text))
print_red = partial(print_colored, '31')
def IsValidURL(targetURL) :
isValid = targetURL.startswith(NICOPEDI_URL_HEAD_A)
return isValid
# メイン処理スタート -----------------------------------------------------------------
if len(args)<=1:
print_red('Nothing Target URL',is_bold=True)
tgtArtUrl = args[1]
if not IsValidURL(tgtArtUrl) :
print_red('This is not valid URL.', is_bold=True)
print('Target URL should be under', NICOPEDI_URL_HEAD_A)
logDir = CheckCreateDirectory('.', LOG_STORE_DIRECTORY)
art_req = requests.get(tgtArtUrl)
art_soup = BeautifulSoup(art_req.content, 'html.parser')
art_soup.find('span', class_='st-label_title-category').decompose()
art_soup.find('div', class_='a-title-yomi').decompose()
titleTxt = art_soup.find('div', class_='a-title')
pageTitle = titleTxt.getText()
pageTitle = pageTitle.strip()
pageTitle = pageTitle.strip('\n')
pageTitle = pageTitle.replace(' ', '_')
pediLogFileName = pageTitle + ".log"
pediLogFileName = logDir + '/' + pediLogFileName
JST = timezone(timedelta(hours=+9), 'JST')
now = datetime.now(JST)
nowstamp = str(now.timestamp()).replace('.','')
tmpDir = CheckCreateDirectory('.', nowstamp)
tmpMainFile = tmpDir + '/' + nowstamp + '.main' + '.tmp'
print('Output log file = [', pediLogFileName, ']')
if os.path.exists(pediLogFileName) :
print("Found log file.")
latestId = GetLatestID(pediLogFileName)
openMode = 'a'
shutil.copyfile(pediLogFileName, tmpMainFile)
else :
print("Not found log file.")
latestId = 0
openMode = 'w'
writer = open(tmpMainFile, openMode)
if openMode == 'w' : TeeOutput(pageTitle + '\n', writer)
targetURLs = GetSearchTargetURLs(tgtArtUrl, latestId)
if targetURLs == None :
print('Progress ... ', end='', flush=True)
for url in targetURLs:
with open(tmpMainFile, 'a') as writer:
resCount, formattedHead, formattedBody = GetAllResInPage(url)
mark = (latestId % RES_IN_SINGLEPAGE)
for i in range(mark, resCount):
TeeOutput(formattedHead[i], writer)
TeeOutput(formattedBody[i], writer)
TeeOutput("", writer)
latestId += 1
print(latestId, end=' ', flush=True)
if url != targetURLs[-1] : sleep(SCRAPING_INTERVAL_TIME)
tmpHeadFile = tmpDir + '/' + nowstamp + '.head' + '.tmp'
with open(tmpHeadFile, 'w') as writer:
metaInfo = [pageTitle, str(now.strftime("%Y-%m-%d/%H:%M")), str(latestId)]
metaInfoLine = ' '.join(metaInfo)
TeeOutput(metaInfoLine, writer)
headlessFile = tmpDir + '/' + 'headless' + '.tmp'
cmnd = ['./CatFiles.sh', tmpHeadFile, tmpMainFile, headlessFile, pediLogFileName]
# pprint(cmnd)
print("Output =", pediLogFileName, '(', latestId, ')' )