実現したいこと
scrapyでスクレイピングしたデータをtext型やint型で保存できるように加工しようとしています。
詰まっていること
text型の加工(空白の削除など)はできましたが、int型の加工がうまくできず、「¥1,000」の「¥」や「,」を消したいのですが、残ったままです。
どうすればint型を加工できるでしょうか?
*講座を受講中に分からなくなったため、きれいにしたつもりではありますが、いらないコードなどが入っているかもしれません。ご容赦ください。
items.py
1import scrapy 2from itemloaders.processors import TakeFirst, MapCompose, Join 3 4#[¥]消す 5def strip_yen(element): 6 if element: 7 return element.replace('¥','') 8 return element 9#[,]消す 10def strip_comma(element): 11 if element: 12 return element.replace(',','') 13 return element 14 15#int型に変える 16def convert_integer(element): 17 if element: 18 return int(element) 19 return 0 20#[サイズ][判]を消す 21def get_size(element): 22 if element: 23 return element.split('/')[0].replace('サイズ ','').replace('判','') 24 return element 25#[ページ数][p]消す 26def get_page(element): 27 if element: 28 return element.split('/')[0].replace('ページ数 ','').replace('p','') 29 return element 30#[商品コード]消す 31def strip_isbn(element): 32 if element: 33 return element.replace('商品コード ','') 34 return element 35 36class BookItem(scrapy.Item): 37 #タイトル 38 title = scrapy.Field( 39 imput_processor = MapCompose(str.lstrip), 40 output_processor = Join(' ') 41 ) 42 #著者 43 author = scrapy.Field( 44 output_processor = TakeFirst() 45 ) 46 #値段 47 price = scrapy.Field( 48 imput_processor = MapCompose(strip_yen, strip_comma, convert_integer), 49 output_processor = TakeFirst() 50 ) 51 #出版社 52 publisher = scrapy.Field( 53 output_processor = TakeFirst() 54 ) 55 #サイズ 56 size = scrapy.Field( 57 imput_processor = MapCompose(get_size), 58 output_processor = TakeFirst() 59 ) 60 #ページ 61 page = scrapy.Field( 62 imput_processor = MapCompose(get_page, convert_integer), 63 output_processor = TakeFirst() 64 ) 65 #ISBN 66 isbn = scrapy.Field( 67 imput_processor = MapCompose(strip_isbn), 68 output_processor = TakeFirst() 69 )
computer_books.py
1import scrapy 2from scrapy.linkextractors import LinkExtractor 3from scrapy.spiders import CrawlSpider, Rule 4import logging 5from kinokuniya.items import BookItem 6from scrapy.loader import ItemLoader 7 8 9class ComputerBooksSpider(CrawlSpider): 10 name = 'computer_books' 11 allowed_domains = ['www.kinokuniya.co.jp'] 12 start_urls = ['https://www.kinokuniya.co.jp/f/dsd-101001037028005-06-'] 13 14 rules = ( 15 #書籍20冊分のURLを辿る 16 Rule(LinkExtractor(restrict_css='h3.heightLine-2 > a'), callback='parse_item', follow=False) 17 ) 18 19def parse_item(self, response): 20 logging.info(response.url) 21 22 loader = ItemLoader(item=BookItem(),response=response) 23 #タイトル 24 loader.add_css('title','h3[itemprop="name"]::text'), 25 #著者 26 loader.add_css('author','div.infobox.ml10.mt10 > ul > li > a::text'), 27 #値段 28 loader.add_css('price','span.sale_price::text'), 29 #出版社 30 loader.add_css('publisher','a[href*=publisher-key]::text'), 31 #サイズ 32 loader.add_css('size','div.infbox.dotted.ml10.mt05.pt05 > ul > li::text'), 33 #ページ 34 loader.add_css('page','div.infbox.dotted.ml10.mt05.pt05 > ul > li::text'), 35 #ISBN 36 loader.add_css('isbn','li[itemprop="identifier"]::text'), 37 38 yield loader.load_item() 39
settings.py
1# Scrapy settings for kinokuniya project 2# 3# For simplicity, this file contains only settings considered important or 4# commonly used. You can find more settings consulting the documentation: 5# 6# https://docs.scrapy.org/en/latest/topics/settings.html 7# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8# https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 10BOT_NAME = 'kinokuniya' 11 12SPIDER_MODULES = ['kinokuniya.spiders'] 13NEWSPIDER_MODULE = 'kinokuniya.spiders' 14 15 16# Crawl responsibly by identifying yourself (and your website) on the user-agent 17#USER_AGENT = 'kinokuniya (+http://www.yourdomain.com)' 18 19# Obey robots.txt rules 20ROBOTSTXT_OBEY = True 21 22# Configure maximum concurrent requests performed by Scrapy (default: 16) 23#CONCURRENT_REQUESTS = 32 24 25# Configure a delay for requests for the same website (default: 0) 26# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 27# See also autothrottle settings and docs 28DOWNLOAD_DELAY = 3 29# The download delay setting will honor only one of: 30#CONCURRENT_REQUESTS_PER_DOMAIN = 16 31#CONCURRENT_REQUESTS_PER_IP = 16 32 33# Disable cookies (enabled by default) 34#COOKIES_ENABLED = False 35 36# Disable Telnet Console (enabled by default) 37#TELNETCONSOLE_ENABLED = False 38 39# Override the default request headers: 40#DEFAULT_REQUEST_HEADERS = { 41# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 42# 'Accept-Language': 'en', 43#} 44 45# Enable or disable spider middlewares 46# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 47#SPIDER_MIDDLEWARES = { 48# 'kinokuniya.middlewares.KinokuniyaSpiderMiddleware': 543, 49#} 50 51# Enable or disable downloader middlewares 52# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 53#DOWNLOADER_MIDDLEWARES = { 54# 'kinokuniya.middlewares.KinokuniyaDownloaderMiddleware': 543, 55#} 56 57# Enable or disable extensions 58# See https://docs.scrapy.org/en/latest/topics/extensions.html 59#EXTENSIONS = { 60# 'scrapy.extensions.telnet.TelnetConsole': None, 61#} 62 63# Configure item pipelines 64# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 65ITEM_PIPELINES = { 66 'kinokuniya.pipelines.CheckItemPipeline': 100, 67} 68 69IMAGE_STORE = r'/Users/koyamataichi/Desktop/books_toscrape/kinokuniya/book_images' 70IMAGE_URLS_FIELD = 'image_urls' 71# Enable and configure the AutoThrottle extension (disabled by default) 72# See https://docs.scrapy.org/en/latest/topics/autothrottle.html 73#AUTOTHROTTLE_ENABLED = True 74# The initial download delay 75#AUTOTHROTTLE_START_DELAY = 5 76# The maximum download delay to be set in case of high latencies 77#AUTOTHROTTLE_MAX_DELAY = 60 78# The average number of requests Scrapy should be sending in parallel to 79# each remote server 80#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81# Enable showing throttling stats for every response received: 82#AUTOTHROTTLE_DEBUG = False 83 84# Enable and configure HTTP caching (disabled by default) 85# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86HTTPCACHE_ENABLED = True 87HTTPCACHE_EXPIRATION_SECS = 0 88HTTPCACHE_DIR = 'httpcache' 89#HTTPCACHE_IGNORE_HTTP_CODES = [] 90#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 92FEED_EXPORT_ENCODING='utf-8' 93 94#処理する量を一つに限定する 95CONCURRENT_REQUESTS = 1 96 97#優先順を階層の浅いところから処理する 98DEPTH_PRIORITY = 1 99 100#リクエストを受けた最後から処理する⇨最初から処理するへ変更 101SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue' 102SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue' 103
回答2件
あなたの回答
tips
プレビュー
下記のような回答は推奨されていません。
このような回答には修正を依頼しましょう。
2022/03/03 00:28
2022/03/03 01:18
2022/03/03 04:54