質問編集履歴

1

コードを追記しました。

2020/12/09 12:01

投稿

Kanadekana_nana
Kanadekana_nana

スコア8

test CHANGED
File without changes
test CHANGED
@@ -3,6 +3,172 @@
3
3
 
4
4
 
5
5
  scrapyのcrawlSpiderを用いてデータを収集した後、itemが一つなら scrapy spidername -o file.csv でcsvにできるのですがItemが2つ以上になったときにcsvをItemごとに生成したいです。
6
+
7
+ ```python
8
+
9
+ import logging
10
+
11
+ import scrapy
12
+
13
+ from scrapy.spiders import CrawlSpider, Rule
14
+
15
+ from scrapy.linkextractors import LinkExtractor
16
+
17
+ import re
18
+
19
+
20
+
21
+ from .. import items
22
+
23
+
24
+
25
+
26
+
27
+ class JtnewsSpider(CrawlSpider):
28
+
29
+ name = 'jtnews'
30
+
31
+ allowed_domains = ['www.jtnews.jp']
32
+
33
+ start_urls = ['http://www.jtnews.jp/cgi-bin/revlist.cgi?PAGE_NO=1']
34
+
35
+
36
+
37
+ rules = (
38
+
39
+ Rule(
40
+
41
+ LinkExtractor(allow=r"revlist.cgi?&?PAGE_NO=\d+$"), # レビュワー名簿
42
+
43
+ callback="parse_user",
44
+
45
+ follow=False
46
+
47
+ ),
48
+
49
+ Rule(
50
+
51
+ LinkExtractor(allow=r"revper.cgi?&?REVPER_NO=\d+$"), # 個人ページ
52
+
53
+ follow=True
54
+
55
+ ),
56
+
57
+ Rule(
58
+
59
+ LinkExtractor(allow=r"revper.cgi?&?PAGE_NO=\d+&REVPER_NO=\d+&TYPE=2$"), # レビューの変更日付順化
60
+
61
+ callback="parse_review"
62
+
63
+ )
64
+
65
+ )
66
+
67
+
68
+
69
+ user_pattern = re.compile(r"REVPER_NO=(?P<user_id>\d+)")
70
+
71
+ movie_pattern = re.compile(r"TITLE_NO=(?P<movie_id>\d+)")
72
+
73
+
74
+
75
+ def parse_user(self, response):
76
+
77
+ try:
78
+
79
+ user_table = response.css("table.hover-table")
80
+
81
+ for link in user_table.css("a"):
82
+
83
+ user = items.UserItem()
84
+
85
+ user_url = link.css("a::attr(href)").get()
86
+
87
+ user["user_id"] = int(self.user_pattern.findall(user_url)[0])
88
+
89
+ user["name"] = link.css("a::text").get()
90
+
91
+ yield user
92
+
93
+ except:
94
+
95
+ self.log(f"parse failed: {response.url}", level=logging.ERROR)
96
+
97
+ yield scrapy.Request(
98
+
99
+ response.url, callback=self.parse_user, dont_filter=True
100
+
101
+ )
102
+
103
+
104
+
105
+ def parse_review(self, response):
106
+
107
+ try:
108
+
109
+ user_table = response.css("table.normal-table")[2]
110
+
111
+ for link in user_table.css("tr")[1:]:
112
+
113
+ review = items.ReviewItem()
114
+
115
+ review_url = link.css("a::attr(href)")[0].get()
116
+
117
+ review["movie_id"] = int(self.movie_pattern.findall(review_url)[0])
118
+
119
+ review["title"] = link.css("a::text")[0].get()
120
+
121
+ review["point"] = link.css("td::text").get()
122
+
123
+ yield review
124
+
125
+ except:
126
+
127
+ self.log(f"parse failed: {response.url}", level=logging.ERROR)
128
+
129
+ yield scrapy.Request(
130
+
131
+ response.url, callback=self.parse_review, dont_filter=True
132
+
133
+ )
134
+
135
+
136
+
137
+ ```
138
+
139
+
140
+
141
+ ```python
142
+
143
+ import scrapy
144
+
145
+
146
+
147
+
148
+
149
+ class ReviewItem(scrapy.Item):
150
+
151
+ point = scrapy.Field(serializer=str)
152
+
153
+ movie_id = scrapy.Field(serializer=str)
154
+
155
+ title = scrapy.Field(serializer=str)
156
+
157
+
158
+
159
+
160
+
161
+ class UserItem(scrapy.Item):
162
+
163
+ user_id = scrapy.Field(serializer=str)
164
+
165
+ name = scrapy.Field(serializer=str)
166
+
167
+
168
+
169
+
170
+
171
+ ```
6
172
 
7
173
 
8
174