質問編集履歴

1

著作権で問題がでないように、必要箇所のみ掲載

2018/10/07 13:18

投稿

giro1975
giro1975

スコア37

test CHANGED
File without changes
test CHANGED
@@ -28,310 +28,6 @@
28
28
 
29
29
 
30
30
 
31
-
32
-
33
-
34
-
35
-
36
-
37
-
38
-
39
- ```ここに言語を入力
40
-
41
- #twitkey呼び出し
42
-
43
- import twitkey #自作の認証キー
44
-
45
- import tweepy
46
-
47
- CK = twitkey.twkey['cons_key']
48
-
49
- CS = twitkey.twkey['cons_sec']
50
-
51
- AK = twitkey.twkey['accto_key']
52
-
53
- AT = twitkey.twkey['accto_sec']
54
-
55
- ```
56
-
57
-
58
-
59
- ```ここに言語を入力
60
-
61
- #twitter データ収集用プログラム
62
-
63
- def get_twitter_api(CK,CS,AK,AT):
64
-
65
- auth = tweepy.OAuthHandler(CK,CS) #OAuthHandler
66
-
67
- auth.set_access_token(AK,AT)
68
-
69
- api = tweepy.API(auth)
70
-
71
- return api
72
-
73
-
74
-
75
- def get_tweet(user_id,count):
76
-
77
- API = get_twitter_api(CK,CS,AK,AT)
78
-
79
- data = API.user_timeline(id=user_id,count=count)
80
-
81
- tweets = []
82
-
83
- for tweet in data:
84
-
85
- tweets.append(format_text(tweet.text))
86
-
87
- return tweets,data[-1].id
88
-
89
- def get_tweet_with_id(user_id,count,next_max_id):
90
-
91
- API = get_twitter_api(CK,CS,AK,AT)
92
-
93
- data = API.user_timeline(id=user_id,count=count,max_id=next_max_id-1)
94
-
95
- tweets = []
96
-
97
- for tweet in data:
98
-
99
- tweets.append(format_text(tweet.text))
100
-
101
- if len(data) == 0:
102
-
103
- return tweets,0
104
-
105
- return tweets,data[-1].id
106
-
107
-
108
-
109
-
110
-
111
- ```
112
-
113
- ```ここに言語を入力
114
-
115
- # tweet 収集用プログラム
116
-
117
- def get_positive_tweets():
118
-
119
- pos_tweets = []
120
-
121
- positive_ids =[
122
-
123
- "positive_bot_00",
124
-
125
- "positivekk_bot",
126
-
127
- "botpositive",
128
-
129
- "positive_mot",
130
-
131
- "kami_positive",
132
-
133
- "positive_bot",
134
-
135
- "jinseiplusbot",
136
-
137
- "syuzou_genki",
138
-
139
- "genki_kotoba_m"
140
-
141
- ]
142
-
143
-
144
-
145
- for pos_id in positive_ids:
146
-
147
- tmp,max_id = get_tweet(pos_id,200)
148
-
149
- tmp = list(set(tmp))
150
-
151
- for i in range(len(tmp)):
152
-
153
- pos_tweets.append(tmp[i])
154
-
155
- while True:
156
-
157
- tmp,max_id = get_tweet_with_id(pos_id,100,max_id)
158
-
159
- tmp = list(set(tmp))
160
-
161
- for i in range(len(tmp)):
162
-
163
- pos_tweets.append(tmp[i])
164
-
165
- if max_id == 0:
166
-
167
- break
168
-
169
- return pos_tweets
170
-
171
-
172
-
173
- def get_negative_tweets():
174
-
175
- neg_tweets = []
176
-
177
- negative_ids = [
178
-
179
- "negatizibu_bot",
180
-
181
- "immydream19",
182
-
183
- "lewyDanf",
184
-
185
- "positive_act_me",
186
-
187
- "pgmtmw",
188
-
189
- "yamiki_bot",
190
-
191
- "cool_aroma",
192
-
193
- "nega_bot",
194
-
195
- "negativebot",
196
-
197
- "H4Za5",
198
-
199
- "ymibot"
200
-
201
- ]
202
-
203
-
204
-
205
- for neg_id in negative_ids:
206
-
207
- tmp,max_id = get_tweet(neg_id,200)
208
-
209
- tmp = list(set(tmp))
210
-
211
- for i in range(len(tmp)):
212
-
213
- neg_tweets.append(tmp[i])
214
-
215
- while True:
216
-
217
- tmp,max_id = get_tweet_with_id(neg_id,100,max_id)
218
-
219
- tmp = list(set(tmp))
220
-
221
- for i in range(len(tmp)):
222
-
223
- neg_tweets.append(tmp[i])
224
-
225
- if max_id == 0:
226
-
227
- break
228
-
229
- return neg_tweets
230
-
231
- ```
232
-
233
-
234
-
235
- ```ここに言語を入力
236
-
237
- #url,必要のない文字列を削除するプログラム
238
-
239
- import re
240
-
241
- def format_text(text):
242
-
243
- text = re.sub(r'https?://[\w/:%#$&?(/)~.=+\-...]+',"",'text')
244
-
245
- text = re.sub('RT',"",text)
246
-
247
- text = re.sub('お気に入り',"",text)
248
-
249
- text = re.sub('まとめ',"",text)
250
-
251
- text = re.sub(r'[!-~]','""',text) #半角記号,数字,英字
252
-
253
- text = re.sub(r'[:-@]',"",text) #全角記号
254
-
255
- text = re.sub('\n',"",text) #改行文字
256
-
257
-
258
-
259
- return text
260
-
261
- ```
262
-
263
-
264
-
265
- ```ここに言語を入力
266
-
267
- #形態素解析
268
-
269
- def tokenize(tweets):
270
-
271
- t = Tokenizer()
272
-
273
- tokennaized_tweets = []
274
-
275
-
276
-
277
- for tw in tweets:
278
-
279
- tokens = t.tokenize(tw)
280
-
281
- tmp = ""
282
-
283
- for token in tokens:
284
-
285
- noun_flag = 0
286
-
287
-
288
-
289
- partOfSpeech = token.part_of_speech.split(",")[0]
290
-
291
-
292
-
293
- if partOfSpeech == "名刺":
294
-
295
- noun_flag = 1
296
-
297
- if partOfSpeech == "動詞":
298
-
299
- noun_flag = 1
300
-
301
- if partOfSpeech == "形容詞":
302
-
303
- noun_flag = 1
304
-
305
- if partOfSpeech == "形容動詞":
306
-
307
- noun_flag = 1
308
-
309
- if partOfSpeech == "感動詞":
310
-
311
- noun_flag = 1
312
-
313
-
314
-
315
- if noun_flag == 1:
316
-
317
- tmp += token.surface + ""
318
-
319
-
320
-
321
- tmp = tmp.rfstrip(" ")
322
-
323
- tokenized_tweets.append(tmp)
324
-
325
-
326
-
327
- return tokenized_tweets
328
-
329
-
330
-
331
-
332
-
333
- ```
334
-
335
31
  ```ここに言語を入力
336
32
 
337
33
  #ポジティブ、ネガティブなツイートを取得、そして整形 ※ここでエラー