質問編集履歴

4

修正

2019/07/22 06:30

投稿

walkwater
walkwater

スコア11

test CHANGED
File without changes
test CHANGED
@@ -4,7 +4,7 @@
4
4
 
5
5
 
6
6
 
7
- @twitter こんにちは pic....
7
+ @ twitter こんにちは pic.twitter.com/XXXXXXXXXXXXXXX
8
8
 
9
9
 
10
10
 

3

修正

2019/07/22 06:30

投稿

walkwater
walkwater

スコア11

test CHANGED
File without changes
test CHANGED
@@ -220,7 +220,7 @@
220
220
 
221
221
  #
222
222
 
223
- screen_name = "ualntewrtrn2651"
223
+ screen_name = "********"
224
224
 
225
225
  subete = 0
226
226
 

2

修正

2019/07/17 03:47

投稿

walkwater
walkwater

スコア11

test CHANGED
File without changes
test CHANGED
@@ -12,6 +12,8 @@
12
12
 
13
13
 
14
14
 
15
+ fil3で処理をするつもりです。
16
+
15
17
  ```
16
18
 
17
19
  #! /usr/bin/python3

1

修正

2019/07/17 03:41

投稿

walkwater
walkwater

スコア11

test CHANGED
File without changes
test CHANGED
@@ -12,4 +12,318 @@
12
12
 
13
13
 
14
14
 
15
+ ```
16
+
17
+ #! /usr/bin/python3
18
+
19
+ # -*- coding: utf-8 -*-
20
+
21
+ #
22
+
23
+ # get_tweet.py
24
+
25
+ #
26
+
27
+ # Dec/21/2017
28
+
29
+ # --------------------------------------------------------------------
30
+
31
+ import sys
32
+
33
+ import json
34
+
35
+ import config2
36
+
37
+ import oauth2 as oauth
38
+
39
+ import got3 as got
40
+
41
+ import re
42
+
43
+ import nltk
44
+
45
+ #
46
+
47
+ from requests_oauthlib import OAuth1Session
48
+
49
+ from define_client2 import define_client2_proc
50
+
51
+ #
52
+
53
+ CK = config2.CONSUMER_KEY
54
+
55
+ CS = config2.CONSUMER_SECRET
56
+
57
+ AT = config2.ACCESS_TOKEN
58
+
59
+ ATS = config2.ACCESS_TOKEN_SECRET
60
+
61
+ twitter = OAuth1Session(CK, CS, AT, ATS)
62
+
63
+ # --------------------------------------------------------------------
64
+
65
+ # [8]:
66
+
67
+ def get_tweets_proc(client,screen_name):
68
+
69
+ nnx = 200
70
+
71
+ url_base = "https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name="
72
+
73
+ url = url_base + screen_name + "&count=" + str(nnx)
74
+
75
+ array_aa = []
76
+
77
+ response, data = client.request(url)
78
+
79
+ if response.status == 200:
80
+
81
+ json_str = data.decode('utf-8')
82
+
83
+ # print(json_str)
84
+
85
+ array_aa = json.loads(json_str)
86
+
87
+ sys.stderr.write("len(array_aa) = %d\n" % len(array_aa))
88
+
89
+ #
90
+
91
+ else:
92
+
93
+ sys.stderr.write("*** error *** get_ids_proc ***\n")
94
+
95
+ sys.stderr.write("Error: %d\n" % response.status)
96
+
97
+ #
98
+
99
+ return array_aa
100
+
101
+ # --------------------------------------------------------------------
102
+
103
+ def filter(text):
104
+
105
+ """
106
+
107
+ :param text: str
108
+
109
+ :rtype : str
110
+
111
+ """
112
+
113
+ # アルファベットと半角英数と記号と改行とタブを排除
114
+
115
+ text = re.sub(r'[a-zA-Z0-9¥"¥.¥,¥@]+', '', text)
116
+
117
+ text = re.sub(r'[!"“#$%&()*+\-.,/:;<=>?@[\]^_`{|}~]', '', text)
118
+
119
+ text = re.sub(r'[\n|\r|\t]', '', text)
120
+
121
+
122
+
123
+ # 日本語以外の文字を排除(韓国語とか中国語とかヘブライ語とか)
124
+
125
+ jp_chartype_tokenizer = nltk.RegexpTokenizer(u'([ぁ-んー]+|[ァ-ンー]+|[\u4e00-\u9FFF]+|[ぁ-んァ-ンー\u4e00-\u9FFF]+)')
126
+
127
+ text = "".join(jp_chartype_tokenizer.tokenize(text))
128
+
129
+ return text
130
+
131
+ # -----------------------------------------------------------------------
132
+
133
+ def fil2(text):
134
+
135
+ """
136
+
137
+ :param text: str
138
+
139
+ :rtype : str
140
+
141
+ """
142
+
143
+ #text = re.sub(r'[!"“#$%&()*+\-.,/:;<=>?@[\]^_`{|}~]', '', text)
144
+
145
+ text = re.sub(r'[\n|\r|\t]', '', text)
146
+
147
+ text = re.sub(r'(https?|ftp)(://[-_.!~*\'()a-zA-Z0-9;/?:\@&=+$,%#]+)', '', text)
148
+
149
+ text = re.sub(r'[─│━┃┌┏┐┓└┗┘┛├┝┠┣┤┥┨┫┬┯┰┳┴┷┸┻┼┿╂╋\  ]', '', text)
150
+
151
+ return text
152
+
153
+ #------------------------------------------------------------------------
154
+
155
+ def n_gram(target, n):
156
+
157
+
158
+
159
+ return [ target[idx:idx + n] for idx in range(len(target) - n + 1)]
160
+
161
+ #------------------------------------------------------------------------
162
+
163
+ def jaccard_similarity_coefficient(list_a,list_b):
164
+
165
+ #集合Aと集合Bの積集合(set型)を作成
166
+
167
+ set_intersection = set.intersection(set(list_a), set(list_b))
168
+
169
+ #集合Aと集合Bの積集合の要素数を取得
170
+
171
+ num_intersection = len(set_intersection)
172
+
173
+
174
+
175
+ #集合Aと集合Bの和集合(set型)を作成
176
+
177
+ set_union = set.union(set(list_a), set(list_b))
178
+
179
+ #集合Aと集合Bの和集合の要素数を取得
180
+
181
+ num_union = len(set_union)
182
+
183
+
184
+
185
+ #積集合の要素数を和集合の要素数で割って
186
+
187
+ #Jaccard係数を算出
188
+
189
+ try:
190
+
191
+ return float(num_intersection) / num_union
192
+
193
+ except ZeroDivisionError:
194
+
195
+ return 1.0
196
+
197
+ #--------------------------------------------------------------------
198
+
199
+ def fil3(text):
200
+
201
+ """
202
+
203
+ :param text: str
204
+
205
+ :rtype : str
206
+
207
+ """
208
+
209
+ text = re.sub('@.+:\s',"", text)
210
+
211
+ return text
212
+
213
+ #-------------------------------------------------------------------------
214
+
215
+
216
+
217
+ sys.stderr.write("*** 開始 ***\n")
218
+
219
+ #
220
+
221
+ screen_name = "ualntewrtrn2651"
222
+
223
+ subete = 0
224
+
225
+ onaji = 0
226
+
227
+ tigau = 0
228
+
229
+ nagai = 0
230
+
231
+ #
232
+
233
+ client = define_client2_proc()
234
+
235
+ #
236
+
237
+ array_aa = get_tweets_proc(client,screen_name)
238
+
239
+ #
240
+
241
+ sys.stderr.write("len(array_aa) = %d\n" % len(array_aa))
242
+
243
+ #
244
+
245
+ for unit_aa in array_aa:
246
+
247
+ #text = filter(unit_aa['text'])
248
+
249
+ text = fil2(unit_aa['text'])
250
+
251
+ print (text)
252
+
253
+ print('----------------------------------------------------')
254
+
255
+ if not text:
256
+
257
+ print('null')
258
+
259
+ else:
260
+
261
+ subete = subete + 1
262
+
263
+ tweetCriteria = got.manager.TweetCriteria().setQuerySearch('"%s"' % text).setSince("2014-01-01").setUntil("2019-04-01").setMaxTweets(5)
264
+
265
+ print(len(got.manager.TweetManager.getTweets(tweetCriteria)))
266
+
267
+ if len(text) > 90:
268
+
269
+ print("長さオーバー")
270
+
271
+ nagai = nagai + 1
272
+
273
+ else:
274
+
275
+ for i in range(5):
276
+
277
+ if len(got.manager.TweetManager.getTweets(tweetCriteria)) < i + 1:
278
+
279
+ print("該当なし")
280
+
281
+ tigau = tigau + 1
282
+
283
+ break
284
+
285
+ else:
286
+
287
+ tweet = got.manager.TweetManager.getTweets(tweetCriteria)[i]
288
+
289
+ if tweet.username == screen_name:
290
+
291
+ print("本人")
292
+
293
+ else:
294
+
295
+ print(tweet.text)
296
+
297
+ print(tweet.username)
298
+
299
+ tweettext = fil3(tweet.text)
300
+
301
+ print(tweettext)
302
+
303
+ onaji = onaji + 1
304
+
305
+ list_a = n_gram(unit_aa['text'], 2)
306
+
307
+ list_b = n_gram(tweettext, 2)
308
+
309
+
310
+
311
+ jaccard = jaccard_similarity_coefficient(list_a,list_b)
312
+
313
+ print(jaccard)
314
+
315
+ #break
316
+
317
+ #
318
+
319
+ print("ツイート数 = %d, 同様ツイート数 = %d, 該当なし = %d, 長さオーバー = %d" % (subete, onaji, tigau, nagai))
320
+
321
+ sys.stderr.write("*** 終了 ***\n")
322
+
323
+ # --------------------------------------------------------------------
324
+
325
+ ```
326
+
327
+
328
+
15
329
  指定した文字から指定した文字までを削除するなどの方法を考えたのですが、それらのやり方も分からなかったので教えていただけると幸いです。