質問編集履歴

1

書式改善

2019/12/14 15:22

投稿

hglkmnlkygmnl
hglkmnlkygmnl

スコア6

test CHANGED
File without changes
test CHANGED
@@ -1,388 +1,6 @@
1
1
  keyerror が出てうまくDBと接続できていないのか、コードが間違っているのでしょうか?
2
2
 
3
3
 
4
-
5
- ```python
6
-
7
- # -*- coding: utf-8 -*-
8
-
9
- import os, sys, logging, time, configparser
10
-
11
- from pymongo import MongoClient, DESCENDING
12
-
13
- import pandas as pd
14
-
15
- app_home = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath('__file__')), '..'))
16
-
17
-
18
-
19
- sys.path.append(os.path.join(app_home, "lib"))
20
-
21
- from trans import Trans
22
-
23
- trans = Trans()
24
-
25
-
26
-
27
- POS_DIC = {
28
-
29
- 'BOS/EOS': 'EOS', # end of sentense
30
-
31
- '形容詞' : 'ADJ',
32
-
33
- '副詞' : 'ADV',
34
-
35
- '名詞' : 'NOUN',
36
-
37
- '動詞' : 'VERB',
38
-
39
- '助動詞' : 'AUX',
40
-
41
- '助詞' : 'PART',
42
-
43
- '連体詞' : 'ADJ', # Japanese-specific POS
44
-
45
- '感動詞' : 'INTJ',
46
-
47
- '接続詞' : 'CONJ',
48
-
49
- '*' : 'X',
50
-
51
- }
52
-
53
- LOG_LEVEL = 'DEBUG'
54
-
55
- logging.basicConfig(
56
-
57
- level=getattr(logging, LOG_LEVEL),
58
-
59
- format='%(asctime)s [%(levelname)s] %(module)s | %(message)s',
60
-
61
- datefmt='%Y/%m/%d %H:%M:%S',
62
-
63
- )
64
-
65
-
66
-
67
- logger = logging.getLogger(__name__)
68
-
69
- # Const of database name
70
-
71
- POLITELY_DICT_DB = "politely_dict"
72
-
73
- NPTABLE_COLLECTION_NAME = "np_table"
74
-
75
- SENTIDIC_COLLECTION_NAME = "sent_dic"
76
-
77
-
78
-
79
- # NAMING CONVENTIONS:
80
-
81
- # "get" - should be a fast find() or find_one()
82
-
83
- # "load" - potentially slow find() that usually returns a large set of data
84
-
85
- # "init" - called once and only once when the collection is created
86
-
87
- # (usually for setting indexes)
88
-
89
- # "save" - fast update() or update_one(). Avoid using insert() as much
90
-
91
- # as possible, because it is not idempotent.
92
-
93
-
94
-
95
-
96
-
97
- def get_db(db_name):
98
-
99
- config = configparser.ConfigParser()
100
-
101
- config.read(app_home + '/myvenv/lib/config.ini')
102
-
103
- client = MongoClient('localhost')
104
-
105
- client['admin'].authenticate(config.get('mongo', 'id'), config.get('mongo', 'password'))
106
-
107
- # client = get_MongoClient()
108
-
109
- db = client[db_name]
110
-
111
- return db
112
-
113
-
114
-
115
- def load_politly_dic(collection_name):
116
-
117
- db = get_db(POLITELY_DICT_DB)
118
-
119
- cursor = db[collection_name].find()
120
-
121
- df = pd.DataFrame.from_dict(list(cursor)).astype(object)
122
-
123
- # df = df[df['source'] != 'walmart']
124
-
125
- # df['review_date'] = pd.to_datetime(df['review_date'])
126
-
127
- # df = df.replace(np.nan, ' ')
128
-
129
- return df
130
-
131
-
132
-
133
-
134
-
135
- # pn_table
136
-
137
- # pol_file = app_home + '/dataset/posneg_dic/pn_table.dic'
138
-
139
- def init_politely_table(file_name):
140
-
141
- # load file
142
-
143
- logger.info(file_name)
144
-
145
- with open(file_name) as filedata:
146
-
147
- lines = filedata.readlines()
148
-
149
- for line in lines:
150
-
151
- try:
152
-
153
- text = line.strip()
154
-
155
- #優れる:すぐれる:動詞:1
156
-
157
- data = text.split(':')
158
-
159
- logger.info( data )
160
-
161
- # save
162
-
163
- db = get_db(POLITELY_DICT_DB)
164
-
165
- # TODO: set eng
166
-
167
- db[NPTABLE_COLLECTION_NAME].update({'headword':data[0], 'POS_jp':data[2] },
168
-
169
- {'$set':{'headword':data[0], 'reading':data[1], 'POS':POS_DIC[data[2]], 'POS_jp':data[2], 'posneg':data[3], 'eng':''}},
170
-
171
- upsert=True)
172
-
173
- except:
174
-
175
- logger.error(data)
176
-
177
- continue
178
-
179
-
180
-
181
-
182
-
183
- def get_politely_score(headword, *, pos='*'):
184
-
185
- db = get_db(POLITELY_DICT_DB)
186
-
187
- # logger.info(headword)
188
-
189
- # TOOD: if pos == '*' else
190
-
191
- res = db[NPTABLE_COLLECTION_NAME].find_one({'headword':headword}, {'posneg':1, '_id':0})
192
-
193
- if res == None:
194
-
195
- res = db[NPTABLE_COLLECTION_NAME].find_one({'reading':headword}, {'posneg':1, '_id':0})
196
-
197
-
198
-
199
- score = 0
200
-
201
- if res:
202
-
203
- # tuning
204
-
205
- if (-0.5 < float(res['posneg']) and float(res['posneg']) < 0):
206
-
207
- score = 0
208
-
209
- else:
210
-
211
- score = float(res['posneg'])
212
-
213
- return score
214
-
215
-
216
-
217
- # senti_dic
218
-
219
- # ==> dataset/posneg_dic/sent_nouns.dic <==
220
-
221
- # 13314 lines
222
-
223
- # headword label(p>e>n) detail
224
-
225
- # "2,3日" e 〜である・になる(状態)客観
226
-
227
- # ==> dataset/posneg_dic/sent_verb_adj.dic <==
228
-
229
- # 5277 lines
230
-
231
- # あがく n ネガ(経験)
232
-
233
-
234
-
235
- def init_senti_dic(file_name, data_type):
236
-
237
- # load file
238
-
239
- logger.info(file_name)
240
-
241
- with open(file_name) as filedata:
242
-
243
- lines = filedata.readlines()
244
-
245
- for line in lines:
246
-
247
- try:
248
-
249
- # headword label(p>e>n) detail
250
-
251
- # "2,3日" e 〜である・になる(状態)客観
252
-
253
- text = line.strip()
254
-
255
- data = text.split("\t")
256
-
257
- # logger.info( data )
258
-
259
- # get score
260
-
261
- score = 0
262
-
263
- if data[1] == 'p':
264
-
265
- score = 1
266
-
267
- elif data[1] == 'n':
268
-
269
- score = -1
270
-
271
- # save
272
-
273
- db = get_db(POLITELY_DICT_DB)
274
-
275
-
276
-
277
- # TODO: set eng / too slow
278
-
279
- # eng = erans.ed('/usr/local/cai_venv/lib/config.-1ini')rans_ja2en(data[0])
280
-
281
- eng = ''
282
-
283
- # logger.info({'headword':data[0], 'type':data_type, 'detail':data[2], 'score':score, 'eng':trans.trans_ja2en(data[0])})
284
-
285
- db[SENTIDIC_COLLECTION_NAME].update({'headword':data[0]},
286
-
287
- {'$set':{'headword':data[0], 'type':data_type,\
288
-
289
- 'detail':data[2], 'score':score, 'eng':eng}},
290
-
291
- upsert=True)
292
-
293
- except:
294
-
295
- logger.error(data)
296
-
297
- continue
298
-
299
-
300
-
301
- # TODO: multi word (ex.あきれる た
302
-
303
- def get_sentidic_score(headword, *, type=None):
304
-
305
- db = get_db(POLITELY_DICT_DB)
306
-
307
- # TODO: type
308
-
309
- res = db[SENTIDIC_COLLECTION_NAME].find_one({'headword':headword})
310
-
311
- logger.info(res)
312
-
313
- score = 0
314
-
315
- if res:
316
-
317
- score = res['score']
318
-
319
- return score
320
-
321
-
322
-
323
- if __name__ == "__main__":
324
-
325
- pd = load_politly_dic(SENTIDIC_COLLECTION_NAME)
326
-
327
- print(pd.shape, pd.head(10))
328
-
329
-
330
-
331
- # db[NPTABLE_COLLECTION_NAME].update({'headword':'優れる', 'POS_jp':'動詞' },
332
-
333
- # {'$set':{'headword':'優れる', 'reading': 'すぐれる', 'POS':'VERB', 'POS_jp':'動詞', 'eng':'be excellent'}},
334
-
335
- # upsert=True)
336
-
337
-
338
-
339
- senti_file_noun = app_home + '/dataset/posneg_dic/sent_nouns.dic'
340
-
341
- # init_senti_dic(senti_file_noun, 'NOUN')
342
-
343
- senti_file_verb = app_home + '/dataset/posneg_dic/sent_verb_adj.dic'
344
-
345
- # get_sentidic_score('ない')
346
-
347
- # get_sentidic_score('合う')
348
-
349
- # get_sentidic_score('おいしい')
350
-
351
- get_sentidic_score('無い')
352
-
353
- get_sentidic_score('無駄')
354
-
355
-
356
-
357
-
358
-
359
- # init_senti_dic(senti_file_verb, 'VERB')
360
-
361
- # logger.info('優れる')
362
-
363
- # logger.info(get_politely_score('優れる') )
364
-
365
- # logger.info('だめ')
366
-
367
- # logger.info(get_politely_score('だめ') )
368
-
369
- # logger.info('いい')
370
-
371
- # logger.info(get_politely_score('いい') )
372
-
373
- #
374
-
375
- # db = get_db(POLITELY_DICT_DB)
376
-
377
- # res = db[NPTABLE_COLLECTION_NAME].find()
378
-
379
- # res = db[NPTABLE_COLLECTION_NAME].find({'headword':'優れる'})
380
-
381
- # for r in res:
382
-
383
- # logger.info(r)### ヘディングのテキスト
384
-
385
- ```
386
4
 
387
5
 
388
6