質問編集履歴

2

ソースコードを追記

2019/07/14 15:14

投稿

canaria369
canaria369

スコア25

test CHANGED
@@ -1 +1 @@
1
- torchtext 教師データと推データが違う場合のbuild_vocab
1
+ torchtext 埋め込みベクトルのサイズ指について
test CHANGED
@@ -10,9 +10,11 @@
10
10
 
11
11
 
12
12
 
13
- 入力行列行数学習データの登場単種類なってしまい、データを使って推定ができないため
13
+  Text.vocabサイズ教師データの語彙数依存してしまい、推定用のデータを利用する際に
14
+
14
-
15
+  新たに埋め込みベクトルを生成すると入力層の次元数が合わなくなるので
16
+
15
- 単語ベクトルの収録単語と同じにしたいですが方法がわかりませんでした。
17
+  入力のベクトルファイル(model.vec)を基準に次元を指定したいです
16
18
 
17
19
 
18
20
 
@@ -34,24 +36,474 @@
34
36
 
35
37
  ### 該当のソースコード
36
38
 
39
+ ・入力データ
40
+
41
+ train_ja.tsv(val_ja.tsv,test_ja.tsvも一緒の形式)
42
+
43
+ ```text
44
+
45
+ あなたをが好きです。 1
46
+
47
+ 私はマイクが嫌いです。 0
48
+
49
+ 私はマキが好きです。 1
50
+
51
+ ボブが嫌いです。 0
52
+
53
+ ```
54
+
55
+ model.vec … FastTextの学習済みベクトル
56
+
57
+
58
+
59
+ ・ソースコード
60
+
61
+ model.py
62
+
37
63
  ```python
38
64
 
65
+ # coding:utf-8
66
+
67
+ import torchtext
68
+
69
+ from torchtext import data
70
+
71
+ from torchtext import datasets
72
+
39
73
  from torchtext.vocab import FastText
40
74
 
75
+ from torchtext.vocab import Vectors
76
+
77
+
78
+
79
+ import torch
80
+
81
+ import torch.nn as nn
82
+
83
+ import torch.optim as optim
84
+
85
+ import torch.nn.functional as F
86
+
87
+ from torch.autograd import Variable
88
+
89
+
90
+
91
+ class EncoderRNN(nn.Module):
92
+
93
+ def __init__(self, emb_dim, h_dim, v_size, gpu=True, v_vec=None, batch_first=True):
94
+
95
+ super(EncoderRNN, self).__init__()
96
+
97
+ self.gpu = gpu
98
+
99
+ self.h_dim = h_dim
100
+
101
+ self.embed = nn.Embedding(v_size, emb_dim)
102
+
103
+ if v_vec is not None:
104
+
105
+ self.embed.weight.data.copy_(v_vec)
106
+
107
+ self.lstm = nn.LSTM(emb_dim, h_dim, batch_first=batch_first,
108
+
109
+ bidirectional=True)
110
+
111
+
112
+
113
+ def init_hidden(self, b_size):
114
+
115
+ h0 = Variable(torch.zeros(1*2, b_size, self.h_dim))
116
+
117
+ c0 = Variable(torch.zeros(1*2, b_size, self.h_dim))
118
+
119
+ if self.gpu:
120
+
121
+ h0 = h0.cuda()
122
+
123
+ c0 = c0.cuda()
124
+
125
+ return (h0, c0)
126
+
127
+
128
+
129
+ def forward(self, sentence, lengths=None):
130
+
131
+ self.hidden = self.init_hidden(sentence.size(0))
132
+
133
+ emb = self.embed(sentence)
134
+
135
+ packed_emb = emb
136
+
137
+
138
+
139
+ if lengths is not None:
140
+
141
+ lengths = lengths.view(-1).tolist()
142
+
143
+ packed_emb = nn.utils.rnn.pack_padded_sequence(emb, lengths)
144
+
145
+ out, hidden = self.lstm(packed_emb, self.hidden)
146
+
147
+ if lengths is not None:
148
+
149
+ out = nn.utils.rnn.pad_packed_sequence(output)[0]
150
+
151
+ out = out[:, :, :self.h_dim] + out[:, :, self.h_dim:]
152
+
153
+ return out
154
+
155
+
156
+
157
+ class Attn(nn.Module):
158
+
159
+ def __init__(self, h_dim):
160
+
161
+ super(Attn, self).__init__()
162
+
163
+ self.h_dim = h_dim
164
+
165
+ self.main = nn.Sequential(
166
+
167
+ nn.Linear(h_dim, 24),
168
+
169
+ nn.ReLU(True),
170
+
171
+ nn.Linear(24,1)
172
+
173
+ )
174
+
175
+
176
+
177
+ def forward(self, encoder_outputs):
178
+
179
+ b_size = encoder_outputs.size(0)
180
+
181
+ output_cont = encoder_outputs.contiguous()
182
+
183
+ output_view = output_cont.view(-1, self.h_dim)
184
+
185
+ attn_ene = self.main(output_view.to("cpu")) # (b, s, h) -> (b * s, 1)
186
+
187
+ return F.softmax(attn_ene.view(b_size, -1), dim=1).unsqueeze(2) # (b*s, 1) -> (b, s, 1)
188
+
189
+
190
+
191
+ class AttnClassifier(nn.Module):
192
+
193
+ def __init__(self, h_dim, c_num):
194
+
195
+ super(AttnClassifier, self).__init__()
196
+
197
+ self.attn = Attn(h_dim)
198
+
199
+ self.main = nn.Linear(h_dim, c_num)
200
+
201
+
202
+
203
+
204
+
205
+ def forward(self, encoder_outputs):
206
+
207
+ attns = self.attn(encoder_outputs) #(b, s, 1)
208
+
209
+ feats = (encoder_outputs.to("cuda:0") * attns.to("cuda:0")).sum(dim=1) # (b, s, h) -> (b, h)
210
+
41
- TEXT.build_vocab(train, vectors=FastText(language="ja"), min_freq=2)
211
+ return F.log_softmax(self.main(feats.to("cpu")),dim=1), attns
42
212
 
43
213
  ```
44
214
 
45
215
 
46
216
 
217
+ train.py
218
+
219
+ ```python
220
+
221
+ # coding:utf-8
222
+
223
+ import janome
224
+
225
+ from janome.tokenizer import Tokenizer
226
+
227
+
228
+
229
+ import torch
230
+
231
+ import torch.nn as nn
232
+
233
+ import torch.optim as optim
234
+
235
+ import torch.nn.functional as F
236
+
237
+ from torch.autograd import Variable
238
+
239
+
240
+
241
+ from torchtext import data, datasets
242
+
243
+
244
+
245
+ from bs4 import BeautifulSoup
246
+
247
+ import pandas as pd
248
+
249
+ import io
250
+
251
+
252
+
253
+ #from model import *
254
+
255
+
256
+
257
+ emb_dim = 300 #単語埋め込み次元
258
+
259
+ h_dim = 3 #lstmの隠れ層の次元
260
+
261
+ class_num = 2 #予測クラス数
262
+
263
+ lr = 0.001 #学習係数
264
+
265
+ epochs = 30 #エポック数
266
+
267
+
268
+
269
+ #device = torch.device('cuda:0')
270
+
271
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
272
+
273
+ #print(device)
274
+
275
+ torch.manual_seed(0)
276
+
277
+ torch.cuda.manual_seed(0)
278
+
279
+ j_t = Tokenizer()
280
+
281
+
282
+
283
+ def tokenizer(text):
284
+
285
+ #return text.split(' ')
286
+
287
+ return [tok for tok in j_t.tokenize(text, wakati=True)]
288
+
289
+
290
+
291
+ def clean_tokenizer(text):
292
+
293
+ soup = BeautifulSoup(text,"lxml")
294
+
295
+ clean_text = soup.get_text()
296
+
297
+ return [tok for tok in j_t.tokenize(clean_text, wakati=True)]
298
+
299
+
300
+
301
+ class JaFastText(Vectors):
302
+
303
+ def __init__(self, name=None, **kwargs):
304
+
305
+ super(JaFastText, self).__init__(name, url=None, **kwargs)
306
+
307
+
308
+
309
+
310
+
311
+ def train_model(epoch, train_iter, optimizer, log_interval=1, batch_size=2):
312
+
313
+ encoder.train()
314
+
315
+ classifier.train()
316
+
317
+ correct = 0
318
+
319
+ for idx, batch in enumerate(train_iter):
320
+
321
+ (x, x_l), y = batch.Text, batch.Label
322
+
323
+ optimizer.zero_grad()
324
+
325
+ encoder_outputs = encoder(x)
326
+
327
+ output, attn = classifier(encoder_outputs)
328
+
329
+ loss = F.nll_loss(output.to(device), y.to(device))
330
+
331
+ loss.backward()
332
+
333
+ optimizer.step()
334
+
335
+ pred = output.data.max(1, keepdim=True)[1]
336
+
337
+ correct += pred.eq(y.data.view_as(pred).to("cpu")).cpu().sum()
338
+
339
+ if idx % log_interval == 0:
340
+
341
+ print('train epoch: {} [{}/{}], acc:{}, loss:{}'.format(
342
+
343
+ epoch, (idx+1)*len(x), len(train_iter)*batch_size,
344
+
345
+ correct/float(log_interval * len(x)),
346
+
347
+ loss.item()))
348
+
349
+ correct = 0
350
+
351
+
352
+
353
+
354
+
355
+ def test_model(epoch, test_iter):
356
+
357
+ encoder.eval()
358
+
359
+ classifier.eval()
360
+
361
+ correct = 0
362
+
363
+ for idx, batch in enumerate(test_iter):
364
+
365
+ (x, x_l), y = batch.Text, batch.Label
366
+
367
+ encoder_outputs = encoder(x)
368
+
369
+ output, attn = classifier(encoder_outputs)
370
+
371
+ pred = output.data.max(1, keepdim=True)[1]
372
+
373
+ correct += pred.eq(y.data.view_as(pred).to("cpu")).cpu().sum()
374
+
375
+ print('test epoch:{}, acc:{}'.format(epoch, correct/float(len(test))))
376
+
377
+
378
+
379
+ # init model
380
+
381
+ def weights_init(m):
382
+
383
+ classname = m.__class__.__name__
384
+
385
+ if hasattr(m, 'weight') and (classname.find('Embedding') == -1):
386
+
387
+ nn.init.xavier_uniform(m.weight.data, gain=nn.init.calculate_gain('relu'))
388
+
389
+
390
+
391
+ if __name__ == '__main__':
392
+
393
+ #TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True, batch_first=True)
394
+
395
+ TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True, batch_first=True)
396
+
397
+ LABEL = data.Field(sequential=False, use_vocab=False)
398
+
399
+ train, val, test = data.TabularDataset.splits(
400
+
401
+ path='./', train='train_ja.tsv',
402
+
403
+ validation='val_ja.tsv', test='test_ja.tsv', format='tsv',
404
+
405
+ fields=[('Text', TEXT), ('Label', LABEL)])
406
+
407
+
408
+
409
+ print('len(train)', len(train))
410
+
411
+ print('vars(train[0])', vars(train[0]))
412
+
413
+
414
+
415
+ fasttext = JaFastText(name='model.vec')
416
+
417
+ TEXT.build_vocab(train, vectors=fasttext, min_freq=1)
418
+
419
+ TEXT.vocab.freqs
420
+
421
+ TEXT.vocab.stoi
422
+
423
+ TEXT.vocab.itos
424
+
425
+ TEXT.vocab.vectors.size()
426
+
427
+ torch.save(TEXT,"TEXT.pkl")
428
+
429
+
430
+
431
+ train_iter, val_iter, test_iter = data.Iterator.splits(
432
+
433
+ (train, val, test), batch_sizes=(2, 2, 1), device=device, repeat=False,sort=False)
434
+
435
+ batch = next(iter(train_iter))
436
+
437
+ print(batch.Text)
438
+
439
+ print(batch.Label)
440
+
441
+
442
+
443
+ # make model
444
+
445
+ encoder = EncoderRNN(emb_dim, h_dim, len(TEXT.vocab),gpu=True, v_vec = TEXT.vocab.vectors)
446
+
447
+ encoder.cuda()
448
+
449
+ classifier = AttnClassifier(h_dim, class_num)
450
+
451
+
452
+
453
+ for m in encoder.modules():
454
+
455
+ print(m.__class__.__name__)
456
+
457
+ weights_init(m)
458
+
459
+
460
+
461
+ for m in classifier.modules():
462
+
463
+ print(m.__class__.__name__)
464
+
465
+ weights_init(m)
466
+
467
+
468
+
469
+ # optim
470
+
471
+ from itertools import chain
472
+
473
+ optimizer = optim.Adam(chain(encoder.parameters(),classifier.parameters()), lr=lr)
474
+
475
+
476
+
477
+ # train model
478
+
479
+ for epoch in range(epochs):
480
+
481
+ train_model(epoch + 1, train_iter, optimizer)
482
+
483
+ test_model(epoch + 1, val_iter)
484
+
485
+ #torch.save(encoder.state_dict(), "model/encoder_epoch"+ str(epoch + 1) +".pkl")
486
+
487
+ #torch.save(classifier.state_dict(), "model/classifier_epoch"+ str(epoch + 1) +".pkl")
488
+
489
+
490
+
491
+ # save model
492
+
493
+ torch.save(encoder.state_dict(),"encoder.pkl")
494
+
495
+ torch.save(classifier.state_dict(),"classifier.pkl")
496
+
497
+ ```
498
+
47
499
  ### 補足情報(FW/ツールのバージョンなど)
48
500
 
49
501
 
50
502
 
51
- python3.6.8
503
+ colaboratory Python3 GPU ランタイム
52
-
53
-
54
-
504
+
505
+
506
+
55
- torch1.1.0
507
+ pytorch 1.1.0
56
-
508
+
57
- torchtext0.4.0
509
+ torchtext 0.4.0

1

タイトルを変更

2019/07/14 15:13

投稿

canaria369
canaria369

スコア25

test CHANGED
@@ -1 +1 @@
1
- torchtext 汎用的な入力データの生成方法
1
+ torchtext 教師データと推定データが違う場合build_vocab
test CHANGED
File without changes