質問編集履歴
2
ソースコードを追記
test
CHANGED
@@ -1 +1 @@
|
|
1
|
-
torchtext
|
1
|
+
torchtext 埋め込みベクトルのサイズ指定について
|
test
CHANGED
@@ -10,9 +10,11 @@
|
|
10
10
|
|
11
11
|
|
12
12
|
|
13
|
-
|
13
|
+
Text.vocabのサイズが教師データの語彙数に依存してしまい、推定用のデータを利用する際に
|
14
|
+
|
14
|
-
|
15
|
+
新たに埋め込みベクトルを生成すると入力層の次元数が合わなくなるので
|
16
|
+
|
15
|
-
|
17
|
+
入力のベクトルファイル(model.vec)を基準に次元数を指定したいです
|
16
18
|
|
17
19
|
|
18
20
|
|
@@ -34,24 +36,474 @@
|
|
34
36
|
|
35
37
|
### 該当のソースコード
|
36
38
|
|
39
|
+
・入力データ
|
40
|
+
|
41
|
+
train_ja.tsv(val_ja.tsv,test_ja.tsvも一緒の形式)
|
42
|
+
|
43
|
+
```text
|
44
|
+
|
45
|
+
あなたをが好きです。 1
|
46
|
+
|
47
|
+
私はマイクが嫌いです。 0
|
48
|
+
|
49
|
+
私はマキが好きです。 1
|
50
|
+
|
51
|
+
ボブが嫌いです。 0
|
52
|
+
|
53
|
+
```
|
54
|
+
|
55
|
+
model.vec … FastTextの学習済みベクトル
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
・ソースコード
|
60
|
+
|
61
|
+
model.py
|
62
|
+
|
37
63
|
```python
|
38
64
|
|
65
|
+
# coding:utf-8
|
66
|
+
|
67
|
+
import torchtext
|
68
|
+
|
69
|
+
from torchtext import data
|
70
|
+
|
71
|
+
from torchtext import datasets
|
72
|
+
|
39
73
|
from torchtext.vocab import FastText
|
40
74
|
|
75
|
+
from torchtext.vocab import Vectors
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
import torch
|
80
|
+
|
81
|
+
import torch.nn as nn
|
82
|
+
|
83
|
+
import torch.optim as optim
|
84
|
+
|
85
|
+
import torch.nn.functional as F
|
86
|
+
|
87
|
+
from torch.autograd import Variable
|
88
|
+
|
89
|
+
|
90
|
+
|
91
|
+
class EncoderRNN(nn.Module):
|
92
|
+
|
93
|
+
def __init__(self, emb_dim, h_dim, v_size, gpu=True, v_vec=None, batch_first=True):
|
94
|
+
|
95
|
+
super(EncoderRNN, self).__init__()
|
96
|
+
|
97
|
+
self.gpu = gpu
|
98
|
+
|
99
|
+
self.h_dim = h_dim
|
100
|
+
|
101
|
+
self.embed = nn.Embedding(v_size, emb_dim)
|
102
|
+
|
103
|
+
if v_vec is not None:
|
104
|
+
|
105
|
+
self.embed.weight.data.copy_(v_vec)
|
106
|
+
|
107
|
+
self.lstm = nn.LSTM(emb_dim, h_dim, batch_first=batch_first,
|
108
|
+
|
109
|
+
bidirectional=True)
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
def init_hidden(self, b_size):
|
114
|
+
|
115
|
+
h0 = Variable(torch.zeros(1*2, b_size, self.h_dim))
|
116
|
+
|
117
|
+
c0 = Variable(torch.zeros(1*2, b_size, self.h_dim))
|
118
|
+
|
119
|
+
if self.gpu:
|
120
|
+
|
121
|
+
h0 = h0.cuda()
|
122
|
+
|
123
|
+
c0 = c0.cuda()
|
124
|
+
|
125
|
+
return (h0, c0)
|
126
|
+
|
127
|
+
|
128
|
+
|
129
|
+
def forward(self, sentence, lengths=None):
|
130
|
+
|
131
|
+
self.hidden = self.init_hidden(sentence.size(0))
|
132
|
+
|
133
|
+
emb = self.embed(sentence)
|
134
|
+
|
135
|
+
packed_emb = emb
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
if lengths is not None:
|
140
|
+
|
141
|
+
lengths = lengths.view(-1).tolist()
|
142
|
+
|
143
|
+
packed_emb = nn.utils.rnn.pack_padded_sequence(emb, lengths)
|
144
|
+
|
145
|
+
out, hidden = self.lstm(packed_emb, self.hidden)
|
146
|
+
|
147
|
+
if lengths is not None:
|
148
|
+
|
149
|
+
out = nn.utils.rnn.pad_packed_sequence(output)[0]
|
150
|
+
|
151
|
+
out = out[:, :, :self.h_dim] + out[:, :, self.h_dim:]
|
152
|
+
|
153
|
+
return out
|
154
|
+
|
155
|
+
|
156
|
+
|
157
|
+
class Attn(nn.Module):
|
158
|
+
|
159
|
+
def __init__(self, h_dim):
|
160
|
+
|
161
|
+
super(Attn, self).__init__()
|
162
|
+
|
163
|
+
self.h_dim = h_dim
|
164
|
+
|
165
|
+
self.main = nn.Sequential(
|
166
|
+
|
167
|
+
nn.Linear(h_dim, 24),
|
168
|
+
|
169
|
+
nn.ReLU(True),
|
170
|
+
|
171
|
+
nn.Linear(24,1)
|
172
|
+
|
173
|
+
)
|
174
|
+
|
175
|
+
|
176
|
+
|
177
|
+
def forward(self, encoder_outputs):
|
178
|
+
|
179
|
+
b_size = encoder_outputs.size(0)
|
180
|
+
|
181
|
+
output_cont = encoder_outputs.contiguous()
|
182
|
+
|
183
|
+
output_view = output_cont.view(-1, self.h_dim)
|
184
|
+
|
185
|
+
attn_ene = self.main(output_view.to("cpu")) # (b, s, h) -> (b * s, 1)
|
186
|
+
|
187
|
+
return F.softmax(attn_ene.view(b_size, -1), dim=1).unsqueeze(2) # (b*s, 1) -> (b, s, 1)
|
188
|
+
|
189
|
+
|
190
|
+
|
191
|
+
class AttnClassifier(nn.Module):
|
192
|
+
|
193
|
+
def __init__(self, h_dim, c_num):
|
194
|
+
|
195
|
+
super(AttnClassifier, self).__init__()
|
196
|
+
|
197
|
+
self.attn = Attn(h_dim)
|
198
|
+
|
199
|
+
self.main = nn.Linear(h_dim, c_num)
|
200
|
+
|
201
|
+
|
202
|
+
|
203
|
+
|
204
|
+
|
205
|
+
def forward(self, encoder_outputs):
|
206
|
+
|
207
|
+
attns = self.attn(encoder_outputs) #(b, s, 1)
|
208
|
+
|
209
|
+
feats = (encoder_outputs.to("cuda:0") * attns.to("cuda:0")).sum(dim=1) # (b, s, h) -> (b, h)
|
210
|
+
|
41
|
-
|
211
|
+
return F.log_softmax(self.main(feats.to("cpu")),dim=1), attns
|
42
212
|
|
43
213
|
```
|
44
214
|
|
45
215
|
|
46
216
|
|
217
|
+
train.py
|
218
|
+
|
219
|
+
```python
|
220
|
+
|
221
|
+
# coding:utf-8
|
222
|
+
|
223
|
+
import janome
|
224
|
+
|
225
|
+
from janome.tokenizer import Tokenizer
|
226
|
+
|
227
|
+
|
228
|
+
|
229
|
+
import torch
|
230
|
+
|
231
|
+
import torch.nn as nn
|
232
|
+
|
233
|
+
import torch.optim as optim
|
234
|
+
|
235
|
+
import torch.nn.functional as F
|
236
|
+
|
237
|
+
from torch.autograd import Variable
|
238
|
+
|
239
|
+
|
240
|
+
|
241
|
+
from torchtext import data, datasets
|
242
|
+
|
243
|
+
|
244
|
+
|
245
|
+
from bs4 import BeautifulSoup
|
246
|
+
|
247
|
+
import pandas as pd
|
248
|
+
|
249
|
+
import io
|
250
|
+
|
251
|
+
|
252
|
+
|
253
|
+
#from model import *
|
254
|
+
|
255
|
+
|
256
|
+
|
257
|
+
emb_dim = 300 #単語埋め込み次元
|
258
|
+
|
259
|
+
h_dim = 3 #lstmの隠れ層の次元
|
260
|
+
|
261
|
+
class_num = 2 #予測クラス数
|
262
|
+
|
263
|
+
lr = 0.001 #学習係数
|
264
|
+
|
265
|
+
epochs = 30 #エポック数
|
266
|
+
|
267
|
+
|
268
|
+
|
269
|
+
#device = torch.device('cuda:0')
|
270
|
+
|
271
|
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
272
|
+
|
273
|
+
#print(device)
|
274
|
+
|
275
|
+
torch.manual_seed(0)
|
276
|
+
|
277
|
+
torch.cuda.manual_seed(0)
|
278
|
+
|
279
|
+
j_t = Tokenizer()
|
280
|
+
|
281
|
+
|
282
|
+
|
283
|
+
def tokenizer(text):
|
284
|
+
|
285
|
+
#return text.split(' ')
|
286
|
+
|
287
|
+
return [tok for tok in j_t.tokenize(text, wakati=True)]
|
288
|
+
|
289
|
+
|
290
|
+
|
291
|
+
def clean_tokenizer(text):
|
292
|
+
|
293
|
+
soup = BeautifulSoup(text,"lxml")
|
294
|
+
|
295
|
+
clean_text = soup.get_text()
|
296
|
+
|
297
|
+
return [tok for tok in j_t.tokenize(clean_text, wakati=True)]
|
298
|
+
|
299
|
+
|
300
|
+
|
301
|
+
class JaFastText(Vectors):
|
302
|
+
|
303
|
+
def __init__(self, name=None, **kwargs):
|
304
|
+
|
305
|
+
super(JaFastText, self).__init__(name, url=None, **kwargs)
|
306
|
+
|
307
|
+
|
308
|
+
|
309
|
+
|
310
|
+
|
311
|
+
def train_model(epoch, train_iter, optimizer, log_interval=1, batch_size=2):
|
312
|
+
|
313
|
+
encoder.train()
|
314
|
+
|
315
|
+
classifier.train()
|
316
|
+
|
317
|
+
correct = 0
|
318
|
+
|
319
|
+
for idx, batch in enumerate(train_iter):
|
320
|
+
|
321
|
+
(x, x_l), y = batch.Text, batch.Label
|
322
|
+
|
323
|
+
optimizer.zero_grad()
|
324
|
+
|
325
|
+
encoder_outputs = encoder(x)
|
326
|
+
|
327
|
+
output, attn = classifier(encoder_outputs)
|
328
|
+
|
329
|
+
loss = F.nll_loss(output.to(device), y.to(device))
|
330
|
+
|
331
|
+
loss.backward()
|
332
|
+
|
333
|
+
optimizer.step()
|
334
|
+
|
335
|
+
pred = output.data.max(1, keepdim=True)[1]
|
336
|
+
|
337
|
+
correct += pred.eq(y.data.view_as(pred).to("cpu")).cpu().sum()
|
338
|
+
|
339
|
+
if idx % log_interval == 0:
|
340
|
+
|
341
|
+
print('train epoch: {} [{}/{}], acc:{}, loss:{}'.format(
|
342
|
+
|
343
|
+
epoch, (idx+1)*len(x), len(train_iter)*batch_size,
|
344
|
+
|
345
|
+
correct/float(log_interval * len(x)),
|
346
|
+
|
347
|
+
loss.item()))
|
348
|
+
|
349
|
+
correct = 0
|
350
|
+
|
351
|
+
|
352
|
+
|
353
|
+
|
354
|
+
|
355
|
+
def test_model(epoch, test_iter):
|
356
|
+
|
357
|
+
encoder.eval()
|
358
|
+
|
359
|
+
classifier.eval()
|
360
|
+
|
361
|
+
correct = 0
|
362
|
+
|
363
|
+
for idx, batch in enumerate(test_iter):
|
364
|
+
|
365
|
+
(x, x_l), y = batch.Text, batch.Label
|
366
|
+
|
367
|
+
encoder_outputs = encoder(x)
|
368
|
+
|
369
|
+
output, attn = classifier(encoder_outputs)
|
370
|
+
|
371
|
+
pred = output.data.max(1, keepdim=True)[1]
|
372
|
+
|
373
|
+
correct += pred.eq(y.data.view_as(pred).to("cpu")).cpu().sum()
|
374
|
+
|
375
|
+
print('test epoch:{}, acc:{}'.format(epoch, correct/float(len(test))))
|
376
|
+
|
377
|
+
|
378
|
+
|
379
|
+
# init model
|
380
|
+
|
381
|
+
def weights_init(m):
|
382
|
+
|
383
|
+
classname = m.__class__.__name__
|
384
|
+
|
385
|
+
if hasattr(m, 'weight') and (classname.find('Embedding') == -1):
|
386
|
+
|
387
|
+
nn.init.xavier_uniform(m.weight.data, gain=nn.init.calculate_gain('relu'))
|
388
|
+
|
389
|
+
|
390
|
+
|
391
|
+
if __name__ == '__main__':
|
392
|
+
|
393
|
+
#TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True, batch_first=True)
|
394
|
+
|
395
|
+
TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True, batch_first=True)
|
396
|
+
|
397
|
+
LABEL = data.Field(sequential=False, use_vocab=False)
|
398
|
+
|
399
|
+
train, val, test = data.TabularDataset.splits(
|
400
|
+
|
401
|
+
path='./', train='train_ja.tsv',
|
402
|
+
|
403
|
+
validation='val_ja.tsv', test='test_ja.tsv', format='tsv',
|
404
|
+
|
405
|
+
fields=[('Text', TEXT), ('Label', LABEL)])
|
406
|
+
|
407
|
+
|
408
|
+
|
409
|
+
print('len(train)', len(train))
|
410
|
+
|
411
|
+
print('vars(train[0])', vars(train[0]))
|
412
|
+
|
413
|
+
|
414
|
+
|
415
|
+
fasttext = JaFastText(name='model.vec')
|
416
|
+
|
417
|
+
TEXT.build_vocab(train, vectors=fasttext, min_freq=1)
|
418
|
+
|
419
|
+
TEXT.vocab.freqs
|
420
|
+
|
421
|
+
TEXT.vocab.stoi
|
422
|
+
|
423
|
+
TEXT.vocab.itos
|
424
|
+
|
425
|
+
TEXT.vocab.vectors.size()
|
426
|
+
|
427
|
+
torch.save(TEXT,"TEXT.pkl")
|
428
|
+
|
429
|
+
|
430
|
+
|
431
|
+
train_iter, val_iter, test_iter = data.Iterator.splits(
|
432
|
+
|
433
|
+
(train, val, test), batch_sizes=(2, 2, 1), device=device, repeat=False,sort=False)
|
434
|
+
|
435
|
+
batch = next(iter(train_iter))
|
436
|
+
|
437
|
+
print(batch.Text)
|
438
|
+
|
439
|
+
print(batch.Label)
|
440
|
+
|
441
|
+
|
442
|
+
|
443
|
+
# make model
|
444
|
+
|
445
|
+
encoder = EncoderRNN(emb_dim, h_dim, len(TEXT.vocab),gpu=True, v_vec = TEXT.vocab.vectors)
|
446
|
+
|
447
|
+
encoder.cuda()
|
448
|
+
|
449
|
+
classifier = AttnClassifier(h_dim, class_num)
|
450
|
+
|
451
|
+
|
452
|
+
|
453
|
+
for m in encoder.modules():
|
454
|
+
|
455
|
+
print(m.__class__.__name__)
|
456
|
+
|
457
|
+
weights_init(m)
|
458
|
+
|
459
|
+
|
460
|
+
|
461
|
+
for m in classifier.modules():
|
462
|
+
|
463
|
+
print(m.__class__.__name__)
|
464
|
+
|
465
|
+
weights_init(m)
|
466
|
+
|
467
|
+
|
468
|
+
|
469
|
+
# optim
|
470
|
+
|
471
|
+
from itertools import chain
|
472
|
+
|
473
|
+
optimizer = optim.Adam(chain(encoder.parameters(),classifier.parameters()), lr=lr)
|
474
|
+
|
475
|
+
|
476
|
+
|
477
|
+
# train model
|
478
|
+
|
479
|
+
for epoch in range(epochs):
|
480
|
+
|
481
|
+
train_model(epoch + 1, train_iter, optimizer)
|
482
|
+
|
483
|
+
test_model(epoch + 1, val_iter)
|
484
|
+
|
485
|
+
#torch.save(encoder.state_dict(), "model/encoder_epoch"+ str(epoch + 1) +".pkl")
|
486
|
+
|
487
|
+
#torch.save(classifier.state_dict(), "model/classifier_epoch"+ str(epoch + 1) +".pkl")
|
488
|
+
|
489
|
+
|
490
|
+
|
491
|
+
# save model
|
492
|
+
|
493
|
+
torch.save(encoder.state_dict(),"encoder.pkl")
|
494
|
+
|
495
|
+
torch.save(classifier.state_dict(),"classifier.pkl")
|
496
|
+
|
497
|
+
```
|
498
|
+
|
47
499
|
### 補足情報(FW/ツールのバージョンなど)
|
48
500
|
|
49
501
|
|
50
502
|
|
51
|
-
|
503
|
+
colaboratory Python3 GPU ランタイム
|
52
|
-
|
53
|
-
|
54
|
-
|
504
|
+
|
505
|
+
|
506
|
+
|
55
|
-
torch
|
507
|
+
pytorch 1.1.0
|
56
|
-
|
508
|
+
|
57
|
-
torchtext
|
509
|
+
torchtext 0.4.0
|
1
タイトルを変更
test
CHANGED
@@ -1 +1 @@
|
|
1
|
-
torchtext
|
1
|
+
torchtext 教師データと推定データが違う場合のbuild_vocab
|
test
CHANGED
File without changes
|