前提
ツイートデータと0~3の4段階のラベルを用いて文書分類モデルを作成するシステムを作っています。
「BERTによる自然言語処理入門」という本を参考に以下のコードを実装したところエラーが発生しました。
実現したいこと
ツイートを用いた文書分類システム
発生している問題・エラーメッセージ
Traceback (most recent call last): File "bert_fin2.py", line 105, in <module> trainer.fit(model, dataloader_train, dataloader_val) File "C:\Users\kanon\VR_en\cor_twi_bert_bigginer\test\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 499, in fit self.dispatch() File "C:\Users\kanon\VR_en\cor_twi_bert_bigginer\test\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 546, in dispatch self.accelerator.start_training(self) File "C:\Users\kanon\VR_en\cor_twi_bert_bigginer\test\lib\site-packages\pytorch_lightning\accelerators\accelerator.py", line 73, in start_training self.training_type_plugin.start_training(trainer) File "C:\Users\kanon\VR_en\cor_twi_bert_bigginer\test\lib\site-packages\pytorch_lightning\plugins\training_type\training_type_plugin.py", line 114, in start_training self._results = trainer.run_train() File "C:\Users\kanon\VR_en\cor_twi_bert_bigginer\test\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 607, in run_train self.run_sanity_check(self.lightning_module) File "C:\Users\kanon\VR_en\cor_twi_bert_bigginer\test\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 850, in run_sanity_check self.reset_val_dataloader(ref_model) File "C:\Users\kanon\VR_en\cor_twi_bert_bigginer\test\lib\site-packages\pytorch_lightning\trainer\data_loading.py", line 364, in reset_val_dataloader self.num_val_batches, self.val_dataloaders = self._reset_eval_dataloader(model, 'val') File "C:\Users\kanon\VR_en\cor_twi_bert_bigginer\test\lib\site-packages\pytorch_lightning\trainer\data_loading.py", line 325, in _reset_eval_dataloader num_batches = len(dataloader) if has_len(dataloader) else float('inf') File "C:\Users\kanon\VR_en\cor_twi_bert_bigginer\test\lib\site-packages\pytorch_lightning\utilities\data.py", line 33, in has_len raise ValueError('`Dataloader` returned 0 length. Please make sure that it returns at least 1 batch') ValueError: `Dataloader` returned 0 length. Please make sure that it returns at least 1 batch
該当のソースコード
python
1import pandas as pd 2import torch 3from torch.utils.data import DataLoader 4from transformers import BertJapaneseTokenizer, BertForSequenceClassification 5import pytorch_lightning as pl 6import random 7import glob 8 9MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking' 10tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME) 11 12data = pd.read_csv("C:\\Users\\kanon\\VR_en\\cor_twi_bert_bigginer\\test\\DateSet\\DS-ALL_4sec_20220920.csv", encoding="utf-8") 13 14ds1 = data[['DS1ツイート','DS1ラベル']] 15 16max_length = 150 17dataset_for_loader = [] 18for index, row in ds1.iterrows(): 19 encoding = tokenizer( 20 row['DS1ツイート'], 21 max_length = max_length, 22 padding = 'max_length', 23 truncation = True 24 ) 25 encoding['labels'] = row['DS1ラベル'] 26 encoding = { k: torch.tensor(v) for k, v in encoding.items() } 27 dataset_for_loader.append(encoding) 28 29##print(dataset_for_loader[0]) 30 31random.shuffle(dataset_for_loader) 32n = len(dataset_for_loader) 33n_train = int(0.6*n) 34n_val = int(0.2*n) 35dataset_train = dataset_for_loader[:n_train] 36dataset_val = dataset_for_loader[n_train:n_val] 37dataset_test = dataset_for_loader[n_train+n_val:] 38 39dataloader_train = DataLoader( 40 dataset_train, batch_size = 32, shuffle=True 41) 42dataloader_val = DataLoader(dataset_val, batch_size = 64) 43dataloader_test = DataLoader(dataset_test, batch_size = 64) 44 45##ここから学習とファインチューニングと検証 46class BertForSequenceClassification_pl(pl.LightningModule): 47 def __init__(self, model_name, num_labels, lr): 48 super().__init__() 49 self.save_hyperparameters() 50 self.bert_sc = BertForSequenceClassification.from_pretrained( 51 model_name, 52 num_labels=num_labels 53 ) 54 55 def training_step(self, batch, batch_idx): 56 output = self.bert_sc(**batch) 57 loss = output.loss 58 self.log('train_loss', loss) 59 return loss 60 61 def validation_step(self, batch, batch_idx): 62 output = self.bert_sc(**batch) 63 val_loss = output.val_loss 64 self.log('val_loss', val_loss) 65 66 def test_step(self, batch, batch_idx): 67 labels = batch.pop('labels') 68 output = self.bert_sc(**batch) 69 labels_predicted = output.logits.argmax(-1) 70 num_correct = ( labels_predicted == labels ).sum().item() 71 accuracy = num_correct/labels.size(0) 72 self.log('accuracy', accuracy) 73 74 def configure_optimizers(self): 75 return torch.optim.Adam(self.parameters(), lr=self.hparams.lr) 76 77checkpoint = pl.callbacks.ModelCheckpoint( 78 monitor='val_loss', 79 mode='min', 80 save_top_k=1, 81 save_weights_only=True, 82 dirpath="C:\\Users\\kanon\\VR_en\\cor_twi_bert_bigginer\\model" 83) 84 85trainer = pl.Trainer( 86 gpus=0, 87 max_epochs=10, 88 callbacks = [checkpoint] 89) 90 91model = BertForSequenceClassification_pl( 92 MODEL_NAME, num_labels=4, lr=1e-5 93) 94 95trainer.fit(model, dataloader_train, dataloader_val) 96 97test = trainer.test(test_dataloaders=dataloader_test) 98 99print(f'accuracy: {test[0]["accuracy"]:.2f}') 100
試したこと
print(len(dataloader_train))
でデータローダーの長さを確認しました。0ではなくバッチ数が表示されました。
補足情報(FW/ツールのバージョンなど)
python 3.7.9
transformers 4.10.2
fugashi 1.1.2
ipadic 1.0.0
pytorch-lightning 1.2.7
回答1件