前提・実現したいこと
コマンドラインから実行したrun_tf_glue.py内に追加記述したモジュールで作成したtf.Datasetが利用可能となること。
発生している問題・エラーメッセージ
エラーメッセージ無し、Datasetを使った以降の処理ができない
該当のソースコード
Python
1ーーーーーーーーエラーに関連するコードーーーーーーーーーーーーー 2呼んだ側 3ds,cnt=gen_dataset(data_dir+mode.value) 4data_dir:data/,mode.value:train/validation 5呼ばれた側 6def gen_dataset(subf): 7 categories = ['0', '1'] 8 category_to_id = { 9 category: index for index, category in enumerate(categories) 10 } 11 origin='c:/tools/python37/transformers/'+subf+'.tsv' 12 origin=origin.replace('validation','dev') 13 logging.info('*** origin(%s)',origin) 14 with open(origin,'r',encoding='utf-8') as fin: 15 reader = csv.reader(fin,delimiter='\t') 16 17 rows = [{ 18 'idx': index, 19 'sentence': row[0], 20 'label': category_to_id[row[1]], 21 } for index, row in enumerate(reader) if row[1] in categories] 22 datasets = tf.data.Dataset.from_generator( 23 lambda: rows,{'idx': tf.int64, 'sentence': tf.string, 'label': tf.int64} 24 ) 25 logging.info('** records(%d) type(%s)**',len(rows),type(datasets)) 26 #list(datasets.as_numpy_iterator()) 27 for line in datasets: 28 logging.info('*** line(%s)',line) 29 sys.exit(7) 30 return(datasets,len(rows)) 31 32 33 34### 試したこと 35 36本日すべてを分解して当該functionとmainだけにして、当該機能をmainの先頭で呼び出す処理を実行すると問題なく処理できます。次に徐々にコードに機能をを元に戻してゆくとimport文を挿入した時点でこの現象が発生します。それまでは問題なくDatasetにアクセスできます。何故colaboratoryでは問題がないのか理解できません。 37ーーーー 38コマンドラインからrun_tf_glue.pyを実行し生成したtf.Datasetにアクセスするとエラーメッセージ無しで暫くしてプログラム終了する。for文内のlogging.infoも表示されず、終了コードもexitに指定した’7’にならないのでDatasetが正しく生成されていないのでは? 39同じコードをcolaboratoryで実行するとなんの問題もなくDatasetの内容も参照可能。 40別途問題個所のモジュールのみをPycharmで実行しても何の問題もなく内容確認できます。 41バッチ起動用に上記コードを変更して実行しても問題なくDataset内容確認できます。 42### 補足情報(FW/ツールのバージョンなど) 43 44Tensorflow:Version: 2.3.1
import文も含まれた検証可能な完全なソースコードを提示すると回答得られやすいかと思います。
以下が問題のソースです。いろんなコードが絡んでいるのでこのままでは動きません。
```Python
# coding=utf-8
""" Fine-tuning the library models for sequence classification."""
import logging
import os
from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, Optional
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from transformers import (
AutoConfig,
AutoTokenizer,
EvalPrediction,
HfArgumentParser,
PreTrainedTokenizer,
TFAutoModelForSequenceClassification,
TFTrainer,
TFTrainingArguments,
glue_compute_metrics,
glue_convert_examples_to_features,
glue_output_modes,
glue_processors,
glue_tasks_num_labels,
)
class Split(Enum):
train = "train"
dev = "validation"
test = "test"
import sys,csv
def gen_dataset(subf):
categories = ['0', '1']
category_to_id = {
category: index for index, category in enumerate(categories)
}
origin='/content/drive/My Drive/bert/Tabelog/'+subf+'.tsv'
with open(origin,'r',encoding='utf-8') as fin:
reader = csv.reader(fin,delimiter='\t')
rows = [{
'idx': index,
'sentence': row[0],
'label': category_to_id[row[1]],
} for index, row in enumerate(reader) if row[1] in categories]
datasets = tf.data.Dataset.from_generator(
lambda: rows,{'idx': tf.int64, 'sentence': tf.string, 'label': tf.int64}
)
logging.info('*** cnt(%d)',len(rows));sys.exit(7)
return(datasets,len(rows))
def get_tfds(
task_name: str,
tokenizer: PreTrainedTokenizer,
max_seq_length: Optional[int] = None,
mode: Split = Split.train,
data_dir: str = None,
):
if task_name == "mnli-mm" and mode == Split.dev:
tfds_name = "mnli_mismatched"
elif task_name == "mnli-mm" and mode == Split.train:
tfds_name = "mnli"
elif task_name == "mnli" and mode == Split.dev:
tfds_name = "mnli_matched"
elif task_name == "sst-2":
tfds_name = "sst2"
elif task_name == "sts-b":
tfds_name = "stsb"
else:
tfds_name = task_name
logging.info('mode.value(%s), tfds_name(%s)',mode.value,tfds_name)
#ds, info = tfds.load("glue/" + tfds_name, split=mode.value, with_info=True, data_dir=data_dir)
#logging.info('info(%s)',info.splits[mode.value])
ds,cnt=gen_dataset(data_dir+mode.value)
logging.info('before convert')
ds = glue_convert_examples_to_features(ds, tokenizer, max_seq_length, task_name)
logging.info('after convert_examples')
#ds = ds.apply(tf.data.experimental.assert_cardinality(info.splits[mode.value].num_examples))
ds = ds.apply(tf.data.experimental.assert_cardinality(cnt))
"""ds, info = tfds.load("glue/" + tfds_name, split=mode.value, with_info=True, data_dir=data_dir)
ds = glue_convert_examples_to_features(ds, tokenizer, max_seq_length, task_name)
ds = ds.apply(tf.data.experimental.assert_cardinality(info.splits[mode.value].num_examples))"""
return ds
logger = logging.getLogger(__name__)
@dataclass
class GlueDataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
Using `HfArgumentParser` we can turn this class
into argparse arguments to be able to specify them on
the command line.
"""
task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
data_dir: Optional[str] = field(default=None, metadata={"help": "The input/output data dir for TFDS."})
max_seq_length: int = field(
default=128,
metadata={
"help": "The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
},
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
def __post_init__(self):
self.task_name = self.task_name.lower()
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path: str = field(
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
)
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."})
# If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
# or just modify its tokenizer_config.json.
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
)
def main():
# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script.
# We now keep distinct sets of args, for a cleaner separation of concerns.
parser = HfArgumentParser((ModelArguments, GlueDataTrainingArguments, TFTrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
)
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
logger.info(
"n_replicas: %s, distributed training: %s, 16-bits training: %s",
training_args.n_replicas,
bool(training_args.n_replicas > 1),
training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)
try:
num_labels = glue_tasks_num_labels["mnli" if data_args.task_name == "mnli-mm" else data_args.task_name]
output_mode = glue_output_modes[data_args.task_name]
except KeyError:
raise ValueError("Task not found: %s" % (data_args.task_name))
# Load pretrained model and tokenizer
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
num_labels=num_labels,
finetuning_task=data_args.task_name,
cache_dir=model_args.cache_dir,
)
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
)
with training_args.strategy.scope():
model = TFAutoModelForSequenceClassification.from_pretrained(
model_args.model_name_or_path,
from_pt=bool(".bin" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
)
# Get datasets
train_dataset = (
get_tfds(
task_name=data_args.task_name,
tokenizer=tokenizer,
max_seq_length=data_args.max_seq_length,
data_dir=data_args.data_dir,
)
if training_args.do_train
else None
)
eval_dataset = (
get_tfds(
task_name=data_args.task_name,
tokenizer=tokenizer,
max_seq_length=data_args.max_seq_length,
mode=Split.dev,
data_dir=data_args.data_dir,
)
if training_args.do_eval
else None
)
def compute_metrics(p: EvalPrediction) -> Dict:
if output_mode == "classification":
preds = np.argmax(p.predictions, axis=1)
elif output_mode == "regression":
preds = np.squeeze(p.predictions)
return glue_compute_metrics(data_args.task_name, preds, p.label_ids)
# Initialize our Trainer
trainer = TFTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics,
)
# Training
if training_args.do_train:
trainer.train()
trainer.save_model()
tokenizer.save_pretrained(training_args.output_dir)
# Evaluation
results = {}
if training_args.do_eval:
logger.info("*** Evaluate ***")
result = trainer.evaluate()
output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer:
logger.info("***** Eval results *****")
for key, value in result.items():
logger.info(" %s = %s", key, value)
writer.write("%s = %s\n" % (key, value))
results.update(result)
return results
if __name__ == "__main__":
main()
回答1件
あなたの回答
tips
プレビュー