質問の変更申し訳ありません。
GPUで実行すると下記のエラーが出ます
実行環境はAWSのp2インスタンスのp2.8xlargeなのでメモリが足りないことはないと思うのですが、バッチを8にしてもこのエラーが出てしまいます。
何が原因なのでしょうか?
ちなみにjupyter上ではなくAWSのEC2のターミナル上で実行しました
ご教授お願いします
追記
python
1config = tf.ConfigProto(log_device_placement=True) 2sess = tf.Session(config=config) 3 4K.set_session(sess)
に変更して、画像サイズ減らす、input関数の画像枚数増やす処理なくせば良いのかなと思うのですが
python
1#エラー 2W tensorflow/core/common_runtime/bfc_allocator.cc:279] *************************************************************************************************xxx 32018-07-24 08:58:04.962110: W tensorflow/core/framework/op_kernel.cc:1295] OP_REQUIRES failed at constant_op.cc:75 : Resource exhausted: OOM when allocating tensor of shape [1,1,1088,192] and type float 42018-07-24 08:58:04.962293: E tensorflow/core/common_runtime/executor.cc:660] Executor failed to create kernel. Resource exhausted: OOM when allocating tensor of shape [1,1,1088,192] and type float 5 [[Node: training/SGD/zeros_176 = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [1,1,1088,192] values: [[[0 0 0]]]...>, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]] 6error 7Traceback (most recent call last): 8 File "Inception_resnet_v2_train.py", line 303, in <module> 9 coord.join(threads) 10 File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/coordinator.py", line 389, in join 11 six.reraise(*self._exc_info_to_raise) 12 File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/six.py", line 693, in reraise 13 raise value 14 File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/queue_runner_impl.py", line 252, in _run 15 enqueue_callable() 16 File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1244, in _single_operation_run 17 self._call_tf_sessionrun(None, {}, [], target_list, None) 18 File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1409, in _call_tf_sessionrun 19 run_metadata) 20tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[150,150,3] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc 21 [[Node: Cast_1 = Cast[DstT=DT_FLOAT, SrcT=DT_UINT8, _class=["loc:@random_flip_left_right/Switch_1"], _device="/job:localhost/replica:0/task:0/device:GPU:0"](Reshape)]] 22Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. 23 24 [[Node: per_image_standardization/_25 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_58_per_image_standardization", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]] 25Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
コード(一部抜粋)
python
1#input用の関数 2from __future__ import print_function 3from __future__ import absolute_import 4 5import warnings 6import time 7import os 8import math 9import numpy as np 10import tensorflow as tf 11from keras.optimizers import SGD 12from keras.callbacks import History 13from keras.callbacks import Callback 14from keras.callbacks import ModelCheckpoint 15from keras.callbacks import TensorBoard 16from keras.callbacks import CSVLogger 17from keras import layers 18from keras.preprocessing import image 19from keras.models import Model 20from keras.layers import Activation 21from keras.layers import AveragePooling2D 22from keras.layers import BatchNormalization 23from keras.layers import Concatenate 24from keras.layers import Conv2D 25from keras.layers import Dense 26from keras.layers import GlobalAveragePooling2D 27from keras.layers import GlobalMaxPooling2D 28from keras.layers import Input 29from keras.layers import Lambda 30from keras.layers import MaxPooling2D 31from keras.utils.data_utils import get_file 32from keras.engine.topology import get_source_inputs 33from keras import backend as K 34from keras import metrics 35from keras import utils as np_utils 36from keras.utils.vis_utils import plot_model, model_to_dot 37import matplotlib.pyplot as plt 38from keras.callbacks import EarlyStopping 39tf.logging.set_verbosity(tf.logging.ERROR) 40 41 42# In[2]: 43 44 45from tensorflow.python.client import device_lib 46device_lib.list_local_devices() 47 48 49# In[4]: 50 51 52def input_data(data_dir, batch_size, distort=False): 53 54 num_class = 45 55 filenames = [os.path.join(data_dir, 'train_%d.tfrecords' % i) 56 for i in range(1, 61)] 57 for f in filenames: 58 if not tf.gfile.Exists(f): 59 raise ValueError('Failed to find file: ' + f) 60 61 # Create a queue that produces the filenames to read. 62 filename_queue = tf.train.string_input_producer(filenames) 63 reader = tf.TFRecordReader() 64 _, serialized_example = reader.read(filename_queue) 65 66 features = tf.parse_single_example(serialized_example, 67 features={"label": tf.FixedLenFeature([], tf.int64), 68 "image": tf.FixedLenFeature([], tf.string)}) 69 70 label = tf.cast(features["label"], tf.int32) 71 imgin = tf.reshape(tf.decode_raw(features["image"], tf.uint8), tf.stack([150, 150, 3])) 72 float_image = tf.cast(imgin, tf.float32) 73 74 num_preprocess_threads = 16 75 min_fraction_of_examples_in_queue = 0.4 76 NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 2900000 77 78 if distort is True: 79 distorted_image = tf.image.random_flip_left_right(float_image) 80 81 distorted_image = tf.image.random_brightness(distorted_image, max_delta=63) 82 distorted_image = tf.image.random_contrast(distorted_image, lower=0.2, upper=1.8) 83 distorted_image = tf.image.per_image_standardization(distorted_image) 84 distorted_image.set_shape([150, 150, 3]) 85 86 min_fraction_of_examples_in_queue = 0.4 87 min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * 88 min_fraction_of_examples_in_queue) 89 print ('Filling queue with %d CIFAR images before starting to train. ' 90 'This will take a few minutes.' % min_queue_examples) 91 92 images, label_batch = tf.train.shuffle_batch([distorted_image, label], batch_size=batch_size, 93 num_threads=num_preprocess_threads, capacity=min_queue_examples + 3 * batch_size, 94 min_after_dequeue=min_queue_examples) 95 96 else: 97 98 images, label_batch = tf.train.batch([float_image, label], batch_size=batch_size, 99 num_threads=num_preprocess_threads, capacity=min_queue_examples + 3 * batch_size, 100 min_after_dequeue=min_queue_examples) 101 102 return tf.subtract(tf.div(images,127.5), 1.0), tf.one_hot(tf.reshape(label_batch, [batch_size]),num_class) 103 104#session実行部 105config = tf.ConfigProto(allow_soft_placement=True) 106config.gpu_options.allocator_type = 'BFC' 107config.gpu_options.per_process_gpu_memory_fraction = 0.40 108config.gpu_options.allow_growth=True 109 110sess = K.get_session() 111train_image, train_labels = input_data('/home/ubuntu/train_tf',16, distort=True) 112input_ = Input(tensor=train_image) 113output_ = InceptionResNetV2(img_input=input_) 114train_model = Model(input_, output_, name='inception_resnet_v2') 115train_model.compile(optimizer=SGD(decay=0.1, momentum=0.9, nesterov=True), 116 loss='categorical_crossentropy', 117 metrics=['accuracy'], target_tensors=[train_labels]) 118 119 120# In[7]: 121 122 123history = History() 124callback = [] 125# callbacks.append(ModelCheckpoint(filepath="model.best.h5", save_best_only=True)) 126callback.append(history) 127callback.append(ModelCheckpoint(filepath="/home/ubuntu/check_dir/model.ep{epoch:02d}.h5")) 128callback.append(EarlyStopping("loss", patience=1)) 129 130# In[8]: 131coord = tf.train.Coordinator() 132threads = tf.train.start_queue_runners(sess, coord) 133try: 134 history = train_model.fit(epochs=10, steps_per_epoch=int(np.ceil(2900000/16)), callbacks=callback) 135 print(history) 136except: 137 print('error') 138 139coord.request_stop() 140coord.join(threads)
回答1件