GPUのエラー'OOM when allocating tensor'について

質問の変更申し訳ありません。

GPUで実行すると下記のエラーが出ます

実行環境はAWSのp2インスタンスのp2.8xlargeなのでメモリが足りないことはないと思うのですが、バッチを8にしてもこのエラーが出てしまいます。

何が原因なのでしょうか？
ちなみにjupyter上ではなくAWSのEC２のターミナル上で実行しました
ご教授お願いします

追記

python
1config = tf.ConfigProto(log_device_placement=True) 
2sess = tf.Session(config=config) 
3
4K.set_session(sess)

に変更して、画像サイズ減らす、input関数の画像枚数増やす処理なくせば良いのかなと思うのですが

python
1＃エラー
2W tensorflow/core/common_runtime/bfc_allocator.cc:279] *************************************************************************************************xxx
32018-07-24 08:58:04.962110: W tensorflow/core/framework/op_kernel.cc:1295] OP_REQUIRES failed at constant_op.cc:75 : Resource exhausted: OOM when allocating tensor of shape [1,1,1088,192] and type float
42018-07-24 08:58:04.962293: E tensorflow/core/common_runtime/executor.cc:660] Executor failed to create kernel. Resource exhausted: OOM when allocating tensor of shape [1,1,1088,192] and type float
5	 [[Node: training/SGD/zeros_176 = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [1,1,1088,192] values: [[[0 0 0]]]...>, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]
6error
7Traceback (most recent call last):
8  File "Inception_resnet_v2_train.py", line 303, in <module>
9    coord.join(threads) 
10  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/coordinator.py", line 389, in join
11    six.reraise(*self._exc_info_to_raise)
12  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/six.py", line 693, in reraise
13    raise value
14  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/queue_runner_impl.py", line 252, in _run
15    enqueue_callable()
16  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1244, in _single_operation_run
17    self._call_tf_sessionrun(None, {}, [], target_list, None)
18  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1409, in _call_tf_sessionrun
19    run_metadata)
20tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[150,150,3] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
21	 [[Node: Cast_1 = Cast[DstT=DT_FLOAT, SrcT=DT_UINT8, _class=["loc:@random_flip_left_right/Switch_1"], _device="/job:localhost/replica:0/task:0/device:GPU:0"](Reshape)]]
22Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
23
24	 [[Node: per_image_standardization/_25 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_58_per_image_standardization", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
25Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

コード（一部抜粋）

python
1＃input用の関数
2from __future__ import print_function
3from __future__ import absolute_import
4
5import warnings
6import time
7import os
8import math
9import numpy as np
10import tensorflow as tf
11from keras.optimizers import SGD
12from keras.callbacks import History
13from keras.callbacks import Callback
14from keras.callbacks import ModelCheckpoint
15from keras.callbacks import TensorBoard
16from keras.callbacks import CSVLogger
17from keras import layers
18from keras.preprocessing import image
19from keras.models import Model
20from keras.layers import Activation
21from keras.layers import AveragePooling2D
22from keras.layers import BatchNormalization
23from keras.layers import Concatenate
24from keras.layers import Conv2D
25from keras.layers import Dense
26from keras.layers import GlobalAveragePooling2D
27from keras.layers import GlobalMaxPooling2D
28from keras.layers import Input
29from keras.layers import Lambda
30from keras.layers import MaxPooling2D
31from keras.utils.data_utils import get_file
32from keras.engine.topology import get_source_inputs
33from keras import backend as K
34from keras import metrics
35from keras import utils as np_utils
36from keras.utils.vis_utils import plot_model, model_to_dot
37import matplotlib.pyplot as plt
38from keras.callbacks import EarlyStopping
39tf.logging.set_verbosity(tf.logging.ERROR)
40
41
42# In[2]:
43
44
45from tensorflow.python.client import device_lib
46device_lib.list_local_devices()
47
48
49# In[4]:
50
51
52def input_data(data_dir, batch_size, distort=False):
53    
54    num_class = 45
55    filenames = [os.path.join(data_dir, 'train_%d.tfrecords' % i)
56               for i in range(1, 61)]
57    for f in filenames:
58        if not tf.gfile.Exists(f):
59            raise ValueError('Failed to find file: ' + f)
60
61    # Create a queue that produces the filenames to read.
62    filename_queue = tf.train.string_input_producer(filenames)
63    reader = tf.TFRecordReader()
64    _, serialized_example = reader.read(filename_queue)
65    
66    features = tf.parse_single_example(serialized_example,
67      features={"label": tf.FixedLenFeature([], tf.int64),
68          "image": tf.FixedLenFeature([], tf.string)})
69    
70    label = tf.cast(features["label"], tf.int32)
71    imgin = tf.reshape(tf.decode_raw(features["image"], tf.uint8), tf.stack([150, 150, 3]))
72    float_image = tf.cast(imgin, tf.float32)
73    
74    num_preprocess_threads = 16
75    min_fraction_of_examples_in_queue = 0.4
76    NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 2900000
77    
78    if distort is True:
79        distorted_image = tf.image.random_flip_left_right(float_image)
80    
81        distorted_image = tf.image.random_brightness(distorted_image, max_delta=63)
82        distorted_image = tf.image.random_contrast(distorted_image, lower=0.2, upper=1.8)
83        distorted_image = tf.image.per_image_standardization(distorted_image)
84        distorted_image.set_shape([150, 150, 3])
85        
86        min_fraction_of_examples_in_queue = 0.4
87        min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN *
88                            min_fraction_of_examples_in_queue)
89        print ('Filling queue with %d CIFAR images before starting to train. '
90         'This will take a few minutes.' % min_queue_examples)
91      
92        images, label_batch = tf.train.shuffle_batch([distorted_image, label], batch_size=batch_size,
93        num_threads=num_preprocess_threads, capacity=min_queue_examples + 3 * batch_size,
94        min_after_dequeue=min_queue_examples)
95    
96    else:
97    
98        images, label_batch = tf.train.batch([float_image, label], batch_size=batch_size,
99        num_threads=num_preprocess_threads, capacity=min_queue_examples + 3 * batch_size,
100        min_after_dequeue=min_queue_examples)
101        
102    return tf.subtract(tf.div(images,127.5), 1.0), tf.one_hot(tf.reshape(label_batch, [batch_size]),num_class)
103
104＃session実行部
105config = tf.ConfigProto(allow_soft_placement=True)
106config.gpu_options.allocator_type = 'BFC'
107config.gpu_options.per_process_gpu_memory_fraction = 0.40
108config.gpu_options.allow_growth=True
109
110sess = K.get_session()
111train_image, train_labels = input_data('/home/ubuntu/train_tf',16, distort=True)
112input_ = Input(tensor=train_image)
113output_ = InceptionResNetV2(img_input=input_)
114train_model = Model(input_, output_, name='inception_resnet_v2')
115train_model.compile(optimizer=SGD(decay=0.1, momentum=0.9, nesterov=True),
116                        loss='categorical_crossentropy',
117                    metrics=['accuracy'], target_tensors=[train_labels])
118
119
120# In[7]:
121
122
123history = History()
124callback = []
125# callbacks.append(ModelCheckpoint(filepath="model.best.h5", save_best_only=True))
126callback.append(history)
127callback.append(ModelCheckpoint(filepath="/home/ubuntu/check_dir/model.ep{epoch:02d}.h5"))
128callback.append(EarlyStopping("loss", patience=1))
129
130# In[8]:
131coord = tf.train.Coordinator()
132threads = tf.train.start_queue_runners(sess, coord)
133try:
134    history = train_model.fit(epochs=10, steps_per_epoch=int(np.ceil(2900000/16)), callbacks=callback)
135    print(history)
136except:
137    print('error')
138
139coord.request_stop()
140coord.join(threads)

t_obara

2018/07/24 03:14

提示された資料はなんのラベルづけもないような時系列データに対する異常検知であり、単語という一意の情報の単位時間あたりの発生数を見るのであれば、ご自身がご提示されているようにcountしたものだけに着目して、単位時間あたりの上昇率のみで判断するのでも十分に思うのですが、何かご懸念されている点はあるのでしょうか？

trafalbad

2018/07/24 04:00 編集

目的は特定の単語の上昇の検知です。countしたものは複数の単語数であり、単一の単語ではないので、特定の単語の上昇によるものか判断できないためです。異常検知手法で上昇数から単語の特定というのは可能でしょうか？

行動規範の内容に同意します

回答1件

自己解決

python
1resize

と

python
1 #distorted_image = tf.image.random_brightness(distorted_image, max_delta=63)
2        #distorted_image = tf.image.random_contrast(distorted_image, lower=0.2, upper=1.8)
3        #distorted_image = tf.image.per_image_standardization(distorted_image)
4

とって、configの設定以下の通りにした

python
1config = tf.ConfigProto(log_device_placement=True) 
2sess = tf.Session(config=config) 
3K.set_session(sess)
4
5＃ input関数
6
7def input_data(data_dir, batch_size, distort=False):
8    
9    num_class = 45
10    filenames = [os.path.join(data_dir, 'train_%d.tfrecords' % i)
11               for i in range(1, 61)]
12    for f in filenames:
13        if not tf.gfile.Exists(f):
14            raise ValueError('Failed to find file: ' + f)
15
16    # Create a queue that produces the filenames to read.
17    filename_queue = tf.train.string_input_producer(filenames)
18    reader = tf.TFRecordReader()
19    _, serialized_example = reader.read(filename_queue)
20    
21    features = tf.parse_single_example(serialized_example,
22      features={"label": tf.FixedLenFeature([], tf.int64),
23          "image": tf.FixedLenFeature([], tf.string)})
24    
25    label = tf.cast(features["label"], tf.int32)
26    imgin = tf.reshape(tf.decode_raw(features["image"], tf.uint8), tf.stack([150, 150, 3]))
27    float_image = tf.cast(imgin, tf.float32)
28    float_image = tf.image.resize_images(float_image, [90, 90])
29    num_preprocess_threads = 16
30    min_fraction_of_examples_in_queue = 0.4
31    NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 2900000
32    
33    if distort is True:
34        distorted_image = tf.image.random_flip_left_right(float_image)
35    
36        #distorted_image = tf.image.random_brightness(distorted_image, max_delta=63)
37        #distorted_image = tf.image.random_contrast(distorted_image, lower=0.2, upper=1.8)
38        #distorted_image = tf.image.per_image_standardization(distorted_image)
39        distorted_image.set_shape([90, 90, 3])
40        
41        min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN *
42                            min_fraction_of_examples_in_queue)
43        print ('Filling queue with %d CIFAR images before starting to train. '
44         'This will take a few minutes.' % min_queue_examples)
45      
46        images, label_batch = tf.train.shuffle_batch([distorted_image, label], batch_size=batch_size,
47        num_threads=num_preprocess_threads, capacity=min_queue_examples + 3 * batch_size,
48        min_after_dequeue=min_queue_examples)
49    
50    else:
51    
52        images, label_batch = tf.train.batch([float_image, label], batch_size=batch_size,
53        num_threads=num_preprocess_threads, capacity=min_queue_examples + 3 * batch_size,
54        min_after_dequeue=min_queue_examples)
55        
56    return tf.subtract(tf.div(images,127.5), 1.0), tf.one_hot(tf.reshape(label_batch, [batch_size]),num_class)