Kerasを２台のGPUを使って並列化させたが、逆に遅くなってしまう

前提・実現したいこと

GPUが２台あるのでkerasの並列化を行い計算を早くしたいが、結果として遅くなってしまいました。
参考にした記事はこちらです。http://tech.wonderpla.net/entry/2018/01/09/110000

この記事ではgeneratorがボトルネックとなっているため、
fit_generator()も更に並列化しているのでそれも行いましたが、1台のほうが速いです。

発生している問題・エラーメッセージ

GPU1台：９秒
GPU2台：15秒
GPU2台＋fit_generatorを並列化:12秒

該当のソースコード

python
1# GPU１台の場合
2from __future__ import print_function
3import keras
4from keras.datasets import cifar10
5from keras.preprocessing.image import ImageDataGenerator
6from keras.models import Sequential
7from keras.layers import Dense, Dropout, Activation, Flatten
8from keras.layers import Conv2D, MaxPooling2D
9import os
10
11
12batch_size = 32
13num_classes = 10
14epochs = 5
15
16num_predictions = 20
17
18
19
20# The data, shuffled and split between train and test sets:
21(x_train, y_train), (x_test, y_test) = cifar10.load_data()
22print('x_train shape:', x_train.shape)
23print(x_train.shape[0], 'train samples')
24print(x_test.shape[0], 'test samples')
25
26# Convert class vectors to binary class matrices.
27y_train = keras.utils.to_categorical(y_train, num_classes)
28y_test = keras.utils.to_categorical(y_test, num_classes)
29
30# モデル構築
31model = Sequential()
32model.add(Conv2D(32, (3, 3), padding='same',
33                 input_shape=x_train.shape[1:]))
34model.add(Activation('relu'))
35model.add(Conv2D(32, (3, 3)))
36model.add(Activation('relu'))
37model.add(MaxPooling2D(pool_size=(2, 2)))
38model.add(Dropout(0.25))
39
40model.add(Conv2D(64, (3, 3), padding='same'))
41model.add(Activation('relu'))
42model.add(Conv2D(64, (3, 3)))
43model.add(Activation('relu'))
44model.add(MaxPooling2D(pool_size=(2, 2)))
45model.add(Dropout(0.25))
46
47model.add(Flatten())
48model.add(Dense(512))
49model.add(Activation('relu'))
50model.add(Dropout(0.5))
51model.add(Dense(num_classes))
52model.add(Activation('softmax'))
53
54
55# initiate RMSprop optimizer
56opt = keras.optimizers.rmsprop(lr=0.0001, decay=1e-6)
57
58# Let's train the model using RMSprop
59model.compile(loss='categorical_crossentropy',
60              optimizer=opt,
61              metrics=['accuracy'])
62
63x_train = x_train.astype('float32')
64x_test = x_test.astype('float32')
65x_train /= 255
66x_test /= 255
67
68
69print('Using real-time data augmentation.')
70# This will do preprocessing and realtime data augmentation:
71datagen = ImageDataGenerator(
72    featurewise_center=False,  # set input mean to 0 over the dataset
73    samplewise_center=False,  # set each sample mean to 0
74    featurewise_std_normalization=False,  # divide inputs by std of the dataset
75    samplewise_std_normalization=False,  # divide each input by its std
76    zca_whitening=False,  # apply ZCA whitening
77    rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
78    width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
79    height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
80    horizontal_flip=True,  # randomly flip images
81    vertical_flip=False)  # randomly flip images
82
83# Compute quantities required for feature-wise normalization
84# (std, mean, and principal components if ZCA whitening is applied).
85datagen.fit(x_train)
86
87# Fit the model on the batches generated by datagen.flow().
88model.fit_generator(datagen.flow(x_train, y_train,
89                                 batch_size=batch_size),
90                    steps_per_epoch=1000,
91                    epochs=epochs,
92                    validation_data=(x_test, y_test),
93                    workers=4)
94
95
96# Score trained model.
97scores = model.evaluate(x_test, y_test, verbose=1)
98print('Test loss:', scores[0])
99print('Test accuracy:', scores[1])

python
1# GPU2台の場合
2from __future__ import print_function
3import keras
4from keras.datasets import cifar10
5from keras.preprocessing.image import ImageDataGenerator
6from keras.models import Sequential
7from keras.layers import Dense, Dropout, Activation, Flatten
8from keras.layers import Conv2D, MaxPooling2D
9import os
10import tensorflow as tf # add
11from keras.utils import multi_gpu_model # add
12
13gpu_count = 2 # add
14
15batch_size = 32 * gpu_count # modify
16num_classes = 10
17epochs = 5
18data_augmentation = True
19num_predictions = 20
20save_dir = os.path.join(os.getcwd(), 'saved_models')
21model_name = 'keras_cifar10_trained_model.h5'
22
23
24# The data, shuffled and split between train and test sets:
25(x_train, y_train), (x_test, y_test) = cifar10.load_data()
26print('x_train shape:', x_train.shape)
27print(x_train.shape[0], 'train samples')
28print(x_test.shape[0], 'test samples')
29
30# Convert class vectors to binary class matrices.
31y_train = keras.utils.to_categorical(y_train, num_classes)
32y_test = keras.utils.to_categorical(y_test, num_classes)
33with tf.device("/cpu:0"): # add
34    model = Sequential()
35    model.add(Conv2D(32, (3, 3), padding='same',
36                     input_shape=x_train.shape[1:]))
37    model.add(Activation('relu'))
38    model.add(Conv2D(32, (3, 3)))
39    model.add(Activation('relu'))
40    model.add(MaxPooling2D(pool_size=(2, 2)))
41    model.add(Dropout(0.25))
42
43    model.add(Conv2D(64, (3, 3), padding='same'))
44    model.add(Activation('relu'))
45    model.add(Conv2D(64, (3, 3)))
46    model.add(Activation('relu'))
47    model.add(MaxPooling2D(pool_size=(2, 2)))
48    model.add(Dropout(0.25))
49
50    model.add(Flatten())
51    model.add(Dense(512))
52    model.add(Activation('relu'))
53    model.add(Dropout(0.5))
54    model.add(Dense(num_classes))
55    model.add(Activation('softmax'))
56model = multi_gpu_model(model, gpus=gpu_count) # add
57# initiate RMSprop optimizer
58opt = keras.optimizers.rmsprop(lr=0.0001, decay=1e-6)
59
60# Let's train the model using RMSprop
61model.compile(loss='categorical_crossentropy',
62              optimizer=opt,
63              metrics=['accuracy'])
64
65x_train = x_train.astype('float32')
66x_test = x_test.astype('float32')
67x_train /= 255
68x_test /= 255
69
70if not data_augmentation:
71    print('Not using data augmentation.')
72    model.fit(x_train, y_train,
73              batch_size=batch_size,
74              epochs=epochs,
75              validation_data=(x_test, y_test),
76              shuffle=True)
77else:
78    print('Using real-time data augmentation.')
79    # This will do preprocessing and realtime data augmentation:
80    datagen = ImageDataGenerator(
81        featurewise_center=False,  # set input mean to 0 over the dataset
82        samplewise_center=False,  # set each sample mean to 0
83        featurewise_std_normalization=False,  # divide inputs by std of the dataset
84        samplewise_std_normalization=False,  # divide each input by its std
85        zca_whitening=False,  # apply ZCA whitening
86        rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
87        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
88        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
89        horizontal_flip=True,  # randomly flip images
90        vertical_flip=False)  # randomly flip images
91
92    # Compute quantities required for feature-wise normalization
93    # (std, mean, and principal components if ZCA whitening is applied).
94    datagen.fit(x_train)
95
96    # Fit the model on the batches generated by datagen.flow().
97    model.fit_generator(datagen.flow(x_train, y_train,
98                                     batch_size=batch_size),
99                        steps_per_epoch=1000,
100                        epochs=epochs,
101                        validation_data=(x_test, y_test),
102                        workers=32,
103                        max_queue_size=64,
104                        use_multiprocessing=True)
105    
106
107# Save model and weights
108if not os.path.isdir(save_dir):
109    os.makedirs(save_dir)
110model_path = os.path.join(save_dir, model_name)
111model.save(model_path)
112print('Saved trained model at %s ' % model_path)
113
114# Score trained model.
115scores = model.evaluate(x_test, y_test, verbose=1)
116print('Test loss:', scores[0])
117print('Test accuracy:', scores[1])