tensorflowを用いたMobileNet V1の設計について

前提・実現したいこと

tensorflowでmobileNet V1を実装してImageNet画像分類の学習を行ったのですが、
誤差がほとんど減らず、精度も一定の値のまま動きません(training, testともに動かずtrainingに至ってはtop1,top5共に0)。
この場合どのような原因が考えられるでしょうか？(正しく実装できていないか、モデルの構造に無理があるのか)
また論文中には詳しく記載されてないのですが、ハイパーパラメータや学習環境、入力データの前処理等の詳細も教えていただきたいです。(論文と同条件で実行してバグかどうかの判断を行いたいため)

発生している問題・エラーメッセージ

設計したモデル(MobileNet V1)の誤差が減らない問題(正確には6.8～6.9の間を行ったり来たりしている)
ソースコードを変更して色々と検証してみた結果、
depthwise separable convolutionか、モデルの実装に問題があるっぽい？

該当のソースコード

一般的なconvolutionの定義

def conv(batch_input, c_i, c_o, k_h, k_w, s_h, s_w, padding, weight_decay, name, variance=0.01, bias=0.0):
	with tf.variable_scope(name):
		weights = _variable_with_weight_decay('weight', [k_h, k_w, c_i, c_o], initializer=tf.random_normal_initializer(0.0, variance), weight_decay=weight_decay)
		biases = _variable_on_device('bias', [c_o], initializer=tf.constant_initializer(bias))
		return  tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(batch_input, weights, [1, s_h, s_w, 1], padding=padding), biases))

depthwise convolutionの定義

def depthwise_conv(batch_input, c_i, k_h, k_w, s_h, s_w, padding, weight_decay, name, variance=0.01, bias=0.0):
	with tf.variable_scope(name):
		weights = _variable_with_weight_decay('weight', [k_h, k_w, c_i, 1], initializer=tf.random_normal_initializer(0.0, variance), weight_decay=weight_decay)
		biases = _variable_on_device('bias', [c_i], initializer=tf.constant_initializer(bias))
		return  tf.nn.relu(tf.nn.batch_normalization(tf.nn.bias_add(tf.nn.depthwise_conv2d(batch_input, weights, [1, s_h, s_w, 1], padding=padding), biases),0,1,0,1,1e-8))

pointwise convolutionの定義

def pointwise_conv(batch_input, c_i, c_o, s_h, s_w, padding, weight_decay, name, variance=0.01, bias=0.0):
	with tf.variable_scope(name):
		weights = _variable_with_weight_decay('weight', [1, 1, c_i, c_o], initializer=tf.random_normal_initializer(0.0, variance), weight_decay=weight_decay)
		biases = _variable_on_device('bias', [c_o], initializer=tf.constant_initializer(bias))
		return  tf.nn.relu(tf.nn.batch_normalization(tf.nn.bias_add(tf.nn.conv2d(batch_input, weights, [1, s_h, s_w, 1], padding=padding), biases),0,1,0,1,1e-8))

average poolingの定義

def avg_pool(batch_input, kernel_size, stride, name):
	with tf.variable_scope(name):
		return tf.nn.avg_pool(batch_input,ksize=[1,kernel_size,kernel_size,1],strides=[1,stride,stride,1],padding='VALID')

全結合層の定義

def output(batch_input, c_i, c_o, weight_decay, name, variance=0.01, bias=0.0):
	with tf.variable_scope(name):
		weights = _variable_with_weight_decay('weight', [c_i,c_o], initializer=tf.random_normal_initializer(0.0, variance), weight_decay=weight_decay)
		biases = _variable_on_device('bias', [c_o], initializer=tf.constant_initializer(bias))
		return tf.nn.bias_add(tf.matmul(batch_input,weights),biases)

モデルの定義

def separable(inputs, weight_decay):
	conv1_1 = conv(inputs, c_i=3, c_o=32, k_h=3, k_w=3, s_h=2, s_w=2, padding='SAME', weight_decay=weight_decay, name='conv1_1')
	
	conv2_1_1 = depthwise_conv(conv1_1, c_i=32, k_h=3, k_w=3, s_h=1, s_w=1, padding='SAME', weight_decay=None, name='conv2_1_1')
	conv2_1_2 = pointwise_conv(conv2_1_1, c_i=32, c_o=64, s_h=1, s_w=1, padding='SAME', weight_decay=weight_decay, name='conv2_1_2')
	conv2_2_1 = depthwise_conv(conv2_1_2, c_i=64, k_h=3, k_w=3, s_h=2, s_w=2, padding='SAME', weight_decay=None, name='conv2_2_1')
	conv2_2_2 = pointwise_conv(conv2_2_1, c_i=64, c_o=128, s_h=1, s_w=1, padding='SAME', weight_decay=weight_decay, name='conv2_2_2')

	conv3_1_1 = depthwise_conv(conv2_2_2, c_i=128, k_h=3, k_w=3, s_h=1, s_w=1, padding='SAME', weight_decay=None, name='conv3_1_1')
	conv3_1_2 = pointwise_conv(conv3_1_1, c_i=128, c_o=128, s_h=1, s_w=1, padding='SAME', weight_decay=weight_decay, name='conv3_1_2')
	conv3_2_1 = depthwise_conv(conv3_1_2, c_i=128, k_h=3, k_w=3, s_h=2, s_w=2, padding='SAME', weight_decay=None, name='conv3_2_1')
	conv3_2_2 = pointwise_conv(conv3_2_1, c_i=128, c_o=256, s_h=1, s_w=1, padding='SAME', weight_decay=weight_decay, name='conv3_2_2')

	conv4_1_1 = depthwise_conv(conv3_2_2, c_i=256, k_h=3, k_w=3, s_h=1, s_w=1, padding='SAME', weight_decay=None, name='conv4_1_1')
	conv4_1_2 = pointwise_conv(conv4_1_1, c_i=256, c_o=256, s_h=1, s_w=1, padding='SAME', weight_decay=weight_decay, name='conv4_1_2')
	conv4_2_1 = depthwise_conv(conv4_1_2, c_i=256, k_h=3, k_w=3, s_h=2, s_w=2, padding='SAME', weight_decay=None, name='conv4_2_1')
	conv4_2_2 = pointwise_conv(conv4_2_1, c_i=256, c_o=512, s_h=1, s_w=1, padding='SAME', weight_decay=weight_decay, name='conv4_2_2')

	conv5_1_1 = depthwise_conv(conv4_2_2, c_i=512, k_h=3, k_w=3, s_h=1, s_w=1, padding='SAME', weight_decay=None, name='conv5_1_1')
	conv5_1_2 = pointwise_conv(conv5_1_1, c_i=512, c_o=512, s_h=1, s_w=1, padding='SAME', weight_decay=weight_decay, name='conv5_1_2')
	conv5_2_1 = depthwise_conv(conv5_1_2, c_i=512, k_h=3, k_w=3, s_h=1, s_w=1, padding='SAME', weight_decay=None, name='conv5_2_1')
	conv5_2_2 = pointwise_conv(conv5_2_1, c_i=512, c_o=512, s_h=1, s_w=1, padding='SAME', weight_decay=weight_decay, name='conv5_2_2')
	conv5_3_1 = depthwise_conv(conv5_2_2, c_i=512, k_h=3, k_w=3, s_h=1, s_w=1, padding='SAME', weight_decay=None, name='conv5_3_1')
	conv5_3_2 = pointwise_conv(conv5_3_1, c_i=512, c_o=512, s_h=1, s_w=1, padding='SAME', weight_decay=weight_decay, name='conv5_3_2')
	conv5_4_1 = depthwise_conv(conv5_3_2, c_i=512, k_h=3, k_w=3, s_h=1, s_w=1, padding='SAME', weight_decay=None, name='conv5_4_1')
	conv5_4_2 = pointwise_conv(conv5_4_1, c_i=512, c_o=512, s_h=1, s_w=1, padding='SAME', weight_decay=weight_decay, name='conv5_4_2')
	conv5_5_1 = depthwise_conv(conv5_4_2, c_i=512, k_h=3, k_w=3, s_h=1, s_w=1, padding='SAME', weight_decay=None, name='conv5_5_1')
	conv5_5_2 = pointwise_conv(conv5_5_1, c_i=512, c_o=512, s_h=1, s_w=1, padding='SAME', weight_decay=weight_decay, name='conv5_5_2')
	conv5_6_1 = depthwise_conv(conv5_5_2, c_i=512, k_h=3, k_w=3, s_h=2, s_w=2, padding='SAME', weight_decay=None, name='conv5_6_1')
	conv5_6_2 = pointwise_conv(conv5_6_1, c_i=512, c_o=1024, s_h=1, s_w=1, padding='SAME', weight_decay=weight_decay, name='conv5_6_2')

	conv6_1_1 = depthwise_conv(conv5_6_2, c_i=1024, k_h=3, k_w=3, s_h=1, s_w=1, padding='SAME', weight_decay=None, name='conv6_1_1')
	conv6_1_2 = pointwise_conv(conv6_1_1, c_i=1024, c_o=1024, s_h=1, s_w=1, padding='SAME', weight_decay=weight_decay, name='conv6_1_2')
	pool6 = avg_pool(conv6_1_2, kernel_size=7, stride=1, name='pool6')

	flat = tf.reshape(pool6,[-1,1*1*1024])

	fc1 = output(flat, c_i=1*1*1024, c_o=1000, weight_decay=weight_decay, name='fc1', variance=0.01, bias=0.0)
		
	return fc1