keras Seq2seq: TimeDistributed()を使って各タイムステップにCustomLayerを付加した時のテンソルサイズエラー

初投稿です。
kerasを用いたseq2seqモデルへのAttentionの実装を行なっています。
下記のコードのmyModel()内のようにTimeDistributed()を使ってdecoderの各タイムステップにCustomLayerを付加する形で実装を試みたところ学習時に次のような形でエラーが出ました（一部抜粋）。

※batch_size=200, encoderのタイムステップ長=118, decoderのタイムステップ長=8, 隠れ状態ベクトルの次元数=300

Train on 50272 samples, validate on 5586 samples
Epoch 1/300
2019-04-11 17:53:00.020675: W tensorflow/core/framework/op_kernel.cc:1192] Invalid argument: Incompatible shapes: [1600,118,300] vs. [200,118,300]

エラー発生箇所はMyLayer()のcall()内です。
[1600,118,300]に注目してほしいのですが、CustomLayerへの入力x(dec)でバッチサイズ次元がbatch_size(200)* decoderのタイムステップ長(8)に変更されてしまっているようで、その後の演算を行う際にエラーが出ているようです。TimeDistributed()がうまく機能していないように思えるのですが原因等ご教授ください。

python3
1import keras
2from keras.models import Model
3from keras.layers import Dense,Lambda
4from keras.engine.topology import Input
5from keras.layers.embeddings import Embedding
6from keras.layers.recurrent import LSTM,GRU,SimpleRNN
7from keras.layers.wrappers import TimeDistributed
8
9from keras import backend as K
10from keras.engine.topology import Layer
11
12class MyLayer(Layer):
13
14    def __init__(self, encoder=None,input_length=1,**kwargs):
15        self.output_dim = 300
16        self.encoder=encoder
17        self.input_length=input_length
18        super().__init__(**kwargs)
19
20    def build(self,input_shape):
21
22        self.WA = self.add_weight(name='wa',
23                                 shape=(300, 600),
24                                 initializer='uniform',
25                                 trainable=True)
26
27    def call(self,x):
28        enc = self.encoder #(batchsize,118,300)
29        dec = x #(batchsize,300) 学習時に(batchsize*8,300)になってしまう。
30        WA = self.WA
31        IL = self.input_length
32
33        dec_norm = K.repeat(dec,IL) #(batchsize,118,300)
34
35        omega =  dec_norm * enc #(batchsize,118,300)*(batchsize,118,300)=(batchsize,118,300)
36        omega_sum = K.sum(omega,axis=2)  #(batchsize,118)
37
38        a = K.softmax(omega_sum)
39        a = K.expand_dims(a,axis=-1)
40        a = K.repeat_elements(a,300,axis=-1) #(batchsize,118,300)
41
42        h_bar = K.sum(a*enc, axis=1) #(batchsize,118,300)*(batchsize,118,300)=(batchsize,118,300) =>(batchsize,300)
43
44        vecs = K.concatenate([h_bar,dec]) #(batchsize,600)
45        vecs = K.reshape(vecs, (-1,600,1)) #(batchsize,600,1)
46
47        output = K.tanh(K.dot(WA,vecs))
48        output = K.permute_dimensions(output,(1,0,2))
49        output = K.reshape(output,(-1,300)) #(batch_size,300)
50
51        return output
52
53LATENT_DIM=300
54INPUT_VOCAB_SIZE=5000
55
56
57def myModel(len_input):
58
59　　#encoder
60    encoder_inputs = Input(shape=(None,))
61    encoder_embedding = Embedding(INPUT_VOCAB_SIZE, 500, name='enc_emb')(encoder_inputs)
62    encoder = GRU(LATENT_DIM, return_sequences=True, return_state=True, name='encoder')
63    encoder_outputs, encoder_states = encoder(encoder_embedding)
64
65    #decoder
66    decoder_inputs = Input(shape=(None,))
67    decoder_embedding = Embedding(39, 100, name='dec_emb')(decoder_inputs)
68    decoder = GRU(LATENT_DIM, return_sequences=True, return_state=True, name='decoder')
69    decoder_outputs, _ = decoder(decoder_embedding, initial_state=encoder_states)
70
71    #custom_layer
72    attention = MyLayer(encoder=encoder_outputs, input_length=len_input) 
73    att = TimeDistributed(attention)(decoder_outputs) 
74
75    decoder_dense = Dense(39, activation='softmax', name='dec_dense')
76    decoder_final_outputs = decoder_dense(att)
77
78    model = Model([encoder_inputs, decoder_inputs], decoder_final_outputs)
79
80    return model
81
82
83def main():
84
85    model = myModel(len_input) #len_input encoderのタイムステップ長118
86
87    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
88    model.summary()
89
90    history = model.fit([x_train, y_train_input],y_train_output, batch_size=200, nb_epoch=300,validation_split=0.1)
91
92
93if __name__ == '__main__':
94    main()

参考　エラーメッセージ全貼り

Train on 50272 samples, validate on 5586 samples
Epoch 1/300
2019-04-11 17:53:00.020675: W tensorflow/core/framework/op_kernel.cc:1192] Invalid argument: Incompatible shapes: [1600,118,300] vs. [200,118,300]
	 [[Node: time_distributed_1/mul = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](time_distributed_1/Tile, encoder/transpose_1)]]
Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1323, in _do_call
    return fn(*args)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1302, in _run_fn
    status, run_metadata)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/errors_impl.py", line 473, in __exit__
    c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [1600,118,300] vs. [200,118,300]
	 [[Node: time_distributed_1/mul = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](time_distributed_1/Tile, encoder/transpose_1)]]
	 [[Node: loss/mul/_77 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_4737_loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "attention0411.py", line 250, in <module>
    main()
  File "attention0411.py", line 246, in main
    history = model.fit([x_train, y_train_input],y_train_output, batch_size=200, nb_epoch=300,validation_split=0.1)
  File "/usr/local/lib/python3.5/dist-packages/keras/engine/training.py", line 1598, in fit
    validation_steps=validation_steps)
  File "/usr/local/lib/python3.5/dist-packages/keras/engine/training.py", line 1183, in _fit_loop
    outs = f(ins_batch)
  File "/usr/local/lib/python3.5/dist-packages/keras/backend/tensorflow_backend.py", line 2273, in __call__
    **self.session_kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 889, in run
    run_metadata_ptr)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1120, in _run
    feed_dict_tensor, options, run_metadata)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1317, in _do_run
    options, run_metadata)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1336, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [1600,118,300] vs. [200,118,300]
	 [[Node: time_distributed_1/mul = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](time_distributed_1/Tile, encoder/transpose_1)]]
	 [[Node: loss/mul/_77 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_4737_loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'time_distributed_1/mul', defined at:
  File "attention0411.py", line 250, in <module>
    main()
  File "attention0411.py", line 241, in main
    model = myModel(max_word_len_input)
  File "attention0411.py", line 214, in myModel
    att=TimeDistributed(attention)(decoder_outputs)
  File "/usr/local/lib/python3.5/dist-packages/keras/engine/topology.py", line 602, in __call__
    output = self.call(inputs, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/keras/layers/wrappers.py", line 203, in call
    y = self.layer.call(inputs, **kwargs)
  File "attention0411.py", line 165, in call
    omega =  dec_norm * enc #(batchsize,118,300)*(batchsize,118,300)=(batchsize,118,300)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/math_ops.py", line 894, in binary_op_wrapper
    return func(x, y, name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/math_ops.py", line 1117, in _mul_dispatch
    return gen_math_ops._mul(x, y, name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 2726, in _mul
    "Mul", x=x, y=y, name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): Incompatible shapes: [1600,118,300] vs. [200,118,300]
	 [[Node: time_distributed_1/mul = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](time_distributed_1/Tile, encoder/transpose_1)]]
	 [[Node: loss/mul/_77 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_4737_loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]