提交 88974072 编写于 作者: X Xin Pan

Better usage for multi-gpu

Use must set num_gpus in config.py to the number of gpus available.
上级 de683692
...@@ -6,6 +6,9 @@ class TrainTaskConfig(object): ...@@ -6,6 +6,9 @@ class TrainTaskConfig(object):
# number of sequences contained in a mini-batch. # number of sequences contained in a mini-batch.
batch_size = 32 batch_size = 32
# number of gpu devices
num_gpus = 4
# the hyper params for Adam optimizer. # the hyper params for Adam optimizer.
learning_rate = 0.001 learning_rate = 0.001
beta1 = 0.9 beta1 = 0.9
......
...@@ -10,6 +10,7 @@ from config import TrainTaskConfig, input_data_names, pos_enc_param_names ...@@ -10,6 +10,7 @@ from config import TrainTaskConfig, input_data_names, pos_enc_param_names
# FIXME(guosheng): Remove out the batch_size from the model. # FIXME(guosheng): Remove out the batch_size from the model.
batch_size = TrainTaskConfig.batch_size batch_size = TrainTaskConfig.batch_size
num_gpus = TrainTaskConfig.num_gpus
def position_encoding_init(n_position, d_pos_vec): def position_encoding_init(n_position, d_pos_vec):
...@@ -86,7 +87,8 @@ def multi_head_attention(queries, ...@@ -86,7 +87,8 @@ def multi_head_attention(queries,
hidden_size = x.shape[-1] hidden_size = x.shape[-1]
# FIXME(guosheng): Decouple the program desc with batch_size. # FIXME(guosheng): Decouple the program desc with batch_size.
reshaped = layers.reshape( reshaped = layers.reshape(
x=x, shape=[batch_size / 2, -1, n_head, hidden_size // n_head]) x=x, shape=[batch_size / num_gpus, -1, n_head,
hidden_size // n_head])
# permuate the dimensions into: # permuate the dimensions into:
# [batch_size, n_head, max_sequence_len, hidden_size_per_head] # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
...@@ -106,7 +108,8 @@ def multi_head_attention(queries, ...@@ -106,7 +108,8 @@ def multi_head_attention(queries,
return layers.reshape( return layers.reshape(
x=trans_x, x=trans_x,
shape=map(int, shape=map(int,
[batch_size / 2, -1, trans_x.shape[2] * trans_x.shape[3]])) [batch_size / num_gpus, -1,
trans_x.shape[2] * trans_x.shape[3]]))
def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate): def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
""" """
...@@ -233,7 +236,7 @@ def prepare_encoder(src_word, ...@@ -233,7 +236,7 @@ def prepare_encoder(src_word,
# FIXME(guosheng): Decouple the program desc with batch_size. # FIXME(guosheng): Decouple the program desc with batch_size.
enc_input = layers.reshape(x=enc_input, enc_input = layers.reshape(x=enc_input,
shape=[batch_size / 2, -1, src_emb_dim]) shape=[batch_size / num_gpus, -1, src_emb_dim])
return layers.dropout( return layers.dropout(
enc_input, dropout_prob=dropout, enc_input, dropout_prob=dropout,
is_test=False) if dropout else enc_input is_test=False) if dropout else enc_input
...@@ -465,7 +468,7 @@ def transformer( ...@@ -465,7 +468,7 @@ def transformer(
append_batch_size=False) append_batch_size=False)
places = fluid.layers.get_places() places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places, use_nccl=False) pd = fluid.layers.ParallelDo(places, use_nccl=True)
src_word = fluid.layers.reshape(x=src_word, src_word = fluid.layers.reshape(x=src_word,
shape=[batch_size, -1, 1]) shape=[batch_size, -1, 1])
......
...@@ -146,6 +146,8 @@ def main(): ...@@ -146,6 +146,8 @@ def main():
" cost = " + str(cost_val)) " cost = " + str(cost_val))
return time.time() - t1 return time.time() - t1
# with open('/tmp/program', 'w') as f:
# f.write('%s' % fluid.framework.default_main_program())
total_time = 0.0 total_time = 0.0
count = 0 count = 0
for pass_id in xrange(TrainTaskConfig.pass_num): for pass_id in xrange(TrainTaskConfig.pass_num):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册