提交 8ac5d6f4 编写于 作者: D dangqingqing

Merge branch 'develop' of https://github.com/PaddlePaddle/models into mobilenet_ssd

...@@ -47,7 +47,9 @@ def main(): ...@@ -47,7 +47,9 @@ def main():
optimizer = fluid.optimizer.Adam(learning_rate=0.01) optimizer = fluid.optimizer.Adam(learning_rate=0.01)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=logits, label=label) batch_size = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=logits, label=label, total=batch_size)
BATCH_SIZE = 50 BATCH_SIZE = 50
PASS_NUM = 3 PASS_NUM = 3
...@@ -63,20 +65,22 @@ def main(): ...@@ -63,20 +65,22 @@ def main():
feeder = fluid.DataFeeder(feed_list=[img, label], place=place) feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
pass_acc = fluid.average.WeightedAverage()
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
accuracy.reset(exe) pass_acc.reset()
for data in train_reader(): for data in train_reader():
loss, acc = exe.run(fluid.default_main_program(), loss, acc, b_size = exe.run(
fluid.default_main_program(),
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost] + accuracy.metrics) fetch_list=[avg_cost, batch_acc, batch_size])
pass_acc = accuracy.eval(exe) pass_acc.add(value=acc, weight=b_size)
print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" print("pass_id=" + str(pass_id) + " acc=" + str(acc[0]) +
+ str(pass_acc)) " pass_acc=" + str(pass_acc.eval()[0]))
if loss < LOSS_THRESHOLD and pass_acc > ACC_THRESHOLD: if loss < LOSS_THRESHOLD and pass_acc > ACC_THRESHOLD:
break break
pass_acc = accuracy.eval(exe) print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc.eval()[
print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc)) 0]))
fluid.io.save_params( fluid.io.save_params(
exe, dirname='./mnist', main_program=fluid.default_main_program()) exe, dirname='./mnist', main_program=fluid.default_main_program())
print('train mnist done') print('train mnist done')
......
...@@ -172,15 +172,16 @@ def train(learning_rate, batch_size, num_passes, model_save_dir='model'): ...@@ -172,15 +172,16 @@ def train(learning_rate, batch_size, num_passes, model_save_dir='model'):
momentum=0.9, momentum=0.9,
regularization=fluid.regularizer.L2Decay(5 * 1e-5)) regularization=fluid.regularizer.L2Decay(5 * 1e-5))
opts = optimizer.minimize(avg_cost) opts = optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=out, label=label)
b_size_var = fluid.layers.create_tensor(dtype='int64')
b_acc_var = fluid.layers.accuracy(input=out, label=label, total=b_size_var)
inference_program = fluid.default_main_program().clone() inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program): with fluid.program_guard(inference_program):
test_accuracy = fluid.evaluator.Accuracy(input=out, label=label) inference_program = fluid.io.get_inference_program(
test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states target_vars=[b_acc_var, b_size_var])
inference_program = fluid.io.get_inference_program(test_target)
place = fluid.CUDAPlace(0) place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
...@@ -190,24 +191,29 @@ def train(learning_rate, batch_size, num_passes, model_save_dir='model'): ...@@ -190,24 +191,29 @@ def train(learning_rate, batch_size, num_passes, model_save_dir='model'):
paddle.dataset.flowers.test(), batch_size=batch_size) paddle.dataset.flowers.test(), batch_size=batch_size)
feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
train_pass_acc_evaluator = fluid.average.WeightedAverage()
test_pass_acc_evaluator = fluid.average.WeightedAverage()
for pass_id in range(num_passes): for pass_id in range(num_passes):
accuracy.reset(exe) train_pass_acc_evaluator.reset()
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
loss, acc = exe.run(fluid.default_main_program(), loss, acc, size = exe.run(
fluid.default_main_program(),
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost] + accuracy.metrics) fetch_list=[avg_cost, b_acc_var, b_size_var])
train_pass_acc_evaluator.add(value=acc, weight=size)
print("Pass {0}, batch {1}, loss {2}, acc {3}".format( print("Pass {0}, batch {1}, loss {2}, acc {3}".format(
pass_id, batch_id, loss[0], acc[0])) pass_id, batch_id, loss[0], acc[0]))
pass_acc = accuracy.eval(exe)
test_accuracy.reset(exe) test_pass_acc_evaluator.reset()
for data in test_reader(): for data in test_reader():
loss, acc = exe.run(inference_program, loss, acc, size = exe.run(
inference_program,
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost] + test_accuracy.metrics) fetch_list=[avg_cost, b_acc_var, b_size_var])
test_pass_acc = test_accuracy.eval(exe) test_pass_acc_evaluator.add(value=acc, weight=size)
print("End pass {0}, train_acc {1}, test_acc {2}".format( print("End pass {0}, train_acc {1}, test_acc {2}".format(
pass_id, pass_acc, test_pass_acc)) pass_id,
train_pass_acc_evaluator.eval(), test_pass_acc_evaluator.eval()))
if pass_id % 10 == 0: if pass_id % 10 == 0:
model_path = os.path.join(model_save_dir, str(pass_id)) model_path = os.path.join(model_save_dir, str(pass_id))
print 'save models to %s' % (model_path) print 'save models to %s' % (model_path)
......
from functools import partial from functools import partial
import numpy as np import numpy as np
import paddle.v2 as paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
...@@ -31,7 +30,7 @@ def multi_head_attention(queries, ...@@ -31,7 +30,7 @@ def multi_head_attention(queries,
d_key, d_key,
d_value, d_value,
d_model, d_model,
num_heads=1, n_head=1,
dropout_rate=0.): dropout_rate=0.):
""" """
Multi-Head Attention. Note that attn_bias is added to the logit before Multi-Head Attention. Note that attn_bias is added to the logit before
...@@ -42,41 +41,53 @@ def multi_head_attention(queries, ...@@ -42,41 +41,53 @@ def multi_head_attention(queries,
raise ValueError( raise ValueError(
"Inputs: quries, keys and values should all be 3-D tensors.") "Inputs: quries, keys and values should all be 3-D tensors.")
def __compute_qkv(queries, keys, values, num_heads, d_key, d_value): def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
""" """
Add linear projection to queries, keys, and values. Add linear projection to queries, keys, and values.
""" """
q = layers.fc(input=queries, q = layers.fc(input=queries,
size=d_key * num_heads, size=d_key * n_head,
param_attr=fluid.initializer.Xavier(
uniform=False,
fan_in=d_model * d_key,
fan_out=n_head * d_key),
bias_attr=False, bias_attr=False,
num_flatten_dims=2) num_flatten_dims=2)
k = layers.fc(input=keys, k = layers.fc(input=keys,
size=d_key * num_heads, size=d_key * n_head,
param_attr=fluid.initializer.Xavier(
uniform=False,
fan_in=d_model * d_key,
fan_out=n_head * d_key),
bias_attr=False, bias_attr=False,
num_flatten_dims=2) num_flatten_dims=2)
v = layers.fc(input=values, v = layers.fc(input=values,
size=d_value * num_heads, size=d_value * n_head,
param_attr=fluid.initializer.Xavier(
uniform=False,
fan_in=d_model * d_value,
fan_out=n_head * d_value),
bias_attr=False, bias_attr=False,
num_flatten_dims=2) num_flatten_dims=2)
return q, k, v return q, k, v
def __split_heads(x, num_heads): def __split_heads(x, n_head):
""" """
Reshape the last dimension of inpunt tensor x so that it becomes two Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions and then transpose. Specifically, input a tensor with shape dimensions and then transpose. Specifically, input a tensor with shape
[bs, max_sequence_length, num_heads * hidden_dim] then output a tensor [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
with shape [bs, num_heads, max_sequence_length, hidden_dim]. with shape [bs, n_head, max_sequence_length, hidden_dim].
""" """
if num_heads == 1: if n_head == 1:
return x return x
hidden_size = x.shape[-1] hidden_size = x.shape[-1]
# FIXME(guosheng): Decouple the program desc with batch_size. # FIXME(guosheng): Decouple the program desc with batch_size.
reshaped = layers.reshape( reshaped = layers.reshape(
x=x, shape=[batch_size, -1, num_heads, hidden_size // num_heads]) x=x, shape=[batch_size, -1, n_head, hidden_size // n_head])
# permuate the dimensions into: # permuate the dimensions into:
# [batch_size, num_heads, max_sequence_len, hidden_size_per_head] # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
def __combine_heads(x): def __combine_heads(x):
...@@ -95,7 +106,7 @@ def multi_head_attention(queries, ...@@ -95,7 +106,7 @@ def multi_head_attention(queries,
shape=map(int, shape=map(int,
[batch_size, -1, trans_x.shape[2] * trans_x.shape[3]])) [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
""" """
Scaled Dot-Product Attention Scaled Dot-Product Attention
""" """
...@@ -114,7 +125,7 @@ def multi_head_attention(queries, ...@@ -114,7 +125,7 @@ def multi_head_attention(queries,
sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False) sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
return layers.elementwise_div(x=exp_out, y=sum_out, axis=0) return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
scaled_q = layers.scale(x=q, scale=d_key**-0.5) scaled_q = layers.scale(x=q, scale=d_model**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True) product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
weights = __softmax(layers.elementwise_add(x=product, y=attn_bias)) weights = __softmax(layers.elementwise_add(x=product, y=attn_bias))
if dropout_rate: if dropout_rate:
...@@ -123,13 +134,13 @@ def multi_head_attention(queries, ...@@ -123,13 +134,13 @@ def multi_head_attention(queries,
out = layers.matmul(weights, v) out = layers.matmul(weights, v)
return out return out
q, k, v = __compute_qkv(queries, keys, values, num_heads, d_key, d_value) q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
q = __split_heads(q, num_heads) q = __split_heads(q, n_head)
k = __split_heads(k, num_heads) k = __split_heads(k, n_head)
v = __split_heads(v, num_heads) v = __split_heads(v, n_head)
ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
dropout_rate) dropout_rate)
out = __combine_heads(ctx_multiheads) out = __combine_heads(ctx_multiheads)
...@@ -137,6 +148,7 @@ def multi_head_attention(queries, ...@@ -137,6 +148,7 @@ def multi_head_attention(queries,
# Project back to the model size. # Project back to the model size.
proj_out = layers.fc(input=out, proj_out = layers.fc(input=out,
size=d_model, size=d_model,
param_attr=fluid.initializer.Xavier(uniform=False),
bias_attr=False, bias_attr=False,
num_flatten_dims=2) num_flatten_dims=2)
return proj_out return proj_out
...@@ -151,8 +163,14 @@ def positionwise_feed_forward(x, d_inner_hid, d_hid): ...@@ -151,8 +163,14 @@ def positionwise_feed_forward(x, d_inner_hid, d_hid):
hidden = layers.fc(input=x, hidden = layers.fc(input=x,
size=d_inner_hid, size=d_inner_hid,
num_flatten_dims=2, num_flatten_dims=2,
param_attr=fluid.initializer.Uniform(
low=-(d_hid**-0.5), high=(d_hid**-0.5)),
act="relu") act="relu")
out = layers.fc(input=hidden, size=d_hid, num_flatten_dims=2) out = layers.fc(input=hidden,
size=d_hid,
num_flatten_dims=2,
param_attr=fluid.initializer.Uniform(
low=-(d_inner_hid**-0.5), high=(d_inner_hid**-0.5)))
return out return out
...@@ -168,7 +186,11 @@ def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.): ...@@ -168,7 +186,11 @@ def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.):
if cmd == "a": # add residual connection if cmd == "a": # add residual connection
out = out + prev_out if prev_out else out out = out + prev_out if prev_out else out
elif cmd == "n": # add layer normalization elif cmd == "n": # add layer normalization
out = layers.layer_norm(out, begin_norm_axis=len(out.shape) - 1) out = layers.layer_norm(
out,
begin_norm_axis=len(out.shape) - 1,
param_attr=fluid.initializer.Constant(1.),
bias_attr=fluid.initializer.Constant(0.))
elif cmd == "d": # add dropout elif cmd == "d": # add dropout
if dropout: if dropout:
out = layers.dropout(out, dropout_prob=dropout, is_test=False) out = layers.dropout(out, dropout_prob=dropout, is_test=False)
...@@ -195,7 +217,10 @@ def prepare_encoder(src_word, ...@@ -195,7 +217,10 @@ def prepare_encoder(src_word,
This module is used at the bottom of the encoder stacks. This module is used at the bottom of the encoder stacks.
""" """
src_word_emb = layers.embedding( src_word_emb = layers.embedding(
src_word, size=[src_vocab_size, src_emb_dim], padding_idx=src_pad_idx) src_word,
size=[src_vocab_size, src_emb_dim],
padding_idx=src_pad_idx,
param_attr=fluid.initializer.Normal(0., 1.))
src_pos_enc = layers.embedding( src_pos_enc = layers.embedding(
src_pos, src_pos,
size=[src_max_len, src_emb_dim], size=[src_max_len, src_emb_dim],
...@@ -462,6 +487,7 @@ def transformer( ...@@ -462,6 +487,7 @@ def transformer(
predict = layers.reshape( predict = layers.reshape(
x=layers.fc(input=dec_output, x=layers.fc(input=dec_output,
size=trg_vocab_size, size=trg_vocab_size,
param_attr=fluid.initializer.Xavier(uniform=False),
bias_attr=False, bias_attr=False,
num_flatten_dims=2), num_flatten_dims=2),
shape=[-1, trg_vocab_size], shape=[-1, trg_vocab_size],
......
...@@ -115,7 +115,7 @@ def main(): ...@@ -115,7 +115,7 @@ def main():
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.wmt16.train(ModelHyperParams.src_vocab_size, paddle.dataset.wmt16.train(ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size), ModelHyperParams.trg_vocab_size),
buf_size=51200), buf_size=100000),
batch_size=TrainTaskConfig.batch_size) batch_size=TrainTaskConfig.batch_size)
# Initialize the parameters. # Initialize the parameters.
...@@ -143,7 +143,7 @@ def main(): ...@@ -143,7 +143,7 @@ def main():
fetch_list=[cost]) fetch_list=[cost])
cost_val = np.array(outs[0]) cost_val = np.array(outs[0])
print("pass_id = " + str(pass_id) + " batch = " + str(batch_id) + print("pass_id = " + str(pass_id) + " batch = " + str(batch_id) +
" avg_cost = " + str(cost_val)) " cost = " + str(cost_val))
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -89,12 +89,14 @@ def main(dict_path): ...@@ -89,12 +89,14 @@ def main(dict_path):
sgd_optimizer = fluid.optimizer.SGD(learning_rate=conf.learning_rate) sgd_optimizer = fluid.optimizer.SGD(learning_rate=conf.learning_rate)
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=prediction, label=label) batch_size_var = fluid.layers.create_tensor(dtype='int64')
batch_acc_var = fluid.layers.accuracy(
input=prediction, label=label, total=batch_size_var)
inference_program = fluid.default_main_program().clone() inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program): with fluid.program_guard(inference_program):
test_target = accuracy.metrics + accuracy.states inference_program = fluid.io.get_inference_program(
inference_program = fluid.io.get_inference_program(test_target) target_vars=[batch_acc_var, batch_size_var])
# The training data set. # The training data set.
train_reader = paddle.batch( train_reader = paddle.batch(
...@@ -119,31 +121,37 @@ def main(dict_path): ...@@ -119,31 +121,37 @@ def main(dict_path):
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
train_pass_acc_evaluator = fluid.average.WeightedAverage()
test_pass_acc_evaluator = fluid.average.WeightedAverage()
def test(exe): def test(exe):
accuracy.reset(exe) test_pass_acc_evaluator.reset()
for batch_id, data in enumerate(test_reader()): for batch_id, data in enumerate(test_reader()):
input_seq = to_lodtensor(map(lambda x: x[0], data), place) input_seq = to_lodtensor(map(lambda x: x[0], data), place)
y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1]) y_data = y_data.reshape([-1, 1])
acc = exe.run(inference_program, b_acc, b_size = exe.run(inference_program,
feed={"words": input_seq, feed={"words": input_seq,
"label": y_data}) "label": y_data},
test_acc = accuracy.eval(exe) fetch_list=[batch_acc_var, batch_size_var])
test_pass_acc_evaluator.add(value=b_acc, weight=b_size)
test_acc = test_pass_acc_evaluator.eval()
return test_acc return test_acc
total_time = 0. total_time = 0.
for pass_id in xrange(conf.num_passes): for pass_id in xrange(conf.num_passes):
accuracy.reset(exe) train_pass_acc_evaluator.reset()
start_time = time.time() start_time = time.time()
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
cost_val, acc_val = exe.run( cost_val, acc_val, size_val = exe.run(
fluid.default_main_program(), fluid.default_main_program(),
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost, accuracy.metrics[0]]) fetch_list=[avg_cost, batch_acc_var, batch_size_var])
pass_acc = accuracy.eval(exe) train_pass_acc_evaluator.add(value=acc_val, weight=size_val)
if batch_id and batch_id % conf.log_period == 0: if batch_id and batch_id % conf.log_period == 0:
print("Pass id: %d, batch id: %d, cost: %f, pass_acc %f" % print("Pass id: %d, batch id: %d, cost: %f, pass_acc: %f" %
(pass_id, batch_id, cost_val, pass_acc)) (pass_id, batch_id, cost_val,
train_pass_acc_evaluator.eval()))
end_time = time.time() end_time = time.time()
total_time += (end_time - start_time) total_time += (end_time - start_time)
pass_test_acc = test(exe) pass_test_acc = test(exe)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册