提交 8ac5d6f4 编写于 作者: D dangqingqing

Merge branch 'develop' of https://github.com/PaddlePaddle/models into mobilenet_ssd

......@@ -47,7 +47,9 @@ def main():
optimizer = fluid.optimizer.Adam(learning_rate=0.01)
optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=logits, label=label)
batch_size = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=logits, label=label, total=batch_size)
BATCH_SIZE = 50
PASS_NUM = 3
......@@ -63,20 +65,22 @@ def main():
feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
exe.run(fluid.default_startup_program())
pass_acc = fluid.average.WeightedAverage()
for pass_id in range(PASS_NUM):
accuracy.reset(exe)
pass_acc.reset()
for data in train_reader():
loss, acc = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe)
print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc="
+ str(pass_acc))
loss, acc, b_size = exe.run(
fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost, batch_acc, batch_size])
pass_acc.add(value=acc, weight=b_size)
print("pass_id=" + str(pass_id) + " acc=" + str(acc[0]) +
" pass_acc=" + str(pass_acc.eval()[0]))
if loss < LOSS_THRESHOLD and pass_acc > ACC_THRESHOLD:
break
pass_acc = accuracy.eval(exe)
print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc.eval()[
0]))
fluid.io.save_params(
exe, dirname='./mnist', main_program=fluid.default_main_program())
print('train mnist done')
......
......@@ -172,15 +172,16 @@ def train(learning_rate, batch_size, num_passes, model_save_dir='model'):
momentum=0.9,
regularization=fluid.regularizer.L2Decay(5 * 1e-5))
opts = optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=out, label=label)
b_size_var = fluid.layers.create_tensor(dtype='int64')
b_acc_var = fluid.layers.accuracy(input=out, label=label, total=b_size_var)
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
test_accuracy = fluid.evaluator.Accuracy(input=out, label=label)
test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
inference_program = fluid.io.get_inference_program(test_target)
inference_program = fluid.io.get_inference_program(
target_vars=[b_acc_var, b_size_var])
place = fluid.CUDAPlace(0)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
......@@ -190,24 +191,29 @@ def train(learning_rate, batch_size, num_passes, model_save_dir='model'):
paddle.dataset.flowers.test(), batch_size=batch_size)
feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
train_pass_acc_evaluator = fluid.average.WeightedAverage()
test_pass_acc_evaluator = fluid.average.WeightedAverage()
for pass_id in range(num_passes):
accuracy.reset(exe)
train_pass_acc_evaluator.reset()
for batch_id, data in enumerate(train_reader()):
loss, acc = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost] + accuracy.metrics)
loss, acc, size = exe.run(
fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost, b_acc_var, b_size_var])
train_pass_acc_evaluator.add(value=acc, weight=size)
print("Pass {0}, batch {1}, loss {2}, acc {3}".format(
pass_id, batch_id, loss[0], acc[0]))
pass_acc = accuracy.eval(exe)
test_accuracy.reset(exe)
test_pass_acc_evaluator.reset()
for data in test_reader():
loss, acc = exe.run(inference_program,
feed=feeder.feed(data),
fetch_list=[avg_cost] + test_accuracy.metrics)
test_pass_acc = test_accuracy.eval(exe)
loss, acc, size = exe.run(
inference_program,
feed=feeder.feed(data),
fetch_list=[avg_cost, b_acc_var, b_size_var])
test_pass_acc_evaluator.add(value=acc, weight=size)
print("End pass {0}, train_acc {1}, test_acc {2}".format(
pass_id, pass_acc, test_pass_acc))
pass_id,
train_pass_acc_evaluator.eval(), test_pass_acc_evaluator.eval()))
if pass_id % 10 == 0:
model_path = os.path.join(model_save_dir, str(pass_id))
print 'save models to %s' % (model_path)
......
from functools import partial
import numpy as np
import paddle.v2 as paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
......@@ -31,7 +30,7 @@ def multi_head_attention(queries,
d_key,
d_value,
d_model,
num_heads=1,
n_head=1,
dropout_rate=0.):
"""
Multi-Head Attention. Note that attn_bias is added to the logit before
......@@ -42,41 +41,53 @@ def multi_head_attention(queries,
raise ValueError(
"Inputs: quries, keys and values should all be 3-D tensors.")
def __compute_qkv(queries, keys, values, num_heads, d_key, d_value):
def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
"""
Add linear projection to queries, keys, and values.
"""
q = layers.fc(input=queries,
size=d_key * num_heads,
size=d_key * n_head,
param_attr=fluid.initializer.Xavier(
uniform=False,
fan_in=d_model * d_key,
fan_out=n_head * d_key),
bias_attr=False,
num_flatten_dims=2)
k = layers.fc(input=keys,
size=d_key * num_heads,
size=d_key * n_head,
param_attr=fluid.initializer.Xavier(
uniform=False,
fan_in=d_model * d_key,
fan_out=n_head * d_key),
bias_attr=False,
num_flatten_dims=2)
v = layers.fc(input=values,
size=d_value * num_heads,
size=d_value * n_head,
param_attr=fluid.initializer.Xavier(
uniform=False,
fan_in=d_model * d_value,
fan_out=n_head * d_value),
bias_attr=False,
num_flatten_dims=2)
return q, k, v
def __split_heads(x, num_heads):
def __split_heads(x, n_head):
"""
Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions and then transpose. Specifically, input a tensor with shape
[bs, max_sequence_length, num_heads * hidden_dim] then output a tensor
with shape [bs, num_heads, max_sequence_length, hidden_dim].
[bs, max_sequence_length, n_head * hidden_dim] then output a tensor
with shape [bs, n_head, max_sequence_length, hidden_dim].
"""
if num_heads == 1:
if n_head == 1:
return x
hidden_size = x.shape[-1]
# FIXME(guosheng): Decouple the program desc with batch_size.
reshaped = layers.reshape(
x=x, shape=[batch_size, -1, num_heads, hidden_size // num_heads])
x=x, shape=[batch_size, -1, n_head, hidden_size // n_head])
# permuate the dimensions into:
# [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
# [batch_size, n_head, max_sequence_len, hidden_size_per_head]
return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
def __combine_heads(x):
......@@ -95,7 +106,7 @@ def multi_head_attention(queries,
shape=map(int,
[batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
"""
Scaled Dot-Product Attention
"""
......@@ -114,7 +125,7 @@ def multi_head_attention(queries,
sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
scaled_q = layers.scale(x=q, scale=d_key**-0.5)
scaled_q = layers.scale(x=q, scale=d_model**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
weights = __softmax(layers.elementwise_add(x=product, y=attn_bias))
if dropout_rate:
......@@ -123,13 +134,13 @@ def multi_head_attention(queries,
out = layers.matmul(weights, v)
return out
q, k, v = __compute_qkv(queries, keys, values, num_heads, d_key, d_value)
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
q = __split_heads(q, num_heads)
k = __split_heads(k, num_heads)
v = __split_heads(v, num_heads)
q = __split_heads(q, n_head)
k = __split_heads(k, n_head)
v = __split_heads(v, n_head)
ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
dropout_rate)
out = __combine_heads(ctx_multiheads)
......@@ -137,6 +148,7 @@ def multi_head_attention(queries,
# Project back to the model size.
proj_out = layers.fc(input=out,
size=d_model,
param_attr=fluid.initializer.Xavier(uniform=False),
bias_attr=False,
num_flatten_dims=2)
return proj_out
......@@ -151,8 +163,14 @@ def positionwise_feed_forward(x, d_inner_hid, d_hid):
hidden = layers.fc(input=x,
size=d_inner_hid,
num_flatten_dims=2,
param_attr=fluid.initializer.Uniform(
low=-(d_hid**-0.5), high=(d_hid**-0.5)),
act="relu")
out = layers.fc(input=hidden, size=d_hid, num_flatten_dims=2)
out = layers.fc(input=hidden,
size=d_hid,
num_flatten_dims=2,
param_attr=fluid.initializer.Uniform(
low=-(d_inner_hid**-0.5), high=(d_inner_hid**-0.5)))
return out
......@@ -168,7 +186,11 @@ def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.):
if cmd == "a": # add residual connection
out = out + prev_out if prev_out else out
elif cmd == "n": # add layer normalization
out = layers.layer_norm(out, begin_norm_axis=len(out.shape) - 1)
out = layers.layer_norm(
out,
begin_norm_axis=len(out.shape) - 1,
param_attr=fluid.initializer.Constant(1.),
bias_attr=fluid.initializer.Constant(0.))
elif cmd == "d": # add dropout
if dropout:
out = layers.dropout(out, dropout_prob=dropout, is_test=False)
......@@ -195,7 +217,10 @@ def prepare_encoder(src_word,
This module is used at the bottom of the encoder stacks.
"""
src_word_emb = layers.embedding(
src_word, size=[src_vocab_size, src_emb_dim], padding_idx=src_pad_idx)
src_word,
size=[src_vocab_size, src_emb_dim],
padding_idx=src_pad_idx,
param_attr=fluid.initializer.Normal(0., 1.))
src_pos_enc = layers.embedding(
src_pos,
size=[src_max_len, src_emb_dim],
......@@ -462,6 +487,7 @@ def transformer(
predict = layers.reshape(
x=layers.fc(input=dec_output,
size=trg_vocab_size,
param_attr=fluid.initializer.Xavier(uniform=False),
bias_attr=False,
num_flatten_dims=2),
shape=[-1, trg_vocab_size],
......
......@@ -115,7 +115,7 @@ def main():
paddle.reader.shuffle(
paddle.dataset.wmt16.train(ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size),
buf_size=51200),
buf_size=100000),
batch_size=TrainTaskConfig.batch_size)
# Initialize the parameters.
......@@ -143,7 +143,7 @@ def main():
fetch_list=[cost])
cost_val = np.array(outs[0])
print("pass_id = " + str(pass_id) + " batch = " + str(batch_id) +
" avg_cost = " + str(cost_val))
" cost = " + str(cost_val))
if __name__ == "__main__":
......
......@@ -89,12 +89,14 @@ def main(dict_path):
sgd_optimizer = fluid.optimizer.SGD(learning_rate=conf.learning_rate)
sgd_optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
batch_size_var = fluid.layers.create_tensor(dtype='int64')
batch_acc_var = fluid.layers.accuracy(
input=prediction, label=label, total=batch_size_var)
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
test_target = accuracy.metrics + accuracy.states
inference_program = fluid.io.get_inference_program(test_target)
inference_program = fluid.io.get_inference_program(
target_vars=[batch_acc_var, batch_size_var])
# The training data set.
train_reader = paddle.batch(
......@@ -119,31 +121,37 @@ def main(dict_path):
exe.run(fluid.default_startup_program())
train_pass_acc_evaluator = fluid.average.WeightedAverage()
test_pass_acc_evaluator = fluid.average.WeightedAverage()
def test(exe):
accuracy.reset(exe)
test_pass_acc_evaluator.reset()
for batch_id, data in enumerate(test_reader()):
input_seq = to_lodtensor(map(lambda x: x[0], data), place)
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
acc = exe.run(inference_program,
feed={"words": input_seq,
"label": y_data})
test_acc = accuracy.eval(exe)
b_acc, b_size = exe.run(inference_program,
feed={"words": input_seq,
"label": y_data},
fetch_list=[batch_acc_var, batch_size_var])
test_pass_acc_evaluator.add(value=b_acc, weight=b_size)
test_acc = test_pass_acc_evaluator.eval()
return test_acc
total_time = 0.
for pass_id in xrange(conf.num_passes):
accuracy.reset(exe)
train_pass_acc_evaluator.reset()
start_time = time.time()
for batch_id, data in enumerate(train_reader()):
cost_val, acc_val = exe.run(
cost_val, acc_val, size_val = exe.run(
fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost, accuracy.metrics[0]])
pass_acc = accuracy.eval(exe)
fetch_list=[avg_cost, batch_acc_var, batch_size_var])
train_pass_acc_evaluator.add(value=acc_val, weight=size_val)
if batch_id and batch_id % conf.log_period == 0:
print("Pass id: %d, batch id: %d, cost: %f, pass_acc %f" %
(pass_id, batch_id, cost_val, pass_acc))
print("Pass id: %d, batch id: %d, cost: %f, pass_acc: %f" %
(pass_id, batch_id, cost_val,
train_pass_acc_evaluator.eval()))
end_time = time.time()
total_time += (end_time - start_time)
pass_test_acc = test(exe)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册