未验证 提交 c0539d13 编写于 作者: B Bai Yifan 提交者: GitHub

Merge branch 'develop' into ce

...@@ -24,15 +24,10 @@ def calc_diff(f1, f2): ...@@ -24,15 +24,10 @@ def calc_diff(f1, f2):
#print d2.shape #print d2.shape
#print d1[0, 0, 0:10, 0:10] #print d1[0, 0, 0:10, 0:10]
#print d2[0, 0, 0:10, 0:10] #print d2[0, 0, 0:10, 0:10]
#d1 = d1[:, :, 1:-2, 1:-2]
#d2 = d2[:, :, 1:-2, 1:-2]
d1 = d1.flatten() d1 = d1.flatten()
d2 = d2.flatten() d2 = d2.flatten()
#print d1[:10]
#print d2[:10]
d1_num = reduce(lambda x, y: x * y, d1.shape) d1_num = reduce(lambda x, y: x * y, d1.shape)
d2_num = reduce(lambda x, y: x * y, d2.shape) d2_num = reduce(lambda x, y: x * y, d2.shape)
if d1_num != d2_num: if d1_num != d2_num:
...@@ -41,7 +36,11 @@ def calc_diff(f1, f2): ...@@ -41,7 +36,11 @@ def calc_diff(f1, f2):
assert (d1_num == d2_num), "their shape is not consistent" assert (d1_num == d2_num), "their shape is not consistent"
try: try:
mask = np.abs(d1) >= np.abs(d2)
mask = mask.astype('int32')
df = np.abs(d1 - d2) df = np.abs(d1 - d2)
df = df / (1.0e-10 + np.abs(d1) * mask + np.abs(d2) * (1 - mask))
max_df = np.max(df) max_df = np.max(df)
sq_df = np.mean(df * df) sq_df = np.mean(df * df)
return max_df, sq_df return max_df, sq_df
......
...@@ -39,6 +39,7 @@ LAYER_DESCRIPTORS = { ...@@ -39,6 +39,7 @@ LAYER_DESCRIPTORS = {
'Pooling': shape_pool, 'Pooling': shape_pool,
'Power': shape_identity, 'Power': shape_identity,
'ReLU': shape_identity, 'ReLU': shape_identity,
'PReLU': shape_identity,
'Scale': shape_identity, 'Scale': shape_identity,
'Sigmoid': shape_identity, 'Sigmoid': shape_identity,
'SigmoidCrossEntropyLoss': shape_scalar, 'SigmoidCrossEntropyLoss': shape_scalar,
......
...@@ -240,10 +240,16 @@ class Network(object): ...@@ -240,10 +240,16 @@ class Network(object):
@layer @layer
def relu(self, input, name): def relu(self, input, name):
fluid = import_fluid() fluid = import_fluid()
output = fluid.layers.relu( output = fluid.layers.relu(input)
name=self.get_unique_output_name(name, 'relu'), x=input)
return output return output
@layer
def prelu(self, input, channel_shared, name):
#fluid = import_fluid()
#output = fluid.layers.relu(input)
#return output
raise NotImplementedError('prelu not implemented')
def pool(self, pool_type, input, k_h, k_w, s_h, s_w, ceil_mode, padding, def pool(self, pool_type, input, k_h, k_w, s_h, s_w, ceil_mode, padding,
name): name):
# Get the number of channels in the input # Get the number of channels in the input
...@@ -382,7 +388,8 @@ class Network(object): ...@@ -382,7 +388,8 @@ class Network(object):
name, name,
scale_offset=True, scale_offset=True,
eps=1e-5, eps=1e-5,
relu=False): relu=False,
relu_negative_slope=0.0):
# NOTE: Currently, only inference is supported # NOTE: Currently, only inference is supported
fluid = import_fluid() fluid = import_fluid()
prefix = name + '_' prefix = name + '_'
...@@ -392,6 +399,15 @@ class Network(object): ...@@ -392,6 +399,15 @@ class Network(object):
name=prefix + 'offset') name=prefix + 'offset')
mean_name = prefix + 'mean' mean_name = prefix + 'mean'
variance_name = prefix + 'variance' variance_name = prefix + 'variance'
leaky_relu = False
act = 'relu'
if relu is False:
act = None
elif relu_negative_slope != 0.0:
leaky_relu = True
act = None
output = fluid.layers.batch_norm( output = fluid.layers.batch_norm(
name=self.get_unique_output_name(name, 'batch_norm'), name=self.get_unique_output_name(name, 'batch_norm'),
input=input, input=input,
...@@ -401,7 +417,10 @@ class Network(object): ...@@ -401,7 +417,10 @@ class Network(object):
moving_mean_name=mean_name, moving_mean_name=mean_name,
moving_variance_name=variance_name, moving_variance_name=variance_name,
epsilon=eps, epsilon=eps,
act='relu' if relu is True else None) act=act)
if leaky_relu:
output = fluid.layers.leaky_relu(output, alpha=relu_negative_slope)
return output return output
......
...@@ -112,6 +112,13 @@ class PaddleMapper(NodeMapper): ...@@ -112,6 +112,13 @@ class PaddleMapper(NodeMapper):
def map_relu(self, node): def map_relu(self, node):
return PaddleNode('relu') return PaddleNode('relu')
def map_prelu(self, node):
channel_shared = getattr(node.parameters, 'channel_shared', False)
return PaddleNode('prelu', channel_shared)
def map_tanh(self, node):
return PaddleNode('tanh')
def map_pooling(self, node): def map_pooling(self, node):
pool_type = node.parameters.pool pool_type = node.parameters.pool
if pool_type == 0: if pool_type == 0:
......
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
cudaid=${language_model:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py | python _ce.py
cudaid=${language_model_m:=0,1,2,3} # use 0,1,2,3 card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
imikolov_20_avg_ppl_kpi = CostKpi('imikolov_20_avg_ppl', 0.2, 0)
imikolov_20_pass_duration_kpi = DurationKpi(
'imikolov_20_pass_duration', 0.02, 0, actived=True)
imikolov_20_avg_ppl_kpi_card4 = CostKpi('imikolov_20_avg_ppl_card4', 0.2, 0)
imikolov_20_pass_duration_kpi_card4 = DurationKpi(
'imikolov_20_pass_duration_card4', 0.03, 0, actived=True)
tracking_kpis = [
imikolov_20_avg_ppl_kpi,
imikolov_20_pass_duration_kpi,
imikolov_20_avg_ppl_kpi_card4,
imikolov_20_pass_duration_kpi_card4,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
import os
import sys import sys
import time import time
import numpy as np import numpy as np
import math import math
import argparse
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.v2 as paddle import paddle
import utils import utils
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("language_model benchmark.")
parser.add_argument(
'--enable_ce',
action='store_true',
help='If set, run \
the task with continuous evaluation logs.')
args = parser.parse_args()
return args
def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound): def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
""" network definition """ """ network definition """
...@@ -63,31 +77,26 @@ def train(train_reader, ...@@ -63,31 +77,26 @@ def train(train_reader,
init_low_bound=-0.04, init_low_bound=-0.04,
init_high_bound=0.04): init_high_bound=0.04):
""" train network """ """ train network """
args = parse_args()
if args.enable_ce:
# random seed must set before configuring the network.
fluid.default_startup_program().random_seed = SEED
vocab_size = len(vocab) vocab_size = len(vocab)
#Input data
src_wordseq = fluid.layers.data( src_wordseq = fluid.layers.data(
name="src_wordseq", shape=[1], dtype="int64", lod_level=1) name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
dst_wordseq = fluid.layers.data( dst_wordseq = fluid.layers.data(
name="dst_wordseq", shape=[1], dtype="int64", lod_level=1) name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)
# Train program
avg_cost = None avg_cost = None
if not parallel: cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size, init_low_bound, init_high_bound)
init_low_bound, init_high_bound) avg_cost = fluid.layers.mean(x=cost)
avg_cost = fluid.layers.mean(x=cost)
else:
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places)
with pd.do():
cost = network(
pd.read_input(src_wordseq),
pd.read_input(dst_wordseq), vocab_size, hid_size,
init_low_bound, init_high_bound)
pd.write_output(cost)
cost = pd()
avg_cost = fluid.layers.mean(x=cost)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.SGD( sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay( learning_rate=fluid.layers.exponential_decay(
learning_rate=base_lr, learning_rate=base_lr,
...@@ -96,39 +105,56 @@ def train(train_reader, ...@@ -96,39 +105,56 @@ def train(train_reader,
staircase=True)) staircase=True))
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
# Initialize executor
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
total_time = 0.0 total_time = 0.0
fetch_list = [avg_cost.name]
for pass_idx in xrange(pass_num): for pass_idx in xrange(pass_num):
epoch_idx = pass_idx + 1 epoch_idx = pass_idx + 1
print "epoch_%d start" % epoch_idx print "epoch_%d start" % epoch_idx
t0 = time.time() t0 = time.time()
i = 0 i = 0
newest_ppl = 0
for data in train_reader(): for data in train_reader():
i += 1 i += 1
lod_src_wordseq = utils.to_lodtensor( lod_src_wordseq = utils.to_lodtensor(
map(lambda x: x[0], data), place) map(lambda x: x[0], data), place)
lod_dst_wordseq = utils.to_lodtensor( lod_dst_wordseq = utils.to_lodtensor(
map(lambda x: x[1], data), place) map(lambda x: x[1], data), place)
ret_avg_cost = exe.run(fluid.default_main_program(), ret_avg_cost = train_exe.run(feed={
feed={ "src_wordseq": lod_src_wordseq,
"src_wordseq": lod_src_wordseq, "dst_wordseq": lod_dst_wordseq
"dst_wordseq": lod_dst_wordseq },
}, fetch_list=fetch_list)
fetch_list=[avg_cost], avg_ppl = np.exp(ret_avg_cost[0])
use_program_cache=True) newest_ppl = np.mean(avg_ppl)
avg_ppl = math.exp(ret_avg_cost[0])
if i % 100 == 0: if i % 100 == 0:
print "step:%d ppl:%.3f" % (i, avg_ppl) print "step:%d ppl:%.3f" % (i, newest_ppl)
t1 = time.time() t1 = time.time()
total_time += t1 - t0 total_time += t1 - t0
print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i, print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i,
total_time / epoch_idx) total_time / epoch_idx)
if pass_idx == pass_num - 1 and args.enable_ce:
#Note: The following logs are special for CE monitoring.
#Other situations do not need to care about these logs.
gpu_num = get_cards()
if gpu_num == 1:
print("kpis imikolov_20_pass_duration %s" %
(total_time / epoch_idx))
print("kpis imikolov_20_avg_ppl %s" % newest_ppl)
else:
print("kpis imikolov_20_pass_duration_card%s %s" % \
(gpu_num, total_time / epoch_idx))
print("kpis imikolov_20_avg_ppl_card%s %s" %
(gpu_num, newest_ppl))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["src_wordseq", "dst_wordseq"] feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost] fetch_vars = [avg_cost]
...@@ -138,11 +164,22 @@ def train(train_reader, ...@@ -138,11 +164,22 @@ def train(train_reader,
print("finish training") print("finish training")
def get_cards(enable_ce):
if enable_ce:
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
num = len(cards.split(","))
return num
else:
return fluid.core.get_cuda_device_count()
def train_net(): def train_net():
""" do training """ """ do training """
batch_size = 20 batch_size = 20
args = parse_args()
vocab, train_reader, test_reader = utils.prepare_data( vocab, train_reader, test_reader = utils.prepare_data(
batch_size=batch_size, buffer_size=1000, word_freq_threshold=0) batch_size=batch_size * get_cards(args.enable_ce), buffer_size=1000, \
word_freq_threshold=0, enable_ce = args.enable_ce)
train( train(
train_reader=train_reader, train_reader=train_reader,
vocab=vocab, vocab=vocab,
...@@ -152,7 +189,7 @@ def train_net(): ...@@ -152,7 +189,7 @@ def train_net():
batch_size=batch_size, batch_size=batch_size,
pass_num=12, pass_num=12,
use_cuda=True, use_cuda=True,
parallel=False, parallel=True,
model_dir="model", model_dir="model",
init_low_bound=-0.1, init_low_bound=-0.1,
init_high_bound=0.1) init_high_bound=0.1)
......
...@@ -3,7 +3,7 @@ import time ...@@ -3,7 +3,7 @@ import time
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.v2 as paddle import paddle
def to_lodtensor(data, place): def to_lodtensor(data, place):
...@@ -22,17 +22,28 @@ def to_lodtensor(data, place): ...@@ -22,17 +22,28 @@ def to_lodtensor(data, place):
return res return res
def prepare_data(batch_size, buffer_size=1000, word_freq_threshold=0): def prepare_data(batch_size,
buffer_size=1000,
word_freq_threshold=0,
enable_ce=False):
""" prepare the English Pann Treebank (PTB) data """ """ prepare the English Pann Treebank (PTB) data """
vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold) vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold)
train_reader = paddle.batch( if enable_ce:
paddle.reader.shuffle( train_reader = paddle.batch(
paddle.dataset.imikolov.train( paddle.dataset.imikolov.train(
vocab, vocab,
buffer_size, buffer_size,
data_type=paddle.dataset.imikolov.DataType.SEQ), data_type=paddle.dataset.imikolov.DataType.SEQ),
buf_size=buffer_size), batch_size)
batch_size) else:
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imikolov.train(
vocab,
buffer_size,
data_type=paddle.dataset.imikolov.DataType.SEQ),
buf_size=buffer_size),
batch_size)
test_reader = paddle.batch( test_reader = paddle.batch(
paddle.dataset.imikolov.test( paddle.dataset.imikolov.test(
vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ), vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ),
......
###!/bin/bash
####This file is only used for continuous evaluation.
model_file='train.py'
python $model_file --pass_num 1 --learning_rate 0.001 --save_interval 10 --enable_ce | python _ce.py
####this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
test_cost_kpi = CostKpi('test_cost', 0.005, 0, actived=True)
train_duration_kpi = DurationKpi('train_duration', 0.06, 0, actived=True)
tracking_kpis = [
train_cost_kpi,
test_cost_kpi,
train_duration_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import distutils.util
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--embedding_dim",
type=int,
default=512,
help="The dimension of embedding table. (default: %(default)d)")
parser.add_argument(
"--encoder_size",
type=int,
default=512,
help="The size of encoder bi-rnn unit. (default: %(default)d)")
parser.add_argument(
"--decoder_size",
type=int,
default=512,
help="The size of decoder rnn unit. (default: %(default)d)")
parser.add_argument(
"--batch_size",
type=int,
default=32,
help="The sequence number of a mini-batch data. (default: %(default)d)")
parser.add_argument(
"--dict_size",
type=int,
default=30000,
help="The dictionary capacity. Dictionaries of source sequence and "
"target dictionary have same capacity. (default: %(default)d)")
parser.add_argument(
"--pass_num",
type=int,
default=5,
help="The pass number to train. (default: %(default)d)")
parser.add_argument(
"--learning_rate",
type=float,
default=0.01,
help="Learning rate used to train the model. (default: %(default)f)")
parser.add_argument(
"--no_attention",
action='store_true',
help="If set, run no attention model instead of attention model.")
parser.add_argument(
"--beam_size",
type=int,
default=3,
help="The width for beam searching. (default: %(default)d)")
parser.add_argument(
"--use_gpu",
type=distutils.util.strtobool,
default=True,
help="Whether to use gpu. (default: %(default)d)")
parser.add_argument(
"--max_length",
type=int,
default=50,
help="The maximum length of sequence when doing generation. "
"(default: %(default)d)")
parser.add_argument(
"--save_dir",
type=str,
default="model",
help="Specify the path to save trained models.")
parser.add_argument(
"--save_interval",
type=int,
default=1,
help="Save the trained model every n passes."
"(default: %(default)d)")
parser.add_argument(
"--enable_ce",
action='store_true',
help="If set, run the task with continuous evaluation logs.")
args = parser.parse_args()
return args
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from paddle.fluid.contrib.decoder.beam_search_decoder import *
def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
def linear(inputs):
return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
cell_t = fluid.layers.sums(input=[
fluid.layers.elementwise_mul(
x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
x=input_gate, y=cell_tilde)
])
hidden_t = fluid.layers.elementwise_mul(
x=output_gate, y=fluid.layers.tanh(x=cell_t))
return hidden_t, cell_t
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
target_dict_dim, is_generating, beam_size, max_length):
"""Construct a seq2seq network."""
def bi_lstm_encoder(input_seq, gate_size):
# A bi-directional lstm encoder implementation.
# Linear transformation part for input gate, output gate, forget gate
# and cell activation vectors need be done outside of dynamic_lstm.
# So the output size is 4 times of gate_size.
input_forward_proj = fluid.layers.fc(input=input_seq,
size=gate_size * 4,
act='tanh',
bias_attr=False)
forward, _ = fluid.layers.dynamic_lstm(
input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
input_reversed_proj = fluid.layers.fc(input=input_seq,
size=gate_size * 4,
act='tanh',
bias_attr=False)
reversed, _ = fluid.layers.dynamic_lstm(
input=input_reversed_proj,
size=gate_size * 4,
is_reverse=True,
use_peepholes=False)
return forward, reversed
# The encoding process. Encodes the input words into tensors.
src_word_idx = fluid.layers.data(
name='source_sequence', shape=[1], dtype='int64', lod_level=1)
src_embedding = fluid.layers.embedding(
input=src_word_idx,
size=[source_dict_dim, embedding_dim],
dtype='float32')
src_forward, src_reversed = bi_lstm_encoder(
input_seq=src_embedding, gate_size=encoder_size)
encoded_vector = fluid.layers.concat(
input=[src_forward, src_reversed], axis=1)
encoded_proj = fluid.layers.fc(input=encoded_vector,
size=decoder_size,
bias_attr=False)
backward_first = fluid.layers.sequence_pool(
input=src_reversed, pool_type='first')
decoder_boot = fluid.layers.fc(input=backward_first,
size=decoder_size,
bias_attr=False,
act='tanh')
cell_init = fluid.layers.fill_constant_batch_size_like(
input=decoder_boot,
value=0.0,
shape=[-1, decoder_size],
dtype='float32')
cell_init.stop_gradient = False
# Create a RNN state cell by providing the input and hidden states, and
# specifies the hidden state as output.
h = InitState(init=decoder_boot, need_reorder=True)
c = InitState(init=cell_init)
state_cell = StateCell(
inputs={'x': None,
'encoder_vec': None,
'encoder_proj': None},
states={'h': h,
'c': c},
out_state='h')
def simple_attention(encoder_vec, encoder_proj, decoder_state):
# The implementation of simple attention model
decoder_state_proj = fluid.layers.fc(input=decoder_state,
size=decoder_size,
bias_attr=False)
decoder_state_expand = fluid.layers.sequence_expand(
x=decoder_state_proj, y=encoder_proj)
# concated lod should inherit from encoder_proj
concated = fluid.layers.concat(
input=[encoder_proj, decoder_state_expand], axis=1)
attention_weights = fluid.layers.fc(input=concated,
size=1,
bias_attr=False)
attention_weights = fluid.layers.sequence_softmax(
input=attention_weights)
weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
scaled = fluid.layers.elementwise_mul(
x=encoder_vec, y=weigths_reshape, axis=0)
context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
return context
@state_cell.state_updater
def state_updater(state_cell):
# Define the updater of RNN state cell
current_word = state_cell.get_input('x')
encoder_vec = state_cell.get_input('encoder_vec')
encoder_proj = state_cell.get_input('encoder_proj')
prev_h = state_cell.get_state('h')
prev_c = state_cell.get_state('c')
context = simple_attention(encoder_vec, encoder_proj, prev_h)
decoder_inputs = fluid.layers.concat(
input=[context, current_word], axis=1)
h, c = lstm_step(decoder_inputs, prev_h, prev_c, decoder_size)
state_cell.set_state('h', h)
state_cell.set_state('c', c)
# Define the decoding process
if not is_generating:
# Training process
trg_word_idx = fluid.layers.data(
name='target_sequence', shape=[1], dtype='int64', lod_level=1)
trg_embedding = fluid.layers.embedding(
input=trg_word_idx,
size=[target_dict_dim, embedding_dim],
dtype='float32')
# A decoder for training
decoder = TrainingDecoder(state_cell)
with decoder.block():
current_word = decoder.step_input(trg_embedding)
encoder_vec = decoder.static_input(encoded_vector)
encoder_proj = decoder.static_input(encoded_proj)
decoder.state_cell.compute_state(inputs={
'x': current_word,
'encoder_vec': encoder_vec,
'encoder_proj': encoder_proj
})
h = decoder.state_cell.get_state('h')
decoder.state_cell.update_states()
out = fluid.layers.fc(input=h,
size=target_dict_dim,
bias_attr=True,
act='softmax')
decoder.output(out)
label = fluid.layers.data(
name='label_sequence', shape=[1], dtype='int64', lod_level=1)
cost = fluid.layers.cross_entropy(input=decoder(), label=label)
avg_cost = fluid.layers.mean(x=cost)
feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
return avg_cost, feeding_list
else:
# Inference
init_ids = fluid.layers.data(
name="init_ids", shape=[1], dtype="int64", lod_level=2)
init_scores = fluid.layers.data(
name="init_scores", shape=[1], dtype="float32", lod_level=2)
# A beam search decoder
decoder = BeamSearchDecoder(
state_cell=state_cell,
init_ids=init_ids,
init_scores=init_scores,
target_dict_dim=target_dict_dim,
word_dim=embedding_dim,
input_var_dict={
'encoder_vec': encoded_vector,
'encoder_proj': encoded_proj
},
topk_size=50,
sparse_emb=True,
max_len=max_length,
beam_size=beam_size,
end_id=1,
name=None)
decoder.decode()
translation_ids, translation_scores = decoder()
feeding_list = ["source_sequence"]
return translation_ids, translation_scores, feeding_list
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import os
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.framework as framework
from paddle.fluid.executor import Executor
from paddle.fluid.contrib.decoder.beam_search_decoder import *
from args import *
import attention_model
import no_attention_model
def infer():
args = parse_args()
# Inference
if args.no_attention:
translation_ids, translation_scores, feed_order = \
no_attention_model.seq_to_seq_net(
args.embedding_dim,
args.encoder_size,
args.decoder_size,
args.dict_size,
args.dict_size,
True,
beam_size=args.beam_size,
max_length=args.max_length)
else:
translation_ids, translation_scores, feed_order = \
attention_model.seq_to_seq_net(
args.embedding_dim,
args.encoder_size,
args.decoder_size,
args.dict_size,
args.dict_size,
True,
beam_size=args.beam_size,
max_length=args.max_length)
test_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
batch_size=args.batch_size,
drop_last=False)
place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
exe = Executor(place)
exe.run(framework.default_startup_program())
model_path = os.path.join(args.save_dir, str(args.pass_num))
fluid.io.load_persistables(
executor=exe,
dirname=model_path,
main_program=framework.default_main_program())
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
feed_list = [
framework.default_main_program().global_block().var(var_name)
for var_name in feed_order[0:1]
]
feeder = fluid.DataFeeder(feed_list, place)
for batch_id, data in enumerate(test_batch_generator()):
# The value of batch_size may vary in the last batch
batch_size = len(data)
# Setup initial ids and scores lod tensor
init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64')
init_scores_data = np.array(
[1. for _ in range(batch_size)], dtype='float32')
init_ids_data = init_ids_data.reshape((batch_size, 1))
init_scores_data = init_scores_data.reshape((batch_size, 1))
init_recursive_seq_lens = [1] * batch_size
init_recursive_seq_lens = [
init_recursive_seq_lens, init_recursive_seq_lens
]
init_ids = fluid.create_lod_tensor(init_ids_data,
init_recursive_seq_lens, place)
init_scores = fluid.create_lod_tensor(init_scores_data,
init_recursive_seq_lens, place)
# Feed dict for inference
feed_dict = feeder.feed(map(lambda x: [x[0]], data))
feed_dict['init_ids'] = init_ids
feed_dict['init_scores'] = init_scores
fetch_outs = exe.run(framework.default_main_program(),
feed=feed_dict,
fetch_list=[translation_ids, translation_scores],
return_numpy=False)
# Split the output words by lod levels
lod_level_1 = fetch_outs[0].lod()[1]
token_array = np.array(fetch_outs[0])
result = []
for i in xrange(len(lod_level_1) - 1):
sentence_list = [
trg_dict[token]
for token in token_array[lod_level_1[i]:lod_level_1[i + 1]]
]
sentence = " ".join(sentence_list[1:-1])
result.append(sentence)
lod_level_0 = fetch_outs[0].lod()[0]
paragraphs = [
result[lod_level_0[i]:lod_level_0[i + 1]]
for i in xrange(len(lod_level_0) - 1)
]
for paragraph in paragraphs:
print(paragraph)
if __name__ == '__main__':
infer()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid.layers as layers
from paddle.fluid.contrib.decoder.beam_search_decoder import *
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
target_dict_dim, is_generating, beam_size, max_length):
def encoder():
# Encoder implementation of RNN translation
src_word = layers.data(
name="src_word", shape=[1], dtype='int64', lod_level=1)
src_embedding = layers.embedding(
input=src_word,
size=[source_dict_dim, embedding_dim],
dtype='float32',
is_sparse=True)
fc1 = layers.fc(input=src_embedding, size=encoder_size * 4, act='tanh')
lstm_hidden0, lstm_0 = layers.dynamic_lstm(
input=fc1, size=encoder_size * 4)
encoder_out = layers.sequence_last_step(input=lstm_hidden0)
return encoder_out
def decoder_state_cell(context):
# Decoder state cell, specifies the hidden state variable and its updater
h = InitState(init=context, need_reorder=True)
state_cell = StateCell(
inputs={'x': None}, states={'h': h}, out_state='h')
@state_cell.state_updater
def updater(state_cell):
current_word = state_cell.get_input('x')
prev_h = state_cell.get_state('h')
# make sure lod of h heritted from prev_h
h = layers.fc(input=[prev_h, current_word],
size=decoder_size,
act='tanh')
state_cell.set_state('h', h)
return state_cell
def decoder_train(state_cell):
# Decoder for training implementation of RNN translation
trg_word = layers.data(
name="target_word", shape=[1], dtype='int64', lod_level=1)
trg_embedding = layers.embedding(
input=trg_word,
size=[target_dict_dim, embedding_dim],
dtype='float32',
is_sparse=True)
# A training decoder
decoder = TrainingDecoder(state_cell)
# Define the computation in each RNN step done by decoder
with decoder.block():
current_word = decoder.step_input(trg_embedding)
decoder.state_cell.compute_state(inputs={'x': current_word})
current_score = layers.fc(input=decoder.state_cell.get_state('h'),
size=target_dict_dim,
act='softmax')
decoder.state_cell.update_states()
decoder.output(current_score)
return decoder()
def decoder_infer(state_cell):
# Decoder for inference implementation
init_ids = layers.data(
name="init_ids", shape=[1], dtype="int64", lod_level=2)
init_scores = layers.data(
name="init_scores", shape=[1], dtype="float32", lod_level=2)
# A beam search decoder for inference
decoder = BeamSearchDecoder(
state_cell=state_cell,
init_ids=init_ids,
init_scores=init_scores,
target_dict_dim=target_dict_dim,
word_dim=embedding_dim,
input_var_dict={},
topk_size=50,
sparse_emb=True,
max_len=max_length,
beam_size=beam_size,
end_id=1,
name=None)
decoder.decode()
translation_ids, translation_scores = decoder()
return translation_ids, translation_scores
context = encoder()
state_cell = decoder_state_cell(context)
if not is_generating:
label = layers.data(
name="target_next_word", shape=[1], dtype='int64', lod_level=1)
rnn_out = decoder_train(state_cell)
cost = layers.cross_entropy(input=rnn_out, label=label)
avg_cost = layers.mean(x=cost)
feeding_list = ['src_word', 'target_word', 'target_next_word']
return avg_cost, feeding_list
else:
translation_ids, translation_scores = decoder_infer(state_cell)
feeding_list = ['src_word']
return translation_ids, translation_scores, feeding_list
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import time
import os
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.framework as framework
from paddle.fluid.executor import Executor
from paddle.fluid.contrib.decoder.beam_search_decoder import *
from args import *
import attention_model
import no_attention_model
def train():
args = parse_args()
if args.enable_ce:
framework.default_startup_program().random_seed = 111
# Training process
if args.no_attention:
avg_cost, feed_order = no_attention_model.seq_to_seq_net(
args.embedding_dim,
args.encoder_size,
args.decoder_size,
args.dict_size,
args.dict_size,
False,
beam_size=args.beam_size,
max_length=args.max_length)
else:
avg_cost, feed_order = attention_model.seq_to_seq_net(
args.embedding_dim,
args.encoder_size,
args.decoder_size,
args.dict_size,
args.dict_size,
False,
beam_size=args.beam_size,
max_length=args.max_length)
# clone from default main program and use it as the validation program
main_program = fluid.default_main_program()
inference_program = fluid.default_main_program().clone()
optimizer = fluid.optimizer.Adam(
learning_rate=args.learning_rate,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-5))
optimizer.minimize(avg_cost)
# Disable shuffle for Continuous Evaluation only
if not args.enable_ce:
train_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
batch_size=args.batch_size,
drop_last=False)
test_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
batch_size=args.batch_size,
drop_last=False)
else:
train_batch_generator = paddle.batch(
paddle.dataset.wmt14.train(args.dict_size),
batch_size=args.batch_size,
drop_last=False)
test_batch_generator = paddle.batch(
paddle.dataset.wmt14.test(args.dict_size),
batch_size=args.batch_size,
drop_last=False)
place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
exe = Executor(place)
exe.run(framework.default_startup_program())
feed_list = [
main_program.global_block().var(var_name) for var_name in feed_order
]
feeder = fluid.DataFeeder(feed_list, place)
def validation():
# Use test set as validation each pass
total_loss = 0.0
count = 0
val_feed_list = [
inference_program.global_block().var(var_name)
for var_name in feed_order
]
val_feeder = fluid.DataFeeder(val_feed_list, place)
for batch_id, data in enumerate(test_batch_generator()):
val_fetch_outs = exe.run(inference_program,
feed=val_feeder.feed(data),
fetch_list=[avg_cost],
return_numpy=False)
total_loss += np.array(val_fetch_outs[0])[0]
count += 1
return total_loss / count
for pass_id in range(1, args.pass_num + 1):
pass_start_time = time.time()
words_seen = 0
for batch_id, data in enumerate(train_batch_generator()):
words_seen += len(data) * 2
fetch_outs = exe.run(framework.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost])
avg_cost_train = np.array(fetch_outs[0])
print('pass_id=%d, batch_id=%d, train_loss: %f' %
(pass_id, batch_id, avg_cost_train))
# This is for continuous evaluation only
if args.enable_ce and batch_id >= 100:
break
pass_end_time = time.time()
test_loss = validation()
time_consumed = pass_end_time - pass_start_time
words_per_sec = words_seen / time_consumed
print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
(pass_id, test_loss, words_per_sec, time_consumed))
# This log is for continuous evaluation only
if args.enable_ce:
print("kpis\ttrain_cost\t%f" % avg_cost_train)
print("kpis\ttest_cost\t%f" % test_loss)
print("kpis\ttrain_duration\t%f" % time_consumed)
if pass_id % args.save_interval == 0:
model_path = os.path.join(args.save_dir, str(pass_id))
if not os.path.isdir(model_path):
os.makedirs(model_path)
fluid.io.save_persistables(
executor=exe,
dirname=model_path,
main_program=framework.default_main_program())
if __name__ == '__main__':
train()
./neural_machine_translation/rnn_search
\ No newline at end of file
...@@ -7,12 +7,12 @@ from kpi import CostKpi, DurationKpi, AccKpi ...@@ -7,12 +7,12 @@ from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!! #### NOTE kpi.py should shared in models in some way!!!!
train_cost_kpi = CostKpi('train_cost', 0.02, actived=True) train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
test_acc_kpi = AccKpi('test_acc', 0.01, actived=True) test_acc_kpi = AccKpi('test_acc', 0.01, 0, actived=True)
train_speed_kpi = AccKpi('train_speed', 0.2, actived=True) train_speed_kpi = AccKpi('train_speed', 0.2, 0, actived=True)
train_cost_card4_kpi = CostKpi('train_cost_card4', 0.02, actived=True) train_cost_card4_kpi = CostKpi('train_cost_card4', 0.02, 0, actived=True)
test_acc_card4_kpi = AccKpi('test_acc_card4', 0.01, actived=True) test_acc_card4_kpi = AccKpi('test_acc_card4', 0.01, 0, actived=True)
train_speed_card4_kpi = AccKpi('train_speed_card4', 0.2, actived=True) train_speed_card4_kpi = AccKpi('train_speed_card4', 0.2, 0, actived=True)
tracking_kpis = [ tracking_kpis = [
train_cost_kpi, train_cost_kpi,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册