未验证 提交 c0539d13 编写于 作者: B Bai Yifan 提交者: GitHub

Merge branch 'develop' into ce

......@@ -24,15 +24,10 @@ def calc_diff(f1, f2):
#print d2.shape
#print d1[0, 0, 0:10, 0:10]
#print d2[0, 0, 0:10, 0:10]
#d1 = d1[:, :, 1:-2, 1:-2]
#d2 = d2[:, :, 1:-2, 1:-2]
d1 = d1.flatten()
d2 = d2.flatten()
#print d1[:10]
#print d2[:10]
d1_num = reduce(lambda x, y: x * y, d1.shape)
d2_num = reduce(lambda x, y: x * y, d2.shape)
if d1_num != d2_num:
......@@ -41,7 +36,11 @@ def calc_diff(f1, f2):
assert (d1_num == d2_num), "their shape is not consistent"
try:
mask = np.abs(d1) >= np.abs(d2)
mask = mask.astype('int32')
df = np.abs(d1 - d2)
df = df / (1.0e-10 + np.abs(d1) * mask + np.abs(d2) * (1 - mask))
max_df = np.max(df)
sq_df = np.mean(df * df)
return max_df, sq_df
......
......@@ -39,6 +39,7 @@ LAYER_DESCRIPTORS = {
'Pooling': shape_pool,
'Power': shape_identity,
'ReLU': shape_identity,
'PReLU': shape_identity,
'Scale': shape_identity,
'Sigmoid': shape_identity,
'SigmoidCrossEntropyLoss': shape_scalar,
......
......@@ -240,10 +240,16 @@ class Network(object):
@layer
def relu(self, input, name):
fluid = import_fluid()
output = fluid.layers.relu(
name=self.get_unique_output_name(name, 'relu'), x=input)
output = fluid.layers.relu(input)
return output
@layer
def prelu(self, input, channel_shared, name):
#fluid = import_fluid()
#output = fluid.layers.relu(input)
#return output
raise NotImplementedError('prelu not implemented')
def pool(self, pool_type, input, k_h, k_w, s_h, s_w, ceil_mode, padding,
name):
# Get the number of channels in the input
......@@ -382,7 +388,8 @@ class Network(object):
name,
scale_offset=True,
eps=1e-5,
relu=False):
relu=False,
relu_negative_slope=0.0):
# NOTE: Currently, only inference is supported
fluid = import_fluid()
prefix = name + '_'
......@@ -392,6 +399,15 @@ class Network(object):
name=prefix + 'offset')
mean_name = prefix + 'mean'
variance_name = prefix + 'variance'
leaky_relu = False
act = 'relu'
if relu is False:
act = None
elif relu_negative_slope != 0.0:
leaky_relu = True
act = None
output = fluid.layers.batch_norm(
name=self.get_unique_output_name(name, 'batch_norm'),
input=input,
......@@ -401,7 +417,10 @@ class Network(object):
moving_mean_name=mean_name,
moving_variance_name=variance_name,
epsilon=eps,
act='relu' if relu is True else None)
act=act)
if leaky_relu:
output = fluid.layers.leaky_relu(output, alpha=relu_negative_slope)
return output
......
......@@ -112,6 +112,13 @@ class PaddleMapper(NodeMapper):
def map_relu(self, node):
return PaddleNode('relu')
def map_prelu(self, node):
channel_shared = getattr(node.parameters, 'channel_shared', False)
return PaddleNode('prelu', channel_shared)
def map_tanh(self, node):
return PaddleNode('tanh')
def map_pooling(self, node):
pool_type = node.parameters.pool
if pool_type == 0:
......
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
cudaid=${language_model:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py | python _ce.py
cudaid=${language_model_m:=0,1,2,3} # use 0,1,2,3 card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
imikolov_20_avg_ppl_kpi = CostKpi('imikolov_20_avg_ppl', 0.2, 0)
imikolov_20_pass_duration_kpi = DurationKpi(
'imikolov_20_pass_duration', 0.02, 0, actived=True)
imikolov_20_avg_ppl_kpi_card4 = CostKpi('imikolov_20_avg_ppl_card4', 0.2, 0)
imikolov_20_pass_duration_kpi_card4 = DurationKpi(
'imikolov_20_pass_duration_card4', 0.03, 0, actived=True)
tracking_kpis = [
imikolov_20_avg_ppl_kpi,
imikolov_20_pass_duration_kpi,
imikolov_20_avg_ppl_kpi_card4,
imikolov_20_pass_duration_kpi_card4,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
import os
import sys
import time
import numpy as np
import math
import argparse
import paddle.fluid as fluid
import paddle.v2 as paddle
import paddle
import utils
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("language_model benchmark.")
parser.add_argument(
'--enable_ce',
action='store_true',
help='If set, run \
the task with continuous evaluation logs.')
args = parser.parse_args()
return args
def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
""" network definition """
......@@ -63,31 +77,26 @@ def train(train_reader,
init_low_bound=-0.04,
init_high_bound=0.04):
""" train network """
args = parse_args()
if args.enable_ce:
# random seed must set before configuring the network.
fluid.default_startup_program().random_seed = SEED
vocab_size = len(vocab)
#Input data
src_wordseq = fluid.layers.data(
name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
dst_wordseq = fluid.layers.data(
name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)
# Train program
avg_cost = None
if not parallel:
cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
init_low_bound, init_high_bound)
avg_cost = fluid.layers.mean(x=cost)
else:
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places)
with pd.do():
cost = network(
pd.read_input(src_wordseq),
pd.read_input(dst_wordseq), vocab_size, hid_size,
init_low_bound, init_high_bound)
pd.write_output(cost)
cost = pd()
avg_cost = fluid.layers.mean(x=cost)
cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
init_low_bound, init_high_bound)
avg_cost = fluid.layers.mean(x=cost)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=base_lr,
......@@ -96,39 +105,56 @@ def train(train_reader,
staircase=True))
sgd_optimizer.minimize(avg_cost)
# Initialize executor
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
total_time = 0.0
fetch_list = [avg_cost.name]
for pass_idx in xrange(pass_num):
epoch_idx = pass_idx + 1
print "epoch_%d start" % epoch_idx
t0 = time.time()
i = 0
newest_ppl = 0
for data in train_reader():
i += 1
lod_src_wordseq = utils.to_lodtensor(
map(lambda x: x[0], data), place)
lod_dst_wordseq = utils.to_lodtensor(
map(lambda x: x[1], data), place)
ret_avg_cost = exe.run(fluid.default_main_program(),
feed={
"src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq
},
fetch_list=[avg_cost],
use_program_cache=True)
avg_ppl = math.exp(ret_avg_cost[0])
ret_avg_cost = train_exe.run(feed={
"src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq
},
fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
if i % 100 == 0:
print "step:%d ppl:%.3f" % (i, avg_ppl)
print "step:%d ppl:%.3f" % (i, newest_ppl)
t1 = time.time()
total_time += t1 - t0
print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i,
total_time / epoch_idx)
if pass_idx == pass_num - 1 and args.enable_ce:
#Note: The following logs are special for CE monitoring.
#Other situations do not need to care about these logs.
gpu_num = get_cards()
if gpu_num == 1:
print("kpis imikolov_20_pass_duration %s" %
(total_time / epoch_idx))
print("kpis imikolov_20_avg_ppl %s" % newest_ppl)
else:
print("kpis imikolov_20_pass_duration_card%s %s" % \
(gpu_num, total_time / epoch_idx))
print("kpis imikolov_20_avg_ppl_card%s %s" %
(gpu_num, newest_ppl))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost]
......@@ -138,11 +164,22 @@ def train(train_reader,
print("finish training")
def get_cards(enable_ce):
if enable_ce:
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
num = len(cards.split(","))
return num
else:
return fluid.core.get_cuda_device_count()
def train_net():
""" do training """
batch_size = 20
args = parse_args()
vocab, train_reader, test_reader = utils.prepare_data(
batch_size=batch_size, buffer_size=1000, word_freq_threshold=0)
batch_size=batch_size * get_cards(args.enable_ce), buffer_size=1000, \
word_freq_threshold=0, enable_ce = args.enable_ce)
train(
train_reader=train_reader,
vocab=vocab,
......@@ -152,7 +189,7 @@ def train_net():
batch_size=batch_size,
pass_num=12,
use_cuda=True,
parallel=False,
parallel=True,
model_dir="model",
init_low_bound=-0.1,
init_high_bound=0.1)
......
......@@ -3,7 +3,7 @@ import time
import numpy as np
import paddle.fluid as fluid
import paddle.v2 as paddle
import paddle
def to_lodtensor(data, place):
......@@ -22,17 +22,28 @@ def to_lodtensor(data, place):
return res
def prepare_data(batch_size, buffer_size=1000, word_freq_threshold=0):
def prepare_data(batch_size,
buffer_size=1000,
word_freq_threshold=0,
enable_ce=False):
""" prepare the English Pann Treebank (PTB) data """
vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold)
train_reader = paddle.batch(
paddle.reader.shuffle(
if enable_ce:
train_reader = paddle.batch(
paddle.dataset.imikolov.train(
vocab,
buffer_size,
data_type=paddle.dataset.imikolov.DataType.SEQ),
buf_size=buffer_size),
batch_size)
batch_size)
else:
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imikolov.train(
vocab,
buffer_size,
data_type=paddle.dataset.imikolov.DataType.SEQ),
buf_size=buffer_size),
batch_size)
test_reader = paddle.batch(
paddle.dataset.imikolov.test(
vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ),
......
###!/bin/bash
####This file is only used for continuous evaluation.
model_file='train.py'
python $model_file --pass_num 1 --learning_rate 0.001 --save_interval 10 --enable_ce | python _ce.py
####this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
test_cost_kpi = CostKpi('test_cost', 0.005, 0, actived=True)
train_duration_kpi = DurationKpi('train_duration', 0.06, 0, actived=True)
tracking_kpis = [
train_cost_kpi,
test_cost_kpi,
train_duration_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import distutils.util
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--embedding_dim",
type=int,
default=512,
help="The dimension of embedding table. (default: %(default)d)")
parser.add_argument(
"--encoder_size",
type=int,
default=512,
help="The size of encoder bi-rnn unit. (default: %(default)d)")
parser.add_argument(
"--decoder_size",
type=int,
default=512,
help="The size of decoder rnn unit. (default: %(default)d)")
parser.add_argument(
"--batch_size",
type=int,
default=32,
help="The sequence number of a mini-batch data. (default: %(default)d)")
parser.add_argument(
"--dict_size",
type=int,
default=30000,
help="The dictionary capacity. Dictionaries of source sequence and "
"target dictionary have same capacity. (default: %(default)d)")
parser.add_argument(
"--pass_num",
type=int,
default=5,
help="The pass number to train. (default: %(default)d)")
parser.add_argument(
"--learning_rate",
type=float,
default=0.01,
help="Learning rate used to train the model. (default: %(default)f)")
parser.add_argument(
"--no_attention",
action='store_true',
help="If set, run no attention model instead of attention model.")
parser.add_argument(
"--beam_size",
type=int,
default=3,
help="The width for beam searching. (default: %(default)d)")
parser.add_argument(
"--use_gpu",
type=distutils.util.strtobool,
default=True,
help="Whether to use gpu. (default: %(default)d)")
parser.add_argument(
"--max_length",
type=int,
default=50,
help="The maximum length of sequence when doing generation. "
"(default: %(default)d)")
parser.add_argument(
"--save_dir",
type=str,
default="model",
help="Specify the path to save trained models.")
parser.add_argument(
"--save_interval",
type=int,
default=1,
help="Save the trained model every n passes."
"(default: %(default)d)")
parser.add_argument(
"--enable_ce",
action='store_true',
help="If set, run the task with continuous evaluation logs.")
args = parser.parse_args()
return args
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from paddle.fluid.contrib.decoder.beam_search_decoder import *
def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
def linear(inputs):
return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
cell_t = fluid.layers.sums(input=[
fluid.layers.elementwise_mul(
x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
x=input_gate, y=cell_tilde)
])
hidden_t = fluid.layers.elementwise_mul(
x=output_gate, y=fluid.layers.tanh(x=cell_t))
return hidden_t, cell_t
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
target_dict_dim, is_generating, beam_size, max_length):
"""Construct a seq2seq network."""
def bi_lstm_encoder(input_seq, gate_size):
# A bi-directional lstm encoder implementation.
# Linear transformation part for input gate, output gate, forget gate
# and cell activation vectors need be done outside of dynamic_lstm.
# So the output size is 4 times of gate_size.
input_forward_proj = fluid.layers.fc(input=input_seq,
size=gate_size * 4,
act='tanh',
bias_attr=False)
forward, _ = fluid.layers.dynamic_lstm(
input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
input_reversed_proj = fluid.layers.fc(input=input_seq,
size=gate_size * 4,
act='tanh',
bias_attr=False)
reversed, _ = fluid.layers.dynamic_lstm(
input=input_reversed_proj,
size=gate_size * 4,
is_reverse=True,
use_peepholes=False)
return forward, reversed
# The encoding process. Encodes the input words into tensors.
src_word_idx = fluid.layers.data(
name='source_sequence', shape=[1], dtype='int64', lod_level=1)
src_embedding = fluid.layers.embedding(
input=src_word_idx,
size=[source_dict_dim, embedding_dim],
dtype='float32')
src_forward, src_reversed = bi_lstm_encoder(
input_seq=src_embedding, gate_size=encoder_size)
encoded_vector = fluid.layers.concat(
input=[src_forward, src_reversed], axis=1)
encoded_proj = fluid.layers.fc(input=encoded_vector,
size=decoder_size,
bias_attr=False)
backward_first = fluid.layers.sequence_pool(
input=src_reversed, pool_type='first')
decoder_boot = fluid.layers.fc(input=backward_first,
size=decoder_size,
bias_attr=False,
act='tanh')
cell_init = fluid.layers.fill_constant_batch_size_like(
input=decoder_boot,
value=0.0,
shape=[-1, decoder_size],
dtype='float32')
cell_init.stop_gradient = False
# Create a RNN state cell by providing the input and hidden states, and
# specifies the hidden state as output.
h = InitState(init=decoder_boot, need_reorder=True)
c = InitState(init=cell_init)
state_cell = StateCell(
inputs={'x': None,
'encoder_vec': None,
'encoder_proj': None},
states={'h': h,
'c': c},
out_state='h')
def simple_attention(encoder_vec, encoder_proj, decoder_state):
# The implementation of simple attention model
decoder_state_proj = fluid.layers.fc(input=decoder_state,
size=decoder_size,
bias_attr=False)
decoder_state_expand = fluid.layers.sequence_expand(
x=decoder_state_proj, y=encoder_proj)
# concated lod should inherit from encoder_proj
concated = fluid.layers.concat(
input=[encoder_proj, decoder_state_expand], axis=1)
attention_weights = fluid.layers.fc(input=concated,
size=1,
bias_attr=False)
attention_weights = fluid.layers.sequence_softmax(
input=attention_weights)
weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
scaled = fluid.layers.elementwise_mul(
x=encoder_vec, y=weigths_reshape, axis=0)
context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
return context
@state_cell.state_updater
def state_updater(state_cell):
# Define the updater of RNN state cell
current_word = state_cell.get_input('x')
encoder_vec = state_cell.get_input('encoder_vec')
encoder_proj = state_cell.get_input('encoder_proj')
prev_h = state_cell.get_state('h')
prev_c = state_cell.get_state('c')
context = simple_attention(encoder_vec, encoder_proj, prev_h)
decoder_inputs = fluid.layers.concat(
input=[context, current_word], axis=1)
h, c = lstm_step(decoder_inputs, prev_h, prev_c, decoder_size)
state_cell.set_state('h', h)
state_cell.set_state('c', c)
# Define the decoding process
if not is_generating:
# Training process
trg_word_idx = fluid.layers.data(
name='target_sequence', shape=[1], dtype='int64', lod_level=1)
trg_embedding = fluid.layers.embedding(
input=trg_word_idx,
size=[target_dict_dim, embedding_dim],
dtype='float32')
# A decoder for training
decoder = TrainingDecoder(state_cell)
with decoder.block():
current_word = decoder.step_input(trg_embedding)
encoder_vec = decoder.static_input(encoded_vector)
encoder_proj = decoder.static_input(encoded_proj)
decoder.state_cell.compute_state(inputs={
'x': current_word,
'encoder_vec': encoder_vec,
'encoder_proj': encoder_proj
})
h = decoder.state_cell.get_state('h')
decoder.state_cell.update_states()
out = fluid.layers.fc(input=h,
size=target_dict_dim,
bias_attr=True,
act='softmax')
decoder.output(out)
label = fluid.layers.data(
name='label_sequence', shape=[1], dtype='int64', lod_level=1)
cost = fluid.layers.cross_entropy(input=decoder(), label=label)
avg_cost = fluid.layers.mean(x=cost)
feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
return avg_cost, feeding_list
else:
# Inference
init_ids = fluid.layers.data(
name="init_ids", shape=[1], dtype="int64", lod_level=2)
init_scores = fluid.layers.data(
name="init_scores", shape=[1], dtype="float32", lod_level=2)
# A beam search decoder
decoder = BeamSearchDecoder(
state_cell=state_cell,
init_ids=init_ids,
init_scores=init_scores,
target_dict_dim=target_dict_dim,
word_dim=embedding_dim,
input_var_dict={
'encoder_vec': encoded_vector,
'encoder_proj': encoded_proj
},
topk_size=50,
sparse_emb=True,
max_len=max_length,
beam_size=beam_size,
end_id=1,
name=None)
decoder.decode()
translation_ids, translation_scores = decoder()
feeding_list = ["source_sequence"]
return translation_ids, translation_scores, feeding_list
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import os
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.framework as framework
from paddle.fluid.executor import Executor
from paddle.fluid.contrib.decoder.beam_search_decoder import *
from args import *
import attention_model
import no_attention_model
def infer():
args = parse_args()
# Inference
if args.no_attention:
translation_ids, translation_scores, feed_order = \
no_attention_model.seq_to_seq_net(
args.embedding_dim,
args.encoder_size,
args.decoder_size,
args.dict_size,
args.dict_size,
True,
beam_size=args.beam_size,
max_length=args.max_length)
else:
translation_ids, translation_scores, feed_order = \
attention_model.seq_to_seq_net(
args.embedding_dim,
args.encoder_size,
args.decoder_size,
args.dict_size,
args.dict_size,
True,
beam_size=args.beam_size,
max_length=args.max_length)
test_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
batch_size=args.batch_size,
drop_last=False)
place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
exe = Executor(place)
exe.run(framework.default_startup_program())
model_path = os.path.join(args.save_dir, str(args.pass_num))
fluid.io.load_persistables(
executor=exe,
dirname=model_path,
main_program=framework.default_main_program())
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
feed_list = [
framework.default_main_program().global_block().var(var_name)
for var_name in feed_order[0:1]
]
feeder = fluid.DataFeeder(feed_list, place)
for batch_id, data in enumerate(test_batch_generator()):
# The value of batch_size may vary in the last batch
batch_size = len(data)
# Setup initial ids and scores lod tensor
init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64')
init_scores_data = np.array(
[1. for _ in range(batch_size)], dtype='float32')
init_ids_data = init_ids_data.reshape((batch_size, 1))
init_scores_data = init_scores_data.reshape((batch_size, 1))
init_recursive_seq_lens = [1] * batch_size
init_recursive_seq_lens = [
init_recursive_seq_lens, init_recursive_seq_lens
]
init_ids = fluid.create_lod_tensor(init_ids_data,
init_recursive_seq_lens, place)
init_scores = fluid.create_lod_tensor(init_scores_data,
init_recursive_seq_lens, place)
# Feed dict for inference
feed_dict = feeder.feed(map(lambda x: [x[0]], data))
feed_dict['init_ids'] = init_ids
feed_dict['init_scores'] = init_scores
fetch_outs = exe.run(framework.default_main_program(),
feed=feed_dict,
fetch_list=[translation_ids, translation_scores],
return_numpy=False)
# Split the output words by lod levels
lod_level_1 = fetch_outs[0].lod()[1]
token_array = np.array(fetch_outs[0])
result = []
for i in xrange(len(lod_level_1) - 1):
sentence_list = [
trg_dict[token]
for token in token_array[lod_level_1[i]:lod_level_1[i + 1]]
]
sentence = " ".join(sentence_list[1:-1])
result.append(sentence)
lod_level_0 = fetch_outs[0].lod()[0]
paragraphs = [
result[lod_level_0[i]:lod_level_0[i + 1]]
for i in xrange(len(lod_level_0) - 1)
]
for paragraph in paragraphs:
print(paragraph)
if __name__ == '__main__':
infer()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid.layers as layers
from paddle.fluid.contrib.decoder.beam_search_decoder import *
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
target_dict_dim, is_generating, beam_size, max_length):
def encoder():
# Encoder implementation of RNN translation
src_word = layers.data(
name="src_word", shape=[1], dtype='int64', lod_level=1)
src_embedding = layers.embedding(
input=src_word,
size=[source_dict_dim, embedding_dim],
dtype='float32',
is_sparse=True)
fc1 = layers.fc(input=src_embedding, size=encoder_size * 4, act='tanh')
lstm_hidden0, lstm_0 = layers.dynamic_lstm(
input=fc1, size=encoder_size * 4)
encoder_out = layers.sequence_last_step(input=lstm_hidden0)
return encoder_out
def decoder_state_cell(context):
# Decoder state cell, specifies the hidden state variable and its updater
h = InitState(init=context, need_reorder=True)
state_cell = StateCell(
inputs={'x': None}, states={'h': h}, out_state='h')
@state_cell.state_updater
def updater(state_cell):
current_word = state_cell.get_input('x')
prev_h = state_cell.get_state('h')
# make sure lod of h heritted from prev_h
h = layers.fc(input=[prev_h, current_word],
size=decoder_size,
act='tanh')
state_cell.set_state('h', h)
return state_cell
def decoder_train(state_cell):
# Decoder for training implementation of RNN translation
trg_word = layers.data(
name="target_word", shape=[1], dtype='int64', lod_level=1)
trg_embedding = layers.embedding(
input=trg_word,
size=[target_dict_dim, embedding_dim],
dtype='float32',
is_sparse=True)
# A training decoder
decoder = TrainingDecoder(state_cell)
# Define the computation in each RNN step done by decoder
with decoder.block():
current_word = decoder.step_input(trg_embedding)
decoder.state_cell.compute_state(inputs={'x': current_word})
current_score = layers.fc(input=decoder.state_cell.get_state('h'),
size=target_dict_dim,
act='softmax')
decoder.state_cell.update_states()
decoder.output(current_score)
return decoder()
def decoder_infer(state_cell):
# Decoder for inference implementation
init_ids = layers.data(
name="init_ids", shape=[1], dtype="int64", lod_level=2)
init_scores = layers.data(
name="init_scores", shape=[1], dtype="float32", lod_level=2)
# A beam search decoder for inference
decoder = BeamSearchDecoder(
state_cell=state_cell,
init_ids=init_ids,
init_scores=init_scores,
target_dict_dim=target_dict_dim,
word_dim=embedding_dim,
input_var_dict={},
topk_size=50,
sparse_emb=True,
max_len=max_length,
beam_size=beam_size,
end_id=1,
name=None)
decoder.decode()
translation_ids, translation_scores = decoder()
return translation_ids, translation_scores
context = encoder()
state_cell = decoder_state_cell(context)
if not is_generating:
label = layers.data(
name="target_next_word", shape=[1], dtype='int64', lod_level=1)
rnn_out = decoder_train(state_cell)
cost = layers.cross_entropy(input=rnn_out, label=label)
avg_cost = layers.mean(x=cost)
feeding_list = ['src_word', 'target_word', 'target_next_word']
return avg_cost, feeding_list
else:
translation_ids, translation_scores = decoder_infer(state_cell)
feeding_list = ['src_word']
return translation_ids, translation_scores, feeding_list
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import time
import os
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.framework as framework
from paddle.fluid.executor import Executor
from paddle.fluid.contrib.decoder.beam_search_decoder import *
from args import *
import attention_model
import no_attention_model
def train():
args = parse_args()
if args.enable_ce:
framework.default_startup_program().random_seed = 111
# Training process
if args.no_attention:
avg_cost, feed_order = no_attention_model.seq_to_seq_net(
args.embedding_dim,
args.encoder_size,
args.decoder_size,
args.dict_size,
args.dict_size,
False,
beam_size=args.beam_size,
max_length=args.max_length)
else:
avg_cost, feed_order = attention_model.seq_to_seq_net(
args.embedding_dim,
args.encoder_size,
args.decoder_size,
args.dict_size,
args.dict_size,
False,
beam_size=args.beam_size,
max_length=args.max_length)
# clone from default main program and use it as the validation program
main_program = fluid.default_main_program()
inference_program = fluid.default_main_program().clone()
optimizer = fluid.optimizer.Adam(
learning_rate=args.learning_rate,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-5))
optimizer.minimize(avg_cost)
# Disable shuffle for Continuous Evaluation only
if not args.enable_ce:
train_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
batch_size=args.batch_size,
drop_last=False)
test_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
batch_size=args.batch_size,
drop_last=False)
else:
train_batch_generator = paddle.batch(
paddle.dataset.wmt14.train(args.dict_size),
batch_size=args.batch_size,
drop_last=False)
test_batch_generator = paddle.batch(
paddle.dataset.wmt14.test(args.dict_size),
batch_size=args.batch_size,
drop_last=False)
place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
exe = Executor(place)
exe.run(framework.default_startup_program())
feed_list = [
main_program.global_block().var(var_name) for var_name in feed_order
]
feeder = fluid.DataFeeder(feed_list, place)
def validation():
# Use test set as validation each pass
total_loss = 0.0
count = 0
val_feed_list = [
inference_program.global_block().var(var_name)
for var_name in feed_order
]
val_feeder = fluid.DataFeeder(val_feed_list, place)
for batch_id, data in enumerate(test_batch_generator()):
val_fetch_outs = exe.run(inference_program,
feed=val_feeder.feed(data),
fetch_list=[avg_cost],
return_numpy=False)
total_loss += np.array(val_fetch_outs[0])[0]
count += 1
return total_loss / count
for pass_id in range(1, args.pass_num + 1):
pass_start_time = time.time()
words_seen = 0
for batch_id, data in enumerate(train_batch_generator()):
words_seen += len(data) * 2
fetch_outs = exe.run(framework.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost])
avg_cost_train = np.array(fetch_outs[0])
print('pass_id=%d, batch_id=%d, train_loss: %f' %
(pass_id, batch_id, avg_cost_train))
# This is for continuous evaluation only
if args.enable_ce and batch_id >= 100:
break
pass_end_time = time.time()
test_loss = validation()
time_consumed = pass_end_time - pass_start_time
words_per_sec = words_seen / time_consumed
print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
(pass_id, test_loss, words_per_sec, time_consumed))
# This log is for continuous evaluation only
if args.enable_ce:
print("kpis\ttrain_cost\t%f" % avg_cost_train)
print("kpis\ttest_cost\t%f" % test_loss)
print("kpis\ttrain_duration\t%f" % time_consumed)
if pass_id % args.save_interval == 0:
model_path = os.path.join(args.save_dir, str(pass_id))
if not os.path.isdir(model_path):
os.makedirs(model_path)
fluid.io.save_persistables(
executor=exe,
dirname=model_path,
main_program=framework.default_main_program())
if __name__ == '__main__':
train()
./neural_machine_translation/rnn_search
\ No newline at end of file
......@@ -7,12 +7,12 @@ from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_cost_kpi = CostKpi('train_cost', 0.02, actived=True)
test_acc_kpi = AccKpi('test_acc', 0.01, actived=True)
train_speed_kpi = AccKpi('train_speed', 0.2, actived=True)
train_cost_card4_kpi = CostKpi('train_cost_card4', 0.02, actived=True)
test_acc_card4_kpi = AccKpi('test_acc_card4', 0.01, actived=True)
train_speed_card4_kpi = AccKpi('train_speed_card4', 0.2, actived=True)
train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
test_acc_kpi = AccKpi('test_acc', 0.01, 0, actived=True)
train_speed_kpi = AccKpi('train_speed', 0.2, 0, actived=True)
train_cost_card4_kpi = CostKpi('train_cost_card4', 0.02, 0, actived=True)
test_acc_card4_kpi = AccKpi('test_acc_card4', 0.01, 0, actived=True)
train_speed_card4_kpi = AccKpi('train_speed_card4', 0.2, 0, actived=True)
tracking_kpis = [
train_cost_kpi,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册