未验证 提交 f3afe346 编写于 作者: G Guo Sheng 提交者: GitHub

Merge pull request #1190 from guoshengCS/support-py3-transformer

Support python3 in Transformer
......@@ -115,11 +115,11 @@ seq_len = ModelHyperParams.max_length
# compile time.
input_descs = {
# The actual data shape of src_word is:
# [batch_size * max_src_len_in_batch, 1]
"src_word": [(batch_size, seq_len, 1L), "int64", 2],
# [batch_size, max_src_len_in_batch, 1]
"src_word": [(batch_size, seq_len, 1), "int64", 2],
# The actual data shape of src_pos is:
# [batch_size * max_src_len_in_batch, 1]
"src_pos": [(batch_size, seq_len, 1L), "int64"],
# [batch_size, max_src_len_in_batch, 1]
"src_pos": [(batch_size, seq_len, 1), "int64"],
# This input is used to remove attention weights on paddings in the
# encoder.
# The actual data shape of src_slf_attn_bias is:
......@@ -127,12 +127,12 @@ input_descs = {
"src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# The actual data shape of trg_word is:
# [batch_size * max_trg_len_in_batch, 1]
"trg_word": [(batch_size, seq_len, 1L), "int64",
# [batch_size, max_trg_len_in_batch, 1]
"trg_word": [(batch_size, seq_len, 1), "int64",
2], # lod_level is only used in fast decoder.
# The actual data shape of trg_pos is:
# [batch_size * max_trg_len_in_batch, 1]
"trg_pos": [(batch_size, seq_len, 1L), "int64"],
# [batch_size, max_trg_len_in_batch, 1]
"trg_pos": [(batch_size, seq_len, 1), "int64"],
# This input is used to remove attention weights on paddings and
# subsequent words in the decoder.
# The actual data shape of trg_slf_attn_bias is:
......@@ -151,15 +151,13 @@ input_descs = {
"enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
# The actual data shape of label_word is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_word": [(batch_size * seq_len, 1L), "int64"],
"lbl_word": [(batch_size * seq_len, 1), "int64"],
# This input is used to mask out the loss of paddding tokens.
# The actual data shape of label_weight is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_weight": [(batch_size * seq_len, 1L), "float32"],
# These inputs are used to change the shape tensor in beam-search decoder.
"trg_slf_attn_pre_softmax_shape_delta": [(2L, ), "int32"],
"trg_slf_attn_post_softmax_shape_delta": [(4L, ), "int32"],
"init_score": [(batch_size, 1L), "float32"],
"lbl_weight": [(batch_size * seq_len, 1), "float32"],
# This input is used in beam-search decoder.
"init_score": [(batch_size, 1), "float32"],
}
# Names of word embedding table which might be reused for weight sharing.
......
......@@ -59,8 +59,7 @@ def parse_args():
"provided in util.py to do this.")
parser.add_argument(
"--token_delimiter",
type=partial(
str.decode, encoding="string-escape"),
type=lambda x: str(x.encode().decode("unicode-escape")),
default=" ",
help="The delimiter used to split tokens in source or target sentences. "
"For EN-DE BPE data we provided, use spaces as token delimiter.; "
......@@ -99,11 +98,11 @@ def post_process_seq(seq,
if idx == eos_idx:
eos_pos = i
break
seq = seq[:eos_pos + 1]
return filter(
lambda idx: (output_bos or idx != bos_idx) and \
(output_eos or idx != eos_idx),
seq)
seq = [
idx for idx in seq[:eos_pos + 1]
if (output_bos or idx != bos_idx) and (output_eos or idx != eos_idx)
]
return seq
def prepare_batch_input(insts, data_input_names, src_pad_idx, bos_idx, n_head,
......@@ -164,8 +163,10 @@ def fast_infer(test_data, trg_idx2word, use_wordpiece):
fluid.io.load_vars(
exe,
InferTaskConfig.model_path,
vars=filter(lambda var: isinstance(var, fluid.framework.Parameter),
fluid.default_main_program().list_vars()))
vars=[
var for var in fluid.default_main_program().list_vars()
if isinstance(var, fluid.framework.Parameter)
])
# This is used here to set dropout to the test mode.
infer_program = fluid.default_main_program().inference_optimize()
......@@ -203,7 +204,7 @@ def fast_infer(test_data, trg_idx2word, use_wordpiece):
post_process_seq(np.array(seq_ids)[sub_start:sub_end]),
trg_idx2word))
scores[i].append(np.array(seq_scores)[sub_end - 1])
print hyps[i][-1]
print(hyps[i][-1])
if len(hyps[i]) >= InferTaskConfig.n_best:
break
......
......@@ -12,7 +12,7 @@ def position_encoding_init(n_position, d_pos_vec):
Generate the initial values for the sinusoid position encoding table.
"""
position_enc = np.array([[
pos / np.power(10000, 2 * (j // 2) / d_pos_vec)
pos / np.power(10000, 2. * (j // 2) / d_pos_vec)
for j in range(d_pos_vec)
] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
......@@ -90,8 +90,7 @@ def multi_head_attention(queries,
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
return layers.reshape(
x=trans_x,
shape=map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]]))
x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]])
def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
"""
......
import glob
import os
import random
import tarfile
import cPickle
import numpy as np
class SortType(object):
......@@ -204,7 +204,8 @@ class DataReader(object):
self._token_delimiter = token_delimiter
self.load_src_trg_ids(end_mark, fpattern, start_mark, tar_fname,
unk_mark)
self._random = random.Random(x=seed)
self._random = np.random
self._random.seed(seed)
def load_src_trg_ids(self, end_mark, fpattern, start_mark, tar_fname,
unk_mark):
......
......@@ -2,8 +2,8 @@ import argparse
import ast
import multiprocessing
import os
import six
import time
from functools import partial
import numpy as np
import paddle.fluid as fluid
......@@ -78,8 +78,7 @@ def parse_args():
help="The <bos>, <eos> and <unk> tokens in the dictionary.")
parser.add_argument(
"--token_delimiter",
type=partial(
str.decode, encoding="string-escape"),
type=lambda x: str(x.encode().decode("unicode-escape")),
default=" ",
help="The delimiter used to split tokens in source or target sentences. "
"For EN-DE BPE data we provided, use spaces as token delimiter. "
......@@ -138,8 +137,6 @@ def pad_batch_data(insts,
"""
return_list = []
max_len = max(len(inst) for inst in insts)
num_token = reduce(lambda x, y: x + y,
[len(inst) for inst in insts]) if return_num_token else 0
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data = np.array(
......@@ -151,7 +148,7 @@ def pad_batch_data(insts,
return_list += [inst_weight.astype("float32").reshape([-1, 1])]
else: # position data
inst_pos = np.array([
range(1, len(inst) + 1) + [0] * (max_len - len(inst))
list(range(1, len(inst) + 1)) + [0] * (max_len - len(inst))
for inst in insts
])
return_list += [inst_pos.astype("int64").reshape([-1, 1])]
......@@ -176,6 +173,9 @@ def pad_batch_data(insts,
if return_max_len:
return_list += [max_len]
if return_num_token:
num_token = 0
for inst in insts:
num_token += len(inst)
return_list += [num_token]
return return_list if len(return_list) > 1 else return_list[0]
......@@ -323,7 +323,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
lr_scheduler.current_steps = TrainTaskConfig.start_step
else:
print "init fluid.framework.default_startup_program"
print("init fluid.framework.default_startup_program")
exe.run(fluid.framework.default_startup_program())
train_data = reader.DataReader(
......@@ -371,8 +371,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
)) + TrainTaskConfig.label_smooth_eps *
np.log(TrainTaskConfig.label_smooth_eps / (
ModelHyperParams.trg_vocab_size - 1) + 1e-20))
step_idx = 0
inst_num = 0
init = False
for pass_id in xrange(TrainTaskConfig.pass_num):
for pass_id in six.moves.xrange(TrainTaskConfig.pass_num):
pass_start_time = time.time()
for batch_id, data in enumerate(train_data()):
feed_list = []
......@@ -387,11 +390,12 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
ModelHyperParams.eos_idx, ModelHyperParams.n_head,
ModelHyperParams.d_model)
total_num_token += num_token
feed_kv_pairs = data_input_dict.items()
inst_num += len(data_buffer)
feed_kv_pairs = list(data_input_dict.items())
if args.local:
feed_kv_pairs += {
feed_kv_pairs += list({
lr_scheduler.learning_rate.name: lr_rate
}.items()
}.items())
feed_list.append(dict(feed_kv_pairs))
if not init:
......@@ -409,14 +413,17 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
) # sum the cost from multi-devices
total_token_num = token_num_val.sum()
total_avg_cost = total_sum_cost / total_token_num
print("epoch: %d, batch: %d, avg loss: %f, normalized loss: %f,"
" ppl: %f" % (pass_id, batch_id, total_avg_cost,
total_avg_cost - loss_normalizer,
np.exp([min(total_avg_cost, 100)])))
print(
"step_idx: %d, total samples: %d, epoch: %d, batch: %d, avg loss: %f, "
"normalized loss: %f, ppl: %f" %
(step_idx, inst_num, pass_id, batch_id, total_avg_cost,
total_avg_cost - loss_normalizer,
np.exp([min(total_avg_cost, 100)])))
if batch_id > 0 and batch_id % 1000 == 0:
fluid.io.save_persistables(
exe,
os.path.join(TrainTaskConfig.ckpt_dir, "latest.checkpoint"))
step_idx += 1
init = True
time_consumed = time.time() - pass_start_time
......@@ -449,7 +456,7 @@ def train(args):
is_local = os.getenv("PADDLE_IS_LOCAL", "1")
if is_local == '0':
args.local = False
print args
print(args)
if args.device == 'CPU':
TrainTaskConfig.use_gpu = False
......@@ -530,7 +537,7 @@ def train(args):
pserver_startup = t.get_startup_program(current_endpoint,
pserver_prog)
print "psserver begin run"
print("psserver begin run")
with open('pserver_startup.desc', 'w') as f:
f.write(str(pserver_startup))
with open('pserver_prog.desc', 'w') as f:
......
......@@ -17,6 +17,35 @@ _ALPHANUMERIC_CHAR_SET = set(
unicodedata.category(six.unichr(i)).startswith("N")))
# Unicode utility functions that work with Python 2 and 3
def native_to_unicode(s):
return s if is_unicode(s) else to_unicode(s)
def unicode_to_native(s):
if six.PY2:
return s.encode("utf-8") if is_unicode(s) else s
else:
return s
def is_unicode(s):
if six.PY2:
if isinstance(s, unicode):
return True
else:
if isinstance(s, str):
return True
return False
def to_unicode(s, ignore_errors=False):
if is_unicode(s):
return s
error_mode = "ignore" if ignore_errors else "strict"
return s.decode("utf-8", errors=error_mode)
def unescape_token(escaped_token):
"""
Inverse of encoding escaping.
......@@ -44,9 +73,7 @@ def subtoken_ids_to_str(subtoken_ids, vocabs):
subtokens = [vocabs.get(subtoken_id, u"") for subtoken_id in subtoken_ids]
# Convert a list of subtokens to a list of tokens.
concatenated = "".join([
t if isinstance(t, unicode) else t.decode("utf-8") for t in subtokens
])
concatenated = "".join([native_to_unicode(t) for t in subtokens])
split = concatenated.split("_")
tokens = []
for t in split:
......@@ -65,4 +92,4 @@ def subtoken_ids_to_str(subtoken_ids, vocabs):
ret.append(token)
seq = "".join(ret)
return seq.encode("utf-8")
return unicode_to_native(seq)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册