提交 6431daed 编写于 作者: G guosheng

Fix condition var is None

上级 3c682920
...@@ -91,8 +91,6 @@ def do_predict(args): ...@@ -91,8 +91,6 @@ def do_predict(args):
dataset=dataset, dataset=dataset,
batch_sampler=batch_sampler, batch_sampler=batch_sampler,
places=device, places=device,
feed_list=None
if fluid.in_dygraph_mode() else [x.forward() for x in inputs],
collate_fn=partial( collate_fn=partial(
prepare_infer_input, src_pad_idx=args.eos_idx, n_head=args.n_head), prepare_infer_input, src_pad_idx=args.eos_idx, n_head=args.n_head),
num_workers=0, num_workers=0,
......
...@@ -22,7 +22,6 @@ from functools import partial ...@@ -22,7 +22,6 @@ from functools import partial
import numpy as np import numpy as np
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.dygraph import to_variable
from paddle.fluid.io import DataLoader from paddle.fluid.io import DataLoader
from utils.configure import PDConfig from utils.configure import PDConfig
...@@ -31,32 +30,33 @@ from utils.check import check_gpu, check_version ...@@ -31,32 +30,33 @@ from utils.check import check_gpu, check_version
from model import Input, set_device from model import Input, set_device
from callbacks import ProgBarLogger from callbacks import ProgBarLogger
from reader import prepare_train_input, Seq2SeqDataset, Seq2SeqBatchSampler from reader import prepare_train_input, Seq2SeqDataset, Seq2SeqBatchSampler
from transformer import Transformer, CrossEntropyCriterion, NoamDecay from transformer import Transformer, CrossEntropyCriterion
class LoggerCallback(ProgBarLogger): class TrainCallback(ProgBarLogger):
def __init__(self, log_freq=1, verbose=2, loss_normalizer=0.): def __init__(self, log_freq=1, verbose=2, loss_normalizer=0.):
super(LoggerCallback, self).__init__(log_freq, verbose) super(TrainCallback, self).__init__(log_freq, verbose)
# TODO: wrap these override function to simplify # TODO: wrap these override function to simplify
self.loss_normalizer = loss_normalizer self.loss_normalizer = loss_normalizer
def on_train_begin(self, logs=None): def on_train_begin(self, logs=None):
super(LoggerCallback, self).on_train_begin(logs) super(TrainCallback, self).on_train_begin(logs)
self.train_metrics += ["normalized loss", "ppl"] self.train_metrics += ["normalized loss", "ppl"]
def on_train_batch_end(self, step, logs=None): def on_train_batch_end(self, step, logs=None):
logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer
logs["ppl"] = np.exp(min(logs["loss"][0], 100)) logs["ppl"] = np.exp(min(logs["loss"][0], 100))
super(LoggerCallback, self).on_train_batch_end(step, logs) super(TrainCallback, self).on_train_batch_end(step, logs)
def on_eval_begin(self, logs=None): def on_eval_begin(self, logs=None):
super(LoggerCallback, self).on_eval_begin(logs) super(TrainCallback, self).on_eval_begin(logs)
self.eval_metrics += ["normalized loss", "ppl"] self.eval_metrics = list(
self.eval_metrics) + ["normalized loss", "ppl"]
def on_eval_batch_end(self, step, logs=None): def on_eval_batch_end(self, step, logs=None):
logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer
logs["ppl"] = np.exp(min(logs["loss"][0], 100)) logs["ppl"] = np.exp(min(logs["loss"][0], 100))
super(LoggerCallback, self).on_eval_batch_end(step, logs) super(TrainCallback, self).on_eval_batch_end(step, logs)
def do_train(args): def do_train(args):
...@@ -127,8 +127,6 @@ def do_train(args): ...@@ -127,8 +127,6 @@ def do_train(args):
dataset=dataset, dataset=dataset,
batch_sampler=batch_sampler, batch_sampler=batch_sampler,
places=device, places=device,
feed_list=None if fluid.in_dygraph_mode() else
[x.forward() for x in inputs + labels],
collate_fn=partial( collate_fn=partial(
prepare_train_input, prepare_train_input,
src_pad_idx=args.eos_idx, src_pad_idx=args.eos_idx,
...@@ -149,8 +147,10 @@ def do_train(args): ...@@ -149,8 +147,10 @@ def do_train(args):
transformer.prepare( transformer.prepare(
fluid.optimizer.Adam( fluid.optimizer.Adam(
learning_rate=fluid.layers.noam_decay(args.d_model, learning_rate=fluid.layers.noam_decay(
args.warmup_steps), args.d_model,
args.warmup_steps,
learning_rate=args.learning_rate),
beta1=args.beta1, beta1=args.beta1,
beta2=args.beta2, beta2=args.beta2,
epsilon=float(args.eps), epsilon=float(args.eps),
...@@ -161,13 +161,10 @@ def do_train(args): ...@@ -161,13 +161,10 @@ def do_train(args):
## init from some checkpoint, to resume the previous training ## init from some checkpoint, to resume the previous training
if args.init_from_checkpoint: if args.init_from_checkpoint:
transformer.load( transformer.load(args.init_from_checkpoint)
os.path.join(args.init_from_checkpoint, "transformer"))
## init from some pretrain models, to better solve the current task ## init from some pretrain models, to better solve the current task
if args.init_from_pretrain_model: if args.init_from_pretrain_model:
transformer.load( transformer.load(args.init_from_pretrain_model, reset_optimizer=True)
os.path.join(args.init_from_pretrain_model, "transformer"),
reset_optimizer=True)
# the best cross-entropy value with label smoothing # the best cross-entropy value with label smoothing
loss_normalizer = -( loss_normalizer = -(
...@@ -178,12 +175,13 @@ def do_train(args): ...@@ -178,12 +175,13 @@ def do_train(args):
# model train # model train
transformer.fit(train_data=train_loader, transformer.fit(train_data=train_loader,
eval_data=eval_loader, eval_data=eval_loader,
epochs=1, epochs=args.epoch,
eval_freq=1, eval_freq=1,
save_freq=1, save_freq=1,
save_dir=args.save_model,
verbose=2, verbose=2,
callbacks=[ callbacks=[
LoggerCallback( TrainCallback(
log_freq=args.print_step, log_freq=args.print_step,
loss_normalizer=loss_normalizer) loss_normalizer=loss_normalizer)
]) ])
......
...@@ -79,7 +79,8 @@ class PrePostProcessLayer(Layer): ...@@ -79,7 +79,8 @@ class PrePostProcessLayer(Layer):
self.functors = [] self.functors = []
for cmd in self.process_cmd: for cmd in self.process_cmd:
if cmd == "a": # add residual connection if cmd == "a": # add residual connection
self.functors.append(lambda x, y: x + y if y else x) self.functors.append(
lambda x, y: x + y if y is not None else x)
elif cmd == "n": # add layer normalization elif cmd == "n": # add layer normalization
self.functors.append( self.functors.append(
self.add_sublayer( self.add_sublayer(
...@@ -169,7 +170,7 @@ class MultiHeadAttention(Layer): ...@@ -169,7 +170,7 @@ class MultiHeadAttention(Layer):
# scale dot product attention # scale dot product attention
product = layers.matmul( product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5) x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
if attn_bias: if attn_bias is not None:
product += attn_bias product += attn_bias
weights = layers.softmax(product) weights = layers.softmax(product)
if self.dropout_rate: if self.dropout_rate:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册