From 1cb6a6430b307308f8c23cd4dc794dcf474b2dbe Mon Sep 17 00:00:00 2001 From: liu zhengxi <380185688@qq.com> Date: Thu, 14 May 2020 16:27:18 +0800 Subject: [PATCH] update api to 1.8 and update readme (#4609) * update api to 1.8 for transformer and similarity_net, test=develop * update readme, test=develop --- .../machine_translation/transformer/README.md | 4 +- .../transformer/transformer.py | 19 +- .../models/matching/paddle_layers.py | 8 +- PaddleNLP/similarity_net/README.md | 2 +- PaddleNLP/similarity_net/run_classifier.py | 129 ++++---- dygraph/similarity_net/README.md | 2 +- dygraph/similarity_net/model_check.py | 21 +- dygraph/similarity_net/nets/paddle_layers.py | 106 ++++--- dygraph/similarity_net/run_classifier.py | 260 +++++++++------- dygraph/transformer/README.md | 4 +- dygraph/transformer/config.py | 21 +- dygraph/transformer/model.py | 290 +++++++++--------- dygraph/transformer/reader.py | 41 +-- 13 files changed, 472 insertions(+), 435 deletions(-) diff --git a/PaddleNLP/machine_translation/transformer/README.md b/PaddleNLP/machine_translation/transformer/README.md index ee945ea0..9bc8462b 100644 --- a/PaddleNLP/machine_translation/transformer/README.md +++ b/PaddleNLP/machine_translation/transformer/README.md @@ -32,7 +32,7 @@ 1. paddle安装 - 本项目依赖于 PaddlePaddle 1.6及以上版本或适当的develop版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装 + 本项目依赖于 PaddlePaddle 1.8及以上版本或适当的develop版本,请参考 [安装指南](https://www.paddlepaddle.org.cn/install/quick) 进行安装 2. 下载代码 @@ -44,7 +44,7 @@ 3. 环境依赖 - 请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html)部分的内容 + 请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/index_cn.html)部分的内容 ### 数据准备 diff --git a/PaddleNLP/machine_translation/transformer/transformer.py b/PaddleNLP/machine_translation/transformer/transformer.py index a73ce9f3..66901bb1 100644 --- a/PaddleNLP/machine_translation/transformer/transformer.py +++ b/PaddleNLP/machine_translation/transformer/transformer.py @@ -752,18 +752,17 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len, # caches contains states of history steps in decoder self-attention # and static encoder output projections in encoder-decoder attention # to reduce redundant computation. + batch_size = layers.shape(start_tokens)[0] caches = [ { "k": # for self attention - layers.fill_constant_batch_size_like( - input=start_tokens, - shape=[-1, n_head, 0, d_key], + layers.fill_constant( + shape=[batch_size, n_head, 0, d_key], dtype=enc_output.dtype, value=0), "v": # for self attention - layers.fill_constant_batch_size_like( - input=start_tokens, - shape=[-1, n_head, 0, d_value], + layers.fill_constant( + shape=[batch_size, n_head, 0, d_value], dtype=enc_output.dtype, value=0), "static_k": # for encoder-decoder attention @@ -792,12 +791,10 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len, lambda x: layers.gather(x, index=gather_idx), caches) pre_src_attn_bias = layers.gather( trg_src_attn_bias, index=gather_idx) + bias_batch_size = layers.shape(pre_src_attn_bias)[0] pre_pos = layers.elementwise_mul( - x=layers.fill_constant_batch_size_like( - input=pre_src_attn_bias, # cann't use lod tensor here - value=1, - shape=[-1, 1], - dtype=pre_ids.dtype), + x=layers.fill_constant( + value=1, shape=[bias_batch_size, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = wrap_decoder( diff --git a/PaddleNLP/shared_modules/models/matching/paddle_layers.py b/PaddleNLP/shared_modules/models/matching/paddle_layers.py index f8782a93..c3650f5c 100644 --- a/PaddleNLP/shared_modules/models/matching/paddle_layers.py +++ b/PaddleNLP/shared_modules/models/matching/paddle_layers.py @@ -210,7 +210,7 @@ class DataLayer(object): """ operation """ - data = fluid.layers.data( + data = fluid.data( name=name, shape=shape, dtype=dtype, lod_level=lod_level) return data @@ -383,8 +383,10 @@ class ConstantLayer(object): """ operation """ - constant = fluid.layers.fill_constant_batch_size_like(input, shape, - dtype, value) + shape = list(shape) + input_shape = fluid.layers.shape(input) + shape[0] = input_shape[0] + constant = fluid.layers.fill_constant(shape, dtype, value) return constant diff --git a/PaddleNLP/similarity_net/README.md b/PaddleNLP/similarity_net/README.md index 61e8ce60..90208707 100644 --- a/PaddleNLP/similarity_net/README.md +++ b/PaddleNLP/similarity_net/README.md @@ -22,7 +22,7 @@ |UNICOM|联通客服|客服| ## 快速开始 #### 版本依赖 -本项目依赖于 Paddlepaddle Fluid 1.6,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。 +本项目依赖于 Paddlepaddle Fluid 1.8,请参考[安装指南](https://www.paddlepaddle.org.cn/install/quick)进行安装。 python版本依赖python 2.7 #### 安装代码 diff --git a/PaddleNLP/similarity_net/run_classifier.py b/PaddleNLP/similarity_net/run_classifier.py index 77271c46..4c25cf2c 100644 --- a/PaddleNLP/similarity_net/run_classifier.py +++ b/PaddleNLP/similarity_net/run_classifier.py @@ -47,46 +47,51 @@ from models.model_check import check_version from models.model_check import check_cuda -def create_model(args, pyreader_name, is_inference=False, is_pointwise=False): +def create_model(args, is_inference=False, is_pointwise=False): """ Create Model for simnet """ if is_inference: - inf_pyreader = fluid.layers.py_reader( - capacity=16, - shapes=([-1], [-1]), - dtypes=('int64', 'int64'), - lod_levels=(1, 1), - name=pyreader_name, - use_double_buffer=False) + left = fluid.data(name='left', shape=[None], dtype='int64', lod_level=1) + pos_right = fluid.data( + name='pos_right', shape=[None], dtype='int64', lod_level=1) + inf_loader = fluid.io.DataLoader.from_generator( + capacity=16, + feed_list=[left, pos_right], + iterable=False, + use_double_buffer=False) - left, pos_right = fluid.layers.read_file(inf_pyreader) - return inf_pyreader, left, pos_right + return inf_loader, left, pos_right else: if is_pointwise: - pointwise_pyreader = fluid.layers.py_reader( - capacity=16, - shapes=([-1], [-1], [-1]), - dtypes=('int64', 'int64', 'int64'), - lod_levels=(1, 1, 0), - name=pyreader_name, - use_double_buffer=False) - - left, right, label = fluid.layers.read_file(pointwise_pyreader) - return pointwise_pyreader, left, right, label + left = fluid.data( + name='left', shape=[None], dtype='int64', lod_level=1) + right = fluid.data( + name='right', shape=[None], dtype='int64', lod_level=1) + label = fluid.data(name='label', shape=[None], dtype='int64') + pointwise_loader = fluid.io.DataLoader.from_generator( + capacity=16, + feed_list=[left, right, label], + iterable=False, + use_double_buffer=False) + + return pointwise_loader, left, right, label else: - pairwise_pyreader = fluid.layers.py_reader( - capacity=16, - shapes=([-1], [-1], [-1]), - dtypes=('int64', 'int64', 'int64'), - lod_levels=(1, 1, 1), - name=pyreader_name, - use_double_buffer=False) + left = fluid.data( + name='left', shape=[None], dtype='int64', lod_level=1) + pos_right = fluid.data( + name='pos_right', shape=[None], dtype='int64', lod_level=1) + neg_right = fluid.data( + name='neg_right', shape=[None], dtype='int64', lod_level=1) + pairwise_loader = fluid.io.DataLoader.from_generator( + capacity=16, + feed_list=[left, pos_right, neg_right], + iterable=False, + use_double_buffer=False) - left, pos_right, neg_right = fluid.layers.read_file(pairwise_pyreader) - return pairwise_pyreader, left, pos_right, neg_right + return pairwise_loader, left, pos_right, neg_right def train(conf_dict, args): @@ -131,8 +136,7 @@ def train(conf_dict, args): # Build network with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): - train_pyreader, left, pos_right, neg_right = create_model( - args, pyreader_name='train_reader') + train_loader, left, pos_right, neg_right = create_model(args) left_feat, pos_score = net.predict(left, pos_right) pred = pos_score _, neg_score = net.predict(left, neg_right) @@ -147,8 +151,8 @@ def train(conf_dict, args): test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): - test_pyreader, left, pos_right = create_model( - args, pyreader_name='test_reader', is_inference=True) + test_loader, left, pos_right = create_model( + args, is_inference=True) left_feat, pos_score = net.predict(left, pos_right) pred = pos_score test_prog = test_prog.clone(for_test=True) @@ -157,8 +161,8 @@ def train(conf_dict, args): # Build network with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): - train_pyreader, left, right, label = create_model( - args, pyreader_name='train_reader', is_pointwise=True) + train_loader, left, right, label = create_model( + args, is_pointwise=True) left_feat, pred = net.predict(left, right) avg_cost = loss.compute(pred, label) avg_cost.persistable = True @@ -171,15 +175,15 @@ def train(conf_dict, args): test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): - test_pyreader, left, right = create_model( - args, pyreader_name='test_reader', is_inference=True) + test_loader, left, right = create_model( + args, is_inference=True) left_feat, pred = net.predict(left, right) test_prog = test_prog.clone(for_test=True) if args.init_checkpoint is not "": utils.init_checkpoint(exe, args.init_checkpoint, startup_prog) - def valid_and_test(test_program, test_pyreader, get_valid_examples, process, + def valid_and_test(test_program, test_loader, get_valid_examples, process, mode, exe, fetch_list): """ return auc and acc @@ -187,15 +191,15 @@ def train(conf_dict, args): # Get Batch Data batch_data = fluid.io.batch( get_valid_examples, args.batch_size, drop_last=False) - test_pyreader.decorate_paddle_reader(batch_data) - test_pyreader.start() + test_loader.set_sample_list_generator(batch_data) + test_loader.start() pred_list = [] while True: try: _pred = exe.run(program=test_program, fetch_list=[pred.name]) pred_list += list(_pred) except fluid.core.EOFException: - test_pyreader.reset() + test_loader.reset() break pred_list = np.vstack(pred_list) if mode == "test": @@ -233,8 +237,8 @@ def train(conf_dict, args): get_train_examples, buf_size=10000), args.batch_size, drop_last=False) - train_pyreader.decorate_paddle_reader(train_batch_data) - train_pyreader.start() + train_loader.set_sample_list_generator(train_batch_data) + train_loader.start() exe.run(startup_prog) losses = [] start_time = time.time() @@ -248,8 +252,8 @@ def train(conf_dict, args): if args.do_valid and global_step % args.validation_steps == 0: get_valid_examples = simnet_process.get_reader("valid") valid_result = valid_and_test( - test_prog, test_pyreader, get_valid_examples, - simnet_process, "valid", exe, [pred.name]) + test_prog, test_loader, get_valid_examples, simnet_process, + "valid", exe, [pred.name]) if args.compute_accuracy: valid_auc, valid_acc = valid_result logging.info( @@ -281,7 +285,7 @@ def train(conf_dict, args): logging.info("saving infer model in %s" % model_path) except fluid.core.EOFException: - train_pyreader.reset() + train_loader.reset() break end_time = time.time() #logging.info("epoch: %d, loss: %f, used time: %d sec" % @@ -327,9 +331,8 @@ def train(conf_dict, args): else: # Get Feeder and Reader get_test_examples = simnet_process.get_reader("test") - test_result = valid_and_test(test_prog, test_pyreader, - get_test_examples, simnet_process, "test", - exe, [pred.name]) + test_result = valid_and_test(test_prog, test_loader, get_test_examples, + simnet_process, "test", exe, [pred.name]) if args.compute_accuracy: test_auc, test_acc = test_result logging.info("AUC of test is %f, Accuracy of test is %f" % @@ -371,8 +374,8 @@ def test(conf_dict, args): if args.task_mode == "pairwise": with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): - test_pyreader, left, pos_right = create_model( - args, pyreader_name='test_reader', is_inference=True) + test_loader, left, pos_right = create_model( + args, is_inference=True) left_feat, pos_score = net.predict(left, pos_right) pred = pos_score test_prog = test_prog.clone(for_test=True) @@ -380,8 +383,8 @@ def test(conf_dict, args): else: with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): - test_pyreader, left, right = create_model( - args, pyreader_name='test_reader', is_inference=True) + test_loader, left, right = create_model( + args, is_inference=True) left_feat, pred = net.predict(left, right) test_prog = test_prog.clone(for_test=True) @@ -390,10 +393,10 @@ def test(conf_dict, args): utils.init_checkpoint(exe, args.init_checkpoint, main_program=test_prog) test_exe = exe - test_pyreader.decorate_paddle_reader(batch_data) + test_loader.set_sample_list_generator(batch_data) logging.info("start test process ...") - test_pyreader.start() + test_loader.start() pred_list = [] fetch_list = [pred.name] output = [] @@ -412,7 +415,7 @@ def test(conf_dict, args): map(lambda item: str(np.argmax(item)), output[0])) + "\n") except fluid.core.EOFException: - test_pyreader.reset() + test_loader.reset() break if args.task_mode == "pairwise": pred_list = np.array(pred_list).reshape((-1, 1)) @@ -468,16 +471,16 @@ def infer(conf_dict, args): if args.task_mode == "pairwise": with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): - infer_pyreader, left, pos_right = create_model( - args, pyreader_name='infer_reader', is_inference=True) + infer_loader, left, pos_right = create_model( + args, is_inference=True) left_feat, pos_score = net.predict(left, pos_right) pred = pos_score test_prog = test_prog.clone(for_test=True) else: with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): - infer_pyreader, left, right = create_model( - args, pyreader_name='infer_reader', is_inference=True) + infer_loader, left, right = create_model( + args, is_inference=True) left_feat, pred = net.predict(left, right) test_prog = test_prog.clone(for_test=True) @@ -486,13 +489,13 @@ def infer(conf_dict, args): utils.init_checkpoint(exe, args.init_checkpoint, main_program=test_prog) test_exe = exe - infer_pyreader.decorate_sample_list_generator(batch_data) + infer_loader.set_sample_list_generator(batch_data) logging.info("start test process ...") preds_list = [] fetch_list = [pred.name] output = [] - infer_pyreader.start() + infer_loader.start() while True: try: output = test_exe.run(program=test_prog, fetch_list=fetch_list) @@ -502,7 +505,7 @@ def infer(conf_dict, args): else: preds_list += map(lambda item: str(np.argmax(item)), output[0]) except fluid.core.EOFException: - infer_pyreader.reset() + infer_loader.reset() break with io.open(args.infer_result_path, "w", encoding="utf8") as infer_file: for _data, _pred in zip(simnet_process.get_infer_data(), preds_list): diff --git a/dygraph/similarity_net/README.md b/dygraph/similarity_net/README.md index 4f7270b0..dfb6ecbb 100644 --- a/dygraph/similarity_net/README.md +++ b/dygraph/similarity_net/README.md @@ -23,7 +23,7 @@ ## 快速开始 #### 版本依赖 -本项目依赖于 Paddlepaddle Fluid 1.7,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。 +本项目依赖于 Paddlepaddle Fluid 1.8,请参考[安装指南](https://www.paddlepaddle.org.cn/install/quick)进行安装。 #### 安装代码 diff --git a/dygraph/similarity_net/model_check.py b/dygraph/similarity_net/model_check.py index 51713452..853a4a92 100644 --- a/dygraph/similarity_net/model_check.py +++ b/dygraph/similarity_net/model_check.py @@ -33,20 +33,21 @@ def check_cuda(use_cuda, err = \ except Exception as e: pass + def check_version(): - """ + """ Log error and exit when the installed version of paddlepaddle is not satisfied. """ - err = "PaddlePaddle version 1.6 or higher is required, " \ - "or a suitable develop version is satisfied as well. \n" \ - "Please make sure the version is good with your code." \ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ - try: - fluid.require_version('1.6.0') - except Exception as e: - print(err) - sys.exit(1) + try: + fluid.require_version('1.8.0') + except Exception as e: + print(err) + sys.exit(1) def check_version(): @@ -59,7 +60,7 @@ def check_version(): "Please make sure the version is good with your code." \ try: - fluid.require_version('1.6.0') + fluid.require_version('1.8.0') except Exception as e: print(err) sys.exit(1) diff --git a/dygraph/similarity_net/nets/paddle_layers.py b/dygraph/similarity_net/nets/paddle_layers.py index 6a797c22..d0f5c0bf 100644 --- a/dygraph/similarity_net/nets/paddle_layers.py +++ b/dygraph/similarity_net/nets/paddle_layers.py @@ -30,6 +30,7 @@ import paddle.fluid.layers.utils as utils from paddle.fluid.dygraph import Embedding, Conv2D, GRUUnit, Layer, to_variable from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as + class EmbeddingLayer(object): """ Embedding Layer class @@ -52,12 +53,12 @@ class EmbeddingLayer(object): size=[self.dict_size, self.emb_dim], is_sparse=True, padding_idx=self.padding_idx, - param_attr=attr.ParamAttr(name=self.name, initializer=fluid.initializer.Xavier())) + param_attr=attr.ParamAttr( + name=self.name, initializer=fluid.initializer.Xavier())) return emb - class FCLayer(object): """ Fully Connect Layer class @@ -76,9 +77,9 @@ class FCLayer(object): operation """ fc = FC(size=self.fc_dim, - param_attr=attr.ParamAttr(name="%s.w" % self.name), - bias_attr=attr.ParamAttr(name="%s.b" % self.name), - act=self.act) + param_attr=attr.ParamAttr(name="%s.w" % self.name), + bias_attr=attr.ParamAttr(name="%s.b" % self.name), + act=self.act) return fc @@ -93,7 +94,7 @@ class DynamicGRULayer(object): """ self.gru_dim = gru_dim self.name = name - + def ops(self): """ operation @@ -117,11 +118,13 @@ class DynamicLSTMLayer(object): self.lstm_dim = lstm_dim self.name = name self.is_reverse = is_reverse + def ops(self): """ operation """ - lstm_cell = BasicLSTMUnit(hidden_size=self.lstm_dim, input_size=self.lstm_dim*4) + lstm_cell = BasicLSTMUnit( + hidden_size=self.lstm_dim, input_size=self.lstm_dim * 4) lstm = RNN(cell=lstm_cell, time_major=True, is_reverse=self.is_reverse) return lstm @@ -141,7 +144,7 @@ class DataLayer(object): """ operation """ - data = fluid.layers.data( + data = fluid.data( name=name, shape=shape, dtype=dtype, lod_level=lod_level) return data @@ -314,8 +317,10 @@ class ConstantLayer(object): """ operation """ - constant = fluid.layers.fill_constant_batch_size_like(input, shape, - dtype, value) + shape = list(shape) + input_shape = fluid.layers.shape(input) + shape[0] = input_shape[0] + constant = fluid.layers.fill_constant(shape, dtype, value) return constant @@ -358,26 +363,23 @@ class SoftsignLayer(object): class SimpleConvPool(Layer): - def __init__(self, - num_channels, - num_filters, - filter_size, - use_cudnn=False - ): + def __init__(self, num_channels, num_filters, filter_size, use_cudnn=False): super(SimpleConvPool, self).__init__() - self._conv2d = Conv2D(num_channels = num_channels, + self._conv2d = Conv2D( + num_channels=num_channels, num_filters=num_filters, filter_size=filter_size, - padding=[1, 1], + padding=[1, 1], use_cudnn=use_cudnn, act='relu') def forward(self, inputs): x = self._conv2d(inputs) x = fluid.layers.reduce_max(x, dim=-1) - x = fluid.layers.reshape(x, shape=[x.shape[0], -1]) + x = fluid.layers.reshape(x, shape=[x.shape[0], -1]) return x + class FC(Layer): """ This interface is used to construct a callable object of the ``FC`` class. @@ -580,7 +582,7 @@ class DynamicGRU(Layer): gate_activation='sigmoid', candidate_activation='tanh', origin_mode=False, - init_size = None): + init_size=None): super(DynamicGRU, self).__init__() self.gru_unit = GRUUnit( size * 3, @@ -591,16 +593,19 @@ class DynamicGRU(Layer): origin_mode=origin_mode) self.size = size self.is_reverse = is_reverse + def forward(self, inputs, h_0): hidden = h_0 res = [] for i in range(inputs.shape[1]): if self.is_reverse: i = inputs.shape[1] - 1 - i - input_ = inputs[ :, i:i+1, :] - input_ = fluid.layers.reshape(input_, [-1, input_.shape[2]], inplace=False) + input_ = inputs[:, i:i + 1, :] + input_ = fluid.layers.reshape( + input_, [-1, input_.shape[2]], inplace=False) hidden, reset, gate = self.gru_unit(input_, hidden) - hidden_ = fluid.layers.reshape(hidden, [-1, 1, hidden.shape[1]], inplace=False) + hidden_ = fluid.layers.reshape( + hidden, [-1, 1, hidden.shape[1]], inplace=False) res.append(hidden_) if self.is_reverse: res = res[::-1] @@ -786,18 +791,21 @@ class BasicLSTMUnit(RNNUnit): self._weight = self.create_parameter( attr=self._param_attr, - shape=[self._input_size + self._hidden_size, 4 * self._hidden_size], + shape=[ + self._input_size + self._hidden_size, 4 * self._hidden_size + ], dtype=self._dtype) - - self._bias = self.create_parameter(attr=self._bias_attr, - shape=[4 * self._hidden_size], - dtype=self._dtype, - is_bias=True) + + self._bias = self.create_parameter( + attr=self._bias_attr, + shape=[4 * self._hidden_size], + dtype=self._dtype, + is_bias=True) def forward(self, input, state): pre_hidden, pre_cell = state concat_input_hidden = layers.concat([input, pre_hidden], axis=1) - + gate_input = layers.matmul(x=concat_input_hidden, y=self._weight) gate_input = layers.elementwise_add(gate_input, self._bias) @@ -817,11 +825,7 @@ class BasicLSTMUnit(RNNUnit): class RNN(Layer): - def __init__(self, - cell, - is_reverse=False, - time_major=False, - **kwargs): + def __init__(self, cell, is_reverse=False, time_major=False, **kwargs): super(RNN, self).__init__() self.cell = cell if not hasattr(self.cell, "call"): @@ -831,12 +835,17 @@ class RNN(Layer): self.batch_index, self.time_step_index = (1, 0) if time_major else (0, 1) - def forward(self, inputs, initial_states=None, sequence_length=None, **kwargs): + def forward(self, + inputs, + initial_states=None, + sequence_length=None, + **kwargs): if fluid.in_dygraph_mode(): class OutputArray(object): def __init__(self, x): self.array = [x] + def append(self, x): self.array.append(x) @@ -844,9 +853,8 @@ class RNN(Layer): # TODO: use where_op new_state = fluid.layers.elementwise_mul( new_state, step_mask, - axis=0) - fluid.layers.elementwise_mul(state, - (step_mask - 1), - axis=0) + axis=0) - fluid.layers.elementwise_mul( + state, (step_mask - 1), axis=0) return new_state flat_inputs = flatten(inputs) @@ -872,16 +880,20 @@ class RNN(Layer): if self.is_reverse: inputs = map_structure(lambda x: fluid.layers.reverse(x, axis=[0]), inputs) - mask = fluid.layers.reverse(mask, axis=[0]) if sequence_length is not None else None + mask = fluid.layers.reverse( + mask, axis=[0]) if sequence_length is not None else None states = initial_states outputs = [] for i in range(time_steps): - step_inputs = map_structure(lambda x:x[i], inputs) - step_outputs, new_states = self.cell(step_inputs, states, **kwargs) + step_inputs = map_structure(lambda x: x[i], inputs) + step_outputs, new_states = self.cell(step_inputs, states, + **kwargs) if sequence_length is not None: new_states = map_structure( - partial(_maybe_copy, step_mask=mask[i]), states, + partial( + _maybe_copy, step_mask=mask[i]), + states, new_states) states = new_states if i == 0: @@ -922,10 +934,9 @@ class EncoderCell(RNNUnit): self.lstm_cells = list() for i in range(self.num_layers): self.lstm_cells.append( - self.add_sublayer( - "layer_%d" % i, - BasicLSTMUnit(input_size if i == 0 else hidden_size, - hidden_size))) + self.add_sublayer("layer_%d" % i, + BasicLSTMUnit(input_size if i == 0 else + hidden_size, hidden_size))) def forward(self, step_input, states): new_states = [] @@ -1040,4 +1051,3 @@ class BasicGRUUnit(Layer): new_hidden = u * pre_hidden + (1 - u) * c return new_hidden - diff --git a/dygraph/similarity_net/run_classifier.py b/dygraph/similarity_net/run_classifier.py index ff82fdfb..ff464e43 100644 --- a/dygraph/similarity_net/run_classifier.py +++ b/dygraph/similarity_net/run_classifier.py @@ -47,18 +47,18 @@ from utils import load_dygraph from model_check import check_version from model_check import check_cuda - + def train(conf_dict, args): """ train process """ - + # Get device if args.use_cuda: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() - + # run train logging.info("start train process ...") @@ -84,7 +84,6 @@ def train(conf_dict, args): return auc, acc else: return auc - with fluid.dygraph.guard(place): # used for continuous evaluation @@ -100,35 +99,35 @@ def train(conf_dict, args): conf_dict['seq_len'] = args.seq_len # Load network structure dynamically - net = utils.import_class("./nets", - conf_dict["net"]["module_name"], - conf_dict["net"]["class_name"])(conf_dict) + net = utils.import_class("./nets", conf_dict["net"]["module_name"], + conf_dict["net"]["class_name"])(conf_dict) if args.init_checkpoint is not "": model, _ = load_dygraph(args.init_checkpoint) net.set_dict(model) # Load loss function dynamically loss = utils.import_class("./nets/losses", - conf_dict["loss"]["module_name"], - conf_dict["loss"]["class_name"])(conf_dict) + conf_dict["loss"]["module_name"], + conf_dict["loss"]["class_name"])(conf_dict) # Load Optimization method learning_rate = conf_dict["optimizer"]["learning_rate"] optimizer_name = conf_dict["optimizer"]["class_name"] - if optimizer_name=='SGDOptimizer': - optimizer = fluid.optimizer.SGDOptimizer(learning_rate,parameter_list=net.parameters()) - elif optimizer_name=='AdamOptimizer': + if optimizer_name == 'SGDOptimizer': + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate, parameter_list=net.parameters()) + elif optimizer_name == 'AdamOptimizer': beta1 = conf_dict["optimizer"]["beta1"] beta2 = conf_dict["optimizer"]["beta2"] epsilon = conf_dict["optimizer"]["epsilon"] optimizer = fluid.optimizer.AdamOptimizer( - learning_rate, - beta1=beta1, - beta2=beta2, - epsilon=epsilon, - parameter_list=net.parameters()) + learning_rate, + beta1=beta1, + beta2=beta2, + epsilon=epsilon, + parameter_list=net.parameters()) # load auc method metric = fluid.metrics.Auc(name="auc") - simnet_process = reader.SimNetProcessor(args, vocab) + simnet_process = reader.SimNetProcessor(args, vocab) # set global step global_step = 0 @@ -136,23 +135,33 @@ def train(conf_dict, args): losses = [] start_time = time.time() - train_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=True) - get_train_examples = simnet_process.get_reader("train",epoch=args.epoch) - train_pyreader.decorate_sample_list_generator( - paddle.batch(get_train_examples, batch_size=args.batch_size), - place) + train_loader = fluid.io.DataLoader.from_generator( + capacity=16, + return_list=True, + iterable=True, + use_double_buffer=True) + get_train_examples = simnet_process.get_reader( + "train", epoch=args.epoch) + train_loader.set_sample_list_generator( + paddle.batch( + get_train_examples, batch_size=args.batch_size), place) if args.do_valid: - valid_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=True) - get_valid_examples = simnet_process.get_reader("valid") - valid_pyreader.decorate_sample_list_generator( - paddle.batch(get_valid_examples, batch_size=args.batch_size), + valid_loader = fluid.io.DataLoader.from_generator( + capacity=16, + return_list=True, + iterable=True, + use_double_buffer=True) + get_valid_examples = simnet_process.get_reader("valid") + valid_loader.set_sample_list_generator( + paddle.batch( + get_valid_examples, batch_size=args.batch_size), place) pred_list = [] if args.task_mode == "pairwise": - - for left, pos_right, neg_right in train_pyreader(): - + + for left, pos_right, neg_right in train_loader(): + left = fluid.layers.reshape(left, shape=[-1, 1]) pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) neg_right = fluid.layers.reshape(neg_right, shape=[-1, 1]) @@ -162,92 +171,98 @@ def train(conf_dict, args): pred = pos_score _, neg_score = net(left, neg_right) avg_cost = loss.compute(pos_score, neg_score) - losses.append(np.mean(avg_cost.numpy())) + losses.append(np.mean(avg_cost.numpy())) avg_cost.backward() optimizer.minimize(avg_cost) net.clear_gradients() - + if args.do_valid and global_step % args.validation_steps == 0: - for left, pos_right in valid_pyreader(): + for left, pos_right in valid_loader(): left = fluid.layers.reshape(left, shape=[-1, 1]) - pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + pos_right = fluid.layers.reshape( + pos_right, shape=[-1, 1]) net.eval() left_feat, pos_score = net(left, pos_right) pred = pos_score - - pred_list += list(pred.numpy()) - valid_result = valid_and_test(pred_list, simnet_process, "valid") + + pred_list += list(pred.numpy()) + valid_result = valid_and_test(pred_list, simnet_process, + "valid") if args.compute_accuracy: valid_auc, valid_acc = valid_result logging.info( - "global_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f" % - (global_step, valid_auc, valid_acc, np.mean(losses))) + "global_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f" + % (global_step, valid_auc, valid_acc, + np.mean(losses))) else: valid_auc = valid_result - logging.info("global_steps: %d, valid_auc: %f, valid_loss: %f" % - (global_step, valid_auc, np.mean(losses))) + logging.info( + "global_steps: %d, valid_auc: %f, valid_loss: %f" % + (global_step, valid_auc, np.mean(losses))) if global_step % args.save_steps == 0: model_save_dir = os.path.join(args.output_dir, - conf_dict["model_path"]) + conf_dict["model_path"]) model_path = os.path.join(model_save_dir, str(global_step)) - + if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) fluid.dygraph.save_dygraph(net.state_dict(), model_path) - + logging.info("saving infer model in %s" % model_path) else: - for left, right, label in train_pyreader(): + for left, right, label in train_loader(): left = fluid.layers.reshape(left, shape=[-1, 1]) right = fluid.layers.reshape(right, shape=[-1, 1]) label = fluid.layers.reshape(label, shape=[-1, 1]) net.train() - global_step += 1 + global_step += 1 left_feat, pred = net(left, right) avg_cost = loss.compute(pred, label) - losses.append(np.mean(avg_cost.numpy())) + losses.append(np.mean(avg_cost.numpy())) avg_cost.backward() optimizer.minimize(avg_cost) net.clear_gradients() - + if args.do_valid and global_step % args.validation_steps == 0: - for left, right in valid_pyreader(): + for left, right in valid_loader(): left = fluid.layers.reshape(left, shape=[-1, 1]) right = fluid.layers.reshape(right, shape=[-1, 1]) net.eval() left_feat, pred = net(left, right) pred_list += list(pred.numpy()) - valid_result = valid_and_test(pred_list, simnet_process, "valid") + valid_result = valid_and_test(pred_list, simnet_process, + "valid") if args.compute_accuracy: valid_auc, valid_acc = valid_result logging.info( - "global_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f" % - (global_step, valid_auc, valid_acc, np.mean(losses))) + "global_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f" + % (global_step, valid_auc, valid_acc, + np.mean(losses))) else: valid_auc = valid_result - logging.info("global_steps: %d, valid_auc: %f, valid_loss: %f" % - (global_step, valid_auc, np.mean(losses))) + logging.info( + "global_steps: %d, valid_auc: %f, valid_loss: %f" % + (global_step, valid_auc, np.mean(losses))) if global_step % args.save_steps == 0: model_save_dir = os.path.join(args.output_dir, - conf_dict["model_path"]) + conf_dict["model_path"]) model_path = os.path.join(model_save_dir, str(global_step)) - + if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) fluid.dygraph.save_dygraph(net.state_dict(), model_path) - + logging.info("saving infer model in %s" % model_path) - end_time = time.time() + end_time = time.time() ce_info.append([np.mean(losses), end_time - start_time]) # final save - logging.info("the final step is %s" % global_step) - model_save_dir = os.path.join(args.output_dir, - conf_dict["model_path"]) + logging.info("the final step is %s" % global_step) + model_save_dir = os.path.join(args.output_dir, conf_dict["model_path"]) model_path = os.path.join(model_save_dir, str(global_step)) - + if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) fluid.dygraph.save_dygraph(net.state_dict(), model_path) @@ -263,19 +278,24 @@ def train(conf_dict, args): except: logging.info("ce info err!") print("kpis\teach_step_duration_%s_card%s\t%s" % - (args.task_name, card_num, ce_time)) + (args.task_name, card_num, ce_time)) print("kpis\ttrain_loss_%s_card%s\t%f" % - (args.task_name, card_num, ce_loss)) + (args.task_name, card_num, ce_loss)) if args.do_test: # Get Feeder and Reader - test_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=True) + test_loader = fluid.io.DataLoader.from_generator( + capacity=16, + return_list=True, + iterable=True, + use_double_buffer=True) get_test_examples = simnet_process.get_reader("test") - test_pyreader.decorate_sample_list_generator( - paddle.batch(get_test_examples, batch_size=args.batch_size), + test_loader.set_sample_list_generator( + paddle.batch( + get_test_examples, batch_size=args.batch_size), place) pred_list = [] - for left, pos_right in test_pyreader(): + for left, pos_right in test_loader(): left = fluid.layers.reshape(left, shape=[-1, 1]) pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) net.eval() @@ -284,15 +304,15 @@ def train(conf_dict, args): left_feat, pos_score = net(left, pos_right) pred = pos_score pred_list += list(pred.numpy()) - test_result = valid_and_test(pred_list, simnet_process, "test") - if args.compute_accuracy: + test_result = valid_and_test(pred_list, simnet_process, "test") + if args.compute_accuracy: test_auc, test_acc = test_result logging.info("AUC of test is %f, Accuracy of test is %f" % - (test_auc, test_acc)) + (test_auc, test_acc)) else: test_auc = test_result logging.info("AUC of test is %f" % test_auc) - + def test(conf_dict, args): """ @@ -307,47 +327,53 @@ def test(conf_dict, args): vocab = utils.load_vocab(args.vocab_path) simnet_process = reader.SimNetProcessor(args, vocab) - test_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=True) + test_loader = fluid.io.DataLoader.from_generator( + capacity=16, + return_list=True, + iterable=True, + use_double_buffer=True) get_test_examples = simnet_process.get_reader("test") - test_pyreader.decorate_sample_list_generator( - paddle.batch(get_test_examples, batch_size=args.batch_size), - place) - + test_loader.set_sample_list_generator( + paddle.batch( + get_test_examples, batch_size=args.batch_size), place) - conf_dict['dict_size'] = len(vocab) + conf_dict['dict_size'] = len(vocab) conf_dict['seq_len'] = args.seq_len - net = utils.import_class("./nets", - conf_dict["net"]["module_name"], - conf_dict["net"]["class_name"])(conf_dict) - + net = utils.import_class("./nets", conf_dict["net"]["module_name"], + conf_dict["net"]["class_name"])(conf_dict) + model, _ = load_dygraph(args.init_checkpoint) net.set_dict(model) metric = fluid.metrics.Auc(name="auc") pred_list = [] - with io.open("predictions.txt", "w", encoding="utf8") as predictions_file: + with io.open( + "predictions.txt", "w", encoding="utf8") as predictions_file: if args.task_mode == "pairwise": - for left, pos_right in test_pyreader(): + for left, pos_right in test_loader(): left = fluid.layers.reshape(left, shape=[-1, 1]) pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) - + left_feat, pos_score = net(left, pos_right) pred = pos_score - pred_list += list(map(lambda item: float(item[0]), pred.numpy())) + pred_list += list( + map(lambda item: float(item[0]), pred.numpy())) predictions_file.write(u"\n".join( - map(lambda item: str((item[0] + 1) / 2), pred.numpy())) + "\n") + map(lambda item: str((item[0] + 1) / 2), pred.numpy())) + + "\n") else: - for left, right in test_pyreader(): + for left, right in test_loader(): left = fluid.layers.reshape(left, shape=[-1, 1]) right = fluid.layers.reshape(right, shape=[-1, 1]) left_feat, pred = net(left, right) - pred_list += list(map(lambda item: float(item[0]), pred.numpy())) + pred_list += list( + map(lambda item: float(item[0]), pred.numpy())) predictions_file.write(u"\n".join( - map(lambda item: str(np.argmax(item)), pred.numpy())) + "\n") - + map(lambda item: str(np.argmax(item)), pred.numpy())) + + "\n") if args.task_mode == "pairwise": pred_list = np.array(pred_list).reshape((-1, 1)) @@ -361,16 +387,16 @@ def test(conf_dict, args): metric.update(pred_list, labels) if args.compute_accuracy: acc = utils.get_accuracy(pred_list, labels, args.task_mode, - args.lamda) + args.lamda) logging.info("AUC of test is %f, Accuracy of test is %f" % - (metric.eval(), acc)) + (metric.eval(), acc)) else: logging.info("AUC of test is %f" % metric.eval()) if args.verbose_result: utils.get_result_file(args) logging.info("test result saved in %s" % - os.path.join(os.getcwd(), args.test_result_path)) + os.path.join(os.getcwd(), args.test_result_path)) def infer(conf_dict, args): @@ -382,50 +408,53 @@ def infer(conf_dict, args): place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() - with fluid.dygraph.guard(place): vocab = utils.load_vocab(args.vocab_path) simnet_process = reader.SimNetProcessor(args, vocab) get_infer_examples = simnet_process.get_infer_reader - infer_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=True) - infer_pyreader.decorate_sample_list_generator( - paddle.batch(get_infer_examples, batch_size=args.batch_size), - place) - - conf_dict['dict_size'] = len(vocab) + infer_loader = fluid.io.DataLoader.from_generator( + capacity=16, + return_list=True, + iterable=True, + use_double_buffer=True) + infer_loader.set_sample_list_generator( + paddle.batch( + get_infer_examples, batch_size=args.batch_size), place) + + conf_dict['dict_size'] = len(vocab) conf_dict['seq_len'] = args.seq_len - net = utils.import_class("./nets", - conf_dict["net"]["module_name"], - conf_dict["net"]["class_name"])(conf_dict) + net = utils.import_class("./nets", conf_dict["net"]["module_name"], + conf_dict["net"]["class_name"])(conf_dict) model, _ = load_dygraph(args.init_checkpoint) net.set_dict(model) - + pred_list = [] if args.task_mode == "pairwise": - for left, pos_right in infer_pyreader(): + for left, pos_right in infer_loader(): left = fluid.layers.reshape(left, shape=[-1, 1]) pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) - + left_feat, pos_score = net(left, pos_right) pred = pos_score pred_list += list( - map(lambda item: str((item[0] + 1) / 2), pred.numpy())) - + map(lambda item: str((item[0] + 1) / 2), pred.numpy())) + else: - for left, right in infer_pyreader(): + for left, right in infer_loader(): left = fluid.layers.reshape(left, shape=[-1, 1]) pos_right = fluid.layers.reshape(right, shape=[-1, 1]) left_feat, pred = net(left, right) - pred_list += map(lambda item: str(np.argmax(item)), pred.numpy()) - - - with io.open(args.infer_result_path, "w", encoding="utf8") as infer_file: + pred_list += map(lambda item: str(np.argmax(item)), + pred.numpy()) + + with io.open( + args.infer_result_path, "w", encoding="utf8") as infer_file: for _data, _pred in zip(simnet_process.get_infer_data(), pred_list): infer_file.write(_data + "\t" + _pred + "\n") logging.info("infer result saved in %s" % - os.path.join(os.getcwd(), args.infer_result_path)) + os.path.join(os.getcwd(), args.infer_result_path)) def get_cards(): @@ -435,6 +464,7 @@ def get_cards(): num = len(cards.split(",")) return num + if __name__ == "__main__": args = ArgConfig() diff --git a/dygraph/transformer/README.md b/dygraph/transformer/README.md index 6776e618..6cec2d79 100644 --- a/dygraph/transformer/README.md +++ b/dygraph/transformer/README.md @@ -28,7 +28,7 @@ 1. paddle安装 - 本项目依赖于 PaddlePaddle 1.7及以上版本或适当的develop版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装 + 本项目依赖于 PaddlePaddle 1.8及以上版本或适当的develop版本,请参考 [安装指南](https://www.paddlepaddle.org.cn/install/quick) 进行安装 2. 下载代码 @@ -40,7 +40,7 @@ 3. 环境依赖 - 请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.6/beginners_guide/install/index_cn.html)部分的内容 + 请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/index_cn.html)部分的内容 ### 数据准备 diff --git a/dygraph/transformer/config.py b/dygraph/transformer/config.py index b6e1b2bb..2841e04d 100644 --- a/dygraph/transformer/config.py +++ b/dygraph/transformer/config.py @@ -42,12 +42,11 @@ class InferTaskConfig(object): batch_size = 4 # the parameters for beam search. beam_size = 4 - alpha=0.6 + alpha = 0.6 # max decoded length, should be less than ModelHyperParams.max_length max_out_len = 30 - class ModelHyperParams(object): """ ModelHyperParams @@ -156,38 +155,32 @@ input_descs = { # Names of word embedding table which might be reused for weight sharing. word_emb_param_names = ( "src_word_emb_table", - "trg_word_emb_table", -) + "trg_word_emb_table", ) # Names of position encoding table which will be initialized externally. pos_enc_param_names = ( "src_pos_enc_table", - "trg_pos_enc_table", -) + "trg_pos_enc_table", ) # separated inputs for different usages. encoder_data_input_fields = ( "src_word", "src_pos", - "src_slf_attn_bias", -) + "src_slf_attn_bias", ) decoder_data_input_fields = ( "trg_word", "trg_pos", "trg_slf_attn_bias", "trg_src_attn_bias", - "enc_output", -) + "enc_output", ) label_data_input_fields = ( "lbl_word", - "lbl_weight", -) + "lbl_weight", ) # In fast decoder, trg_pos (only containing the current time step) is generated # by ops and trg_slf_attn_bias is not needed. fast_decoder_data_input_fields = ( "trg_word", # "init_score", # "init_idx", - "trg_src_attn_bias", -) + "trg_src_attn_bias", ) def merge_cfg_from_list(cfg_list, g_cfgs): diff --git a/dygraph/transformer/model.py b/dygraph/transformer/model.py index 3e8ec488..1693a8a7 100644 --- a/dygraph/transformer/model.py +++ b/dygraph/transformer/model.py @@ -34,10 +34,10 @@ def position_encoding_init(n_position, d_pos_vec): num_timescales = channels // 2 log_timescale_increment = (np.log(float(1e4) / float(1)) / (num_timescales - 1)) - inv_timescales = np.exp( - np.arange(num_timescales)) * -log_timescale_increment - scaled_time = np.expand_dims(position, 1) * np.expand_dims( - inv_timescales, 0) + inv_timescales = np.exp(np.arange( + num_timescales)) * -log_timescale_increment + scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales, + 0) signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1) signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant') position_enc = signal @@ -48,6 +48,7 @@ class NoamDecay(LearningRateDecay): """ learning rate scheduler """ + def __init__(self, d_model, warmup_steps, @@ -72,6 +73,7 @@ class PrePostProcessLayer(Layer): """ PrePostProcessLayer """ + def __init__(self, process_cmd, d_model, dropout_rate): super(PrePostProcessLayer, self).__init__() self.process_cmd = process_cmd @@ -82,8 +84,8 @@ class PrePostProcessLayer(Layer): elif cmd == "n": # add layer normalization self.functors.append( self.add_sublayer( - "layer_norm_%d" % - len(self.sublayers(include_sublayers=False)), + "layer_norm_%d" % len( + self.sublayers(include_sublayers=False)), LayerNorm( normalized_shape=d_model, param_attr=fluid.ParamAttr( @@ -108,6 +110,7 @@ class MultiHeadAttention(Layer): """ Multi-Head Attention """ + def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.): super(MultiHeadAttention, self).__init__() self.n_head = n_head @@ -115,18 +118,14 @@ class MultiHeadAttention(Layer): self.d_value = d_value self.d_model = d_model self.dropout_rate = dropout_rate - self.q_fc = Linear(input_dim=d_model, - output_dim=d_key * n_head, - bias_attr=False) - self.k_fc = Linear(input_dim=d_model, - output_dim=d_key * n_head, - bias_attr=False) - self.v_fc = Linear(input_dim=d_model, - output_dim=d_value * n_head, - bias_attr=False) - self.proj_fc = Linear(input_dim=d_value * n_head, - output_dim=d_model, - bias_attr=False) + self.q_fc = Linear( + input_dim=d_model, output_dim=d_key * n_head, bias_attr=False) + self.k_fc = Linear( + input_dim=d_model, output_dim=d_key * n_head, bias_attr=False) + self.v_fc = Linear( + input_dim=d_model, output_dim=d_value * n_head, bias_attr=False) + self.proj_fc = Linear( + input_dim=d_value * n_head, output_dim=d_model, bias_attr=False) def forward(self, queries, keys, values, attn_bias, cache=None): # compute q ,k ,v @@ -152,17 +151,14 @@ class MultiHeadAttention(Layer): cache["k"], cache["v"] = k, v # scale dot product attention - product = layers.matmul(x=q, - y=k, - transpose_y=True, - alpha=self.d_model**-0.5) + product = layers.matmul( + x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5) if attn_bias is not None: product += attn_bias weights = layers.softmax(product) if self.dropout_rate: - weights = layers.dropout(weights, - dropout_prob=self.dropout_rate, - is_test=False) + weights = layers.dropout( + weights, dropout_prob=self.dropout_rate, is_test=False) out = layers.matmul(weights, v) @@ -179,6 +175,7 @@ class FFN(Layer): """ Feed-Forward Network """ + def __init__(self, d_inner_hid, d_model, dropout_rate): super(FFN, self).__init__() self.dropout_rate = dropout_rate @@ -188,9 +185,8 @@ class FFN(Layer): def forward(self, x): hidden = self.fc1(x) if self.dropout_rate: - hidden = layers.dropout(hidden, - dropout_prob=self.dropout_rate, - is_test=False) + hidden = layers.dropout( + hidden, dropout_prob=self.dropout_rate, is_test=False) out = self.fc2(hidden) return out @@ -199,6 +195,7 @@ class EncoderLayer(Layer): """ EncoderLayer """ + def __init__(self, n_head, d_key, @@ -227,8 +224,8 @@ class EncoderLayer(Layer): prepostprocess_dropout) def forward(self, enc_input, attn_bias): - attn_output = self.self_attn(self.preprocesser1(enc_input), None, None, - attn_bias) + attn_output = self.self_attn( + self.preprocesser1(enc_input), None, None, attn_bias) attn_output = self.postprocesser1(attn_output, enc_input) ffn_output = self.ffn(self.preprocesser2(attn_output)) @@ -240,6 +237,7 @@ class Encoder(Layer): """ encoder """ + def __init__(self, n_layer, n_head, @@ -279,6 +277,7 @@ class Embedder(Layer): """ Word Embedding + Position Encoding """ + def __init__(self, vocab_size, emb_dim, bos_idx=0): super(Embedder, self).__init__() @@ -297,6 +296,7 @@ class WrapEncoder(Layer): """ embedder + encoder """ + def __init__(self, src_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, @@ -324,9 +324,9 @@ class WrapEncoder(Layer): pos_enc = self.pos_encoder(src_pos) pos_enc.stop_gradient = True emb = word_emb + pos_enc - enc_input = layers.dropout(emb, - dropout_prob=self.emb_dropout, - is_test=False) if self.emb_dropout else emb + enc_input = layers.dropout( + emb, dropout_prob=self.emb_dropout, + is_test=False) if self.emb_dropout else emb enc_output = self.encoder(enc_input, src_slf_attn_bias) return enc_output @@ -336,6 +336,7 @@ class DecoderLayer(Layer): """ decoder """ + def __init__(self, n_head, d_key, @@ -375,8 +376,8 @@ class DecoderLayer(Layer): self_attn_bias, cross_attn_bias, cache=None): - self_attn_output = self.self_attn(self.preprocesser1(dec_input), None, - None, self_attn_bias, cache) + self_attn_output = self.self_attn( + self.preprocesser1(dec_input), None, None, self_attn_bias, cache) self_attn_output = self.postprocesser1(self_attn_output, dec_input) cross_attn_output = self.cross_attn( @@ -395,6 +396,7 @@ class Decoder(Layer): """ decoder """ + def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd): @@ -420,8 +422,8 @@ class Decoder(Layer): caches=None): for i, decoder_layer in enumerate(self.decoder_layers): dec_output = decoder_layer(dec_input, enc_output, self_attn_bias, - cross_attn_bias, - None if caches is None else caches[i]) + cross_attn_bias, None + if caches is None else caches[i]) dec_input = dec_output return self.processer(dec_output) @@ -431,6 +433,7 @@ class WrapDecoder(Layer): """ embedder + decoder """ + def __init__(self, trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, @@ -458,9 +461,8 @@ class WrapDecoder(Layer): word_embedder.weight, transpose_y=True) else: - self.linear = Linear(input_dim=d_model, - output_dim=trg_vocab_size, - bias_attr=False) + self.linear = Linear( + input_dim=d_model, output_dim=trg_vocab_size, bias_attr=False) def forward(self, trg_word, @@ -474,15 +476,14 @@ class WrapDecoder(Layer): pos_enc = self.pos_encoder(trg_pos) pos_enc.stop_gradient = True emb = word_emb + pos_enc - dec_input = layers.dropout(emb, - dropout_prob=self.emb_dropout, - is_test=False) if self.emb_dropout else emb + dec_input = layers.dropout( + emb, dropout_prob=self.emb_dropout, + is_test=False) if self.emb_dropout else emb dec_output = self.decoder(dec_input, enc_output, trg_slf_attn_bias, trg_src_attn_bias, caches) dec_output = layers.reshape( dec_output, - shape=[-1, dec_output.shape[-1]], - ) + shape=[-1, dec_output.shape[-1]], ) logits = self.linear(dec_output) return logits @@ -493,9 +494,10 @@ class CrossEntropyCriterion(object): def __call__(self, predict, label, weights): if self.label_smooth_eps: - label_out = layers.label_smooth(label=layers.one_hot( - input=label, depth=predict.shape[-1]), - epsilon=self.label_smooth_eps) + label_out = layers.label_smooth( + label=layers.one_hot( + input=label, depth=predict.shape[-1]), + epsilon=self.label_smooth_eps) cost = layers.softmax_with_cross_entropy( logits=predict, @@ -513,6 +515,7 @@ class Transformer(Layer): """ model """ + def __init__(self, src_vocab_size, trg_vocab_size, @@ -532,29 +535,25 @@ class Transformer(Layer): bos_id=0, eos_id=1): super(Transformer, self).__init__() - src_word_embedder = Embedder(vocab_size=src_vocab_size, - emb_dim=d_model, - bos_idx=bos_id) - self.encoder = WrapEncoder(src_vocab_size, max_length, n_layer, n_head, - d_key, d_value, d_model, d_inner_hid, - prepostprocess_dropout, attention_dropout, - relu_dropout, preprocess_cmd, - postprocess_cmd, src_word_embedder) + src_word_embedder = Embedder( + vocab_size=src_vocab_size, emb_dim=d_model, bos_idx=bos_id) + self.encoder = WrapEncoder( + src_vocab_size, max_length, n_layer, n_head, d_key, d_value, + d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, + relu_dropout, preprocess_cmd, postprocess_cmd, src_word_embedder) if weight_sharing: assert src_vocab_size == trg_vocab_size, ( "Vocabularies in source and target should be same for weight sharing." ) trg_word_embedder = src_word_embedder else: - trg_word_embedder = Embedder(vocab_size=trg_vocab_size, - emb_dim=d_model, - bos_idx=bos_id) - self.decoder = WrapDecoder(trg_vocab_size, max_length, n_layer, n_head, - d_key, d_value, d_model, d_inner_hid, - prepostprocess_dropout, attention_dropout, - relu_dropout, preprocess_cmd, - postprocess_cmd, weight_sharing, - trg_word_embedder) + trg_word_embedder = Embedder( + vocab_size=trg_vocab_size, emb_dim=d_model, bos_idx=bos_id) + self.decoder = WrapDecoder( + trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, + d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, + relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, + trg_word_embedder) self.trg_vocab_size = trg_vocab_size self.n_layer = n_layer @@ -595,6 +594,7 @@ class Transformer(Layer): and newly added finished candidates from `grow_topk`, and selects the top `beam_size` finished candidates. """ + def expand_to_beam_size(tensor, beam_size): tensor = layers.reshape(tensor, [tensor.shape[0], 1] + tensor.shape[1:]) @@ -616,19 +616,23 @@ class Transformer(Layer): ### initialize states of beam search ### ## init for the alive ## initial_log_probs = to_variable( - np.array([[0.] + [-inf] * (beam_size - 1)], dtype="float32")) + np.array( + [[0.] + [-inf] * (beam_size - 1)], dtype="float32")) alive_log_probs = layers.expand(initial_log_probs, [batch_size, 1]) alive_seq = to_variable( - np.tile(np.array([[[bos_id]]], dtype="int64"), - (batch_size, beam_size, 1))) + np.tile( + np.array( + [[[bos_id]]], dtype="int64"), (batch_size, beam_size, 1))) ## init for the finished ## finished_scores = to_variable( - np.array([[-inf] * beam_size], dtype="float32")) + np.array( + [[-inf] * beam_size], dtype="float32")) finished_scores = layers.expand(finished_scores, [batch_size, 1]) finished_seq = to_variable( - np.tile(np.array([[[bos_id]]], dtype="int64"), - (batch_size, beam_size, 1))) + np.tile( + np.array( + [[[bos_id]]], dtype="int64"), (batch_size, beam_size, 1))) finished_flags = layers.zeros_like(finished_scores) ### initialize inputs and states of transformer decoder ### @@ -640,13 +644,11 @@ class Transformer(Layer): enc_output = merge_beam_dim(expand_to_beam_size(enc_output, beam_size)) ## init states (caches) for transformer, need to be updated according to selected beam caches = [{ - "k": - layers.fill_constant( + "k": layers.fill_constant( shape=[batch_size * beam_size, self.n_head, 0, self.d_key], dtype=enc_output.dtype, value=0), - "v": - layers.fill_constant( + "v": layers.fill_constant( shape=[batch_size * beam_size, self.n_head, 0, self.d_value], dtype=enc_output.dtype, value=0), @@ -665,11 +667,11 @@ class Transformer(Layer): beam_size, batch_size, need_flat=True): - batch_idx = layers.range(0, batch_size, 1, - dtype="int64") * beam_size + batch_idx = layers.range( + 0, batch_size, 1, dtype="int64") * beam_size flat_tensor = merge_beam_dim(tensor_nd) if need_flat else tensor_nd - idx = layers.reshape(layers.elementwise_add(beam_idx, batch_idx, 0), - [-1]) + idx = layers.reshape( + layers.elementwise_add(beam_idx, batch_idx, 0), [-1]) new_flat_tensor = layers.gather(flat_tensor, idx) new_tensor_nd = layers.reshape( new_flat_tensor, @@ -681,7 +683,8 @@ class Transformer(Layer): finished_in_finished): max_length_penalty = np.power(((5. + max_len) / 6.), alpha) # The best possible score of the most likely alive sequence - lower_bound_alive_scores = alive_log_probs[:, 0] / max_length_penalty + lower_bound_alive_scores = alive_log_probs[:, + 0] / max_length_penalty # Now to compute the lowest score of a finished sequence in finished # If the sequence isn't finished, we multiply it's score by 0. since @@ -711,8 +714,8 @@ class Transformer(Layer): curr_scores = log_probs / length_penalty flat_curr_scores = layers.reshape(curr_scores, [batch_size, -1]) - topk_scores, topk_ids = layers.topk(flat_curr_scores, - k=beam_size * 2) + topk_scores, topk_ids = layers.topk( + flat_curr_scores, k=beam_size * 2) topk_log_probs = topk_scores * length_penalty @@ -723,13 +726,11 @@ class Transformer(Layer): topk_seq = gather_2d_by_gather(alive_seq, topk_beam_index, beam_size, batch_size) topk_seq = layers.concat( - [topk_seq, - layers.reshape(topk_ids, topk_ids.shape + [1])], + [topk_seq, layers.reshape(topk_ids, topk_ids.shape + [1])], axis=2) states = update_states(states, topk_beam_index, beam_size) - eos = layers.fill_constant(shape=topk_ids.shape, - dtype="int64", - value=eos_id) + eos = layers.fill_constant( + shape=topk_ids.shape, dtype="int64", value=eos_id) topk_finished = layers.cast(layers.equal(topk_ids, eos), "float32") #topk_seq: [batch_size, 2*beam_size, i+1] @@ -751,37 +752,35 @@ class Transformer(Layer): def grow_finished(finished_seq, finished_scores, finished_flags, curr_seq, curr_scores, curr_finished): # finished scores - finished_seq = layers.concat([ - finished_seq, - layers.fill_constant(shape=[batch_size, beam_size, 1], - dtype="int64", - value=eos_id) - ], - axis=2) + finished_seq = layers.concat( + [ + finished_seq, layers.fill_constant( + shape=[batch_size, beam_size, 1], + dtype="int64", + value=eos_id) + ], + axis=2) # Set the scores of the unfinished seq in curr_seq to large negative # values curr_scores += (1. - curr_finished) * -inf # concatenating the sequences and scores along beam axis curr_finished_seq = layers.concat([finished_seq, curr_seq], axis=1) - curr_finished_scores = layers.concat([finished_scores, curr_scores], - axis=1) - curr_finished_flags = layers.concat([finished_flags, curr_finished], - axis=1) + curr_finished_scores = layers.concat( + [finished_scores, curr_scores], axis=1) + curr_finished_flags = layers.concat( + [finished_flags, curr_finished], axis=1) _, topk_indexes = layers.topk(curr_finished_scores, k=beam_size) finished_seq = gather_2d_by_gather(curr_finished_seq, topk_indexes, beam_size * 3, batch_size) - finished_scores = gather_2d_by_gather(curr_finished_scores, - topk_indexes, beam_size * 3, - batch_size) - finished_flags = gather_2d_by_gather(curr_finished_flags, - topk_indexes, beam_size * 3, - batch_size) + finished_scores = gather_2d_by_gather( + curr_finished_scores, topk_indexes, beam_size * 3, batch_size) + finished_flags = gather_2d_by_gather( + curr_finished_flags, topk_indexes, beam_size * 3, batch_size) return finished_seq, finished_scores, finished_flags for i in range(max_len): - trg_pos = layers.fill_constant(shape=trg_word.shape, - dtype="int64", - value=i) + trg_pos = layers.fill_constant( + shape=trg_word.shape, dtype="int64", value=i) logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias, enc_output, caches) topk_seq, topk_log_probs, topk_scores, topk_finished, states = grow_topk( @@ -818,20 +817,23 @@ class Transformer(Layer): return layers.expand(tensor, tile_dims) def merge_batch_beams(tensor): - return layers.reshape(tensor, [tensor.shape[0] * tensor.shape[1]] + - tensor.shape[2:]) + return layers.reshape( + tensor, [tensor.shape[0] * tensor.shape[1]] + tensor.shape[2:]) def split_batch_beams(tensor): - return fluid.layers.reshape(tensor, - shape=[-1, beam_size] + - list(tensor.shape[1:])) + return fluid.layers.reshape( + tensor, shape=[-1, beam_size] + list(tensor.shape[1:])) def mask_probs(probs, finished, noend_mask_tensor): # TODO: use where_op finished = layers.cast(finished, dtype=probs.dtype) probs = layers.elementwise_mul( - layers.expand(layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size]), - noend_mask_tensor, axis=-1) - layers.elementwise_mul(probs, (finished - 1), axis=0) + layers.expand( + layers.unsqueeze(finished, [2]), + [1, 1, self.trg_vocab_size]), + noend_mask_tensor, + axis=-1) - layers.elementwise_mul( + probs, (finished - 1), axis=0) return probs def gather(x, indices, batch_pos): @@ -845,54 +847,53 @@ class Transformer(Layer): inf = float(1. * 1e7) batch_size = enc_output.shape[0] max_len = (enc_output.shape[1] + 20) if max_len is None else max_len - vocab_size_tensor = layers.fill_constant(shape=[1], - dtype="int64", - value=self.trg_vocab_size) + vocab_size_tensor = layers.fill_constant( + shape=[1], dtype="int64", value=self.trg_vocab_size) end_token_tensor = to_variable( - np.full([batch_size, beam_size], eos_id, dtype="int64")) + np.full( + [batch_size, beam_size], eos_id, dtype="int64")) noend_array = [-inf] * self.trg_vocab_size noend_array[eos_id] = 0 - noend_mask_tensor = to_variable(np.array(noend_array,dtype="float32")) + noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32")) batch_pos = layers.expand( layers.unsqueeze( - to_variable(np.arange(0, batch_size, 1, dtype="int64")), [1]), - [1, beam_size]) + to_variable(np.arange( + 0, batch_size, 1, dtype="int64")), [1]), [1, beam_size]) predict_ids = [] parent_ids = [] ### initialize states of beam search ### log_probs = to_variable( - np.array([[0.] + [-inf] * (beam_size - 1)] * batch_size, - dtype="float32")) - finished = to_variable(np.full([batch_size, beam_size], 0, - dtype="bool")) + np.array( + [[0.] + [-inf] * (beam_size - 1)] * batch_size, + dtype="float32")) + finished = to_variable( + np.full( + [batch_size, beam_size], 0, dtype="bool")) ### initialize inputs and states of transformer decoder ### ## init inputs for decoder, shaped `[batch_size*beam_size, ...]` - trg_word = layers.fill_constant(shape=[batch_size * beam_size, 1], - dtype="int64", - value=bos_id) + trg_word = layers.fill_constant( + shape=[batch_size * beam_size, 1], dtype="int64", value=bos_id) trg_pos = layers.zeros_like(trg_word) trg_src_attn_bias = merge_batch_beams( expand_to_beam_size(trg_src_attn_bias, beam_size)) - enc_output = merge_batch_beams(expand_to_beam_size(enc_output, beam_size)) + enc_output = merge_batch_beams( + expand_to_beam_size(enc_output, beam_size)) ## init states (caches) for transformer, need to be updated according to selected beam caches = [{ - "k": - layers.fill_constant( + "k": layers.fill_constant( shape=[batch_size * beam_size, self.n_head, 0, self.d_key], dtype=enc_output.dtype, value=0), - "v": - layers.fill_constant( + "v": layers.fill_constant( shape=[batch_size * beam_size, self.n_head, 0, self.d_value], dtype=enc_output.dtype, value=0), } for i in range(self.n_layer)] for i in range(max_len): - trg_pos = layers.fill_constant(shape=trg_word.shape, - dtype="int64", - value=i) + trg_pos = layers.fill_constant( + shape=trg_word.shape, dtype="int64", value=i) caches = map_structure( # can not be reshaped since the 0 size lambda x: x if i == 0 else merge_batch_beams(x), caches) logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias, @@ -902,18 +903,17 @@ class Transformer(Layer): fluid.layers.log(fluid.layers.softmax(logits))) step_log_probs = mask_probs(step_log_probs, finished, noend_mask_tensor) - log_probs = layers.elementwise_add(x=step_log_probs, - y=log_probs, - axis=0) + log_probs = layers.elementwise_add( + x=step_log_probs, y=log_probs, axis=0) log_probs = layers.reshape(log_probs, [-1, beam_size * self.trg_vocab_size]) scores = log_probs - topk_scores, topk_indices = fluid.layers.topk(input=scores, - k=beam_size) - beam_indices = fluid.layers.elementwise_floordiv( - topk_indices, vocab_size_tensor) - token_indices = fluid.layers.elementwise_mod( - topk_indices, vocab_size_tensor) + topk_scores, topk_indices = fluid.layers.topk( + input=scores, k=beam_size) + beam_indices = fluid.layers.elementwise_floordiv(topk_indices, + vocab_size_tensor) + token_indices = fluid.layers.elementwise_mod(topk_indices, + vocab_size_tensor) # update states caches = map_structure(lambda x: gather(x, beam_indices, batch_pos), diff --git a/dygraph/transformer/reader.py b/dygraph/transformer/reader.py index ef23c5e1..0ac62a1b 100644 --- a/dygraph/transformer/reader.py +++ b/dygraph/transformer/reader.py @@ -306,6 +306,7 @@ class DataProcessor(object): :param seed: The seed for random. :type seed: int """ + def __init__(self, src_vocab_fpath, trg_vocab_fpath, @@ -360,21 +361,23 @@ class DataProcessor(object): def load_src_trg_ids(self, fpattern, tar_fname): converters = [ - Converter(vocab=self._src_vocab, - beg=self._bos_idx, - end=self._eos_idx, - unk=self._unk_idx, - delimiter=self._token_delimiter, - add_beg=False) + Converter( + vocab=self._src_vocab, + beg=self._bos_idx, + end=self._eos_idx, + unk=self._unk_idx, + delimiter=self._token_delimiter, + add_beg=False) ] if not self._only_src: converters.append( - Converter(vocab=self._trg_vocab, - beg=self._bos_idx, - end=self._eos_idx, - unk=self._unk_idx, - delimiter=self._token_delimiter, - add_beg=True)) + Converter( + vocab=self._trg_vocab, + beg=self._bos_idx, + end=self._eos_idx, + unk=self._unk_idx, + delimiter=self._token_delimiter, + add_beg=True)) converters = ComposedConverter(converters) @@ -402,9 +405,8 @@ class DataProcessor(object): f = tarfile.open(fpaths[0], "rb") for line in f.extractfile(tar_fname): fields = line.strip(b"\n").split(self._field_delimiter) - if (not self._only_src - and len(fields) == 2) or (self._only_src - and len(fields) == 1): + if (not self._only_src and len(fields) == 2) or ( + self._only_src and len(fields) == 1): yield fields else: for fpath in fpaths: @@ -414,9 +416,8 @@ class DataProcessor(object): with open(fpath, "rb") as f: for line in f: fields = line.strip(b"\n").split(self._field_delimiter) - if (not self._only_src - and len(fields) == 2) or (self._only_src - and len(fields) == 1): + if (not self._only_src and len(fields) == 2) or ( + self._only_src and len(fields) == 1): yield fields @staticmethod @@ -512,8 +513,8 @@ class DataProcessor(object): for item in data_reader(): inst_num_per_part = len(item) // count for i in range(count): - yield item[inst_num_per_part * i:inst_num_per_part * - (i + 1)] + yield item[inst_num_per_part * i:inst_num_per_part * (i + 1 + )] return __impl__ -- GitLab