diff --git a/python/paddle_fl/mobile/README.md b/python/paddle_fl/mobile/README.md index cc14e3a9c2133f87107e6e8cc2bc9e19633b7e0f..714b09579b12e6cf78996d74a78d148ed278a5d6 100644 --- a/python/paddle_fl/mobile/README.md +++ b/python/paddle_fl/mobile/README.md @@ -60,10 +60,10 @@ mpirun -np 2 python application.py lm_data ### 训练结果 ```shell -framework.py : INFO infer results: 0.085723 +framework.py : INFO infer results: 0.116334 ``` -即:在测试集上的,测试Top1为 8.6% +即:在测试集上的,测试Top1为 11.6% ## 添加自己的数据集和Trainer @@ -89,7 +89,7 @@ framework.py : INFO infer results: 0.085723 - Step1 模型初始化 1. 全局参数初始化:由编号为0的simulator来做模型初始化工作,初始化之后,它会通过UpdateGlobalParams()接口将参数传递给Scheduler; - + 2. 个性化参数初始化 - Step2 模型分发 diff --git a/python/paddle_fl/mobile/application.py b/python/paddle_fl/mobile/application.py index 905c2e7375482777b1a18c4fe861eab2aa1fa7c9..7fc4949647cbe040b8ea4872a937007da6a5bc4c 100644 --- a/python/paddle_fl/mobile/application.py +++ b/python/paddle_fl/mobile/application.py @@ -36,17 +36,22 @@ simulator = SimulationFramework(role_maker) language_model_trainer = LanguageModelTrainer() language_model_trainer.set_trainer_configs({ - "epoch": 3, + "epoch": 1, "max_steps_in_epoch": -1, - "lr": 0.1, + "lr": 1.0, "batch_size": 5, + "max_grad_norm": 5, + "n_hidden": 256, + "num_layers": 2, + "init_scale": 0.1, + "dropout_prob": 0.0, }) sampler = UniformSampler() -sampler.set_sample_num(30) +sampler.set_sample_num(10) sampler.set_min_ins_num(1) test_sampler = Test1percentSampler() -fed_avg_optimizer = FedAvgOptimizer(learning_rate=2.0) +fed_avg_optimizer = FedAvgOptimizer(learning_rate=1.85) simulator.set_trainer(language_model_trainer) simulator.set_sampler(sampler) @@ -68,5 +73,8 @@ elif simulator.is_simulator(): print("dates: {}".format(dates)) time.sleep(10) - simulator.run_simulation( - base_path, dates, sim_num_everyday=100, do_test=True, test_skip_day=1) + simulator.run_simulation(base_path, + dates, + sim_num_everyday=100, + do_test=True, + test_skip_day=1) diff --git a/python/paddle_fl/mobile/model/language_model.py b/python/paddle_fl/mobile/model/language_model.py index 454c973e56073e5aa78c5fa6333fa16f7ec5965e..3f6764816f10a662a415d7f1f1552316da9db1e8 100644 --- a/python/paddle_fl/mobile/model/language_model.py +++ b/python/paddle_fl/mobile/model/language_model.py @@ -22,8 +22,6 @@ import paddle.fluid as fluid import paddle.fluid.layers as layers import paddle.fluid as fluid -from paddle.fluid.layers.control_flow import StaticRNN as PaddingRNN -import numpy as np from paddle.fluid import ParamAttr from paddle.fluid.contrib.layers import basic_lstm @@ -31,19 +29,19 @@ from paddle.fluid.contrib.layers import basic_lstm class LanguageModel(ModelBase): def __init__(self): # model args - self.hidden_size_ = 200 - self.vocab_size_ = 10000 + self.seq_len_ = 10 # fixed + self.n_hidden_ = 256 self.num_layers_ = 2 - self.num_steps_ = 10 # fix - self.init_scale_ = 0.1 - self.dropout_ = 0.0 - self.rnn_model_ = 'basic_lstm' self.pad_symbol_ = 0 self.unk_symbol_ = 1 + self.vocab_size_ = 10000 + self.init_scale_ = 0.1 + self.max_grad_norm_ = 5 + self.dropout_prob_ = 0.0 # results self.correct_ = None - self.prediction_ = None + self.pred_ = None self.loss_ = None # private vars @@ -53,6 +51,13 @@ class LanguageModel(ModelBase): self.input_name_list_ = None self.target_var_names_ = [] + def update_params(self, config): + self.n_hidden_ = config.get("n_hidden", 256) + self.num_layers_ = config.get("num_layers", 2) + self.init_scale_ = config.get("init_scale", 0.1) + self.max_grad_norm_ = config.get("max_grad_norm", 5) + self.dropout_prob_ = config.get("dropout_prob", 0.0) + def get_model_input_names(self): return self.input_name_list_ @@ -63,381 +68,157 @@ class LanguageModel(ModelBase): return self.loss_.name def get_model_metrics(self): - return {"correct": self.correct_.name} + metrics = { + "init_hidden": self.last_hidden_.name, + "init_cell": self.last_cell_.name, + "correct": self.correct_.name + } + return metrics def get_target_names(self): return self.target_var_names_ def build_model(self, model_configs): - hidden_size = self.hidden_size_ - init_scale = self.init_scale_ - dropout = self.dropout_ - num_layers = self.num_layers_ - num_steps = self.num_steps_ - pad_symbol = self.pad_symbol_ - unk_symbol = self.unk_symbol_ - vocab_size = self.vocab_size_ - rnn_model = self.rnn_model_ - x = fluid.data(name="x", shape=[None, num_steps], dtype='int64') - y = fluid.data(name="y", shape=[None, num_steps], dtype='int64') - x = layers.reshape(x, shape=[-1, num_steps, 1]) - y = layers.reshape(y, shape=[-1, 1]) - self.input_name_list_ = ['x', 'y'] - - init_hidden = layers.fill_constant_batch_size_like( - input=x, - shape=[-1, num_layers, hidden_size], - value=0, - dtype="float32") - init_cell = layers.fill_constant_batch_size_like( - input=x, - shape=[-1, num_layers, hidden_size], - value=0, - dtype="float32") + self.update_params(model_configs) + features = fluid.layers.data(name="features", + shape=[None, self.seq_len_], + dtype='int64') + labels = fluid.layers.data(name="labels", + shape=[None, self.seq_len_], + dtype='int64') + sequence_length_ph = fluid.layers.data(name="seq_len_ph", + shape=[None], + dtype='int64') + sequence_mask_ph = fluid.layers.data(name="seq_mask_ph", + shape=[None], + dtype='float32') + + init_hidden = fluid.layers.data( + name="init_hidden", + shape=[None, self.num_layers_, self.n_hidden_], + dtype='float32') + init_cell = fluid.layers.data( + name="init_cell", + shape=[None, self.num_layers_, self.n_hidden_], + dtype='float32') init_hidden = layers.transpose(init_hidden, perm=[1, 0, 2]) init_cell = layers.transpose(init_cell, perm=[1, 0, 2]) init_hidden_reshape = layers.reshape( - init_hidden, shape=[num_layers, -1, hidden_size]) + init_hidden, shape=[self.num_layers_, -1, self.n_hidden_]) init_cell_reshape = layers.reshape( - init_cell, shape=[num_layers, -1, hidden_size]) + init_cell, shape=[self.num_layers_, -1, self.n_hidden_]) - x_emb = layers.embedding( - input=x, - size=[vocab_size, hidden_size], + features = layers.reshape(features, shape=[-1, self.seq_len_, 1]) + + # word embedding + inputs = layers.embedding( + input=features, + size=[self.vocab_size_, self.n_hidden_], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer( - low=-init_scale, high=init_scale))) - - x_emb = layers.reshape( - x_emb, shape=[-1, num_steps, hidden_size], inplace=True) - if dropout != None and dropout > 0.0: - x_emb = layers.dropout( - x_emb, - dropout_prob=dropout, - dropout_implementation='upscale_in_train') - - if rnn_model == "padding": - rnn_out, last_hidden, last_cell = self._padding_rnn( - x_emb, - len=num_steps, - init_hidden=init_hidden_reshape, - init_cell=init_cell_reshape) - elif rnn_model == "static": - rnn_out, last_hidden, last_cell = self._encoder_static( - x_emb, - len=num_steps, - init_hidden=init_hidden_reshape, - init_cell=init_cell_reshape) - elif rnn_model == "cudnn": - x_emb = layers.transpose(x_emb, perm=[1, 0, 2]) - rnn_out, last_hidden, last_cell = layers.lstm( - x_emb, - init_hidden_reshape, - init_cell_reshape, - num_steps, - hidden_size, - num_layers, - is_bidirec=False, - default_initializer=fluid.initializer.UniformInitializer( - low=-init_scale, high=init_scale)) - rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2]) - elif rnn_model == "basic_lstm": - rnn_out, last_hidden, last_cell = basic_lstm( - x_emb, - init_hidden, - init_cell, - hidden_size, - num_layers=num_layers, - batch_first=True, - dropout_prob=dropout, - param_attr=ParamAttr( - initializer=fluid.initializer.UniformInitializer( - low=-init_scale, high=init_scale)), - bias_attr=ParamAttr( - initializer=fluid.initializer.Constant(0.0)), - forget_bias=0.0) - else: - raise Exception("type not support") - - rnn_out = layers.reshape( - rnn_out, shape=[-1, num_steps, hidden_size], inplace=True) - - softmax_weight = layers.create_parameter( - [hidden_size, vocab_size], + low=-self.init_scale_, high=self.init_scale_))) + + # LSTM + output, last_hidden, last_cell = self._build_rnn_graph( + inputs, init_hidden, init_cell, sequence_length_ph) + + output = layers.reshape(output, + shape=[-1, self.seq_len_, self.n_hidden_], + inplace=True) + self.last_hidden_ = layers.reshape( + last_hidden, [-1, self.num_layers_, self.n_hidden_]) + self.last_cell_ = layers.reshape( + last_cell, [-1, self.num_layers_, self.n_hidden_]) + + # softmax + softmax_w = layers.create_parameter( + [self.n_hidden_, self.vocab_size_], dtype="float32", - name="softmax_weight", + name="softmax_w", default_initializer=fluid.initializer.UniformInitializer( - low=-init_scale, high=init_scale)) - softmax_bias = layers.create_parameter( - [vocab_size], + low=-self.init_scale_, high=self.init_scale_)) + softmax_b = layers.create_parameter( + [self.vocab_size_], dtype="float32", - name='softmax_bias', + name='softmax_b', default_initializer=fluid.initializer.UniformInitializer( - low=-init_scale, high=init_scale)) + low=-self.init_scale_, high=self.init_scale_)) - projection = layers.matmul(rnn_out, softmax_weight) - projection = layers.elementwise_add(projection, softmax_bias) - projection = layers.reshape( - projection, shape=[-1, vocab_size], inplace=True) + logits = layers.matmul(output, softmax_w) + logits = layers.elementwise_add(logits, softmax_b) + logits = layers.reshape(logits, + shape=[-1, self.vocab_size_], + inplace=True) # correct predictions - labels_reshaped = fluid.layers.reshape(y, [-1]) - pred = fluid.layers.cast( - fluid.layers.argmax(projection, 1), dtype="int64") - correct_pred = fluid.layers.cast( - fluid.layers.equal(pred, labels_reshaped), dtype="int64") - self.prediction_ = pred - self.target_var_names_.append(pred) + labels_reshaped = layers.reshape(labels, [-1]) + pred = layers.cast(layers.argmax(logits, 1), dtype="int64") + correct_pred = layers.cast(layers.equal(pred, labels_reshaped), + dtype="int64") + self.pred_ = pred # predicting unknown is always considered wrong - unk_tensor = fluid.layers.fill_constant( - fluid.layers.shape(labels_reshaped), - value=unk_symbol, - dtype='int64') - pred_unk = fluid.layers.cast( - fluid.layers.equal(pred, unk_tensor), dtype="int64") - correct_unk = fluid.layers.elementwise_mul(pred_unk, correct_pred) + # only in paddle 1.8 + unk_tensor = layers.fill_constant(layers.shape(labels_reshaped), + value=self.unk_symbol_, + dtype='int64') + pred_unk = layers.cast(layers.equal(pred, unk_tensor), dtype="int64") + correct_unk = layers.elementwise_mul(pred_unk, correct_pred) # predicting padding is always considered wrong - pad_tensor = fluid.layers.fill_constant( - fluid.layers.shape(labels_reshaped), value=0, dtype='int64') - pred_pad = fluid.layers.cast( - fluid.layers.equal(pred, pad_tensor), dtype="int64") - correct_pad = fluid.layers.elementwise_mul(pred_pad, correct_pred) - - # acc - correct_count = fluid.layers.reduce_sum(correct_pred) \ + pad_tensor = layers.fill_constant(layers.shape(labels_reshaped), + value=self.pad_symbol_, + dtype='int64') + pred_pad = layers.cast(layers.equal(pred, pad_tensor), dtype="int64") + correct_pad = layers.elementwise_mul(pred_pad, correct_pred) + + # Reshape logits to be a 3-D tensor for sequence loss + logits = layers.reshape(logits, [-1, self.seq_len_, self.vocab_size_]) + + labels = layers.reshape(labels, [-1, self.seq_len_, 1]) + loss = layers.softmax_with_cross_entropy(logits=logits, + label=labels, + soft_label=False, + return_softmax=False) + sequence_mask = layers.reshape(sequence_mask_ph, + [-1, self.seq_len_, 1]) + loss = layers.reduce_mean(layers.elementwise_mul(loss, sequence_mask)) + + eval_metric_ops = fluid.layers.reduce_sum(correct_pred) \ - fluid.layers.reduce_sum(correct_unk) \ - fluid.layers.reduce_sum(correct_pad) - self.correct_ = correct_count - self.target_var_names_.append(correct_count) - - loss = layers.softmax_with_cross_entropy( - logits=projection, label=y, soft_label=False) - loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True) - loss = layers.reduce_mean(loss, dim=[0]) - loss = layers.reduce_sum(loss) self.loss_ = loss - self.target_var_names_.append(loss) - - loss.persistable = True - - # This will feed last_hidden, last_cell to init_hidden, init_cell, which - # can be used directly in next batch. This can avoid the fetching of - # last_hidden and last_cell and feeding of init_hidden and init_cell in - # each training step. - #last_hidden = layers.transpose(last_hidden, perm=[1, 0, 2]) - #last_cell = layers.transpose(last_cell, perm=[1, 0, 2]) - #self.input_name_list_ = ['x', 'y', 'init_hidden', 'init_cell'] + self.correct_ = eval_metric_opsself.input_name_list_ = [ + 'features', 'labels', 'seq_len_ph', 'seq_mask_ph', 'init_hidden', + 'init_cell' + ] + self.target_var_names_ = [ + self.loss_, self.last_hidden_, self.last_cell_, self.correct_ + ] self.program_ = fluid.default_main_program() self.startup_program_ = fluid.default_startup_program() - def _padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): - weight_1_arr = [] - weight_2_arr = [] - bias_arr = [] - hidden_array = [] - cell_array = [] - mask_array = [] - hidden_size = self.hidden_size_ - init_scale = self.init_scale_ - dropout = slef.dropout_ - num_layers = self.num_layers_ - num_steps = self._num_steps_ - for i in range(num_layers): - weight_1 = layers.create_parameter( - [hidden_size * 2, hidden_size * 4], - dtype="float32", - name="fc_weight1_" + str(i), - default_initializer=fluid.initializer.UniformInitializer( - low=-init_scale, high=init_scale)) - weight_1_arr.append(weight_1) - bias_1 = layers.create_parameter( - [hidden_size * 4], - dtype="float32", - name="fc_bias1_" + str(i), - default_initializer=fluid.initializer.Constant(0.0)) - bias_arr.append(bias_1) - - pre_hidden = layers.slice( - init_hidden, axes=[0], starts=[i], ends=[i + 1]) - pre_cell = layers.slice( - init_cell, axes=[0], starts=[i], ends=[i + 1]) - pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size]) - pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size]) - hidden_array.append(pre_hidden) - cell_array.append(pre_cell) - - input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2]) - rnn = PaddingRNN() - - with rnn.step(): - input = rnn.step_input(input_embedding) - for k in range(num_layers): - pre_hidden = rnn.memory(init=hidden_array[k]) - pre_cell = rnn.memory(init=cell_array[k]) - weight_1 = weight_1_arr[k] - bias = bias_arr[k] - - nn = layers.concat([input, pre_hidden], 1) - gate_input = layers.matmul(x=nn, y=weight_1) - - gate_input = layers.elementwise_add(gate_input, bias) - i = layers.slice( - gate_input, axes=[1], starts=[0], ends=[hidden_size]) - j = layers.slice( - gate_input, - axes=[1], - starts=[hidden_size], - ends=[hidden_size * 2]) - f = layers.slice( - gate_input, - axes=[1], - starts=[hidden_size * 2], - ends=[hidden_size * 3]) - o = layers.slice( - gate_input, - axes=[1], - starts=[hidden_size * 3], - ends=[hidden_size * 4]) - - c = pre_cell * layers.sigmoid(f) + layers.sigmoid( - i) * layers.tanh(j) - m = layers.tanh(c) * layers.sigmoid(o) - - rnn.update_memory(pre_hidden, m) - rnn.update_memory(pre_cell, c) - - rnn.step_output(m) - rnn.step_output(c) - - input = m - - if dropout != None and dropout > 0.0: - input = layers.dropout( - input, - dropout_prob=dropout, - dropout_implementation='upscale_in_train') - - rnn.step_output(input) - rnnout = rnn() - - last_hidden_array = [] - last_cell_array = [] - real_res = rnnout[-1] - for i in range(num_layers): - m = rnnout[i * 2] - c = rnnout[i * 2 + 1] - m.stop_gradient = True - c.stop_gradient = True - last_h = layers.slice( - m, axes=[0], starts=[num_steps - 1], ends=[num_steps]) - last_hidden_array.append(last_h) - last_c = layers.slice( - c, axes=[0], starts=[num_steps - 1], ends=[num_steps]) - last_cell_array.append(last_c) - real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) - last_hidden = layers.concat(last_hidden_array, 0) - last_cell = layers.concat(last_cell_array, 0) - - return real_res, last_hidden, last_cell - - def _encoder_static(input_embedding, - len=3, - init_hidden=None, - init_cell=None): - weight_1_arr = [] - weight_2_arr = [] - bias_arr = [] - hidden_array = [] - cell_array = [] - mask_array = [] - hidden_size = self.hidden_size_ - init_scale = self.init_scale_ - dropout = slef.dropout_ - num_layers = self.num_layers_ - for i in range(num_layers): - weight_1 = layers.create_parameter( - [hidden_size * 2, hidden_size * 4], - dtype="float32", - name="fc_weight1_" + str(i), - default_initializer=fluid.initializer.UniformInitializer( - low=-init_scale, high=init_scale)) - weight_1_arr.append(weight_1) - bias_1 = layers.create_parameter( - [hidden_size * 4], - dtype="float32", - name="fc_bias1_" + str(i), - default_initializer=fluid.initializer.Constant(0.0)) - bias_arr.append(bias_1) - - pre_hidden = layers.slice( - init_hidden, axes=[0], starts=[i], ends=[i + 1]) - pre_cell = layers.slice( - init_cell, axes=[0], starts=[i], ends=[i + 1]) - pre_hidden = layers.reshape( - pre_hidden, shape=[-1, hidden_size], inplace=True) - pre_cell = layers.reshape( - pre_cell, shape=[-1, hidden_size], inplace=True) - hidden_array.append(pre_hidden) - cell_array.append(pre_cell) - - res = [] - sliced_inputs = layers.split( - input_embedding, num_or_sections=len, dim=1) - - for index in range(len): - input = sliced_inputs[index] - input = layers.reshape( - input, shape=[-1, hidden_size], inplace=True) - for k in range(num_layers): - pre_hidden = hidden_array[k] - pre_cell = cell_array[k] - weight_1 = weight_1_arr[k] - bias = bias_arr[k] - nn = layers.concat([input, pre_hidden], 1) - gate_input = layers.matmul(x=nn, y=weight_1) - - gate_input = layers.elementwise_add(gate_input, bias) - i, j, f, o = layers.split( - gate_input, num_or_sections=4, dim=-1) - - c = pre_cell * layers.sigmoid(f) + layers.sigmoid( - i) * layers.tanh(j) - m = layers.tanh(c) * layers.sigmoid(o) - - hidden_array[k] = m - cell_array[k] = c - input = m - - if dropout != None and dropout > 0.0: - input = layers.dropout( - input, - dropout_prob=dropout, - dropout_implementation='upscale_in_train') - - res.append(input) - - last_hidden = layers.concat(hidden_array, 1) - last_hidden = layers.reshape( - last_hidden, shape=[-1, num_layers, hidden_size], inplace=True) - last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2]) - - last_cell = layers.concat(cell_array, 1) - last_cell = layers.reshape( - last_cell, shape=[-1, num_layers, hidden_size]) - last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2]) - - real_res = layers.concat(res, 0) - real_res = layers.reshape( - real_res, shape=[len, -1, hidden_size], inplace=True) - real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) - - return real_res, last_hidden, last_cell + def _build_rnn_graph(self, inputs, init_hidden, init_cell, + sequence_length_ph): + rnn_out, last_hidden, last_cell = basic_lstm( + input=inputs, + init_hidden=init_hidden, + init_cell=init_cell, + hidden_size=self.n_hidden_, + num_layers=self.num_layers_, + batch_first=True, + dropout_prob=self.dropout_prob_, + sequence_length=sequence_length_ph, + param_attr=ParamAttr( + initializer=fluid.initializer.UniformInitializer( + low=-self.init_scale_, high=self.init_scale_)), + bias_attr=ParamAttr(initializer=fluid.initializer.Constant(0.0)), + forget_bias=0.0) + return rnn_out, last_hidden, last_cell diff --git a/python/paddle_fl/mobile/reader/leaf_reddit_reader.py b/python/paddle_fl/mobile/reader/leaf_reddit_reader.py index ee2f926ce6b1da5041e528347f15341b74ad6c7e..fc3f7cd2f3ed68b2790fd76aeeb3b13aa429f255 100644 --- a/python/paddle_fl/mobile/reader/leaf_reddit_reader.py +++ b/python/paddle_fl/mobile/reader/leaf_reddit_reader.py @@ -108,7 +108,7 @@ def train_reader(lines): input_data, input_length = process_x(data_x, VOCAB) target_data = process_y(data_y, VOCAB) - yield [input_data] + [target_data] + yield [input_data] + [target_data] + [input_length] + [data_mask] return local_iter diff --git a/python/paddle_fl/mobile/trainer/language_model_trainer.py b/python/paddle_fl/mobile/trainer/language_model_trainer.py index 8ee382b03820566ef760b4f3ab0fb472d25ced1e..679d981d4ca737df4d8973c10e0718e179c0fb20 100644 --- a/python/paddle_fl/mobile/trainer/language_model_trainer.py +++ b/python/paddle_fl/mobile/trainer/language_model_trainer.py @@ -34,10 +34,10 @@ def train_one_user(arg_dict, trainer_config): max_training_steps = trainer_config["max_training_steps"] batch_size = trainer_config["batch_size"] # logging.info("training one user...") - main_program = fluid.Program.parse_from_string(trainer_config[ - "main_program_desc"]) - startup_program = fluid.Program.parse_from_string(trainer_config[ - "startup_program_desc"]) + main_program = fluid.Program.parse_from_string( + trainer_config["main_program_desc"]) + startup_program = fluid.Program.parse_from_string( + trainer_config["startup_program_desc"]) place = fluid.CPUPlace() exe = fluid.Executor(place) scope = fluid.global_scope() @@ -46,10 +46,9 @@ def train_one_user(arg_dict, trainer_config): exit() exe.run(startup_program) - feeder = fluid.DataFeeder( - feed_list=trainer_config["input_names"], - place=place, - program=main_program) + feeder = fluid.DataFeeder(feed_list=trainer_config["input_names"], + place=place, + program=main_program) data_server_endpoints = arg_dict["data_endpoints"] # create data clients data_client = DataClient() @@ -76,36 +75,43 @@ def train_one_user(arg_dict, trainer_config): epoch = trainer_config["epoch"] max_steps_in_epoch = trainer_config.get("max_steps_in_epoch", -1) metrics = trainer_config["metrics"] - metric_keys = metrics.keys() - fetch_list = [main_program.global_block().var(trainer_config["loss_name"])] - for key in metric_keys: - fetch_list.append(main_program.global_block().var(metrics[key])) + fetch_list = [] + for var in trainer_config["target_names"]: + fetch_list.append(var) - seq_len = 10 for ei in range(epoch): + fetch_res_list = [] trained_sample_num = 0 step = 0 - fetch_res_list = [] - total_loss = 0.0 - total_correct = 0 + num_layers = trainer_config["num_layers"] + hidden_size = trainer_config["n_hidden"] + tot_loss, tot_correct = 0, 0 + tot_samples = 0 + init_hidden, init_cell = generate_init_data(batch_size, num_layers, + hidden_size) for data in train_reader(): + feed_data, input_lengths = prepare_input(batch_size, data, + init_hidden, init_cell) fetch_res = exe.run(main_program, - feed=feeder.feed(data), + feed=feeder.feed(feed_data), fetch_list=fetch_list) + loss, last_hidden, last_cell, correct = fetch_res + + init_hidden = np.array(last_hidden) + init_cell = np.array(last_cell) + tot_loss += np.array(loss) + tot_correct += np.array(correct) + tot_samples += np.sum(input_lengths) step += 1 trained_sample_num += len(data) - fetch_res_list.append([x[0] for x in fetch_res]) + fetch_res_list.append([np.array(loss), np.array(correct)]) if max_steps_in_epoch != -1 and step >= max_steps_in_epoch: break if show_metric and trained_sample_num > 0: - loss = sum([x[0] for x in fetch_res_list]) / trained_sample_num - print("loss: {}, ppl: {}".format(loss, np.exp(loss))) - for i, key in enumerate(metric_keys): - if key == "correct": - value = float(sum([x[i + 1] for x in fetch_res_list - ])) / trained_sample_num - print("correct: {}".format(value / seq_len)) + loss = tot_loss / step + acc = float(tot_correct) / tot_samples + print("loss: {}, acc: {}".format(loss, acc)) local_updated_param_dict = {} # update user param @@ -142,10 +148,10 @@ def infer_one_user(arg_dict, trainer_config): # run startup program, set params uid = arg_dict["uid"] batch_size = trainer_config["batch_size"] - startup_program = fluid.Program.parse_from_string(trainer_config[ - "startup_program_desc"]) - infer_program = fluid.Program.parse_from_string(trainer_config[ - "infer_program_desc"]) + startup_program = fluid.Program.parse_from_string( + trainer_config["startup_program_desc"]) + infer_program = fluid.Program.parse_from_string( + trainer_config["infer_program_desc"]) place = fluid.CPUPlace() exe = fluid.Executor(place) scope = fluid.global_scope() @@ -169,7 +175,6 @@ def infer_one_user(arg_dict, trainer_config): arg_dict["global_params"], scope) # reader - date = arg_dict["date"] global_param_dict = arg_dict["global_params"] user_data = data_client.get_data_by_uid(uid, date) @@ -179,36 +184,60 @@ def infer_one_user(arg_dict, trainer_config): # run infer program os.mkdir(arg_dict["infer_result_dir"]) #pred_file = open(arg_dict["infer_result_dir"] + '/' + "pred_file", "w") - feeder = fluid.DataFeeder( - feed_list=trainer_config["input_names"], - place=place, - program=infer_program) + feeder = fluid.DataFeeder(feed_list=trainer_config["input_names"], + place=place, + program=infer_program) fetch_list = trainer_config["target_names"] #logging.info("fetch_list: {}".format(fetch_list)) fetch_res = [] sample_count = 0 - total_loss = 0.0 - total_correct = 0 - iters = 0 - steps = 0 - seq_len = 10 + num_layers = trainer_config["num_layers"] + hidden_size = trainer_config["n_hidden"] + tot_correct, tot_loss = 0, 0 + tot_samples, tot_batches = 0, 0 + init_hidden, init_cell = generate_init_data(batch_size, num_layers, + hidden_size) for data in infer_reader(): - # feed_data = [x["features"] + [x["label"]] for x in data] - # prediction, acc_val= exe.run(infer_program, - pred, correct_count, loss = exe.run(infer_program, - feed=feeder.feed(data), - fetch_list=fetch_list) - total_loss += loss - total_correct += correct_count - steps += 1 - sample_count += len(data) - - correct = float(total_correct) / (seq_len * sample_count) - # logging.info("correct: {}".format(correct)) + feed_data, input_lengths = prepare_input(batch_size, data, init_hidden, + init_cell) + fetch_res = exe.run(infer_program, + feed=feeder.feed(feed_data), + fetch_list=fetch_list) + loss, last_hidden, last_cell, correct = fetch_res + + cost_eval = np.array(loss) + init_hidden = np.array(last_hidden) + init_cell = np.array(last_cell) + correct_val = np.array(correct) + tot_loss += cost_eval + tot_correct += correct_val + tot_samples += np.sum(input_lengths) + tot_batches += 1 + + loss = tot_loss / tot_batches + acc = float(tot_correct) / tot_samples + logging.info("infer acc: {}".format(acc)) with open(arg_dict["infer_result_dir"] + "/res", "w") as f: - f.write("%d\t%f\n" % (1, correct)) + f.write("%d\t%f\n" % (1, acc)) + + +def prepare_input(batch_size, data, init_hidden, init_cell): + init_hidden = np.split(init_hidden, batch_size) + init_cell = np.split(init_cell, batch_size) + data = [[features] + [labels] + [seq_len_ph] + [seq_mask_ph] + [init_hidden[i]] + [init_cell[i] ] \ + for i, (features, labels, seq_len_ph, seq_mask_ph) in enumerate(data)] + input_lengths = [x[2] for x in data] + return data, input_lengths + + +def generate_init_data(batch_size, num_layers, hidden_size): + init_hidden = np.zeros((batch_size, num_layers, hidden_size), + dtype='float32') + init_cell = np.zeros((batch_size, num_layers, hidden_size), + dtype='float32') + return init_hidden, init_cell def save_and_upload(arg_dict, trainer_config, dfs_upload_path): @@ -219,7 +248,6 @@ def save_and_upload(arg_dict, trainer_config, dfs_upload_path): def evaluate_a_group(group): group_list = [] for label, pred, _ in group: - # print("%s\t%s\n" % (label, pred)) group_list.append((int(label), float(pred))) random.shuffle(group_list) labels = [x[0] for x in group_list] @@ -236,7 +264,6 @@ class LanguageModelTrainer(TrainerBase): """ LanguageModelTrainer only support training with PaddlePaddle """ - def __init__(self): super(LanguageModelTrainer, self).__init__() self.main_program_ = fluid.Program() @@ -270,10 +297,13 @@ class LanguageModelTrainer(TrainerBase): """ with fluid.program_guard(self.main_program_, self.startup_program_): self.input_model_ = LanguageModel() - model_configs = {} + model_configs = self.trainer_config self.input_model_.build_model(model_configs) + optimizer = fluid.optimizer.SGD( - learning_rate=self.trainer_config["lr"]) + learning_rate=self.trainer_config["lr"], + grad_clip=fluid.clip.GradientClipByGlobalNorm( + clip_norm=self.trainer_config["max_grad_norm"])) optimizer.minimize(self.input_model_.get_model_loss()) self.main_program_desc_ = self.main_program_.desc.serialize_to_string() @@ -283,13 +313,16 @@ class LanguageModelTrainer(TrainerBase): self.input_model_.get_model_loss_name()) self.update_trainer_configs( "input_names", - self.input_model_.get_model_input_names(), ) + self.input_model_.get_model_input_names(), + ) self.update_trainer_configs( "target_names", - self.input_model_.get_target_names(), ) + self.input_model_.get_target_names(), + ) self.update_trainer_configs( "metrics", - self.input_model_.get_model_metrics(), ) + self.input_model_.get_model_metrics(), + ) self.update_trainer_configs("show_metric", True) self.update_trainer_configs("max_training_steps", "inf") self.update_trainer_configs("shuffle", False)