# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import numpy as np import paddle.v2 as paddle import paddle.v2.dataset.conll05 as conll05 import paddle.fluid as fluid import time import os word_dict, verb_dict, label_dict = conll05.get_dict() word_dict_len = len(word_dict) label_dict_len = len(label_dict) pred_len = len(verb_dict) mark_dict_len = 2 word_dim = 32 mark_dim = 5 hidden_dim = 512 depth = 8 mix_hidden_lr = 1e-3 IS_SPARSE = True PASS_NUM = 10 BATCH_SIZE = 20 embedding_name = 'emb' def load_parameter(file_name, h, w): with open(file_name, 'rb') as f: f.read(16) # skip header. return np.fromfile(f, dtype=np.float32).reshape(h, w) def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, **ignored): # 8 features predicate_embedding = fluid.layers.embedding( input=predicate, size=[pred_len, word_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr='vemb') mark_embedding = fluid.layers.embedding( input=mark, size=[mark_dict_len, mark_dim], dtype='float32', is_sparse=IS_SPARSE) word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] emb_layers = [ fluid.layers.embedding( size=[word_dict_len, word_dim], input=x, param_attr=fluid.ParamAttr( name=embedding_name, trainable=False)) for x in word_input ] emb_layers.append(predicate_embedding) emb_layers.append(mark_embedding) hidden_0_layers = [ fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers ] hidden_0 = fluid.layers.sums(input=hidden_0_layers) lstm_0 = fluid.layers.dynamic_lstm( input=hidden_0, size=hidden_dim, candidate_activation='relu', gate_activation='sigmoid', cell_activation='sigmoid') # stack L-LSTM and R-LSTM with direct edges input_tmp = [hidden_0, lstm_0] for i in range(1, depth): mix_hidden = fluid.layers.sums(input=[ fluid.layers.fc(input=input_tmp[0], size=hidden_dim), fluid.layers.fc(input=input_tmp[1], size=hidden_dim) ]) lstm = fluid.layers.dynamic_lstm( input=mix_hidden, size=hidden_dim, candidate_activation='relu', gate_activation='sigmoid', cell_activation='sigmoid', is_reverse=((i % 2) == 1)) input_tmp = [mix_hidden, lstm] feature_out = fluid.layers.sums(input=[ fluid.layers.fc(input=input_tmp[0], size=label_dict_len), fluid.layers.fc(input=input_tmp[1], size=label_dict_len) ]) return feature_out def to_lodtensor(data, place): seq_lens = [len(seq) for seq in data] cur_len = 0 lod = [cur_len] for l in seq_lens: cur_len += l lod.append(cur_len) flattened_data = np.concatenate(data, axis=0).astype("int64") flattened_data = flattened_data.reshape([len(flattened_data), 1]) res = fluid.LoDTensor() res.set(flattened_data, place) res.set_lod([lod]) return res def main(): # define network topology word = fluid.layers.data( name='word_data', shape=[1], dtype='int64', lod_level=1) predicate = fluid.layers.data( name='verb_data', shape=[1], dtype='int64', lod_level=1) ctx_n2 = fluid.layers.data( name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) ctx_n1 = fluid.layers.data( name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) ctx_0 = fluid.layers.data( name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) ctx_p1 = fluid.layers.data( name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) ctx_p2 = fluid.layers.data( name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) mark = fluid.layers.data( name='mark_data', shape=[1], dtype='int64', lod_level=1) feature_out = db_lstm(**locals()) target = fluid.layers.data( name='target', shape=[1], dtype='int64', lod_level=1) crf_cost = fluid.layers.linear_chain_crf( input=feature_out, label=target, param_attr=fluid.ParamAttr( name='crfw', learning_rate=mix_hidden_lr)) avg_cost = fluid.layers.mean(crf_cost) # TODO(qiao) # check other optimizers and check why out will be NAN sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001) optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) # TODO(qiao) # add dependency track and move this config before optimizer crf_decode = fluid.layers.crf_decoding( input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) chunk_evaluator = fluid.evaluator.ChunkEvaluator( input=crf_decode, label=target, chunk_scheme="IOB", num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0))) train_data = paddle.batch( paddle.reader.shuffle( paddle.dataset.conll05.test(), buf_size=8192), batch_size=BATCH_SIZE) place = fluid.CPUPlace() feeder = fluid.DataFeeder( feed_list=[ word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target ], place=place) exe = fluid.Executor(place) t = fluid.DistributeTranspiler() pserver_endpoints = os.getenv("PSERVERS") # server endpoint for current node current_endpoint = os.getenv("SERVER_ENDPOINT") # run as trainer or parameter server training_role = os.getenv( "TRAINING_ROLE", "TRAINER") # get the training role: trainer/pserver t.transpile( optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2) if training_role == "PSERVER": if not current_endpoint: print("need env SERVER_ENDPOINT") exit(1) pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog) elif training_role == "TRAINER": trainer_prog = t.get_trainer_program() start_time = time.time() batch_id = 0 exe.run(fluid.default_startup_program()) embedding_param = fluid.global_scope().find_var( embedding_name).get_tensor() embedding_param.set( load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place) for pass_id in xrange(PASS_NUM): chunk_evaluator.reset(exe) for data in train_data(): cost, precision, recall, f1_score = exe.run( trainer_prog, feed=feeder.feed(data), fetch_list=[avg_cost] + chunk_evaluator.metrics) pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval( exe) if batch_id % 10 == 0: print("avg_cost:" + str(cost) + " precision:" + str( precision) + " recall:" + str(recall) + " f1_score:" + str(f1_score) + " pass_precision:" + str( pass_precision) + " pass_recall:" + str( pass_recall) + " pass_f1_score:" + str( pass_f1_score)) if batch_id != 0: print("second per batch: " + str((time.time( ) - start_time) / batch_id)) batch_id = batch_id + 1 if __name__ == '__main__': main()