diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..e946a792b5f51ab54355aac0b6e9aef51ae815fb --- /dev/null +++ b/demo/semantic_role_labeling/api_train_v2.py @@ -0,0 +1,175 @@ +import sys +import math +import numpy as np +import paddle.v2 as paddle +import paddle.v2.dataset.conll05 as conll05 + + +def db_lstm(): + word_dict, verb_dict, label_dict = conll05.get_dict() + word_dict_len = len(word_dict) + label_dict_len = len(label_dict) + pred_len = len(verb_dict) + + mark_dict_len = 2 + word_dim = 32 + mark_dim = 5 + hidden_dim = 512 + depth = 8 + + #8 features + def d_type(size): + return paddle.data_type.integer_value_sequence(size) + + word = paddle.layer.data(name='word_data', type=d_type(word_dict_len)) + predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) + + ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len)) + ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len)) + ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len)) + ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len)) + ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len)) + mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) + + target = paddle.layer.data(name='target', type=d_type(label_dict_len)) + + default_std = 1 / math.sqrt(hidden_dim) / 3.0 + + emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.) + std_0 = paddle.attr.Param(initial_std=0.) + std_default = paddle.attr.Param(initial_std=default_std) + + predicate_embedding = paddle.layer.embedding( + size=word_dim, + input=predicate, + param_attr=paddle.attr.Param( + name='vemb', initial_std=default_std)) + mark_embedding = paddle.layer.embedding( + size=mark_dim, input=mark, param_attr=std_0) + + word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] + emb_layers = [ + paddle.layer.embedding( + size=word_dim, input=x, param_attr=emb_para) for x in word_input + ] + emb_layers.append(predicate_embedding) + emb_layers.append(mark_embedding) + + hidden_0 = paddle.layer.mixed( + size=hidden_dim, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=emb, param_attr=std_default) for emb in emb_layers + ]) + + mix_hidden_lr = 1e-3 + lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0) + hidden_para_attr = paddle.attr.Param( + initial_std=default_std, learning_rate=mix_hidden_lr) + + lstm_0 = paddle.layer.lstmemory( + input=hidden_0, + act=paddle.activation.Relu(), + gate_act=paddle.activation.Sigmoid(), + state_act=paddle.activation.Sigmoid(), + bias_attr=std_0, + param_attr=lstm_para_attr) + + #stack L-LSTM and R-LSTM with direct edges + input_tmp = [hidden_0, lstm_0] + + for i in range(1, depth): + mix_hidden = paddle.layer.mixed( + size=hidden_dim, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=input_tmp[0], param_attr=hidden_para_attr), + paddle.layer.full_matrix_projection( + input=input_tmp[1], param_attr=lstm_para_attr) + ]) + + lstm = paddle.layer.lstmemory( + input=mix_hidden, + act=paddle.activation.Relu(), + gate_act=paddle.activation.Sigmoid(), + state_act=paddle.activation.Sigmoid(), + reverse=((i % 2) == 1), + bias_attr=std_0, + param_attr=lstm_para_attr) + + input_tmp = [mix_hidden, lstm] + + feature_out = paddle.layer.mixed( + size=label_dict_len, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=input_tmp[0], param_attr=hidden_para_attr), + paddle.layer.full_matrix_projection( + input=input_tmp[1], param_attr=lstm_para_attr) + ], ) + + crf_cost = paddle.layer.crf(size=label_dict_len, + input=feature_out, + label=target, + param_attr=paddle.attr.Param( + name='crfw', + initial_std=default_std, + learning_rate=mix_hidden_lr)) + + crf_dec = paddle.layer.crf_decoding( + name='crf_dec_l', + size=label_dict_len, + input=feature_out, + label=target, + param_attr=paddle.attr.Param(name='crfw')) + + return crf_cost, crf_dec + + +def load_parameter(file_name, h, w): + with open(file_name, 'rb') as f: + f.read(16) # skip header. + return np.fromfile(f, dtype=np.float32).reshape(h, w) + + +def main(): + paddle.init(use_gpu=False, trainer_count=1) + + # define network topology + crf_cost, crf_dec = db_lstm() + + # create parameters + parameters = paddle.parameters.create([crf_cost, crf_dec]) + + # create optimizer + optimizer = paddle.optimizer.Momentum( + momentum=0, + learning_rate=2e-2, + regularization=paddle.optimizer.L2Regularization(rate=8e-4), + model_average=paddle.optimizer.ModelAverage( + average_window=0.5, max_average_window=10000), ) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + + trainer = paddle.trainer.SGD(cost=crf_cost, + parameters=parameters, + update_equation=optimizer) + parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) + + trn_reader = paddle.reader.batched( + paddle.reader.shuffle( + conll05.test(), buf_size=8192), batch_size=10) + + trainer.train( + reader=trn_reader, event_handler=event_handler, num_passes=10000) + + +if __name__ == '__main__': + main() diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py index 7c999752af22a58aca4ee9a504a909d97c53498d..a1b21bab3bac8b304abb4ae292b1c1e9f3e719de 100644 --- a/python/paddle/v2/dataset/__init__.py +++ b/python/paddle/v2/dataset/__init__.py @@ -17,5 +17,6 @@ import imikolov import imdb import cifar import movielens +import conll05 -__all__ = ['mnist', 'imikolov', 'imdb', 'cifar', 'movielens'] +__all__ = ['mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05'] diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py index 7874161a05968921a8e6789a7ff6a229d6cc6c01..e96a701c1a944e2d6d84f897157cb357c5aa0824 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/v2/dataset/conll05.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.v2.dataset.common import tarfile import gzip import itertools +from common import download __all__ = ['test, get_dict', 'get_embedding'] """ @@ -160,7 +160,6 @@ def reader_creator(corpus_reader, ctx_p2 = 'eos' word_idx = [word_dict.get(w, UNK_IDX) for w in sentence] - pred_idx = [predicate_dict.get(predicate)] * sen_len ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len @@ -168,38 +167,30 @@ def reader_creator(corpus_reader, ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len + pred_idx = [predicate_dict.get(predicate)] * sen_len label_idx = [label_dict.get(w) for w in labels] - yield word_idx, pred_idx, ctx_n2_idx, ctx_n1_idx, \ - ctx_0_idx, ctx_p1_idx, ctx_p2_idx, mark, label_idx + yield word_idx, ctx_n2_idx, ctx_n1_idx, \ + ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx - return reader() + return reader def get_dict(): - word_dict = load_dict( - common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)) - verb_dict = load_dict( - common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)) - label_dict = load_dict( - common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)) + word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)) + verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)) + label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)) return word_dict, verb_dict, label_dict def get_embedding(): - return common.download(EMB_URL, 'conll05st', EMB_MD5) + return download(EMB_URL, 'conll05st', EMB_MD5) def test(): word_dict, verb_dict, label_dict = get_dict() reader = corpus_reader( - common.download(DATA_URL, 'conll05st', DATA_MD5), + download(DATA_URL, 'conll05st', DATA_MD5), words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') return reader_creator(reader, word_dict, verb_dict, label_dict) - - -if __name__ == '__main__': - print get_embedding() - for f in test(): - print f