From 43b2bccd07230691f268fdf24296024bce162807 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 2 May 2017 21:30:52 +0800 Subject: [PATCH] add network configuration and train script --- word_embedding/network_conf.py | 71 ++++++++++++++++++++++++++++++++++ word_embedding/train_v2.py | 44 +++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 word_embedding/network_conf.py create mode 100644 word_embedding/train_v2.py diff --git a/word_embedding/network_conf.py b/word_embedding/network_conf.py new file mode 100644 index 00000000..9cc39ebf --- /dev/null +++ b/word_embedding/network_conf.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import math +import paddle.v2 as paddle + + +def network_conf(hidden_size, embed_size, dict_size): + def word_embed(in_layer): + ''' word embedding layer ''' + word_embed = paddle.layer.table_projection( + input=in_layer, + size=embed_size, + param_attr=paddle.attr.Param( + name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)) + return word_embed + + first_word = paddle.layer.data( + name='firstw', type=paddle.data_type.integer_value(dict_size)) + second_word = paddle.layer.data( + name='secondw', type=paddle.data_type.integer_value(dict_size)) + third_word = paddle.layer.data( + name='thirdw', type=paddle.data_type.integer_value(dict_size)) + fourth_word = paddle.layer.data( + name='fourthw', type=paddle.data_type.integer_value(dict_size)) + target_word = paddle.layer.data( + name='fifthw', type=paddle.data_type.integer_value(dict_size)) + + first_word_embed = word_embed(first_word) + second_word_embed = word_embed(second_word) + third_word_embed = word_embed(third_word) + fourth_word_embed = word_embed(fourth_word) + + context_embed = paddle.layer.concat(input=[ + first_word_embed, second_word_embed, third_word_embed, fourth_word_embed + ]) + + hidden_layer = paddle.layer.fc( + input=context_embed, + size=hidden_size, + act=paddle.activation.Sigmoid(), + layer_attr=paddle.attr.Extra(drop_rate=0.5), + bias_attr=paddle.attr.Param(learning_rate=2), + param_attr=paddle.attr.Param( + initial_std=1. / math.sqrt(embed_size * 8), learning_rate=1)) + + with paddle.layer.mixed( + size=dict_size - 1, + act=paddle.activation.Sigmoid(), + bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction: + prediction += paddle.layer.trans_full_matrix_projection( + input=hidden_layer, param_attr=paddle.attr.Param(name='sigmoid_w')) + + cost = paddle.layer.hsigmoid( + input=hidden_layer, + label=target_word, + num_classes=dict_size, + param_attr=paddle.attr.Param(name='sigmoid_w'), + bias_attr=paddle.attr.Param(name='sigmoid_b')) + + parameters = paddle.parameters.create([cost, prediction]) + + adam_optimizer = paddle.optimizer.Adam( + learning_rate=3e-3, + regularization=paddle.optimizer.L2Regularization(8e-4)) + + input_data_lst = ['firstw', 'secondw', 'thirdw', 'fourthw', 'fifthw'] + + trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer) + + return input_data_lst, trainer, prediction, parameters diff --git a/word_embedding/train_v2.py b/word_embedding/train_v2.py new file mode 100644 index 00000000..1c14f4fa --- /dev/null +++ b/word_embedding/train_v2.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import paddle.v2 as paddle +from network_conf import network_conf +import gzip + + +def main(): + paddle.init(use_gpu=False, trainer_count=1) + word_dict = paddle.dataset.imikolov.build_dict() + dict_size = len(word_dict) + input_data_lst, trainer, _, parameters = network_conf( + hidden_size=256, embed_size=32, dict_size=dict_size) + + def event_handler(event): + if isinstance(event, paddle.event.EndPass): + model_name = './models/model_pass_%05d.tar.gz' % event.pass_id + print("Save model into %s ..." % model_name) + with gzip.open(model_name, 'w') as f: + parameters.to_tar(f) + + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + result = trainer.test( + paddle.batch( + paddle.dataset.imikolov.test(word_dict, 5), 32)) + print "Pass %d, Batch %d, Cost %f" % ( + event.pass_id, event.batch_id, event.cost) + + feeding = dict(zip(input_data_lst, xrange(len(input_data_lst)))) + + trainer.train( + paddle.batch( + paddle.reader.shuffle( + lambda: paddle.dataset.imikolov.train(word_dict, 5)(), + buf_size=1000), 64), + num_passes=30, + event_handler=event_handler, + feeding=feeding) + + +if __name__ == '__main__': + main() -- GitLab