From fcd74e06b8f8ed1e7cd13a0255f207f25e638992 Mon Sep 17 00:00:00 2001 From: QI JUN Date: Mon, 23 Oct 2017 12:45:17 -0700 Subject: [PATCH] add book04.word2vec train test (#5002) * init * ensure ids in lookup table op must be a column vector * add book4 configuration in test_layers * debug test_book4 * add test_word2vec * follow comments * follow comments --- paddle/framework/var_desc.cc | 4 + paddle/framework/var_desc.h | 4 +- paddle/pybind/protobuf.cc | 1 + python/paddle/v2/framework/framework.py | 7 +- python/paddle/v2/framework/layer_helper.py | 5 +- python/paddle/v2/framework/layers.py | 35 +++- .../paddle/v2/framework/tests/test_layers.py | 71 ++++++++ .../v2/framework/tests/test_word2vec.py | 165 ++++++++++++++++++ 8 files changed, 282 insertions(+), 10 deletions(-) create mode 100644 python/paddle/v2/framework/tests/test_word2vec.py diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc index c302217e5aa..8e92c81d113 100644 --- a/paddle/framework/var_desc.cc +++ b/paddle/framework/var_desc.cc @@ -18,6 +18,10 @@ limitations under the License. */ namespace paddle { namespace framework { +VarDesc::VarType VarDescBind::GetType() const { return desc_.type(); } + +void VarDescBind::SetType(VarDesc::VarType type) { desc_.set_type(type); } + void VarDescBind::SetShape(const std::vector &dims) { VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims()); } diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h index af4c26ca0a7..929de1f836f 100644 --- a/paddle/framework/var_desc.h +++ b/paddle/framework/var_desc.h @@ -75,9 +75,9 @@ class VarDescBind { int32_t GetLodLevel() const; - VarDesc::VarType GetType() const { return desc_.type(); } + VarDesc::VarType GetType() const; - void SetType(VarDesc::VarType type) { desc_.set_type(type); } + void SetType(VarDesc::VarType type); bool Persistable() const { return desc_.persistable(); } diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index 405ac544e10..5d43ecea112 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -257,6 +257,7 @@ void BindOpDesc(py::module &m) { .def("block_attr", &OpDescBind::GetBlockAttr) .def("check_attrs", &OpDescBind::CheckAttrs) .def("infer_shape", &OpDescBind::InferShape) + .def("infer_var_type", &OpDescBind::InferVarType) .def("serialize_to_string", [](OpDescBind &op_desc) -> py::bytes { const OpDesc *desc = op_desc.Proto(); PADDLE_ENFORCE(desc->IsInitialized(), diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/framework/framework.py index 03a3dacf25c..1a42de3a9ba 100644 --- a/python/paddle/v2/framework/framework.py +++ b/python/paddle/v2/framework/framework.py @@ -53,8 +53,8 @@ class Variable(object): if is_new_var: self.desc.set_data_type(dtype) else: - old_dtype = self.data_type() - if dtype != old_shape: + old_dtype = self.data_type + if dtype != old_dtype: raise ValueError("Variable {0} has been created before. " "The previous data type is {1}; the new " "data type is {2}. They are not " @@ -191,7 +191,6 @@ class Operator(object): "`type` to initilized an Operator can not be None.") self.desc.set_type(type) proto = OpProtoHolder.instance().get_op_proto(type) - if inputs is not None: given = set() need = set() @@ -206,6 +205,7 @@ class Operator(object): str(e) for e in given))) for in_proto in proto.inputs: + in_argus = inputs[in_proto.name] if not isinstance(in_argus, list): in_argus = [in_argus] @@ -257,6 +257,7 @@ class Operator(object): self.desc.check_attrs() if type not in {'feed', 'fetch'}: + self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) def __str__(self): diff --git a/python/paddle/v2/framework/layer_helper.py b/python/paddle/v2/framework/layer_helper.py index 849a6f43065..5e14f39e336 100644 --- a/python/paddle/v2/framework/layer_helper.py +++ b/python/paddle/v2/framework/layer_helper.py @@ -120,10 +120,7 @@ class LayerHelper(object): if attr['name'] is None: attr['name'] = unique_name(".".join([self.name, suffix])) self.init_program.global_block().create_parameter( - name=attr['name'], - dtype=dtype, - shape=shape, - init_attr=attr['init_attr']) + dtype=dtype, shape=shape, **attr) return self.program.global_block().create_parameter( name=attr['name'], dtype=dtype, shape=shape) diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py index ac77aefa153..b7e914d734f 100644 --- a/python/paddle/v2/framework/layers.py +++ b/python/paddle/v2/framework/layers.py @@ -3,7 +3,9 @@ import paddle.v2.framework.core as core from paddle.v2.framework.framework import OpProtoHolder, Variable import re -__all__ = ['fc', 'data', 'cross_entropy', 'conv2d', 'pool2d'] +__all__ = [ + 'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat' +] def fc(input, @@ -55,6 +57,24 @@ def fc(input, return helper.append_activation(pre_activation) +def embedding(input, + size, + data_type='float32', + param_attr=None, + program=None, + init_program=None): + helper = LayerHelper('embedding', **locals()) + w = helper.create_parameter( + attr=helper.param_attr, shape=size, dtype=data_type) + tmp = helper.create_tmp_variable(data_type) + helper.append_op( + type='lookup_table', + inputs={'Ids': input, + 'W': w}, + outputs={'Out': tmp}) + return tmp + + def data(name, shape, data_type='float32', @@ -122,6 +142,19 @@ _create_op_func_('mean') _create_op_func_('mul') +def concat(input, axis, program=None, init_program=None): + helper = LayerHelper('concat', **locals()) + if not isinstance(input, list) and not isinstance(input, tuple): + input = [input] + out = helper.create_tmp_variable(dtype=input[0].data_type) + helper.append_op( + type='concat', + inputs={'X': input}, + outputs={'Out': [out]}, + attrs={'axis': axis}) + return out + + def cross_entropy(input, label, **kwargs): helper = LayerHelper('cross_entropy', **kwargs) out = helper.create_tmp_variable(dtype=input.data_type) diff --git a/python/paddle/v2/framework/tests/test_layers.py b/python/paddle/v2/framework/tests/test_layers.py index 4ecc02b12d8..7aedb985f98 100644 --- a/python/paddle/v2/framework/tests/test_layers.py +++ b/python/paddle/v2/framework/tests/test_layers.py @@ -88,6 +88,77 @@ class TestBook(unittest.TestCase): print str(program) + def test_word_embedding(self): + program = Program() + dict_size = 10000 + embed_size = 32 + first_word = layers.data( + name='firstw', shape=[1], data_type='int32', program=program) + second_word = layers.data( + name='secondw', shape=[1], data_type='int32', program=program) + third_word = layers.data( + name='thirdw', shape=[1], data_type='int32', program=program) + forth_word = layers.data( + name='forthw', shape=[1], data_type='int32', program=program) + next_word = layers.data( + name='nextw', shape=[1], data_type='int32', program=program) + + embed_param_attr_1 = { + 'name': 'shared_w', + 'init_attr': { + 'max': 1.0, + 'type': 'uniform_random', + 'min': -1.0 + } + } + embed_param_attr_2 = {'name': 'shared_w'} + + embed_first = layers.embedding( + input=first_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_1, + program=program) + embed_second = layers.embedding( + input=second_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_2, + program=program) + + embed_third = layers.embedding( + input=third_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_2, + program=program) + embed_forth = layers.embedding( + input=forth_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_2, + program=program) + + concat_embed = layers.concat( + input=[embed_first, embed_second, embed_third, embed_forth], + axis=1, + program=program) + + hidden1 = layers.fc(input=concat_embed, + size=256, + act='sigmoid', + program=program) + predict_word = layers.fc(input=hidden1, + size=dict_size, + act='softmax', + program=program) + cost = layers.cross_entropy( + input=predict_word, label=next_word, program=program) + avg_cost = layers.mean(x=cost, program=program) + self.assertIsNotNone(avg_cost) + + print str(program) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/framework/tests/test_word2vec.py b/python/paddle/v2/framework/tests/test_word2vec.py new file mode 100644 index 00000000000..b5d98035156 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_word2vec.py @@ -0,0 +1,165 @@ +import paddle.v2 as paddle +import paddle.v2.framework.layers as layers +import paddle.v2.framework.core as core +import paddle.v2.framework.optimizer as optimizer + +from paddle.v2.framework.framework import Program, g_program +from paddle.v2.framework.executor import Executor + +import numpy as np + +init_program = Program() +program = Program() + +embed_size = 32 +hidden_size = 256 +N = 5 +batch_size = 32 + +word_dict = paddle.dataset.imikolov.build_dict() +dict_size = len(word_dict) + +first_word = layers.data( + name='firstw', + shape=[1], + data_type='int32', + program=program, + init_program=init_program) +second_word = layers.data( + name='secondw', + shape=[1], + data_type='int32', + program=program, + init_program=init_program) +third_word = layers.data( + name='thirdw', + shape=[1], + data_type='int32', + program=program, + init_program=init_program) +forth_word = layers.data( + name='forthw', + shape=[1], + data_type='int32', + program=program, + init_program=init_program) +next_word = layers.data( + name='nextw', + shape=[1], + data_type='int32', + program=program, + init_program=init_program) + +embed_param_attr_1 = { + 'name': 'shared_w', + 'init_attr': { + 'max': 1.0, + 'type': 'uniform_random', + 'min': -1.0 + } +} +embed_param_attr_2 = {'name': 'shared_w'} + +embed_first = layers.embedding( + input=first_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_1, + program=program, + init_program=init_program) +embed_second = layers.embedding( + input=second_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_2, + program=program, + init_program=init_program) + +embed_third = layers.embedding( + input=third_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_2, + program=program, + init_program=init_program) +embed_forth = layers.embedding( + input=forth_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_2, + program=program, + init_program=init_program) + +concat_embed = layers.concat( + input=[embed_first, embed_second, embed_third, embed_forth], + axis=1, + program=program, + init_program=init_program) + +hidden1 = layers.fc(input=concat_embed, + size=hidden_size, + act='sigmoid', + program=program, + init_program=init_program) +predict_word = layers.fc(input=hidden1, + size=dict_size, + act='softmax', + program=program, + init_program=init_program) +cost = layers.cross_entropy( + input=predict_word, + label=next_word, + program=program, + init_program=init_program) +avg_cost = layers.mean(x=cost, program=program, init_program=init_program) + +sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) +opts = sgd_optimizer.minimize(avg_cost) + +train_reader = paddle.batch( + paddle.dataset.imikolov.train(word_dict, N), batch_size) + +place = core.CPUPlace() +exe = Executor(place) + +exe.run(init_program, feed={}, fetch_list=[]) +PASS_NUM = 100 +for pass_id in range(PASS_NUM): + for data in train_reader(): + input_data = [[data_idx[idx] for data_idx in data] for idx in xrange(5)] + input_data = map(lambda x: np.array(x).astype("int32"), input_data) + input_data = map(lambda x: np.expand_dims(x, axis=1), input_data) + + first_data = input_data[0] + first_tensor = core.LoDTensor() + first_tensor.set(first_data, place) + + second_data = input_data[0] + second_tensor = core.LoDTensor() + second_tensor.set(second_data, place) + + third_data = input_data[0] + third_tensor = core.LoDTensor() + third_tensor.set(third_data, place) + + forth_data = input_data[0] + forth_tensor = core.LoDTensor() + forth_tensor.set(forth_data, place) + + next_data = input_data[0] + next_tensor = core.LoDTensor() + next_tensor.set(next_data, place) + + outs = exe.run(program, + feed={ + 'firstw': first_tensor, + 'secondw': second_tensor, + 'thirdw': third_tensor, + 'forthw': forth_tensor, + 'nextw': next_tensor + }, + fetch_list=[avg_cost]) + out = np.array(outs[0]) + if out[0] < 10.0: + exit(0) # if avg cost less than 10.0, we think our code is good. +exit(1) -- GitLab