diff --git a/CMakeLists.txt b/CMakeLists.txt index e0db0d535b3fc661c6398f74e17d2cb048217677..861bb50a2de0249e4e5ac2e2fa1d7a8a7c61bca0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,6 +64,7 @@ include(external/python) # download, build, install python include(external/openblas) # download, build, install openblas include(external/swig) # download, build, install swig include(external/warpctc) # download, build, install warpctc +include(external/any) # download libn::any include(package) # set paddle packages include(cpplint) # set paddle c++ style diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake new file mode 100644 index 0000000000000000000000000000000000000000..8116f235d535917c03deb646ff4ec083a0cdadc7 --- /dev/null +++ b/cmake/external/any.cmake @@ -0,0 +1,20 @@ +INCLUDE(ExternalProject) + +SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any) + +INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any) + +ExternalProject_Add( + linb_any + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/thelink2012/any.git" + GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020" + PREFIX ${ANY_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE) diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py index 5d138a8c4f91976d90b19441781248f7b67c854a..2809054e7d3a367f441188fe7f91037cfa5f1579 100644 --- a/demo/seqToseq/api_train_v2.py +++ b/demo/seqToseq/api_train_v2.py @@ -1,13 +1,17 @@ import sys + import paddle.v2 as paddle -def seqToseq_net(source_dict_dim, target_dict_dim): +def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ### Network Architecture word_vector_dim = 512 # dimension of word vector decoder_size = 512 # dimension of hidden unit in GRU Decoder network encoder_size = 512 # dimension of hidden unit in GRU Encoder network + beam_size = 3 + max_length = 250 + #### Encoder src_word_id = paddle.layer.data( name='source_language_word', @@ -67,30 +71,57 @@ def seqToseq_net(source_dict_dim, target_dict_dim): group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True) group_inputs = [group_input1, group_input2] - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - # For decoder equipped with attention mechanism, in training, - # target embeding (the groudtruth) is the data input, - # while encoded source sequence is accessed to as an unbounded memory. - # Here, the StaticInput defines a read-only memory - # for the recurrent_group. - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost + if not is_generating: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) + + # For decoder equipped with attention mechanism, in training, + # target embeding (the groudtruth) is the data input, + # while encoded source sequence is accessed to as an unbounded memory. + # Here, the StaticInput defines a read-only memory + # for the recurrent_group. + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name='target_language_next_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost + else: + # In generation, the decoder predicts a next target word based on + # the encoded source sequence and the last generated target word. + + # The encoded source sequence (encoder's output) must be specified by + # StaticInput, which is a read-only memory. + # Embedding of the last generated word is automatically gotten by + # GeneratedInputs, which is initialized by a start mark, such as , + # and must be included in generation. + + trg_embedding = paddle.layer.GeneratedInputV2( + size=target_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vector_dim) + group_inputs.append(trg_embedding) + + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs, + bos_id=0, + eos_id=1, + beam_size=beam_size, + max_length=max_length) + + return beam_gen def main(): diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp index f47d55a4ade97d76e0f1940a2234e34e20efade6..f71c0f681b3bc524ba96c55f1dcad30ef59478c8 100644 --- a/paddle/function/Function.cpp +++ b/paddle/function/Function.cpp @@ -16,66 +16,6 @@ limitations under the License. */ namespace paddle { -template <> -size_t FuncConfig::get(const std::string& key) const { - auto it = valueMap_.find(key); - CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'"; - return it->second.s; -} - -template <> -real FuncConfig::get(const std::string& key) const { - auto it = valueMap_.find(key); - CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'"; - return it->second.r; -} - -template <> -int FuncConfig::get(const std::string& key) const { - auto it = valueMap_.find(key); - CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'"; - return it->second.i; -} - -template <> -bool FuncConfig::get(const std::string& key) const { - auto it = valueMap_.find(key); - CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'"; - return it->second.b; -} - -template <> -FuncConfig& FuncConfig::set(const std::string& key, size_t v) { - CHECK_EQ(static_cast(valueMap_.count(key)), 0) << "Duplicated value: " - << key; - valueMap_[key].s = v; - return *this; -} - -template <> -FuncConfig& FuncConfig::set(const std::string& key, real v) { - CHECK_EQ(static_cast(valueMap_.count(key)), 0) << "Duplicated value: " - << key; - valueMap_[key].r = v; - return *this; -} - -template <> -FuncConfig& FuncConfig::set(const std::string& key, int v) { - CHECK_EQ(static_cast(valueMap_.count(key)), 0) << "Duplicated value: " - << key; - valueMap_[key].i = v; - return *this; -} - -template <> -FuncConfig& FuncConfig::set(const std::string& key, bool v) { - CHECK_EQ(static_cast(valueMap_.count(key)), 0) << "Duplicated value: " - << key; - valueMap_[key].b = v; - return *this; -} - void BufferArgs::addArg(const Matrix& arg, const TensorShape& shape, ArgType argType) { diff --git a/paddle/function/Function.h b/paddle/function/Function.h index 3bbeb6e525f85bdde9a54c8d60146eaa30a1bb4d..9ad00c6f370cf64e9cc26f16e62c4d2ddb284003 100644 --- a/paddle/function/Function.h +++ b/paddle/function/Function.h @@ -18,32 +18,49 @@ limitations under the License. */ #include #include "BufferArg.h" #include "paddle/math/Matrix.h" +#include "paddle/utils/Any.h" #include "paddle/utils/ClassRegistrar.h" +#include "paddle/utils/Error.h" namespace paddle { /** * Function Configuration. * The argument type of Function::init. - * Follow-up will consider moving this data structure to Proto inside. */ class FuncConfig { public: - union value { - size_t s; - real r; - int i; - bool b; - }; - template - T get(const std::string& key) const; + T get(const std::string& key, Error* err = nullptr) const { + try { + return any_cast(valueMap_.at(key)); + } catch (std::exception& e) { // could be cast or out of range exception. + if (err) { + *err = Error(e.what()); + } else { + LOG(FATAL) << "Cannot get key " << key << "with error " << e.what(); + } + return T(); + } + } template - FuncConfig& set(const std::string& key, T v); + FuncConfig& set(const std::string& key, T v, Error* err = nullptr) { + auto it = valueMap_.find(key); + if (it != valueMap_.end()) { // already contains key. + if (err) { + *err = Error("Key %s is already set in FuncConfig", key.c_str()); + } else { + LOG(FATAL) << "Key " << key << " is already set in FuncConfig."; + } + return *this; + } + valueMap_[key] = any(v); + return *this; + } protected: - std::map valueMap_; + mutable std::unordered_map valueMap_; }; /** diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp index f1a0d2a1a96f24ddff8cd120681a8bc8cddaf40a..adba7c92ece505eecc74edce6b393cf27fa10ccc 100644 --- a/paddle/function/PadOp.cpp +++ b/paddle/function/PadOp.cpp @@ -25,9 +25,9 @@ void Pad(real* outputs, const int inH, const int inW, const PadConf& pad) { - int cstart = pad.channelStart, cend = pad.channelEnd; - int hstart = pad.heightStart, hend = pad.heightEnd; - int wstart = pad.widthStart, wend = pad.widthEnd; + int cstart = pad.channel[0], cend = pad.channel[1]; + int hstart = pad.height[0], hend = pad.height[1]; + int wstart = pad.width[0], wend = pad.width[1]; int outC = inC + cstart + cend; int outH = inH + hstart + hend; int outW = inW + wstart + wend; @@ -51,9 +51,9 @@ void PadGrad(real* inGrad, const int inH, const int inW, const PadConf& pad) { - int cstart = pad.channelStart, cend = pad.channelEnd; - int hstart = pad.heightStart, hend = pad.heightEnd; - int wstart = pad.widthStart, wend = pad.widthEnd; + int cstart = pad.channel[0], cend = pad.channel[1]; + int hstart = pad.height[0], hend = pad.height[1]; + int wstart = pad.width[0], wend = pad.width[1]; int outC = inC + cstart + cend; int outH = inH + hstart + hend; int outW = inW + wstart + wend; @@ -71,6 +71,12 @@ void PadGrad(real* inGrad, } } +static inline PadConf castToPadConf(const FuncConfig& conf) { + return {conf.get>("channel"), + conf.get>("height"), + conf.get>("width")}; +} + /** * \brief Padding zeros to input according to the specify dimension. * The struct pad_ contains the padding size in each dimension. @@ -127,14 +133,7 @@ void PadGrad(real* inGrad, template class PadFunc : public FunctionBase { public: - void init(const FuncConfig& config) override { - pad_.channelStart = config.get("cstart"); - pad_.channelEnd = config.get("cend"); - pad_.heightStart = config.get("hstart"); - pad_.heightEnd = config.get("hend"); - pad_.widthStart = config.get("wstart"); - pad_.widthEnd = config.get("wend"); - } + void init(const FuncConfig& config) override { pad_ = castToPadConf(config); } void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { CHECK_EQ(1UL, inputs.size()); @@ -175,14 +174,7 @@ private: template class PadGradFunc : public FunctionBase { public: - void init(const FuncConfig& config) override { - pad_.channelStart = config.get("cstart"); - pad_.channelEnd = config.get("cend"); - pad_.heightStart = config.get("hstart"); - pad_.heightEnd = config.get("hend"); - pad_.widthStart = config.get("wstart"); - pad_.widthEnd = config.get("wend"); - } + void init(const FuncConfig& config) override { pad_ = castToPadConf(config); } void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { CHECK_EQ(1UL, inputs.size()); diff --git a/paddle/function/PadOp.h b/paddle/function/PadOp.h index 7b5c730a6a0fa57833e63beba085cb17054ae2f5..0e226ec7370b9897ebdc697ee528b90a37e4ec56 100644 --- a/paddle/function/PadOp.h +++ b/paddle/function/PadOp.h @@ -19,18 +19,12 @@ limitations under the License. */ namespace paddle { struct PadConf { - /// how many values to add before the data along channel dimension. - int channelStart; - /// how many values to add after the data along channel dimension. - int channelEnd; - /// how many values to add before the data along height dimension. - int heightStart; - /// how many values to add after the data along height dimension. - int heightEnd; - /// how many values to add before the data along width dimension. - int widthStart; - /// how many values to add after the data along width dimension. - int widthEnd; + /// how many values to add before/after the data along channel dimension. + std::vector channel; + /// how many values to add before/after the data along height dimension. + std::vector height; + /// how many values to add before/after the data along width dimension. + std::vector width; }; /** diff --git a/paddle/gserver/layers/PadLayer.cpp b/paddle/gserver/layers/PadLayer.cpp index bb618c09f9777785d93995fa7140dd4a5383cd1b..a5ed7e057aea8f065ee752f8c0f0d2d9bdddfc8b 100644 --- a/paddle/gserver/layers/PadLayer.cpp +++ b/paddle/gserver/layers/PadLayer.cpp @@ -36,12 +36,9 @@ bool PadLayer::init(const LayerMap& layerMap, CHECK_EQ(2, pad_conf.pad_c_size()); CHECK_EQ(2, pad_conf.pad_h_size()); CHECK_EQ(2, pad_conf.pad_w_size()); - padc_.push_back(pad_conf.pad_c(0)); - padc_.push_back(pad_conf.pad_c(1)); - padh_.push_back(pad_conf.pad_h(0)); - padh_.push_back(pad_conf.pad_h(1)); - padw_.push_back(pad_conf.pad_w(0)); - padw_.push_back(pad_conf.pad_w(1)); + padc_ = {pad_conf.pad_c(0), pad_conf.pad_c(1)}; + padh_ = {pad_conf.pad_h(0), pad_conf.pad_h(1)}; + padw_ = {pad_conf.pad_w(0), pad_conf.pad_w(1)}; outDims_ = TensorShape(4); setOutDims(0); @@ -49,21 +46,15 @@ bool PadLayer::init(const LayerMap& layerMap, createFunction(forward_, "Pad", FuncConfig() - .set("cstart", padc_[0]) - .set("cend", padc_[1]) - .set("hstart", padh_[0]) - .set("hend", padh_[1]) - .set("wstart", padw_[0]) - .set("wend", padw_[1])); + .set("channel", padc_) + .set("height", padh_) + .set("width", padw_)); createFunction(backward_, "PadGrad", FuncConfig() - .set("cstart", padc_[0]) - .set("cend", padc_[1]) - .set("hstart", padh_[0]) - .set("hend", padh_[1]) - .set("wstart", padw_[0]) - .set("wend", padw_[1])); + .set("channel", padc_) + .set("height", padh_) + .set("width", padw_)); return true; } diff --git a/paddle/gserver/layers/PadLayer.h b/paddle/gserver/layers/PadLayer.h index b2bbf28082e630aeb429ee997a1d43ce7ba05d1c..fe9388d8cc260ed599af0113361f4687f3f4a18b 100644 --- a/paddle/gserver/layers/PadLayer.h +++ b/paddle/gserver/layers/PadLayer.h @@ -38,9 +38,9 @@ protected: void setOutDims(const size_t batchSize); void setTensorDim(const size_t batchSize); - std::vector padc_; - std::vector padh_; - std::vector padw_; + std::vector padc_; + std::vector padh_; + std::vector padw_; TensorShape inDims_; TensorShape outDims_; }; diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py index 6d6a406cf61d467cb2701ca5e85e99648eea36eb..879703a00c197e62ba7e21b8e2e2dea2889c4e13 100644 --- a/paddle/py_paddle/dataprovider_converter.py +++ b/paddle/py_paddle/dataprovider_converter.py @@ -160,10 +160,19 @@ class SparseFloatScanner(SparseBinaryScanner): class IndexScanner(IScanner): def __init__(self, input_type, pos): IScanner.__init__(self, input_type, pos) - self.__ids__ = [] + self.__ids__ = None + self.__idx__ = 0 + + def pre_scan(self, dat): + self.__idx__ += 1 + + def finish_pre_scan(self, argument): + self.__ids__ = [0] * self.__idx__ + self.__idx__ = 0 def scan(self, dat): - self.__ids__.append(dat) + self.__ids__[self.__idx__] = dat + self.__idx__ += 1 def finish_scan(self, argument): ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu) diff --git a/paddle/utils/Any.h b/paddle/utils/Any.h new file mode 100644 index 0000000000000000000000000000000000000000..99a0139accc4988f1e4cce45eeb688a6603c2c31 --- /dev/null +++ b/paddle/utils/Any.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#if __cplusplus > 201402L +#include + +namespace paddle { +// using std::any for C++ 17 +using std::any; +using std::any_cast; +using std::bad_any_cast; +} // namespace paddle + +#else +#include + +namespace paddle { +// use linb::any for C++ 11 +using linb::any; +using linb::any_cast; +using linb::bad_any_cast; +} // namespace paddle +#endif diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index e98b1dfc8f9ad3bebd5e58813a86e5a8928bfa49..03f9fc2707aaec7c997e7c41ded331087e181582 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -18,7 +18,7 @@ import inspect from paddle.trainer.config_parser import * from .activations import LinearActivation, SigmoidActivation, TanhActivation, \ - ReluActivation, IdentityActivation, SoftmaxActivation + ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation from .evaluators import * from .poolings import MaxPooling, AvgPooling, BasePoolingType from .attrs import * @@ -2277,8 +2277,9 @@ def img_pool_layer(input, pool_type.name = 'avg' type_name = pool_type.name + '-projection' \ - if (isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \ - else pool_type.name + if ( + isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \ + else pool_type.name pool_size_y = pool_size if pool_size_y is None else pool_size_y stride_y = stride if stride_y is None else stride_y @@ -3318,8 +3319,8 @@ def recurrent_group(step, assert (targetInlink == None or targetInlink_in_inlinks()) targetInlinkName = None if targetInlink == None \ - else targetInlink.name if isinstance(targetInlink, LayerOutput) \ - else targetInlink.input.name + else targetInlink.name if isinstance(targetInlink, LayerOutput) \ + else targetInlink.input.name contains_sub_seq = [False] @@ -4831,12 +4832,14 @@ def crf_decoding_layer(input, return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=1) +@wrap_act_default(act=SigmoidActivation()) @wrap_bias_attr_default(has_bias=True) @wrap_name_default() @layer_support() def nce_layer(input, label, num_classes, + act=None, weight=None, num_neg_samples=10, neg_distribution=None, @@ -4865,6 +4868,8 @@ def nce_layer(input, :type weight: LayerOutput :param num_classes: number of classes. :type num_classes: int + :param act: Activation, default is Sigmoid. + :type act: BaseActivation :param num_neg_samples: number of negative samples. Default is 10. :type num_neg_samples: int :param neg_distribution: The distribution for generating the random negative labels. @@ -4887,6 +4892,8 @@ def nce_layer(input, assert isinstance(neg_distribution, collections.Sequence) assert len(neg_distribution) == num_classes assert sum(neg_distribution) == 1 + if not isinstance(act, BaseActivation): + raise TypeError() ipts_for_layer = [] parents = [] @@ -4908,12 +4915,17 @@ def nce_layer(input, type=LayerType.NCE_LAYER, num_classes=num_classes, neg_sampling_dist=neg_distribution, + active_type=act.name, num_neg_samples=num_neg_samples, inputs=ipts_for_layer, bias=ParamAttr.to_bias(bias_attr), **ExtraLayerAttribute.to_kwargs(layer_attr)) return LayerOutput( - name, LayerType.NCE_LAYER, parents=parents, size=l.config.size) + name, + LayerType.NCE_LAYER, + parents=parents, + size=l.config.size, + activation=act) """ diff --git a/python/paddle/v2/config_base.py b/python/paddle/v2/config_base.py index cb98866d874e35b9fda8e170004d687329b7d3e3..b0e8da563e0d65d534d3f224fe5f1c39a67eeb54 100644 --- a/python/paddle/v2/config_base.py +++ b/python/paddle/v2/config_base.py @@ -67,7 +67,16 @@ class Layer(object): self.name = name self.__context__ = {} self.__parent_layers__ = parent_layers - self.__children_layers__ = [] # used for evaluator. + # some layer may have some extra parent layer + self.__extra_parent__ = [] + # used for evaluator. + self.__children_layers__ = [] + + def extra_parent(self): + return self.__extra_parent__ + + def append_extra_parent(self, parent): + self.__extra_parent__.append(parent) def append_child(self, layer, parent_names): self.__children_layers__.append((layer, parent_names)) @@ -78,14 +87,20 @@ class Layer(object): """ self.__context__ = context - # short cut if myself is parsed before. + # STEP: short cut if this layer is parsed before. if self.context_name() in context: if self.use_context_name(): return context[self.context_name()] else: return context[self.name] - # parse parent before myself + # STEP: parse extra_parent that is not used by this layer but must + # be parsed before this layer. + for p in self.__extra_parent__: + p.to_proto(context=context) + + # STEP: parse parent that is used by this layer, get the result and + # insert into kwargs of the next layer's to_proto_impl method. kwargs = dict() for layer_name in self.__parent_layers__: if not isinstance(self.__parent_layers__[layer_name], @@ -97,14 +112,13 @@ class Layer(object): self.__parent_layers__[layer_name]) kwargs[layer_name] = v1_layer - # parse myself. + # STEP: parse myself and add myself into context. ret_val = self.to_proto_impl(**kwargs) - - if self.context_name() is not None and \ - self.context_name() not in context: + if self.context_name() is not None \ + and self.context_name() not in context: context[self.context_name()] = ret_val - # parse children. + # STEP: parse children that should be pased after this layer. for layer, pnames in self.__children_layers__: drop = False @@ -117,6 +131,7 @@ class Layer(object): continue layer.to_proto(context=context) + # STEP: return v1 layer result if self.context_name() is None: return ret_val elif self.use_context_name(): diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index 7021a6da05dec6be216534112c2df2586e73390f..2eb018b8d60e9a8bd0091836ab56c35b05786fca 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -66,13 +66,6 @@ def download(url, module_name, md5sum): return filename -def dict_add(a_dict, ele): - if ele in a_dict: - a_dict[ele] += 1 - else: - a_dict[ele] = 1 - - def fetch_all(): for module_name in filter(lambda x: not x.startswith("__"), dir(paddle.v2.dataset)): diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index 5284017ce08de8beb559f58fb6006639f40f5580..9a7ccff4d5cd2563053adb0aae95fc6d10ad2a50 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -18,6 +18,7 @@ TODO(yuyang18): Complete comments. """ import paddle.v2.dataset.common +import collections import tarfile import Queue import re @@ -48,10 +49,10 @@ def tokenize(pattern): def build_dict(pattern, cutoff): - word_freq = {} + word_freq = collections.defaultdict(int) for doc in tokenize(pattern): for word in doc: - paddle.v2.dataset.common.dict_add(word_freq, word) + word_freq[word] += 1 # Not sure if we should prune less-frequent words here. word_freq = filter(lambda x: x[1] > cutoff, word_freq.items()) diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index 2931d06e7eb65bde887c56a8bc20e7a9c5e4d4e4..5d7e0282b4db639e6590ade66241328d6ab8b5e3 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -17,6 +17,7 @@ imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/ Complete comments. """ import paddle.v2.dataset.common +import collections import tarfile __all__ = ['train', 'test', 'build_dict'] @@ -26,15 +27,14 @@ MD5 = '30177ea32e27c525793142b6bf2c8e2d' def word_count(f, word_freq=None): - add = paddle.v2.dataset.common.dict_add - if word_freq == None: - word_freq = {} + if word_freq is None: + word_freq = collections.defaultdict(int) for l in f: for w in l.strip().split(): - add(word_freq, w) - add(word_freq, '') - add(word_freq, '') + word_freq[w] += 1 + word_freq[''] += 1 + word_freq[''] += 1 return word_freq diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py index 1e4efedde363f20fde168941adcb6e8a594b533a..384de9b9d57f88e84ab6067846174bb037502dc0 100644 --- a/python/paddle/v2/layer.py +++ b/python/paddle/v2/layer.py @@ -33,40 +33,52 @@ The primary usage shows below. import collections import inspect -from config_base import Layer, __convert_to_v2__ +import re + import paddle.trainer_config_helpers as conf_helps +from paddle.trainer.config_parser import \ + RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \ + RecurrentLayerGroupEnd, model_type from paddle.trainer_config_helpers.config_parser_utils import \ parse_network_config as __parse__ from paddle.trainer_config_helpers.default_decorators import wrap_act_default from paddle.trainer_config_helpers.default_decorators import \ wrap_bias_attr_default from paddle.trainer_config_helpers.default_decorators import wrap_name_default +from paddle.trainer_config_helpers.layers import RecurrentLayerGroupSetGenerator, Generator from paddle.trainer_config_helpers.layers import layer_support -from paddle.trainer.config_parser import \ - RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \ - RecurrentLayerGroupEnd, model_type import activation -import re +import attr import data_type +from config_base import Layer, __convert_to_v2__ __all__ = ['parse_network', 'data'] -def parse_network(*outputs): +def parse_network(output_layers, extra_layers=None): """ - Parse all output layers and then generate a ModelConfig object. + Parse all layers in the neural network graph and + then generate a ModelConfig object. .. note:: This function is used internally in paddle.v2 module. User should never invoke this method. - :param outputs: Output layers. - :type outputs: Layer + :param output_layers: Output layers. + :type output_layers: Layer + :param extra_layers: Some layers in the neural network graph are not in the + path of output_layers. + :type extra_layers: Layer :return: A ModelConfig object instance. :rtype: ModelConfig """ + if not isinstance(output_layers, collections.Sequence): + output_layers = [output_layers] + if extra_layers is not None and not isinstance(extra_layers, + collections.Sequence): + extra_layers = [extra_layers] def __real_func__(): """ @@ -74,7 +86,11 @@ def parse_network(*outputs): the plain old paddle configuration function. """ context = dict() - real_output = [each.to_proto(context=context) for each in outputs] + real_output = [each.to_proto(context=context) for each in output_layers] + if extra_layers is not None: + extra_output = [ + each.to_proto(context=context) for each in extra_layers + ] conf_helps.outputs(real_output) return __parse__(__real_func__) @@ -119,54 +135,23 @@ class DataLayerV2(Layer): return doc -class WithExtraParent(Layer): - def extra_parent(self): - return self.__extra_parent__ - - def __init__(self, name=None, parent_layers=None): - self.__extra_parent__ = [] - super(WithExtraParent, self).__init__( - name=name, parent_layers=parent_layers) - - def append_extra_parent(self, parent): - self.__extra_parent__.append(parent) - - def to_proto(self, context): +class MemoryV2(Layer): + def __init__(self, name, extra_input=None, **kwargs): """ - function to set proto attribute + Init memory object, if memory is inited inside recurrent_group step + function, it may depend on a boot_layer that should be initialized + outside recurrent_group, so we: + 1. add RecurrentLayerInput to extra_parent of self. + 2. add boot_layer to the extra_parent of RecurrentLayerInput. + + :param extra_input: list of RecurrentLayerInput + :type extra_input: [RecurrentLayerInput] """ - kwargs = dict() - for p in self.__extra_parent__: - p.to_proto(context=context) - - for layer_name in self.__parent_layers__: - if not isinstance(self.__parent_layers__[layer_name], - collections.Sequence): - v1_layer = self.__parent_layers__[layer_name].to_proto( - context=context) - else: - v1_layer = map(lambda x: x.to_proto(context=context), - self.__parent_layers__[layer_name]) - kwargs[layer_name] = v1_layer - - if self.context_name() is None: - return self.to_proto_impl(context=context, **kwargs) - elif self.context_name() not in context: - context[self.context_name()] = self.to_proto_impl( - context=context, **kwargs) - - if self.use_context_name(): - return context[self.context_name()] - else: - return context[self.name] - - -class MemoryV2(WithExtraParent): - def __init__(self, name, **kwargs): self.name = name super(MemoryV2, self).__init__(name=name, parent_layers=dict()) self.__kwargs__ = kwargs self.__boot_layer_name__ = None + if 'boot_layer' in kwargs: begin_of_current_rnn = [] # TODO(yuyang18): Fix inspect, it could be wrong when user invoke a @@ -189,11 +174,10 @@ class MemoryV2(WithExtraParent): assert begin_of_current_rnn is not None for extra in begin_of_current_rnn: self.append_extra_parent(extra) - assert isinstance(extra, WithExtraParent) extra.append_extra_parent(kwargs['boot_layer']) self.__boot_layer_name__ = kwargs['boot_layer'].name - def to_proto_impl(self, context, **kwargs): + def to_proto_impl(self, **kwargs): args = dict() for each in kwargs: args[each] = kwargs[each] @@ -201,7 +185,7 @@ class MemoryV2(WithExtraParent): args[each] = self.__kwargs__[each] if self.__boot_layer_name__ is not None: - args['boot_layer'] = context[self.__boot_layer_name__] + args['boot_layer'] = self.__context__[self.__boot_layer_name__] size = args.get('size', None) if size is not None: @@ -223,22 +207,6 @@ class MemoryV2(WithExtraParent): return True -class LayerOutputV2(Layer): - """ - LayerOutputV2 is used to store the result of LayerOutput in v1 api. - It will not store it's parents because layer_output has been parsed already. - """ - - def __init__(self, layer_output): - assert isinstance(layer_output, conf_helps.LayerOutput) - self.layer_output = layer_output - super(LayerOutputV2, self).__init__( - name=layer_output.name, parent_layers=dict()) - - def to_proto_impl(self): - return self.layer_output - - class StaticInputV2(object): def __init__(self, input, is_seq=False, size=None): assert isinstance(input, LayerV2) @@ -250,6 +218,66 @@ class StaticInputV2(object): # assert input.size is not None or size is not None +class BaseGeneratedInputV2(object): + def __init__(self): + self.bos_id = None + self.eos_id = None + + def before_real_step(self): + raise NotImplementedError() + + def after_real_step(self, *args): + raise NotImplementedError() + + +class GeneratedInputV2(BaseGeneratedInputV2): + def __init__(self, size, embedding_name, embedding_size): + super(GeneratedInputV2, self).__init__() + self.size = size + self.embedding_name = embedding_name + self.embedding_size = embedding_size + + def after_real_step(self, input): + return max_id(input=input, name='__beam_search_predict__') + + def before_real_step(self): + predict_id = memory( + name='__beam_search_predict__', + size=self.size, + boot_with_const_id=self.bos_id) + + trg_emb = embedding( + input=predict_id, + size=self.embedding_size, + param_attr=attr.ParamAttr(name=self.embedding_name)) + return trg_emb + + +class RecurrentLayerGroupSetGeneratorV2(Layer): + def __init__(self, eos_name, max_length, beam_size, num_results_per_sample): + self.eos_name = eos_name + self.max_length = max_length + self.beam_size = beam_size + self.num_results_per_sample = num_results_per_sample + super(RecurrentLayerGroupSetGeneratorV2, self).__init__( + name=eos_name, parent_layers={}) + + def to_proto_impl(self, **kwargs): + RecurrentLayerGroupSetGenerator( + Generator( + eos_layer_name=self.eos_name, + max_num_frames=self.max_length, + beam_size=self.beam_size, + num_results_per_sample=self.num_results_per_sample)) + return self + + def context_name(self): + return self.eos_name + ".fake" + + def use_context_name(self): + return True + + class MixedLayerV2(Layer): """ This class is use to support `with` grammar. If not, the following code @@ -328,18 +356,24 @@ def mixed(size=0, return MixedLayerV2(size, input, name, act, bias_attr, layer_attr) -class RecurrentLayerInput(WithExtraParent): +class RecurrentLayerInput(Layer): def __init__(self, recurrent_name, index, parent_layers): - assert len(parent_layers) == 1 - self.__parents__ = parent_layers.values()[0] - super(RecurrentLayerInput, self).__init__( - name=self.__parents__[index].name, parent_layers=parent_layers) + parents_len = len(parent_layers) + assert parents_len <= 1 + if parents_len == 0: + self.__parents__ = [] + else: + self.__parents__ = parent_layers.values()[0] self.__recurrent_name__ = recurrent_name + name = self.__parents__[ + index].name if index >= 0 else self.context_name() + super(RecurrentLayerInput, self).__init__( + name=name, parent_layers=parent_layers) def context_name(self): return self.__recurrent_name__ + ".begin" - def to_proto_impl(self, context, **kwargs): + def to_proto_impl(self, **kwargs): model_type('recurrent_nn') RecurrentLayerGroupWithoutOutLinksBegin( name=self.__recurrent_name__, @@ -436,6 +470,11 @@ def recurrent_group(step, input, name=None): for i in xrange(len(non_static_inputs)) ] + extra_input = None + if len(non_static_inputs) == 0: + extra_input = RecurrentLayerInput( + recurrent_name=name, index=-1, parent_layers={}) + def __real_step__(*args): rnn_input = list(args) static_inputs = filter(lambda x: isinstance(x, StaticInputV2), input) @@ -443,6 +482,7 @@ def recurrent_group(step, input, name=None): mem_name = "__%s_memory__" % static_input.input.name mem = memory( name=mem_name, + extra_input=extra_input, is_seq=static_input.is_seq, size=static_input.input.calculate_size, boot_layer=static_input.input) @@ -472,6 +512,73 @@ def recurrent_group(step, input, name=None): return retv +@wrap_name_default() +def beam_search(step, + input, + bos_id, + eos_id, + beam_size, + max_length=500, + name=None, + num_results_per_sample=None): + if num_results_per_sample is None: + num_results_per_sample = beam_size + assert num_results_per_sample <= beam_size + # logger.warning("num_results_per_sample should be less than beam_size") + + if isinstance(input, StaticInputV2) or isinstance(input, + BaseGeneratedInputV2): + input = [input] + + generated_input_index = -1 + + real_input = [] + for i, each_input in enumerate(input): + assert isinstance(each_input, StaticInputV2) or isinstance( + each_input, BaseGeneratedInputV2) + if isinstance(each_input, BaseGeneratedInputV2): + assert generated_input_index == -1 + generated_input_index = i + else: + real_input.append(each_input) + + assert generated_input_index != -1 + + gipt = input[generated_input_index] + assert isinstance(gipt, BaseGeneratedInputV2) + + gipt.bos_id = bos_id + gipt.eos_id = eos_id + + def __real_step__(*args): + eos_name = "__%s_eos_layer__" % name + generator = RecurrentLayerGroupSetGeneratorV2( + eos_name, max_length, beam_size, num_results_per_sample) + + args = list(args) + before_step_layer = gipt.before_real_step() + before_step_layer.append_child( + layer=generator, parent_names=[before_step_layer.name]) + args.insert(generated_input_index, before_step_layer) + + predict = gipt.after_real_step(step(*args)) + + eos_layer = eos(input=predict, eos_id=eos_id, name=eos_name) + predict.append_child(layer=eos_layer, parent_names=[predict.name]) + + return predict + + # tmp = paddle.layer.recurrent_group( + # step=__real_step__, + # input=real_input, + # reverse=False, + # name=name, + # is_generating=True) + tmp = recurrent_group(step=__real_step__, input=real_input, name=name) + + return tmp + + __projection_names__ = filter(lambda x: x.endswith('_projection'), dir(conf_helps)) diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py index 05dc5c68dd97b00fb15b74564a32313430c45345..d686d09f220671fce50be0784e354f97cb109f32 100644 --- a/python/paddle/v2/parameters.py +++ b/python/paddle/v2/parameters.py @@ -159,7 +159,8 @@ class Parameters(object): if not self.has_key(key): raise ValueError("No such parameter %s" % key) conf = self.__param_conf__[key] - return tuple(map(int, conf.dims)) + dims = conf.dims if conf.dims else (1, conf.size) + return tuple(map(int, dims)) def __setitem__(self, key, value): """ diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py index 2378f68ea89aa0268eb2482359e1fde71442258d..c67f3b84d96eb92d94ad80cc54c5e056103c1a1a 100644 --- a/python/paddle/v2/tests/test_layer.py +++ b/python/paddle/v2/tests/test_layer.py @@ -59,13 +59,13 @@ class ImageLayerTest(unittest.TestCase): num_channels=16, pool_type=pooling.Max()) maxout = layer.maxout(input=conv, num_channels=16, groups=4) - print layer.parse_network(maxpool, spp, maxout) + print layer.parse_network([maxpool, spp, maxout]) def test_norm_layer(self): norm1 = layer.img_cmrnorm(input=conv, size=5) norm2 = layer.batch_norm(input=conv) norm3 = layer.sum_to_one_norm(input=conv) - print layer.parse_network(norm1, norm2, norm3) + print layer.parse_network([norm1, norm2, norm3]) class AggregateLayerTest(unittest.TestCase): @@ -78,7 +78,8 @@ class AggregateLayerTest(unittest.TestCase): first_seq = layer.first_seq(input=pixel) concat = layer.concat(input=[last_seq, first_seq]) seq_concat = layer.seq_concat(a=last_seq, b=first_seq) - print layer.parse_network(pool, last_seq, first_seq, concat, seq_concat) + print layer.parse_network( + [pool, last_seq, first_seq, concat, seq_concat]) class MathLayerTest(unittest.TestCase): @@ -95,8 +96,10 @@ class MathLayerTest(unittest.TestCase): tensor = layer.tensor(a=pixel, b=pixel, size=1000) cos_sim = layer.cos_sim(a=pixel, b=pixel) trans = layer.trans(input=tensor) - print layer.parse_network(addto, linear_comb, interpolation, power, - scaling, slope, tensor, cos_sim, trans) + print layer.parse_network([ + addto, linear_comb, interpolation, power, scaling, slope, tensor, + cos_sim, trans + ]) class ReshapeLayerTest(unittest.TestCase): @@ -110,7 +113,8 @@ class ReshapeLayerTest(unittest.TestCase): repeat = layer.repeat(input=pixel, num_repeats=4) reshape = layer.seq_reshape(input=pixel, reshape_size=4) rotate = layer.rotate(input=pixel, height=16, width=49) - print layer.parse_network(block_expand, expand, repeat, reshape, rotate) + print layer.parse_network( + [block_expand, expand, repeat, reshape, rotate]) class RecurrentLayerTest(unittest.TestCase): @@ -119,7 +123,7 @@ class RecurrentLayerTest(unittest.TestCase): recurrent = layer.recurrent(input=word) lstm = layer.lstmemory(input=word) gru = layer.grumemory(input=word) - print layer.parse_network(recurrent, lstm, gru) + print layer.parse_network([recurrent, lstm, gru]) class CostLayerTest(unittest.TestCase): @@ -139,10 +143,10 @@ class CostLayerTest(unittest.TestCase): cost10 = layer.sum_cost(input=inference) cost11 = layer.huber_cost(input=score, label=label) - print layer.parse_network(cost1, cost2) - print layer.parse_network(cost3, cost4) - print layer.parse_network(cost5, cost6) - print layer.parse_network(cost7, cost8, cost9, cost10, cost11) + print layer.parse_network([cost1, cost2]) + print layer.parse_network([cost3, cost4]) + print layer.parse_network([cost5, cost6]) + print layer.parse_network([cost7, cost8, cost9, cost10, cost11]) crf = layer.crf(input=inference, label=label) crf_decoding = layer.crf_decoding(input=inference, size=3) @@ -151,8 +155,8 @@ class CostLayerTest(unittest.TestCase): nce = layer.nce(input=inference, label=label, num_classes=3) hsigmoid = layer.hsigmoid(input=inference, label=label, num_classes=3) - print layer.parse_network(crf, crf_decoding, ctc, warp_ctc, nce, - hsigmoid) + print layer.parse_network( + [crf, crf_decoding, ctc, warp_ctc, nce, hsigmoid]) class OtherLayerTest(unittest.TestCase): @@ -160,7 +164,7 @@ class OtherLayerTest(unittest.TestCase): maxid = layer.max_id(input=inference) sampling_id = layer.sampling_id(input=inference) eos = layer.eos(input=maxid, eos_id=5) - print layer.parse_network(maxid, sampling_id, eos) + print layer.parse_network([maxid, sampling_id, eos]) def test_slicing_joining_layer(self): pad = layer.pad(input=conv, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1]) diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py index f0679c5675b0c0f24f28f3df22efd4eb51ccbb3a..737b6bf1e2eb60281d4d6e92667d9fe91e243704 100644 --- a/python/paddle/v2/topology.py +++ b/python/paddle/v2/topology.py @@ -17,7 +17,6 @@ import collections from paddle.proto.ModelConfig_pb2 import ModelConfig import layer as v2_layer -from layer import WithExtraParent __all__ = ['Topology'] @@ -41,9 +40,8 @@ def __bfs_travel__(callback, *layers): __break__ = callback(each_layer) if __break__: return - __layers__ = each_layer.__parent_layers__.values() - if isinstance(each_layer, WithExtraParent): - __layers__ = __layers__ + each_layer.extra_parent() + __layers__ = each_layer.__parent_layers__.values() + \ + each_layer.extra_parent() __bfs_travel__(callback, *__layers__) @@ -53,14 +51,26 @@ class Topology(object): and network configs. """ - def __init__(self, layers): - if not isinstance(layers, collections.Sequence): - __check_layer_type__(layers) - layers = [layers] - for layer in layers: - __check_layer_type__(layer) + def __init__(self, layers, extra_layers=None): + def __check__(layers): + if not isinstance(layers, collections.Sequence): + __check_layer_type__(layers) + layers = [layers] + for layer in layers: + __check_layer_type__(layer) + return layers + + layers = __check__(layers) self.layers = layers - self.__model_config__ = v2_layer.parse_network(*layers) + if extra_layers is not None: + extra_layers = __check__(extra_layers) + + self.__model_config__ = v2_layer.parse_network( + layers, extra_layers=extra_layers) + + if extra_layers is not None: + self.layers.extend(extra_layers) + assert isinstance(self.__model_config__, ModelConfig) def proto(self): diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index 4e432a52b209c825ca1b74393cd607db8f884f4f..a207beb548f0fda6d7aacf5c55e53d937e18a924 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -37,9 +37,12 @@ class SGD(object): :type cost: paddle.v2.config_base.Layer :param parameters: The parameters dictionary. :type parameters: paddle.v2.parameters.Parameters + :param extra_layers: Some layers in the neural network graph are not + in the path of cost layer. + :type extra_layers: paddle.v2.config_base.Layer """ - def __init__(self, cost, parameters, update_equation): + def __init__(self, cost, parameters, update_equation, extra_layers=None): if not isinstance(parameters, v2_parameters.Parameters): raise TypeError('parameters should be parameters') @@ -47,7 +50,7 @@ class SGD(object): if not isinstance(update_equation, v2_optimizer.Optimizer): raise TypeError("update equation parameter must be " "paddle.v2.optimizer.Optimizer") - topology = Topology(cost) + topology = Topology(cost, extra_layers=extra_layers) self.__optimizer__ = update_equation self.__topology__ = topology self.__parameters__ = parameters