diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md index 040f5ffa41968cbf93a817faa1db86c18956341e..ab0be77324450521fee02b7bd7ea12fb9eacf86a 100644 --- a/benchmark/IntelOptimizedPaddle.md +++ b/benchmark/IntelOptimizedPaddle.md @@ -12,11 +12,11 @@ Machine: System: CentOS release 6.3 (Final), Docker 1.12.1. -PaddlePaddle: paddlepaddle/paddle:latest (TODO: will rerun after 0.11.0) - -- MKL-DNN tag v0.10 -- MKLML 2018.0.20170720 +PaddlePaddle: paddlepaddle/paddle:latest (for MKLML and MKL-DNN), paddlepaddle/paddle:latest-openblas (for OpenBLAS) +- MKL-DNN tag v0.11 +- MKLML 2018.0.1.20171007 - OpenBLAS v0.2.20 +(TODO: will rerun after 0.11.0) On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively. @@ -31,15 +31,26 @@ Input image size - 3 * 224 * 224, Time: images/second | BatchSize | 64 | 128 | 256 | |--------------|-------| -----| --------| -| OpenBLAS | 7.82 | 8.62 | 10.34 | -| MKLML | 11.02 | 12.86 | 15.33 | -| MKL-DNN | 27.69 | 28.8 | 29.27 | +| OpenBLAS | 7.80 | 9.00 | 10.80 | +| MKLML | 12.12 | 13.70 | 16.18 | +| MKL-DNN | 28.46 | 29.83 | 30.44 | + + +chart on batch size 128 +TBD + + - ResNet-50 + +| BatchSize | 64 | 128 | 256 | +|--------------|-------| ------| -------| +| OpenBLAS | 25.22 | 25.68 | 27.12 | +| MKLML | 32.52 | 31.89 | 33.12 | +| MKL-DNN | 81.69 | 82.35 | 84.08 | chart on batch size 128 TBD - - ResNet - GoogLeNet ### Laptop diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp index 28969d01a13b7831794cef856af11ad2ec01c31e..6fbf3c7fdec2f537769adb660c67c5a597beb609 100644 --- a/paddle/gserver/layers/MKLDNNLayer.cpp +++ b/paddle/gserver/layers/MKLDNNLayer.cpp @@ -294,22 +294,8 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) { srcs.push_back(*src); } - // TODO(TJ): remove me when mkldnn sum support different formats - for (size_t i = 1; i < srcPDs.size(); ++i) { - CHECK(srcPDs[0] == srcPDs[i]); - } - tmpOutGrad_ = out; - tmpCvt_ = nullptr; - if (out->getPrimitiveDesc() != srcPDs[0]) { - tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]); - tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out); - CHECK(tmpCvt_); - pipelineMergeGrad_.push_back(*tmpCvt_); - } - - auto sumPD = - sum::primitive_desc(tmpOutGrad_->getMemoryDesc(), scales, srcPDs); - mergeGrad_.reset(new sum(sumPD, srcs, *tmpOutGrad_)); + auto sumPD = sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs); + mergeGrad_.reset(new sum(sumPD, srcs, *out)); pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_); } diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index 907927f984f1a7cd4a72038515569251df48d56f..e48b9b5a91f7f17cb3f31e9140f1428ba8954a20 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -36,7 +36,7 @@ class MKLDNNLayer : public Layer { protected: // batch size int bs_; - // they sizes are always from the first input layer + // their sizes are always from the first input layer // input image channel, height and width int ic_, ih_, iw_; // output image channel, height and width @@ -94,11 +94,6 @@ protected: std::vector pipelineMergeGrad_; // tmp input argument to save input grad, only used to merge grad Argument tmpInArg_; - // since mkldnn sum do not support different formats: - // can refer to https://github.com/01org/mkl-dnn/issues/134 - // so need create reorder manually and save tmp MKLDNNMatrix - MKLDNNMatrixPtr tmpOutGrad_; - std::shared_ptr tmpCvt_; public: explicit MKLDNNLayer(const LayerConfig& config) diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp index 42644e9601a82ea81c417adc6441edeb036998e2..56b523f220c2a405851b89db5f63e9aa50bfaaf7 100644 --- a/paddle/gserver/tests/test_MKLDNN.cpp +++ b/paddle/gserver/tests/test_MKLDNN.cpp @@ -315,7 +315,7 @@ TEST(MKLDNNLayer, AddtoLayer) { static void getMKLDNNConcatConfig(TestConfig& cfg, const std::vector& inputs) { - CHECK_GE(inputs.size(), 2) << "at least two inputs"; + CHECK_GE(inputs.size(), 2UL) << "at least two inputs"; int oc = inputs[0].ic; for (size_t i = 1; i < inputs.size(); ++i) { CHECK_EQ(inputs[i].bs, inputs[0].bs); diff --git a/paddle/operators/beam_search_op.cc b/paddle/operators/beam_search_op.cc index 17926a813d5b0b8ace6a1b20066cd0007703c696..8c3e2a303fb8f12a8886c11cf112b859a6db7bcf 100644 --- a/paddle/operators/beam_search_op.cc +++ b/paddle/operators/beam_search_op.cc @@ -139,7 +139,7 @@ bool BeamSearch::NextItemSet(std::vector *items) { items->reserve(framework::product(ids.dims())); for (size_t offset = abs_lod[lod_level_][sent_offset_]; offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) { - for (int d = 0; d < instance_dim; d++) { + for (size_t d = 0; d < instance_dim; d++) { const size_t dim_offset = offset * instance_dim + d; items->emplace_back(offset, ids_data[dim_offset], scores_data[dim_offset]); diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp index 88e684849df6fbfe4042b92bdb76ef98159eecea..3e4a2b5fa8a3981f6362edc1dc61ae1616e257ef 100644 --- a/paddle/trainer/Trainer.cpp +++ b/paddle/trainer/Trainer.cpp @@ -138,7 +138,7 @@ void Trainer::init(const std::shared_ptr& config, } if (FLAGS_use_mkldnn) { - CHECK_EQ(FLAGS_trainer_count, 1UL) << "MKLDNN only need 1 trainer"; + CHECK_EQ(FLAGS_trainer_count, 1) << "MKLDNN only need 1 trainer"; } if (testing) { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 0f97c279a0d4d22cb434d63dd39bf16cfb7174ed..0941f10cf1ef337ac0e0225aea250dcdd8a27614 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2037,13 +2037,20 @@ class ParameterReluLayer(LayerBase): def __init__(self, name, inputs, partial_sum=1, **args): super(ParameterReluLayer, self).__init__( name, self.layer_type, 0, inputs=inputs, **args) + input_layer = self.get_input_layer(0) config_assert(len(self.inputs) == 1, "prelu layer has only one input.") config_assert(input_layer.size % partial_sum == 0, "a wrong setting for partial_sum") + + dims = [1, input_layer.size / partial_sum] self.set_layer_size(input_layer.size) self.config.partial_sum = partial_sum - self.create_input_parameter(0, input_layer.size / partial_sum) + self.create_input_parameter(0, input_layer.size / partial_sum, dims) + + self.set_layer_height_width(self.get_input_layer(0).height, \ + self.get_input_layer(0).width) + self.set_layer_depth(self.get_input_layer(0).depth) @config_layer('conv') diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py index 57979db4de08989ab583b0ab41589c09789a0921..95797fba8f67bacb421f5c2813ad6332bc53cbc9 100644 --- a/python/paddle/trainer_config_helpers/evaluators.py +++ b/python/paddle/trainer_config_helpers/evaluators.py @@ -297,7 +297,7 @@ def auc_evaluator( def pnpair_evaluator( input, label, - info, + query_id, weight=None, name=None, ): """ @@ -308,16 +308,20 @@ def pnpair_evaluator( .. code-block:: python - eval = pnpair_evaluator(input, label, info) + eval = pnpair_evaluator(input, label, query_id) :param input: Input Layer name. The output prediction of network. :type input: LayerOutput :param label: Label layer name. :type label: LayerOutput - :param info: Info layer name. (TODO, explaination) - :type info: LayerOutput + :param query_id: Query_id layer name. Query_id indicates that which query + each sample belongs to. Its shape should be + the same as output of Label layer. + :type query_id: LayerOutput :param weight: Weight Layer name. It should be a matrix with size - [sample_num, 1]. (TODO, explaination) + [sample_num, 1] which indicates the weight of each sample. + The default weight of sample is 1 if the weight layer is None. + And the pair weight is the mean of the two samples' weight. :type weight: LayerOutput :param name: Evaluator name. :type name: None|basestring @@ -326,8 +330,8 @@ def pnpair_evaluator( input = [input] if label: input.append(label) - if info: - input.append(info) + if query_id: + input.append(query_id) evaluator_base( input=input, type="pnpair", diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index a76bc97264b6ba806482d7b226d1d170064a6589..6bd5ce4fe2f70946befb388986dff603bdae0b8e 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -6604,10 +6604,11 @@ def row_conv_layer(input, @layer_support() @wrap_name_default() -@wrap_param_attr_default() def prelu_layer(input, name=None, partial_sum=1, + channel_shared=None, + num_channels=None, param_attr=None, layer_attr=None): """ @@ -6638,6 +6639,14 @@ def prelu_layer(input, - partial_sum = number of outputs, indicates all elements share the same weight. :type partial_sum: int + :param channel_shared: whether or not the parameter are shared across channels. + + - channel_shared = True, we set the partial_sum to the number of outputs. + - channel_shared = False, we set the partial_sum to the number of elements in one channel. + + :type channel_shared: bool + :param num_channels: number of input channel. + :type num_channels: int :param param_attr: The parameter attribute. See ParameterAttribute for details. :type param_attr: ParameterAttribute :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for @@ -6648,7 +6657,25 @@ def prelu_layer(input, """ assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.' - assert isinstance(param_attr, ParameterAttribute) + + if not param_attr: + param_attr = ParamAttr(initial_mean=0.25, initial_std=0.0) + else: + assert isinstance(param_attr, ParameterAttribute) + + if num_channels is None: + assert input.num_filters is not None, \ + 'the input channel cannot be detected, please specify the num_channels parameter' + num_channels = input.num_filters + + if channel_shared is not None: + assert isinstance(channel_shared, bool) + assert (input.height != 0 and input.width != 0), \ + 'input height and widht must be setted' + if channel_shared: + partial_sum = input.height * input.width * num_channels + else: + partial_sum = input.height * input.width l = Layer( name=name, @@ -6660,6 +6687,7 @@ def prelu_layer(input, name=name, layer_type=LayerType.PRELU, parents=input, + num_filters=num_channels, size=l.config.size) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr index 94ad56cab063df9e6a11bb1c293727fb9dec810f..63fb38c6508675d379f577b965ea17ad4c3b4942 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr @@ -4,6 +4,8 @@ layers { type: "data" size: 300 active_type: "" + height: 10 + width: 10 } layers { name: "__prelu_layer_0__" @@ -15,6 +17,9 @@ layers { input_parameter_name: "___prelu_layer_0__.w0" } partial_sum: 1 + height: 10 + width: 10 + depth: 1 } layers { name: "__prelu_layer_1__" @@ -26,6 +31,9 @@ layers { input_parameter_name: "___prelu_layer_1__.w0" } partial_sum: 1 + height: 10 + width: 10 + depth: 1 } layers { name: "__prelu_layer_2__" @@ -37,41 +45,100 @@ layers { input_parameter_name: "___prelu_layer_2__.w0" } partial_sum: 5 + height: 10 + width: 10 + depth: 1 +} +layers { + name: "__prelu_layer_3__" + type: "prelu" + size: 300 + active_type: "" + inputs { + input_layer_name: "input" + input_parameter_name: "___prelu_layer_3__.w0" + } + partial_sum: 300 + height: 10 + width: 10 + depth: 1 +} +layers { + name: "__prelu_layer_4__" + type: "prelu" + size: 300 + active_type: "" + inputs { + input_layer_name: "input" + input_parameter_name: "___prelu_layer_4__.w0" + } + partial_sum: 100 + height: 10 + width: 10 + depth: 1 } parameters { name: "___prelu_layer_0__.w0" size: 300 - initial_mean: 0.0 - initial_std: 0.057735026919 + initial_mean: 0.25 + initial_std: 0.0 + dims: 1 + dims: 300 initial_strategy: 0 - initial_smart: true + initial_smart: false } parameters { name: "___prelu_layer_1__.w0" size: 300 - initial_mean: 0.0 - initial_std: 0.057735026919 + initial_mean: 0.25 + initial_std: 0.0 + dims: 1 + dims: 300 initial_strategy: 0 - initial_smart: true + initial_smart: false } parameters { name: "___prelu_layer_2__.w0" size: 60 - initial_mean: 0.0 - initial_std: 0.129099444874 + initial_mean: 0.25 + initial_std: 0.0 + dims: 1 + dims: 60 + initial_strategy: 0 + initial_smart: false +} +parameters { + name: "___prelu_layer_3__.w0" + size: 1 + initial_mean: 0.25 + initial_std: 0.0 + dims: 1 + dims: 1 + initial_strategy: 0 + initial_smart: false +} +parameters { + name: "___prelu_layer_4__.w0" + size: 3 + initial_mean: 0.25 + initial_std: 0.0 + dims: 1 + dims: 3 initial_strategy: 0 - initial_smart: true + initial_smart: false } input_layer_names: "input" -output_layer_names: "__prelu_layer_2__" +output_layer_names: "__prelu_layer_4__" sub_models { name: "root" layer_names: "input" layer_names: "__prelu_layer_0__" layer_names: "__prelu_layer_1__" layer_names: "__prelu_layer_2__" + layer_names: "__prelu_layer_3__" + layer_names: "__prelu_layer_4__" input_layer_names: "input" - output_layer_names: "__prelu_layer_2__" + output_layer_names: "__prelu_layer_4__" is_recurrent_layer_group: false } diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py index aae90fab32db78a70c2169ed8fafb930433f4136..45b02fbf325bb63b057bbbf64d59af8debf0bc9d 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py +++ b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py @@ -1,8 +1,10 @@ from paddle.trainer_config_helpers import * -data = data_layer(name='input', size=300) -prelu = prelu_layer(input=data) -prelu = prelu_layer(input=data, partial_sum=1) -prelu = prelu_layer(input=data, partial_sum=5) +data = data_layer(name='input', size=300, height=10, width=10) +prelu = prelu_layer(input=data, num_channels=3) +prelu = prelu_layer(input=data, partial_sum=1, num_channels=3) +prelu = prelu_layer(input=data, partial_sum=5, num_channels=3) +prelu = prelu_layer(input=data, channel_shared=True, num_channels=3) +prelu = prelu_layer(input=data, channel_shared=False, num_channels=3) outputs(prelu) diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index 7bbe3eaaa67a117bc53571e6571365c3a26814c1..4edc96437f8490012cd58526d8f8b23983074048 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -62,21 +62,15 @@ __all__ = [ cp.begin_parse() -def init(**kwargs): - import py_paddle.swig_paddle as api - args = [] - args_dict = {} - # NOTE: append arguments if they are in ENV - for ek, ev in os.environ.iteritems(): - if ek.startswith("PADDLE_INIT_"): - args_dict[ek.replace("PADDLE_INIT_", "").lower()] = str(ev) +def set_omp_mkl_env_vars(trainer_count): + '''Auto set CPU environment if have not set before. + export KMP_AFFINITY, OMP_DYNAMIC according to the Hyper Threading status. + export OMP_NUM_THREADS, MKL_NUM_THREADS according to trainer_count. + ''' + import platform + if not platform.system() in ['Linux', 'Darwin']: + return - args_dict.update(kwargs) - # NOTE: overwrite arguments from ENV if it is in kwargs - for key in args_dict.keys(): - args.append('--%s=%s' % (key, str(args_dict[key]))) - - # auto set cpu environment def set_env(key, value): '''If the key has not been set in the environment, set it with value.''' assert isinstance(key, str) @@ -85,22 +79,59 @@ def init(**kwargs): if envset is None: os.environ[key] = value - ht = os.popen("lscpu |grep \"per core\"|awk -F':' '{print $2}'|xargs") - ht = int(ht.read()) - if ht == 1: # ht is off - set_env("OMP_DYNAMIC", "false") - set_env("KMP_AFFINITY", "granularity=fine,compact,0,0") - else: + def num_physical_cores(): + '''Get the number of physical cores''' + if platform.system() == "Linux": + num_sockets = int( + os.popen("lscpu |grep \"Socket\" |awk -F':' '{print $2}'|xargs") + .read()) + num_cores_per_socket = int( + os.popen( + "lscpu |grep \"per socket\" |awk -F':' '{print $2}'|xargs") + .read()) + return num_sockets * num_cores_per_socket + else: + cmds = {"Darwin": "sysctl hw.physicalcpu"} + return int(os.popen(cmds.get(platform.system(), "expr 1")).read()) + + def num_logical_processors(): + '''Get the number of logical processors''' + cmds = { + "Linux": "grep \"processor\" /proc/cpuinfo|sort -u|wc -l", + "Darwin": "sysctl hw.logicalcpu" + } + return int(os.popen(cmds.get(platform.system(), "expr 1")).read()) + + num_cores = num_physical_cores() + num_processors = num_logical_processors() + if num_processors > num_cores: # Hyper Threading is enabled set_env("OMP_DYNAMIC", "true") set_env("KMP_AFFINITY", "granularity=fine,compact,1,0") - processors = os.popen("grep \"processor\" /proc/cpuinfo|sort -u|wc -l") - processors = int(processors.read()) - trainers = kwargs.get('trainer_count', 1) - threads = processors / trainers + else: + set_env("OMP_DYNAMIC", "false") + set_env("KMP_AFFINITY", "granularity=fine,compact,0,0") + threads = num_processors / trainer_count threads = '1' if threads < 1 else str(threads) set_env("OMP_NUM_THREADS", threads) set_env("MKL_NUM_THREADS", threads) + +def init(**kwargs): + import py_paddle.swig_paddle as api + args = [] + args_dict = {} + # NOTE: append arguments if they are in ENV + for ek, ev in os.environ.iteritems(): + if ek.startswith("PADDLE_INIT_"): + args_dict[ek.replace("PADDLE_INIT_", "").lower()] = str(ev) + + args_dict.update(kwargs) + # NOTE: overwrite arguments from ENV if it is in kwargs + for key in args_dict.keys(): + args.append('--%s=%s' % (key, str(args_dict[key]))) + + set_omp_mkl_env_vars(kwargs.get('trainer_count', 1)) + if 'use_gpu' in kwargs: cp.g_command_config_args['use_gpu'] = kwargs['use_gpu'] if 'use_mkldnn' in kwargs: diff --git a/python/paddle/v2/fluid/initializer.py b/python/paddle/v2/fluid/initializer.py index ded144ecd5db83ce50ca0dc6243fdc52ac0b7a2f..1a9d804ee7ee8e6463d42fefb809fb45888fd064 100644 --- a/python/paddle/v2/fluid/initializer.py +++ b/python/paddle/v2/fluid/initializer.py @@ -285,3 +285,86 @@ class XavierInitializer(Initializer): }) var.op = op return op + + +class MSRAInitializer(Initializer): + """Implements the MSRA initializer a.k.a. Kaiming Initializer + + This class implements the weight initialization from the paper + Delving Deep into Rectifiers: Surpassing Human-Level Performance on + ImageNet Classification[1] by Kaiming He, Xiangyu Zhang, Shaoqing Ren + and Jian Sun. This is a robust initialization method that particularly + considers the rectifier nonlinearities. In case of Uniform distribution, + the range is [-x, x], where x = sqrt(6 / fan_in). In case of Normal + distribution, the mean is 0 and the standard deviation + is sqrt(2/ fan_in). + + References: + [1] Delving Deep into Rectifiers: Surpassing Human-Level Performance + on ImageNet Classification + (https://arxiv.org/abs/1502.01852) + """ + + def __init__(self, uniform=True, fan_in=None, seed=0): + """Constructor for MSRAInitializer + + Args: + uniform: whether to use uniform or normal distribution + fan_in: fan_in for MSRAInitializer. If None, it is + inferred from the variable. + seed: random seed + + Note: It is recommended to set fan_in to None for most cases. + """ + assert uniform is not None + assert seed is not None + super(MSRAInitializer, self).__init__() + self._uniform = uniform + self._fan_in = fan_in + self._seed = seed + + def __call__(self, var, block): + """Add MSRA initialization ops for a variable + + Args: + var: Variable that needs to be initialized + block: The block in which initialization ops + should be added + + Returns: + the initialization op + """ + assert isinstance(var, framework.Variable) + assert isinstance(block, framework.Block) + f_in, f_out = self._compute_fans(var) + + # If fan_in is passed, use it + fan_in = f_in if self._fan_in is None else self._fan_in + + if self._uniform: + limit = np.sqrt(6.0 / float(fan_in)) + op = block.prepend_op( + type="uniform_random", + outputs={"Out": var}, + attrs={ + "shape": var.shape, + "data_type": int(var.data_type), + "min": -limit, + "max": limit, + "seed": self._seed + }) + + else: + std = np.sqrt(2.0 / float(fan_in)) + op = block.prepend_op( + type="gaussian_random", + outputs={"Out": var}, + attrs={ + "shape": var.shape, + "data_type": int(var.data_type), + "mean": 0.0, + "std": std, + "seed": self._seed + }) + var.op = op + return op diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index 26a10ae766c9c37b68951613c494029a6f162084..abd4b22e8b68f3b5c3e961df83db34e419e7f4d5 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -17,13 +17,13 @@ __all__ = [ def fc(input, size, + num_flatten_dims=1, param_attr=None, param_initializer=None, bias_attr=None, bias_initializer=None, - name=None, act=None, - num_flatten_dims=1, + name=None, main_program=None, startup_program=None): """ @@ -32,15 +32,15 @@ def fc(input, Args: input: The input tensor to the function size: The size of the layer + num_flatten_dims: Number of columns in input param_attr: The parameters/weights to the FC Layer param_initializer: Initializer used for the weight/parameter. If None, XavierInitializer() is used bias_attr: The bias parameter for the FC layer bias_initializer: Initializer used for the bias. If None, then ConstantInitializer() is used - name: Name/alias of the function act: Activation to be applied to the output of FC layer - num_flatten_dims: Number of columns in input + name: Name/alias of the function main_program: Name of the main program that calls this startup_program: Name of the startup program @@ -111,9 +111,9 @@ def fc(input, def embedding(input, size, - data_type='float32', is_sparse=False, param_attr=None, + data_type='float32', main_program=None, startup_program=None): """ @@ -122,9 +122,9 @@ def embedding(input, Args: input: The input to the function size: The size of the layer - data_type: The type of data : float32, float_16, int etc is_sparse: A flag that decleares whether the input is sparse param_attr: Parameters for this layer + data_type: The type of data : float32, float_16, int etc main_program: Name of the main program that calls this startup_program: Name of the startup program @@ -152,7 +152,6 @@ def embedding(input, # TODO(qijun): expose H0 and C0 def dynamic_lstm(input, size, - data_type='float32', param_attr=None, bias_attr=None, use_peepholes=True, @@ -160,6 +159,7 @@ def dynamic_lstm(input, gate_activation='sigmoid', cell_activation='tanh', candidate_activation='tanh', + data_type='float32', main_program=None, startup_program=None): helper = LayerHelper('lstm', **locals()) @@ -200,9 +200,9 @@ def dynamic_lstm(input, def data(name, shape, + append_batch_size=True, data_type='float32', type=core.VarDesc.VarType.LOD_TENSOR, - append_batch_size=True, main_program=None, startup_program=None, stop_gradient=True): @@ -212,9 +212,9 @@ def data(name, Args: name: The name/alias of the function shape: Tuple declaring the shape. + append_batch_size: Whether or not to append the data as a batch. data_type: The type of data : float32, float_16, int etc type: The output type. By default it is LOD_TENSOR. - append_batch_size: Whether or not to append the data as a batch. main_program: Name of the main program that calls this startup_program: Name of the startup program stop_gradient: A boolean that mentions whether gradient should flow. @@ -600,12 +600,12 @@ def sequence_conv(input, num_filters, filter_size=3, filter_stride=1, - act=None, padding=None, bias_attr=None, bias_initializer=None, param_attr=None, param_initializer=None, + act=None, main_program=None, startup_program=None): """ @@ -658,16 +658,16 @@ def sequence_conv(input, def conv2d(input, num_filters, - name=None, - filter_size=[1, 1], - act=None, - groups=None, + filter_size, stride=[1, 1], padding=None, - bias_attr=None, - bias_initializer=None, + groups=None, param_attr=None, param_initializer=None, + bias_attr=None, + bias_initializer=None, + act=None, + name=None, main_program=None, startup_program=None): """ diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py index 280f6e902c34512735a27586221c2be68963ef2b..9a51a2f207ebed340b8e5c60e7ebeb82a611dbc5 100644 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py +++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py @@ -54,17 +54,17 @@ def to_lodtensor(data, place): return res -def chop_data(data, chop_len=80, batch_len=50): +def chop_data(data, chop_len=80, batch_size=50): data = [(x[0][:chop_len], x[1]) for x in data if len(x[0]) >= chop_len] - return data[:batch_len] + return data[:batch_size] def prepare_feed_data(data, place): tensor_words = to_lodtensor(map(lambda x: x[0], data), place) label = np.array(map(lambda x: x[1], data)).astype("int64") - label = label.reshape([50, 1]) + label = label.reshape([len(label), 1]) tensor_label = core.LoDTensor() tensor_label.set(label, place) @@ -72,33 +72,41 @@ def prepare_feed_data(data, place): def main(): - word_dict = paddle.dataset.imdb.word_dict() - cost, acc = lstm_net(dict_dim=len(word_dict), class_dim=2) + BATCH_SIZE = 100 + PASS_NUM = 5 - batch_size = 100 - train_data = paddle.batch( - paddle.reader.buffered( - paddle.dataset.imdb.train(word_dict), size=batch_size * 10), - batch_size=batch_size) + word_dict = paddle.dataset.imdb.word_dict() + print "load word dict successfully" + dict_dim = len(word_dict) + class_dim = 2 - data = chop_data(next(train_data())) + cost, acc = lstm_net(dict_dim=dict_dim, class_dim=class_dim) + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.train(word_dict), buf_size=BATCH_SIZE * 10), + batch_size=BATCH_SIZE) place = core.CPUPlace() - tensor_words, tensor_label = prepare_feed_data(data, place) exe = Executor(place) + exe.run(framework.default_startup_program()) - while True: - outs = exe.run(framework.default_main_program(), - feed={"words": tensor_words, - "label": tensor_label}, - fetch_list=[cost, acc]) - cost_val = np.array(outs[0]) - acc_val = np.array(outs[1]) - - print("cost=" + str(cost_val) + " acc=" + str(acc_val)) - if acc_val > 0.9: - break + for pass_id in xrange(PASS_NUM): + for data in train_data(): + chopped_data = chop_data(data) + tensor_words, tensor_label = prepare_feed_data(chopped_data, place) + + outs = exe.run(framework.default_main_program(), + feed={"words": tensor_words, + "label": tensor_label}, + fetch_list=[cost, acc]) + cost_val = np.array(outs[0]) + acc_val = np.array(outs[1]) + + print("cost=" + str(cost_val) + " acc=" + str(acc_val)) + if acc_val > 0.7: + exit(0) + exit(1) if __name__ == '__main__': diff --git a/python/paddle/v2/fluid/tests/test_initializer.py b/python/paddle/v2/fluid/tests/test_initializer.py index f2eb79b209627f5814847db6d96c0a17300d9b5a..6c20203f8eca02b3f68ed2aa8664bed29551c070 100644 --- a/python/paddle/v2/fluid/tests/test_initializer.py +++ b/python/paddle/v2/fluid/tests/test_initializer.py @@ -223,5 +223,109 @@ class TestXavierInitializer(unittest.TestCase): self.assertEqual(init_op.attr('seed'), 134) +class TestMSRAInitializer(unittest.TestCase): + def test_uniform_msra_initializer(self): + """Test MSRA initializer with uniform distribution on + for matrix multiply. + """ + program = framework.Program() + block = program.global_block() + param = block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.MSRAInitializer()) + self.assertEqual(len(block.ops), 1) + init_op = block.ops[0] + self.assertEqual(init_op.type, 'uniform_random') + limit = np.sqrt(6.0 / param.shape[0]) + self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA) + self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA) + self.assertEqual(init_op.attr('seed'), 0) + + def test_uniform_msra_initializer_conv(self): + """Test MSRA initializer with uniform distribution on + for convolutions. + """ + program = framework.Program() + block = program.global_block() + param = block.create_parameter( + dtype="float32", + shape=[5, 10, 15, 20], + lod_level=0, + name="param", + initializer=initializer.MSRAInitializer()) + self.assertEqual(len(block.ops), 1) + init_op = block.ops[0] + self.assertEqual(init_op.type, 'uniform_random') + receptive_field_size = float(15 * 20) + limit = np.sqrt(6.0 / (param.shape[1] * receptive_field_size)) + self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA) + self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA) + self.assertEqual(init_op.attr('seed'), 0) + + def test_normal_msra_initializer(self): + """Test MSRA initializer with normal distribution on + for matrix multiply. + """ + program = framework.Program() + block = program.global_block() + param = block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.MSRAInitializer(uniform=False)) + self.assertEqual(len(block.ops), 1) + init_op = block.ops[0] + self.assertEqual(init_op.type, 'gaussian_random') + std = np.sqrt(2.0 / param.shape[0]) + self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA) + self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA) + self.assertEqual(init_op.attr('seed'), 0) + + def test_normal_msra_initializer_conv(self): + """Test MSRA initializer with normal distribution on + for convolutions. + """ + program = framework.Program() + block = program.global_block() + param = block.create_parameter( + dtype="float32", + shape=[5, 10, 15, 20], + lod_level=0, + name="param", + initializer=initializer.MSRAInitializer(uniform=False)) + self.assertEqual(len(block.ops), 1) + init_op = block.ops[0] + self.assertEqual(init_op.type, 'gaussian_random') + receptive_field_size = float(15 * 20) + std = np.sqrt(2.0 / (param.shape[1] * receptive_field_size)) + self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA) + self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA) + self.assertEqual(init_op.attr('seed'), 0) + + def test_msra_initializer_supplied_arguments(self): + """Test the MSRA initializer with supplied arguments + """ + program = framework.Program() + block = program.global_block() + block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.MSRAInitializer( + fan_in=12, seed=134)) + self.assertEqual(len(block.ops), 1) + init_op = block.ops[0] + self.assertEqual(init_op.type, 'uniform_random') + limit = np.sqrt(6.0 / 12) + self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA) + self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA) + self.assertEqual(init_op.attr('seed'), 134) + + if __name__ == '__main__': unittest.main()