diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py index 3358d43a4b08c6a9b89d59e1a8be53ee1f12bbe0..77d130ae34059d1e87040d00346ac1dadd86b0d8 100644 --- a/benchmark/paddle/image/alexnet.py +++ b/benchmark/paddle/image/alexnet.py @@ -6,8 +6,18 @@ height = 227 width = 227 num_class = 1000 batch_size = get_config_arg('batch_size', int, 128) +gp = get_config_arg('layer_num', int, 1) +is_infer = get_config_arg("is_infer", bool, False) +num_samples = get_config_arg('num_samples', int, 2560) -args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} +args = { + 'height': height, + 'width': width, + 'color': True, + 'num_class': num_class, + 'is_infer': is_infer, + 'num_samples': num_samples +} define_py_data_sources2( "train.list", None, module="provider", obj="process", args=args) @@ -31,7 +41,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2) # conv2 net = img_conv_layer( - input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1) + input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp) net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75) net = img_pool_layer(input=net, pool_size=3, stride=2) @@ -40,11 +50,11 @@ net = img_conv_layer( input=net, filter_size=3, num_filters=384, stride=1, padding=1) # conv4 net = img_conv_layer( - input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1) + input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp) # conv5 net = img_conv_layer( - input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1) + input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp) net = img_pool_layer(input=net, pool_size=3, stride=2) net = fc_layer( @@ -59,6 +69,9 @@ net = fc_layer( layer_attr=ExtraAttr(drop_rate=0.5)) net = fc_layer(input=net, size=1000, act=SoftmaxActivation()) -lab = data_layer('label', num_class) -loss = cross_entropy(input=net, label=lab) -outputs(loss) +if is_infer: + outputs(net) +else: + lab = data_layer('label', num_class) + loss = cross_entropy(input=net, label=lab) + outputs(loss) diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py index 7059c13bd2c2b98eb3fbcf633a6f7064e54d5402..2a850ccb7f2c75b467554181fc5f4aa8f2b97a09 100644 --- a/benchmark/paddle/image/googlenet.py +++ b/benchmark/paddle/image/googlenet.py @@ -7,13 +7,15 @@ num_class = 1000 batch_size = get_config_arg('batch_size', int, 128) use_gpu = get_config_arg('use_gpu', bool, True) is_infer = get_config_arg("is_infer", bool, False) +num_samples = get_config_arg('num_samples', int, 2560) args = { 'height': height, 'width': width, 'color': True, 'num_class': num_class, - 'is_infer': is_infer + 'is_infer': is_infer, + 'num_samples': num_samples } define_py_data_sources2( "train.list" if not is_infer else None, diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py index 927b1759941f362ef4b5ffe84dd01332986d9306..1018ec9ce1e529f618ddd7b7afa72a84c5e876a1 100644 --- a/benchmark/paddle/image/provider.py +++ b/benchmark/paddle/image/provider.py @@ -14,6 +14,7 @@ def initHook(settings, height, width, color, num_class, **kwargs): else: settings.data_size = settings.height * settings.width settings.is_infer = kwargs.get('is_infer', False) + settings.num_samples = kwargs.get('num_samples', 2560) if settings.is_infer: settings.slots = [dense_vector(settings.data_size)] else: @@ -23,7 +24,7 @@ def initHook(settings, height, width, color, num_class, **kwargs): @provider( init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM) def process(settings, file_list): - for i in xrange(2560 if settings.is_infer else 1024): + for i in xrange(settings.num_samples): img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten() if settings.is_infer: yield img.astype('float32') diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py index 4a14363ff1db48a5072cbb5f5eb3bc9241ffca8f..2846e4763f1cda4602f03af5ec649d57ee6cf0d8 100644 --- a/benchmark/paddle/image/resnet.py +++ b/benchmark/paddle/image/resnet.py @@ -7,13 +7,15 @@ num_class = 1000 batch_size = get_config_arg('batch_size', int, 64) layer_num = get_config_arg("layer_num", int, 50) is_infer = get_config_arg("is_infer", bool, False) +num_samples = get_config_arg('num_samples', int, 2560) args = { 'height': height, 'width': width, 'color': True, 'num_class': num_class, - 'is_infer': is_infer + 'is_infer': is_infer, + 'num_samples': num_samples } define_py_data_sources2( "train.list" if not is_infer else None, diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh index d795bcab1b7d098295066f79189d17e8299d28fb..62c9bf6efd3810f506fd4592b2ba3a21b1b7f0e7 100755 --- a/benchmark/paddle/image/run_mkl_infer.sh +++ b/benchmark/paddle/image/run_mkl_infer.sh @@ -37,7 +37,7 @@ function infer() { --trainer_count=1 \ --num_passes=1 \ --save_dir="models/${topology}-${layer_num}" \ - --config_args="batch_size=128,layer_num=${layer_num}" \ + --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \ > /dev/null 2>&1 echo "Done" fi @@ -79,8 +79,9 @@ fi # inference benchmark for use_mkldnn in True False; do for batchsize in 1 2 4 8 16; do - infer googlenet v1 $batchsize $use_mkldnn - infer resnet 50 $batchsize $use_mkldnn infer vgg 19 $batchsize $use_mkldnn + infer resnet 50 $batchsize $use_mkldnn + infer googlenet v1 $batchsize $use_mkldnn + infer alexnet 2 $batchsize $use_mkldnn done done diff --git a/benchmark/paddle/image/run_mkl_train.sh b/benchmark/paddle/image/run_mkl_train.sh index 5335af5ac1b9a4a48ec107b8b6386b50ead8284c..03d2d378fb72e36f765d89af788f6ee96fe21d4e 100755 --- a/benchmark/paddle/image/run_mkl_train.sh +++ b/benchmark/paddle/image/run_mkl_train.sh @@ -47,5 +47,6 @@ for use_mkldnn in True False; do train vgg 19 $batchsize $use_mkldnn train resnet 50 $batchsize $use_mkldnn train googlenet v1 $batchsize $use_mkldnn + train alexnet 2 $batchsize $use_mkldnn done done diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh index c1001d3a7c95a293d0b2b5b78fb7415e167b3e9f..da034f3b9dff794e22086a5295ad2b0c2361c356 100755 --- a/benchmark/paddle/image/run_openblas_infer.sh +++ b/benchmark/paddle/image/run_openblas_infer.sh @@ -23,24 +23,25 @@ function infer() { echo "./run_mkl_infer.sh to save the model first" exit 0 fi - log_period=$((256 / bs)) + log_period=$((32 / bs)) paddle train --job=test \ --config="${topology}.py" \ + --use_mkldnn=False \ --use_gpu=False \ --trainer_count=$thread \ --log_period=$log_period \ - --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \ + --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \ --init_model_path=$models_in \ 2>&1 | tee ${log} - # calculate the last 5 logs period time of 1280 samples, + # calculate the last 5 logs period time of 160(=32*5) samples, # the time before are burning time. start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs` end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs` start_sec=`clock_to_seconds $start` end_sec=`clock_to_seconds $end` - fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'` - echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log} + fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'` + echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log} echo "FPS: $fps images/sec" 2>&1 | tee -a ${log} } @@ -56,7 +57,8 @@ fi # inference benchmark for batchsize in 1 2 4 8 16; do - infer googlenet v1 $batchsize - infer resnet 50 $batchsize infer vgg 19 $batchsize + infer resnet 50 $batchsize + infer googlenet v1 $batchsize + infer alexnet 2 $batchsize done diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh index b9494ce119523953a3360b2b67e2cb6f3e0f1643..e9df83fee2a3f796b7234b39619364f6ee4d5dc9 100755 --- a/benchmark/paddle/image/run_openblas_train.sh +++ b/benchmark/paddle/image/run_openblas_train.sh @@ -12,10 +12,11 @@ function train() { config="${topology}.py" paddle train --job=time \ --config=$config \ + --use_mkldnn=False \ --use_gpu=False \ --trainer_count=$thread \ - --log_period=10 \ - --test_period=100 \ + --log_period=3 \ + --test_period=30 \ --config_args=$args \ 2>&1 | tee ${log} @@ -36,4 +37,5 @@ for batchsize in 64 128 256; do train vgg 19 $batchsize train resnet 50 $batchsize train googlenet v1 $batchsize + train alexnet 2 $batchsize done diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py index 8d0a1e97a451cd52ef17e4e326673cc90059ef3c..ca0a6798fb8c35b68cf84d263855955eb93ba0b0 100644 --- a/benchmark/paddle/image/vgg.py +++ b/benchmark/paddle/image/vgg.py @@ -7,13 +7,15 @@ num_class = 1000 batch_size = get_config_arg('batch_size', int, 64) layer_num = get_config_arg('layer_num', int, 19) is_infer = get_config_arg("is_infer", bool, False) +num_samples = get_config_arg('num_samples', int, 2560) args = { 'height': height, 'width': width, 'color': True, 'num_class': num_class, - 'is_infer': is_infer + 'is_infer': is_infer, + 'num_samples': num_samples } define_py_data_sources2( "train.list" if not is_infer else None, diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index fab2af362bb070a54987b6499748056f3d12a56b..ff5855052dabaa0b63099cd219f3f04e22f1aa85 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -253,9 +253,9 @@ IF(NOT PROTOBUF_FOUND) IF(WITH_C_API) INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf) IF(ANDROID) - INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI}) + INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI}) ELSE() - INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib) + INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib) ENDIF() ENDIF() diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index c3f9c18d0663a7a24880b441981875c1e4f015aa..d81481ca819c13ee0e299c204f998f3915c34bd4 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -467,7 +467,7 @@ lambda_cost :noindex: square_error_cost --------- +----------------- .. autoclass:: paddle.v2.layer.square_error_cost :noindex: @@ -533,7 +533,7 @@ Miscs ===== dropout --------------- +-------- .. autoclass:: paddle.v2.layer.dropout :noindex: diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst index fc29795d1241f86d32c441114323162fdc4b50ba..ef9febe0aa9d1f65e6608495f6ad7d4f502acf59 100644 --- a/doc/api/v2/fluid/layers.rst +++ b/doc/api/v2/fluid/layers.rst @@ -19,17 +19,17 @@ dynamic_lstm :noindex: data ---------- +---- .. autofunction:: paddle.v2.fluid.layers.data :noindex: mean ---------- +---- .. autofunction:: paddle.v2.fluid.layers.mean :noindex: mul ---------- +--- .. autofunction:: paddle.v2.fluid.layers.mul :noindex: @@ -45,13 +45,13 @@ elementwise_div dropout ---------- +------- .. autofunction:: paddle.v2.fluid.layers.dropout :noindex: reshape ---------- +-------- .. autofunction:: paddle.v2.fluid.layers.reshape :noindex: @@ -81,67 +81,67 @@ transpose sigmoid_cross_entropy_with_logits ---------- +--------------------------------- .. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits :noindex: cast ---------- +---- .. autofunction:: paddle.v2.fluid.layers.cast :noindex: concat ---------- +------- .. autofunction:: paddle.v2.fluid.layers.concat :noindex: sums ---------- +---- .. autofunction:: paddle.v2.fluid.layers.sums :noindex: linear_chain_crf ---------- +---------------- .. autofunction:: paddle.v2.fluid.layers.linear_chain_crf :noindex: assign ---------- +------- .. autofunction:: paddle.v2.fluid.layers.embedding :noindex: split_lod_tensor ---------- +---------------- .. autofunction:: paddle.v2.fluid.layers.split_lod_tensor :noindex: merge_lod_tensor ---------- +---------------- .. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor :noindex: cos_sim ---------- +-------- .. autofunction:: paddle.v2.fluid.layers.cos_sim :noindex: cross_entropy ---------- +------------- .. autofunction:: paddle.v2.fluid.layers.cross_entropy :noindex: square_error_cost ---------- +----------------- .. autofunction:: paddle.v2.fluid.layers.square_error_cost :noindex: @@ -153,19 +153,19 @@ accuracy sequence_conv ---------- +------------- .. autofunction:: paddle.v2.fluid.layers.sequence_conv :noindex: conv2d ---------- +------ .. autofunction:: paddle.v2.fluid.layers.conv2d :noindex: sequence_pool ---------- +------------- .. autofunction:: paddle.v2.fluid.layers.sequence_pool :noindex: @@ -183,50 +183,50 @@ sequence_last_step pool2d ---------- +------ .. autofunction:: paddle.v2.fluid.layers.pool2d :noindex: batch_norm ---------- +---------- .. autofunction:: paddle.v2.fluid.layers.batch_norm :noindex: beam_search_decode ---------- +------------------ .. autofunction:: paddle.v2.fluid.layers.beam_search_decode :noindex: lod_rank_table ---------- +-------------- .. autofunction:: paddle.v2.fluid.layers.lod_rank_table :noindex: max_sequence_len ---------- +---------------- .. autofunction:: paddle.v2.fluid.layers.max_sequence_len :noindex: topk ---------- +----- .. autofunction:: paddle.v2.fluid.layers.topk :noindex: lod_tensor_to_array ---------- +------------------- .. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array :noindex: array_to_lod_tensor ---------- +------------------- .. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor :noindex: @@ -234,26 +234,26 @@ array_to_lod_tensor fill_constant ---------- +------------- .. autofunction:: paddle.v2.fluid.layers.fill_constant :noindex: fill_constant_batch_size_like ---------- +----------------------------- .. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like :noindex: ones ---------- +---- .. autofunction:: paddle.v2.fluid.layers.ones :noindex: zeros ---------- +----- .. autofunction:: paddle.v2.fluid.layers.zeros :noindex: @@ -265,14 +265,14 @@ increment array_write ---------- +----------- .. autofunction:: paddle.v2.fluid.layers.array_write :noindex: create_array ---------- +------------ .. autofunction:: paddle.v2.fluid.layers.create_array :noindex: @@ -284,31 +284,31 @@ less_than array_read ---------- +---------- .. autofunction:: paddle.v2.fluid.layers.array_read :noindex: shrink_memory ---------- +-------------- .. autofunction:: paddle.v2.fluid.layers.shrink_memory :noindex: array_length ---------- +------------- .. autofunction:: paddle.v2.fluid.layers.array_length :noindex: conv2d_transpose ---------- +---------------- .. autofunction:: paddle.v2.fluid.layers.conv2d_transpose :noindex: sequence_expand ---------- +--------------- .. autofunction:: paddle.v2.fluid.layers.sequence_expand :noindex: @@ -320,13 +320,19 @@ lstm_unit sequence_softmax ---------- +---------------- .. autofunction:: paddle.v2.fluid.layers.sequence_softmax :noindex: reduce_sum ---------- +---------- .. autofunction:: paddle.v2.fluid.layers.reduce_sum :noindex: + +reduce_mean +--------- +.. autofunction:: paddle.v2.fluid.layers.reduce_mean + :noindex: + diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst index 2c3d075422de29c96e25458e831133a30270dd39..b792efb71f85ae643df655568da69c82414e9d5d 100644 --- a/doc/api/v2/fluid/nets.rst +++ b/doc/api/v2/fluid/nets.rst @@ -3,19 +3,19 @@ Nets =========== simple_img_conv_pool ------------ +-------------------- .. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool :noindex: img_conv_group ------------ +--------------- .. autofunction:: paddle.v2.fluid.nets.img_conv_group :noindex: sequence_conv_pool ------------ +------------------ .. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool :noindex: diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst index 233762fcdfb39e592740adef6721a556fae3feef..19b4940f08de3e2f7dc177f2961e538946d10a78 100644 --- a/doc/api/v2/fluid/optimizer.rst +++ b/doc/api/v2/fluid/optimizer.rst @@ -18,7 +18,7 @@ SGDOptimizer MomentumOptimizer ------------ +----------------- .. automodule:: paddle.v2.fluid.optimizer :members: MomentumOptimizer :noindex: @@ -26,14 +26,14 @@ MomentumOptimizer AdagradOptimizer ------------ +---------------- .. automodule:: paddle.v2.fluid.optimizer :members: AdagradOptimizer :noindex: AdamOptimizer ------------ +------------- .. automodule:: paddle.v2.fluid.optimizer :members: AdamOptimizer :noindex: @@ -47,7 +47,7 @@ AdamaxOptimizer DecayedAdagradOptimizer ------------ +----------------------- .. automodule:: paddle.v2.fluid.optimizer :members: DecayedAdagradOptimizer :noindex: diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst index 3af2b07d2ae55d99df705fbf1ad2402eee05c435..868e225ed3d59e79aeb217fb88081ea25f80fa2c 100644 --- a/doc/api/v2/fluid/regularizer.rst +++ b/doc/api/v2/fluid/regularizer.rst @@ -3,14 +3,14 @@ Regularizer =========== WeightDecayRegularizer ------------ +---------------------- .. automodule:: paddle.v2.fluid.regularizer :members: WeightDecayRegularizer :noindex: L2DecayRegularizer ------------ +------------------ .. automodule:: paddle.v2.fluid.regularizer :members: L2DecayRegularizer :noindex: @@ -18,7 +18,7 @@ L2DecayRegularizer L1DecayRegularizer ------------ +------------------- .. automodule:: paddle.v2.fluid.regularizer :members: L1DecayRegularizer diff --git a/doc/design/kernel_hint_design.md b/doc/design/kernel_hint_design.md new file mode 100644 index 0000000000000000000000000000000000000000..a54b7da045e1a362626ef066f9ebb56af2c3181a --- /dev/null +++ b/doc/design/kernel_hint_design.md @@ -0,0 +1,57 @@ +## Problem +In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this. + +In the current design, we use KernelType to describe one kernel. + +```cpp +struct KernelType { + Place place_; + DataType data_type_; + LayoutType layout_; +}; +``` + `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it. + +The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use. + +So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel. + +The problem is, how should we define and send the information for `GetExpectedKernelType` to use? + +## Solution + +### Potential choice +1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel. + +2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint. + +### Final choice +To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose. + +In C++ + +```cpp +const std::string kForceCPU = "force_cpu"; +const std::string kUseCUDNN = "use_cudnn"; +const std::string kUseMKLDNN = "use_mkldnn"; + +KernelType GetExpectedKernelType() { + if (Attr(kForceCPU)) { + return KernelType(CPUPlace, ...) + } else { + ... + } +} +``` + +In Python code + +```python +FORCE_CPU = core.kForceCPU() + +def xx_layer(..., force_cpu=false): + layer_helper = LayerHelper(...) + layer_helper.append_op( + type="xx", + attr={FORCE_CPU: force_cpu}) +``` diff --git a/doc/design/operator_kernel_type.md b/doc/design/operator_kernel_type.md new file mode 100644 index 0000000000000000000000000000000000000000..aa82e96bf79319f1a57e2ad58aa9826e57be6470 --- /dev/null +++ b/doc/design/operator_kernel_type.md @@ -0,0 +1,91 @@ +# Design Doc: The Keys of Operator Kernel Type +## Problem +An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique Kernel. Before an operator runs, an certain kernel must be chosen by a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows: + +```cpp +struct OpKernelType { + platform::Place place_; + proto::DataType data_type_; +}; +``` +For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github. + +It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys are not enough. We need a more complete representation of `OpKernelType`. + +We often implement a kernel of an operator with some computing library in certain device(place). Please remind that computing library and device are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. + +For example, Eigen library can support Nvidia GPU/AMD GPU/CPU. And MKLDNN library can support Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`. + +It's obvious that different DataTypes, like fp64/fp32/int8 will have different kernels. But the data layout of a Tensor will also lead to different implementation. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209). Data Layout should also be taken into consideration. + +## Solution + +There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`. + +```cpp +struct OpKernelType { + platform::Place place_; + platform::Library library_; + proto::DataType data_type_; + framework::Layout layout_; +}; +``` + +Following is the details: + +### Place + +`Place` is defined as follows: + +```cpp +typedef boost::variant Place; +``` + +`Place` is to represent the device memory where data is locating. + + +### Library + +One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable: + +```cpp +enum Library { Plain, MKLDNN, CUDNN }; +``` + +We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on `Eigen` library, we take `Eigen` library as the `Plain` enumerator. +A library usually has a corresponding `DeviceContext` which contains some handles needed by computation. Fluid now have two default DeviceContexts in CPU and CUDA, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains a Eigen library handle and `CDUADeviceContext` contains a Eigen library handle and cuBLAS handle. + +If we want to support new Library, a new enumerator need to be added to `Library` and a new corresponding `LibraryDeviceContext` will be created. + + +### DataType + + +`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported. + +### Layout + +Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout. + +Different layout leads to different implementation of operator kernel. There are mainly 4 principles we have to follow to support layout in our fluid framework. + +- We take layout as a data member of Tensor. Layout is actually a enum variable. If fluid is built with MKLDNN, then, the memory format in MKLDNN will be added into this enum variable too. + +- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout of generating data. Of course, we can have some default layout, like NCHW. + +- The inference of Layout is at run-time, not compile-time. + +- Every operator have to implement different kernels for different layouts. Let's take MKLDNN as an example, if we want to implement a MKLDNN convolution operator, we have to realize all the kernels for different layout, list at [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to do registering kernels for MKLDNN operators. + +`Layout` is also defined as a enum variable: + +```cpp +enum Layout { + kNCHW, + kNHWC, +#ifdef PADDLE_WITH_MKLDNN + knChw8c + ... +#endif +}; +``` diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst index c875c807b8ab2e420dec189ef32d41533f58fa6d..41ac07ca5674d2c121baba77c58226ad328cd681 100644 --- a/doc/getstarted/build_and_install/build_from_source_cn.rst +++ b/doc/getstarted/build_and_install/build_from_source_cn.rst @@ -70,13 +70,13 @@ PaddlePaddle编译需要使用到下面的依赖(包含但不限于),其 :header: "依赖", "版本", "说明" :widths: 10, 15, 30 - "CMake", ">=3.5", "" + "CMake", ">=3.2", "" "GCC", "4.8.2", "推荐使用CentOS的devtools2" - "Python", "2.7.x", "依赖libpython2.7.so" - "pip", ">=9.0", "" - "numpy", "", "" + "Python", "2.7.x", "依赖libpython2.7.so" + "pip", ">=9.0", "" + "numpy", "", "" "SWIG", ">=2.0", "" - "Go", ">=1.8", "可选" + "Go", ">=1.8", "可选" .. _build_options: diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst index f194f84ce7c961bb8644d7c077a7c71730220ea2..92211aee8c3bc0ae6e1a38311d40ddf92117cac7 100644 --- a/doc/getstarted/build_and_install/build_from_source_en.rst +++ b/doc/getstarted/build_and_install/build_from_source_en.rst @@ -76,13 +76,13 @@ will be downloaded automatically. :header: "Dependency", "Version", "Description" :widths: 10, 15, 30 - "CMake", ">=3.5", "" + "CMake", ">=3.2", "" "GCC", "4.8.2", "Recommend devtools2 for CentOS" - "Python", "2.7.x", "Need libpython2.7.so" - "pip", ">=9.0", "" - "numpy", "", "" + "Python", "2.7.x", "Need libpython2.7.so" + "pip", ">=9.0", "" + "numpy", "", "" "SWIG", ">=2.0", "" - "Go", ">=1.8", "Optional" + "Go", ">=1.8", "Optional" .. _build_options: diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst index b270e2c2f0b0cbfd6fb4b9b0750d207952f84d76..a4587f82a984acf243f49834e707fcd66d5b1252 100644 --- a/doc/getstarted/build_and_install/pip_install_cn.rst +++ b/doc/getstarted/build_and_install/pip_install_cn.rst @@ -37,11 +37,11 @@ PaddlePaddle可以使用常用的Python包管理工具 :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API" :widths: 1, 3, 3, 3 - "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "暂无" - "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "暂无" + "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" .. _pip_dependency: diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst index 70f601a11c610e0a2b5dcc8b73d2c3ea19e195e1..55e31560a0f5087ab69966a6281c6c8573c04204 100644 --- a/doc/getstarted/build_and_install/pip_install_en.rst +++ b/doc/getstarted/build_and_install/pip_install_en.rst @@ -40,11 +40,11 @@ If the links below shows up the login form, just click "Log in as guest" to star :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API" :widths: 1, 3, 3, 3 - "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "Not Available" - "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "Not Available" + "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" .. _pip_dependency: diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index f1a577325f1b1ead6b1e08ee35a0ea30ce003bb6..222aee5974ca82c12a9c3b18f7fac4f96ee251bd 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -42,7 +42,7 @@ static std::unordered_set& CtrlFlowOps() { static inline std::unique_ptr CreateGradOp( const OperatorBase& op, const std::unordered_set& no_grad_set, std::unordered_map* grad_to_var) { - OpDescBind op_desc; + OpDesc op_desc; op_desc.SetInputMap(op.Inputs()); op_desc.SetOutputMap(op.Outputs()); op_desc.SetType(op.Type()); @@ -53,7 +53,7 @@ static inline std::unique_ptr CreateGradOp( grad_ops.reserve(grad_descs.size()); std::transform(grad_descs.begin(), grad_descs.end(), std::back_inserter(grad_ops), - [](const std::unique_ptr& grad_desc) { + [](const std::unique_ptr& grad_desc) { return OpRegistry::CreateOp(*grad_desc); }); PADDLE_ENFORCE(!grad_ops.empty()); @@ -217,7 +217,7 @@ static std::unique_ptr BackwardRecursive( // If part of input gradient of that operator is not calculated, fill // zero variables to that input gradient. net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}}, - {{"Y", {grad_input}}}, + {{"Out", {grad_input}}}, AttributeMap{})); } return false; @@ -296,7 +296,7 @@ static std::string FwdName(const std::string& grad_name) { static void CreateGradVarInBlock( size_t grad_op_start_index, const std::unordered_map& param_name_map, - BlockDescBind* block_desc, + BlockDesc* block_desc, std::unordered_map* grad_var_record) { auto ops = block_desc->AllOps(); for (size_t op_index = grad_op_start_index; op_index < ops.size(); @@ -350,12 +350,11 @@ static void CreateGradVarInBlock( } } -std::vector> MakeOpGrad( - const OpDescBind* op_desc, std::unordered_set* no_grad_vars, +std::vector> MakeOpGrad( + const OpDesc* op_desc, std::unordered_set* no_grad_vars, std::unordered_map* grad_to_var, - const std::vector& grad_block = - std::vector()) { - std::vector> grad_op_descs; + const std::vector& grad_block = std::vector()) { + std::vector> grad_op_descs; // All input gradients of forwarding operator do not need to calculate. const std::vector& inputs = op_desc->InputArgumentNames(); if (AllGradInSet(inputs, *no_grad_vars)) { @@ -386,7 +385,7 @@ std::vector> MakeOpGrad( .Get(op_desc->Type()) .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block); - std::list> pending_fill_zeros_ops; + std::list> pending_fill_zeros_ops; for (auto& desc : grad_op_descs) { for (const std::string& in_name : desc->InputArgumentNames()) { if (no_grad_vars->count(in_name)) { @@ -394,9 +393,9 @@ std::vector> MakeOpGrad( 0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1); std::string new_name = prefix + kZeroVarSuffix; desc->Rename(in_name, new_name); - std::unique_ptr fill_zeros_op( - new OpDescBind("fill_zeros_like", {{"X", {prefix}}}, - {{"Y", {new_name}}}, AttributeMap{})); + std::unique_ptr fill_zeros_op( + new OpDesc("fill_zeros_like", {{"X", {prefix}}}, + {{"Out", {new_name}}}, AttributeMap{})); pending_fill_zeros_ops.push_back(std::move(fill_zeros_op)); } } @@ -408,34 +407,33 @@ std::vector> MakeOpGrad( return grad_op_descs; } -static BlockDescBind* CreateStepBlock( - ProgramDescBind& program_desc, - std::unordered_set* no_grad_vars, +static BlockDesc* CreateStepBlock( + ProgramDesc& program_desc, std::unordered_set* no_grad_vars, std::unordered_map* grad_to_var, int step_block_idx); -std::vector> MakeBlockBackward( - ProgramDescBind& program_desc, int block_idx, +std::vector> MakeBlockBackward( + ProgramDesc& program_desc, int block_idx, std::unordered_set* no_grad_vars, std::unordered_map* grad_to_var) { VLOG(5) << "MakeBlockBackward"; - BlockDescBind* cur_block = program_desc.MutableBlock(block_idx); - std::vector op_descs = cur_block->AllOps(); + BlockDesc* cur_block = program_desc.MutableBlock(block_idx); + std::vector op_descs = cur_block->AllOps(); std::unordered_map> dup_out_ops; size_t grad_desc_idx = 0; - std::vector> backward_descs; + std::vector> backward_descs; for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) { VLOG(5) << "Making backward " << (*it)->Type() << " op"; - std::vector> op_grads; + std::vector> op_grads; if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") { int step_block_idx = (*it)->GetBlockAttr("sub_block"); - BlockDescBind* backward_block = CreateStepBlock( - program_desc, no_grad_vars, grad_to_var, step_block_idx); + BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars, + grad_to_var, step_block_idx); op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block}); } else if ((*it)->Type() == "conditional_block") { - BlockDescBind* backward_block = + BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars, grad_to_var, (*it)->GetBlockAttr("sub_block")); op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block}); @@ -463,14 +461,14 @@ std::vector> MakeBlockBackward( } ++grad_desc_idx; } - std::transform( - op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs), - [](std::unique_ptr& ptr) { return std::move(ptr); }); + std::transform(op_grads.begin(), op_grads.end(), + std::back_inserter(backward_descs), + [](std::unique_ptr& ptr) { return std::move(ptr); }); } VLOG(5) << "Appending Sums"; // Check whether some variables are written more than once - std::list>> pending_sum_ops; + std::list>> pending_sum_ops; for (const auto& dup : dup_out_ops) { const std::string& out_name = dup.first; const std::vector dup_op = dup.second; @@ -486,18 +484,17 @@ std::vector> MakeBlockBackward( sum_op_inputs.emplace_back(new_name); next_g_name = sum_op_inputs.back(); } - std::unique_ptr sum_op( - new OpDescBind("sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, - AttributeMap{})); + std::unique_ptr sum_op(new OpDesc("sum", {{"X", sum_op_inputs}}, + {{"Out", {out_name}}}, + AttributeMap{})); pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)}); } } - pending_sum_ops.sort( - [](const std::pair>& a, - const std::pair>& b) { - return a.first > b.first; - }); + pending_sum_ops.sort([](const std::pair>& a, + const std::pair>& b) { + return a.first > b.first; + }); for (auto& p : pending_sum_ops) { backward_descs.insert(backward_descs.begin() + p.first + 1, std::move(p.second)); @@ -508,14 +505,13 @@ std::vector> MakeBlockBackward( return backward_descs; } -static BlockDescBind* CreateStepBlock( - ProgramDescBind& program_desc, - std::unordered_set* no_grad_vars, +static BlockDesc* CreateStepBlock( + ProgramDesc& program_desc, std::unordered_set* no_grad_vars, std::unordered_map* grad_to_var, int step_block_idx) { auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx, no_grad_vars, grad_to_var); - BlockDescBind* backward_block = + BlockDesc* backward_block = program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx)); for (auto& ptr : backward_block_op_descs) { backward_block->AppendAllocatedOp(move(ptr)); @@ -524,7 +520,7 @@ static BlockDescBind* CreateStepBlock( } ParamGradInfoMap AppendBackward( - ProgramDescBind& program_desc, const VarDescBind& target, + ProgramDesc& program_desc, const VarDesc& target, const std::unordered_set& no_grad_vars) { std::unordered_set no_grad_var_names; no_grad_var_names.reserve(no_grad_vars.size() + 1); @@ -541,11 +537,11 @@ ParamGradInfoMap AppendBackward( PADDLE_ENFORCE(is_scalar, "target should be scalar"); VLOG(3) << "backward from loss=" << target.Name() << " data_type=" << target.GetDataType(); - std::unique_ptr fill_one_op( - new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}}, - {{"shape", std::vector{1}}, - {"value", static_cast(1.0)}, - {"dtype", target.GetDataType()}})); + std::unique_ptr fill_one_op( + new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}}, + {{"shape", std::vector{1}}, + {"value", static_cast(1.0)}, + {"dtype", target.GetDataType()}})); // infer var type of fill_one_op fill_one_op->InferVarType(root_block); diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h index 96154fa82cb7a486aa4762ae633982ed6735220b..2d3b75fe6966cb5dad32dc185a3973e92b23e26e 100644 --- a/paddle/framework/backward.h +++ b/paddle/framework/backward.h @@ -49,7 +49,7 @@ using ParamGradInfoMap = std::unordered_map; ParamGradInfoMap AppendBackward( - ProgramDescBind& program_desc, const VarDescBind& target, + ProgramDesc& program_desc, const VarDesc& target, const std::unordered_set& no_grad_vars); } // namespace framework diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc index 1099fffab3129876b3fd3a93cbc4d8a5bd29bea1..0957646b5642cd9afce5d88b2c638679cb01f198 100644 --- a/paddle/framework/backward_test.cc +++ b/paddle/framework/backward_test.cc @@ -58,13 +58,13 @@ class RowWiseAddGradMaker : public SingleGradOpDescMaker { using SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto grad_op = new OpDescBind(); + std::unique_ptr Apply() const override { + auto grad_op = new OpDesc(); grad_op->SetInput(GradVarName("Out"), OutputGrad("Out")); grad_op->SetOutput(GradVarName("X"), InputGrad("X")); grad_op->SetOutput(GradVarName("b"), InputGrad("b")); grad_op->SetType("rowwise_add_grad"); - return std::unique_ptr(grad_op); + return std::unique_ptr(grad_op); } }; @@ -159,7 +159,7 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker { FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "x"); - AddOutput("Y", "out"); + AddOutput("Out", "out"); AddComment(""); } }; @@ -190,11 +190,11 @@ class MinusGradOpDescMaker : public GradOpDescMakerBase { public: using GradOpDescMakerBase::GradOpDescMakerBase; - std::vector> operator()() const override { - std::vector> retv; + std::vector> operator()() const override { + std::vector> retv; auto x_g = InputGrad("X"); if (!x_g.empty()) { - auto *op_desc = new OpDescBind(); + auto *op_desc = new OpDesc(); op_desc->SetType("scale"); op_desc->SetInput("X", OutputGrad("Out")); op_desc->SetOutput("Out", x_g); @@ -204,7 +204,7 @@ class MinusGradOpDescMaker : public GradOpDescMakerBase { auto y_g = InputGrad("Y"); if (!y_g.empty()) { - auto *op_desc = new OpDescBind(); + auto *op_desc = new OpDesc(); op_desc->SetType("scale"); op_desc->SetInput("X", OutputGrad("Out")); op_desc->SetOutput("Out", y_g); @@ -430,8 +430,8 @@ TEST(Backward, op_part_of_output_are_not_need) { ASSERT_EQ("fill_zeros_like", fill_zero.Type()); ASSERT_EQ(1UL, fill_zero.Inputs("X").size()); ASSERT_EQ("Z", fill_zero.Input("X")); - ASSERT_EQ(1UL, fill_zero.Outputs("Y").size()); - ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Y")); + ASSERT_EQ(1UL, fill_zero.Outputs("Out").size()); + ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out")); auto &d_many_out = *net->ops_[1]; ASSERT_EQ("many_output_op_grad", d_many_out.Type()); @@ -505,25 +505,25 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) { } TEST(Backward, simple_single_op) { - f::ProgramDescBind program; - f::BlockDescBind *block = program.MutableBlock(0); + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); - f::OpDescBind *op = block->AppendOp(); + f::OpDesc *op = block->AppendOp(); op->SetType("rowwise_add"); op->SetInput("X", {"x"}); op->SetInput("b", {"b"}); op->SetOutput("Out", {"out"}); - auto target = f::VarDescBind("out"); + auto target = f::VarDesc("out"); target.SetShape({1}); auto var_to_grad = AppendBackward(program, target, std::unordered_set{}); ASSERT_EQ(block->AllOps().size(), 3UL); - f::OpDescBind *fill_op = block->AllOps()[1]; + f::OpDesc *fill_op = block->AllOps()[1]; EXPECT_EQ(fill_op->Type(), "fill_constant"); - f::OpDescBind *grad_op = block->AllOps()[2]; + f::OpDesc *grad_op = block->AllOps()[2]; EXPECT_EQ(grad_op->Type(), "rowwise_add_grad"); ASSERT_EQ(grad_op->InputNames().size(), 1UL); ASSERT_EQ(grad_op->OutputNames().size(), 2UL); @@ -543,16 +543,16 @@ TEST(Backward, simple_single_op) { } TEST(Backward, default_attribute) { - f::ProgramDescBind program; - f::BlockDescBind *block = program.MutableBlock(0); - f::OpDescBind *op = block->AppendOp(); + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::OpDesc *op = block->AppendOp(); op->SetType("mul"); op->SetInput("X", {"x"}); op->SetInput("Y", {"y"}); op->SetOutput("Out", {"out"}); op->CheckAttrs(); - auto target = f::VarDescBind("out"); + auto target = f::VarDesc("out"); target.SetShape({1}); AppendBackward(program, target, std::unordered_set{}); @@ -560,47 +560,47 @@ TEST(Backward, default_attribute) { EXPECT_EQ(boost::get(op->GetAttr("x_num_col_dims")), 1); EXPECT_EQ(boost::get(op->GetAttr("y_num_col_dims")), 1); - f::OpDescBind *fill_op = block->AllOps()[1]; + f::OpDesc *fill_op = block->AllOps()[1]; EXPECT_EQ(fill_op->Type(), "fill_constant"); - f::OpDescBind *grad_op = block->AllOps()[2]; + f::OpDesc *grad_op = block->AllOps()[2]; ASSERT_EQ(grad_op->Type(), "mul_grad"); EXPECT_EQ(boost::get(grad_op->GetAttr("x_num_col_dims")), 1); EXPECT_EQ(boost::get(grad_op->GetAttr("y_num_col_dims")), 1); } TEST(Backward, simple_mult_op) { - f::ProgramDescBind program; - f::BlockDescBind *block = program.MutableBlock(0); - f::OpDescBind *op1 = block->AppendOp(); + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::OpDesc *op1 = block->AppendOp(); op1->SetType("rowwise_add"); op1->SetInput("X", {"x1"}); op1->SetInput("b", {"b1"}); op1->SetOutput("Out", {"out1"}); - f::OpDescBind *op2 = block->AppendOp(); + f::OpDesc *op2 = block->AppendOp(); op2->SetType("mul"); op2->SetInput("X", {"out1"}); op2->SetInput("Y", {"y2"}); op2->SetOutput("Out", {"out2"}); - f::OpDescBind *op3 = block->AppendOp(); + f::OpDesc *op3 = block->AppendOp(); op3->SetType("rowwise_add"); op3->SetInput("X", {"out2"}); op3->SetInput("b", {"b3"}); op3->SetOutput("Out", {"out3"}); - auto target = f::VarDescBind("out3"); + auto target = f::VarDesc("out3"); target.SetShape({1}); size_t forward_len = block->AllOps().size(); auto var_to_grad = AppendBackward(program, target, std::unordered_set{}); ASSERT_EQ(block->AllOps().size(), 6UL + 1); - f::OpDescBind *fill_op = block->AllOps()[forward_len]; + f::OpDesc *fill_op = block->AllOps()[forward_len]; EXPECT_EQ(fill_op->Type(), "fill_constant"); - f::OpDescBind *grad_op1 = block->AllOps()[6]; + f::OpDesc *grad_op1 = block->AllOps()[6]; EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad"); ASSERT_EQ(grad_op1->InputNames().size(), 1UL); ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); @@ -611,7 +611,7 @@ TEST(Backward, simple_mult_op) { EXPECT_EQ(grad_op1->Output(f::GradVarName("b")), std::vector({f::GradVarName("b1")})); - f::OpDescBind *grad_op2 = block->AllOps()[5]; + f::OpDesc *grad_op2 = block->AllOps()[5]; EXPECT_EQ(grad_op2->Type(), "mul_grad"); ASSERT_EQ(grad_op2->InputNames().size(), 4UL); ASSERT_EQ(grad_op2->OutputNames().size(), 2UL); @@ -625,7 +625,7 @@ TEST(Backward, simple_mult_op) { EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")), std::vector({f::GradVarName("y2")})); - f::OpDescBind *grad_op3 = block->AllOps()[4]; + f::OpDesc *grad_op3 = block->AllOps()[4]; EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad"); ASSERT_EQ(grad_op3->InputNames().size(), 1UL); ASSERT_EQ(grad_op3->OutputNames().size(), 2UL); @@ -655,42 +655,42 @@ TEST(Backward, simple_mult_op) { } TEST(Backward, intermedia_var_no_grad) { - f::ProgramDescBind program; - f::BlockDescBind *block = program.MutableBlock(0); - f::OpDescBind *op1 = block->AppendOp(); + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::OpDesc *op1 = block->AppendOp(); op1->SetType("rowwise_add"); op1->SetInput("X", {"x1"}); op1->SetInput("b", {"b1"}); op1->SetOutput("Out", {"out1"}); - f::OpDescBind *op2 = block->AppendOp(); + f::OpDesc *op2 = block->AppendOp(); op2->SetType("mul"); op2->SetInput("X", {"x2"}); op2->SetInput("Y", {"y2"}); op2->SetOutput("Out", {"out2"}); - f::OpDescBind *op3 = block->AppendOp(); + f::OpDesc *op3 = block->AppendOp(); op3->SetType("rowwise_add"); op3->SetInput("X", {"out2"}); op3->SetInput("b", {"b3"}); op3->SetOutput("Out", {"out3"}); - f::OpDescBind *op4 = block->AppendOp(); + f::OpDesc *op4 = block->AppendOp(); op4->SetType("mul"); op4->SetInput("X", {"out1"}); op4->SetInput("Y", {"out3"}); op4->SetOutput("Out", {"out4"}); - auto target = f::VarDescBind("out4"); + auto target = f::VarDesc("out4"); target.SetShape({1}); size_t forward_len = block->AllOps().size(); auto var_to_grad = AppendBackward(program, target, {"out3"}); ASSERT_EQ(block->AllOps().size(), 7UL); - f::OpDescBind *fill_op = block->AllOps()[forward_len]; + f::OpDesc *fill_op = block->AllOps()[forward_len]; EXPECT_EQ(fill_op->Type(), "fill_constant"); - f::OpDescBind *grad_op1 = block->AllOps()[6]; + f::OpDesc *grad_op1 = block->AllOps()[6]; EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad"); ASSERT_EQ(grad_op1->InputNames().size(), 1UL); ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); @@ -701,7 +701,7 @@ TEST(Backward, intermedia_var_no_grad) { EXPECT_EQ(grad_op1->Output(f::GradVarName("b")), std::vector({f::GradVarName("b1")})); - f::OpDescBind *grad_op4 = block->AllOps()[5]; + f::OpDesc *grad_op4 = block->AllOps()[5]; EXPECT_EQ(grad_op4->Type(), "mul_grad"); ASSERT_EQ(grad_op4->InputNames().size(), 4UL); ASSERT_EQ(grad_op4->OutputNames().size(), 2UL); @@ -726,32 +726,32 @@ TEST(Backward, intermedia_var_no_grad) { } TEST(Backward, var_no_grad) { - f::ProgramDescBind program; - f::BlockDescBind *block = program.MutableBlock(0); - f::OpDescBind *op1 = block->AppendOp(); + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::OpDesc *op1 = block->AppendOp(); op1->SetType("mult_in_out"); op1->SetInput("X", {"x1"}); op1->SetInput("H", {"h1"}); op1->SetOutput("Y", {"y1"}); op1->SetOutput("Z", {"z1"}); - f::OpDescBind *op2 = block->AppendOp(); + f::OpDesc *op2 = block->AppendOp(); op2->SetType("mult_in_out"); op2->SetInput("X", {"y1"}); op2->SetInput("H", {"z1"}); op2->SetOutput("Y", {"y2"}); op2->SetOutput("Z", {"z2"}); - auto target = f::VarDescBind("z2"); + auto target = f::VarDesc("z2"); target.SetShape({1}); size_t forward_len = block->AllOps().size(); auto var_to_grad = AppendBackward(program, target, {"z1"}); ASSERT_EQ(block->AllOps().size(), 6UL); - f::OpDescBind *fill_op = block->AllOps()[forward_len]; + f::OpDesc *fill_op = block->AllOps()[forward_len]; EXPECT_EQ(fill_op->Type(), "fill_constant"); - f::OpDescBind *grad_op2 = block->AllOps()[3]; + f::OpDesc *grad_op2 = block->AllOps()[3]; ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad"); ASSERT_EQ(grad_op2->InputNames().size(), 6UL); ASSERT_EQ(grad_op2->OutputNames().size(), 2UL); @@ -767,15 +767,15 @@ TEST(Backward, var_no_grad) { std::vector({f::GradVarName("y1")})); EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector()); - f::OpDescBind *fill_zero_op = block->AllOps()[4]; + f::OpDesc *fill_zero_op = block->AllOps()[4]; ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like"); ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL); ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL); EXPECT_EQ(fill_zero_op->Input("X"), std::vector({"z1"})); - EXPECT_EQ(fill_zero_op->Output("Y"), + EXPECT_EQ(fill_zero_op->Output("Out"), std::vector({std::string("z1") + f::kZeroVarSuffix})); - f::OpDescBind *grad_op1 = block->AllOps()[5]; + f::OpDesc *grad_op1 = block->AllOps()[5]; ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad"); ASSERT_EQ(grad_op1->InputNames().size(), 6UL); ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); @@ -803,37 +803,37 @@ TEST(Backward, var_no_grad) { } TEST(Backward, shared_var) { - f::ProgramDescBind program; - f::BlockDescBind *block = program.MutableBlock(0); - f::OpDescBind *op1 = block->AppendOp(); + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::OpDesc *op1 = block->AppendOp(); op1->SetType("rowwise_add"); op1->SetInput("X", {"x1"}); op1->SetInput("b", {"b1"}); op1->SetOutput("Out", {"out1"}); - f::OpDescBind *op2 = block->AppendOp(); + f::OpDesc *op2 = block->AppendOp(); op2->SetType("mul"); op2->SetInput("X", {"out1"}); op2->SetInput("Y", {"y2"}); op2->SetOutput("Out", {"out2"}); - f::OpDescBind *op3 = block->AppendOp(); + f::OpDesc *op3 = block->AppendOp(); op3->SetType("rowwise_add"); op3->SetInput("X", {"out1"}); op3->SetInput("b", {"b3"}); op3->SetOutput("Out", {"out3"}); - auto target = f::VarDescBind("out3"); + auto target = f::VarDesc("out3"); target.SetShape({1}); size_t forward_len = block->AllOps().size(); auto var_to_grad = AppendBackward(program, target, std::unordered_set{}); ASSERT_EQ(block->AllOps().size(), 8UL); - f::OpDescBind *fill_op = block->AllOps()[forward_len]; + f::OpDesc *fill_op = block->AllOps()[forward_len]; EXPECT_EQ(fill_op->Type(), "fill_constant"); - f::OpDescBind *grad_op3 = block->AllOps()[4]; + f::OpDesc *grad_op3 = block->AllOps()[4]; ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad"); ASSERT_EQ(grad_op3->InputNames().size(), 1UL); ASSERT_EQ(grad_op3->OutputNames().size(), 2UL); @@ -844,7 +844,7 @@ TEST(Backward, shared_var) { EXPECT_EQ(grad_op3->Output(f::GradVarName("b")), std::vector({f::GradVarName("b3")})); - f::OpDescBind *grad_op4 = block->AllOps()[5]; + f::OpDesc *grad_op4 = block->AllOps()[5]; ASSERT_EQ(grad_op4->Type(), "mul_grad"); ASSERT_EQ(grad_op4->InputNames().size(), 4UL); ASSERT_EQ(grad_op4->OutputNames().size(), 2UL); @@ -858,7 +858,7 @@ TEST(Backward, shared_var) { EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector({f::GradVarName("y2")})); - f::OpDescBind *sum_op = block->AllOps()[6]; + f::OpDesc *sum_op = block->AllOps()[6]; ASSERT_EQ(sum_op->Type(), "sum"); ASSERT_EQ(sum_op->InputNames().size(), 1UL); ASSERT_EQ(sum_op->OutputNames().size(), 1UL); @@ -868,7 +868,7 @@ TEST(Backward, shared_var) { EXPECT_EQ(sum_op->Output("Out"), std::vector({f::GradVarName("out1")})); - f::OpDescBind *grad_op1 = block->AllOps()[7]; + f::OpDesc *grad_op1 = block->AllOps()[7]; ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad"); ASSERT_EQ(grad_op1->InputNames().size(), 1UL); ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); @@ -895,19 +895,19 @@ TEST(Backward, shared_var) { } TEST(Backward, half_backward) { - f::ProgramDescBind program; - f::BlockDescBind *block = program.MutableBlock(0); + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); auto *op1 = block->AppendOp(); op1->SetType("minus"); op1->SetInput("X", {"a"}); op1->SetInput("Y", {"b"}); op1->SetOutput("Out", {"out"}); - auto target = f::VarDescBind("out"); + auto target = f::VarDesc("out"); target.SetShape({1}); size_t forward_len = block->AllOps().size(); auto var_to_grad = AppendBackward(program, target, {"b"}); - f::OpDescBind *fill_op = block->AllOps()[forward_len]; + f::OpDesc *fill_op = block->AllOps()[forward_len]; EXPECT_EQ(fill_op->Type(), "fill_constant"); auto ops = block->AllOps(); ASSERT_EQ(3UL, ops.size()); diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc index 6b961caebd3c0e2af732ce5ff8f1ae5ef2b042aa..0668b08ff7ab3c8ca4f1e989fc7af45a8ec5f63c 100644 --- a/paddle/framework/block_desc.cc +++ b/paddle/framework/block_desc.cc @@ -19,18 +19,18 @@ limitations under the License. */ namespace paddle { namespace framework { -VarDescBind *BlockDescBind::Var(const std::string &name) { +VarDesc *BlockDesc::Var(const std::string &name) { auto it = vars_.find(name); if (it != vars_.end()) { return it->second.get(); } need_update_ = true; - auto *var = new VarDescBind(name); + auto *var = new VarDesc(name); vars_[name].reset(var); return var; } -VarDescBind *BlockDescBind::FindVar(const std::string &name) const { +VarDesc *BlockDesc::FindVar(const std::string &name) const { auto it = vars_.find(name); if (it == vars_.end()) { return nullptr; @@ -38,11 +38,11 @@ VarDescBind *BlockDescBind::FindVar(const std::string &name) const { return it->second.get(); } -bool BlockDescBind::HasVar(const std::string &name) const { +bool BlockDesc::HasVar(const std::string &name) const { return vars_.find(name) != vars_.end(); } -VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const { +VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const { if (name == kEmptyVarName) return nullptr; auto it = vars_.find(name); @@ -53,53 +53,67 @@ VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const { return it->second.get(); } -VarDescBind *BlockDescBind::FindRecursiveOrCreateVar( - const std::string &name_bytes) { - VarDescBind *res = FindVarRecursive(name_bytes); +VarDesc *BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) { + VarDesc *res = FindVarRecursive(name_bytes); if (res == nullptr) { res = Var(name_bytes); } return res; } -bool BlockDescBind::HasVarRecursive(const std::string &name) const { +bool BlockDesc::HasVarRecursive(const std::string &name) const { return FindVarRecursive(name) != nullptr; } -std::vector BlockDescBind::AllVars() const { - std::vector res; +std::vector BlockDesc::AllVars() const { + std::vector res; for (const auto &p : vars_) { res.push_back(p.second.get()); } return res; } -OpDescBind *BlockDescBind::AppendOp() { +OpDesc *BlockDesc::AppendOp() { need_update_ = true; - ops_.emplace_back(new OpDescBind()); + ops_.emplace_back(new OpDesc()); return ops_.back().get(); } -void BlockDescBind::AppendAllocatedOp(std::unique_ptr &&op_desc) { +void BlockDesc::AppendAllocatedOp(std::unique_ptr &&op_desc) { need_update_ = true; ops_.emplace_back(std::move(op_desc)); } -OpDescBind *BlockDescBind::PrependOp() { +OpDesc *BlockDesc::PrependOp() { need_update_ = true; - ops_.emplace_front(new OpDescBind()); + ops_.emplace_front(new OpDesc()); return ops_.front().get(); } -std::vector BlockDescBind::AllOps() const { - std::vector res; +void BlockDesc::RemoveOp(size_t s, size_t e) { + if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) { + return; + } + need_update_ = true; + for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) { + auto names = (*it)->InputArgumentNames(); + for (auto n : names) { + // TODO(typhoonzero): delete vars if no other op use it. + VLOG(3) << "deleting var " << n; + } + } + ops_.erase(ops_.begin() + s, ops_.begin() + e); +} + +std::vector BlockDesc::AllOps() const { + std::vector res; for (const auto &op : ops_) { res.push_back(op.get()); } return res; } -void BlockDescBind::Flush() { +void BlockDesc::Flush() { for (auto &op_desc : ops_) { op_desc->Flush(); } @@ -121,43 +135,43 @@ void BlockDescBind::Flush() { } } -BlockDescBind *BlockDescBind::ParentBlock() const { +BlockDesc *BlockDesc::ParentBlock() const { if (this->desc_->parent_idx() == kNoneBlockIndex) { return nullptr; } return prog_->MutableBlock(static_cast(this->desc_->parent_idx())); } -proto::BlockDesc *BlockDescBind::Proto() { +proto::BlockDesc *BlockDesc::Proto() { Flush(); return desc_; } -BlockDescBind::BlockDescBind(ProgramDescBind *prog, proto::BlockDesc *desc) +BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc) : prog_(prog), desc_(desc), need_update_(false) { for (const proto::VarDesc &var_desc : desc_->vars()) { - vars_[var_desc.name()].reset(new VarDescBind(var_desc)); + vars_[var_desc.name()].reset(new VarDesc(var_desc)); } for (const proto::OpDesc &op_desc : desc_->ops()) { - ops_.emplace_back(new OpDescBind(op_desc, prog)); + ops_.emplace_back(new OpDesc(op_desc, prog)); } } -BlockDescBind::BlockDescBind(const BlockDescBind &other, proto::BlockDesc *desc, - ProgramDescBind *prog) +BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, + ProgramDesc *prog) : prog_(prog), desc_(desc) { need_update_ = true; for (auto &op : other.ops_) { - ops_.emplace_back(new OpDescBind(*op)); + ops_.emplace_back(new OpDesc(*op)); } for (auto &it : other.vars_) { - auto *var = new VarDescBind(*it.second); + auto *var = new VarDesc(*it.second); vars_[it.first].reset(var); } } -void BlockDescBind::ClearPBOps() { +void BlockDesc::ClearPBOps() { auto ops = this->desc_->mutable_ops(); while (!ops->empty()) { // we do not own the OpDesc, so release the ownership. @@ -165,7 +179,7 @@ void BlockDescBind::ClearPBOps() { } } -void BlockDescBind::ClearPBVars() { +void BlockDesc::ClearPBVars() { auto vars = this->desc_->mutable_vars(); while (!vars->empty()) { // we do not own the VarDesc, so release the ownership. diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h index 592fe49e075a947956c6e34f03068d4c34530a46..6c8c81b332d99e52db41018e117aa837be6745bc 100644 --- a/paddle/framework/block_desc.h +++ b/paddle/framework/block_desc.h @@ -28,20 +28,19 @@ limitations under the License. */ namespace paddle { namespace framework { -class ProgramDescBind; +class ProgramDesc; // Each Protobuf Message, we provide a XXXBind class. In that class, we optimize // read/write speed. Only when we want the protobuf message, the local changes // will be synchronized (by `Sync` method). -class BlockDescBind { +class BlockDesc { public: - BlockDescBind(ProgramDescBind *prog, proto::BlockDesc *desc); + BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc); - BlockDescBind(const BlockDescBind &other, proto::BlockDesc *desc, - ProgramDescBind *prog); + BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog); - ~BlockDescBind() { + ~BlockDesc() { this->ClearPBVars(); this->ClearPBOps(); } @@ -50,15 +49,15 @@ class BlockDescBind { int32_t Parent() const { return desc_->parent_idx(); } - VarDescBind *Var(const std::string &name_bytes); + VarDesc *Var(const std::string &name_bytes); - VarDescBind *FindVar(const std::string &name_bytes) const; + VarDesc *FindVar(const std::string &name_bytes) const; bool HasVar(const std::string &var_name) const; - VarDescBind *FindVarRecursive(const std::string &name_bytes) const; + VarDesc *FindVarRecursive(const std::string &name_bytes) const; - VarDescBind *FindRecursiveOrCreateVar(const std::string &name_bytes); + VarDesc *FindRecursiveOrCreateVar(const std::string &name_bytes); bool HasVarRecursive(const std::string &var_name) const; @@ -70,41 +69,43 @@ class BlockDescBind { return var_names; } - std::vector AllVars() const; + std::vector AllVars() const; - BlockDescBind *ParentBlock() const; + BlockDesc *ParentBlock() const; - OpDescBind *AppendOp(); + OpDesc *AppendOp(); - void AppendAllocatedOp(std::unique_ptr &&op_desc); + void AppendAllocatedOp(std::unique_ptr &&op_desc); - OpDescBind *PrependOp(); + OpDesc *PrependOp(); - std::vector AllOps() const; + void RemoveOp(size_t s, size_t e); + + std::vector AllOps() const; size_t OpSize() const { return ops_.size(); } - OpDescBind *Op(int idx) { return ops_.at(idx).get(); } + OpDesc *Op(int idx) { return ops_.at(idx).get(); } void Flush(); proto::BlockDesc *Proto(); - ProgramDescBind *Program() { return this->prog_; } + ProgramDesc *Program() { return this->prog_; } private: void ClearPBOps(); void ClearPBVars(); private: - ProgramDescBind *prog_; // not_own + ProgramDesc *prog_; // not_own proto::BlockDesc *desc_; // not_own bool need_update_; - std::deque> ops_; - std::unordered_map> vars_; + std::deque> ops_; + std::unordered_map> vars_; - DISABLE_COPY_AND_ASSIGN(BlockDescBind); + DISABLE_COPY_AND_ASSIGN(BlockDesc); }; } // namespace framework } // namespace paddle diff --git a/paddle/framework/data_layout.h b/paddle/framework/data_layout.h new file mode 100644 index 0000000000000000000000000000000000000000..7429de7ee39297c26360984809e2451100f7b3ff --- /dev/null +++ b/paddle/framework/data_layout.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace paddle { +namespace framework { + +enum DataLayout { + kNHWC = 0, + kNCHW = 1, + kAnyLayout = 2, +}; + +inline DataLayout StringToDataLayout(const std::string& str) { + if (str == "NHWC" || str == "nhwc") { + return DataLayout::kNHWC; + } else if (str == "NCHW" || str == "nchw") { + return DataLayout::kNCHW; + } else { + PADDLE_THROW("Unknown storage order string: %s", str); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/details/op_registry.h b/paddle/framework/details/op_registry.h index 435f0b6b78b19dc433f0b761d9ac800565684b7c..7f5151c41d6046f21f7a9707e45de85ec50219ad 100644 --- a/paddle/framework/details/op_registry.h +++ b/paddle/framework/details/op_registry.h @@ -106,10 +106,10 @@ template struct OpInfoFiller { void operator()(const char* op_type, OpInfo* info) const { info->grad_op_maker_ = []( - const OpDescBind& fwd_op, + const OpDesc& fwd_op, const std::unordered_set& no_grad_set, std::unordered_map* grad_to_var, - const std::vector& grad_block) { + const std::vector& grad_block) { T maker(fwd_op, no_grad_set, grad_to_var, grad_block); return maker(); }; @@ -119,7 +119,7 @@ struct OpInfoFiller { template struct OpInfoFiller { void operator()(const char* op_type, OpInfo* info) const { - info->infer_var_type_ = [](const OpDescBind& fwd_op, BlockDescBind* block) { + info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) { T inference; inference(fwd_op, block); }; diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index ea6b259c09012ab1e5576cedeac54e46d21e40dc..14ae37ec49c12203381e74b3f9174a460e41c18e 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -64,8 +64,8 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { } } -void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id, - bool create_local_scope) { +void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, + bool create_local_scope, bool create_vars) { // TODO(tonyyang-svail): // - only runs on the first device (i.e. no interdevice communication) // - will change to use multiple blocks for RNN op and Cond Op @@ -74,33 +74,35 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id, auto& device = device_contexts_[0]; Scope* local_scope = scope; - if (create_local_scope) { - local_scope = &scope->NewScope(); - for (auto& var : block.AllVars()) { - if (var->Name() == framework::kEmptyVarName) { - continue; + if (create_vars) { + if (create_local_scope) { + local_scope = &scope->NewScope(); + for (auto& var : block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var->Persistable()) { + auto* ptr = scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { + auto* ptr = local_scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } } - - if (var->Persistable()) { - auto* ptr = scope->Var(var->Name()); - CreateTensor(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " global, which pointer is " << ptr; - } else { + } else { + for (auto& var : block.AllVars()) { auto* ptr = local_scope->Var(var->Name()); CreateTensor(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " locally, which pointer is " << ptr; + VLOG(3) << "Create variable " << var->Name() << ", which pointer is " + << ptr; } - } - } else { - for (auto& var : block.AllVars()) { - auto* ptr = local_scope->Var(var->Name()); - CreateTensor(ptr, var->GetType()); - VLOG(3) << "Create variable " << var->Name() << ", which pointer is " - << ptr; - } - } + } // if (create_local_scope) + } // if (create_vars) for (auto& op_desc : block.AllOps()) { auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index 073e04729b1166f1cabd16709d161fda0d580f1c..a3d1609293a0d687c33447ca7a0df95c6aac3bc5 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -40,6 +40,16 @@ class DeviceContextPool { return *pool; } + const platform::DeviceContext* Borrow(const platform::Place& place) { + auto range = device_contexts_.equal_range(place); + if (range.first == range.second) { + PADDLE_THROW( + "'Place' is not supported, Please re-compile with WITH_GPU " + "option"); + } + return range.first->second; + } + std::vector Borrow( const std::vector& places) { PADDLE_ENFORCE_GT(places.size(), 0); @@ -114,7 +124,8 @@ class Executor { * ProgramDesc * Scope */ - void Run(const ProgramDescBind&, Scope*, int, bool create_local_scope = true); + void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true, + bool create_vars = true); private: std::vector device_contexts_; diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h index 998186e33915a11f2864eb5387d19ed1bfbab51c..cf411fa710103350713342b43946697a8dd2aa46 100644 --- a/paddle/framework/grad_op_desc_maker.h +++ b/paddle/framework/grad_op_desc_maker.h @@ -22,21 +22,27 @@ namespace paddle { namespace framework { +/* + This functor class is responsible for creating the gradient ops for the given + operator fwd_op. After it is called (through operator()), the pairs of + (gradient variable, corresponding input variable of fwd_op) will be added to + grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its + gradient varialbe will be ignored or kEmptyVarName depending on the template + argument DropEmptyIG in the derived classes. + */ class GradOpDescMakerBase { public: explicit GradOpDescMakerBase( - const OpDescBind& fwd_op, - const std::unordered_set& no_grad_set, + const OpDesc& fwd_op, const std::unordered_set& no_grad_set, std::unordered_map* grad_to_var, - const std::vector& grad_block = - std::vector()) + const std::vector& grad_block = std::vector()) : fwd_op_(fwd_op), no_grad_set_(no_grad_set), grad_to_var_(grad_to_var), grad_block_(grad_block) {} virtual ~GradOpDescMakerBase() = default; - virtual std::vector> operator()() const = 0; + virtual std::vector> operator()() const = 0; protected: std::vector InputGrad(const std::string& name, @@ -58,6 +64,16 @@ class GradOpDescMakerBase { if (!drop_empty_grad) { return ret_val; } + PADDLE_ENFORCE_LE(var_names.size(), 1UL, + "BUG from operator developer:" + " for input argument with a list of variables, " + " drop_empty_grad is not allowed because it makes" + " the correspondence bewteen a variable and its gradient" + " ambiguous. Use REGISTER_OP_EX to register the op" + " or call InputGrad(?,false) in GradOpDescMaker." + " Op type %s", + fwd_op_.Type()); + std::vector dropped_ret_val; dropped_ret_val.reserve(ret_val.size()); std::copy_if(ret_val.begin(), ret_val.end(), @@ -105,26 +121,26 @@ class GradOpDescMakerBase { std::string ForwardOpType() const { return this->fwd_op_.Type(); } private: - const OpDescBind& fwd_op_; + const OpDesc& fwd_op_; const std::unordered_set& no_grad_set_; std::unordered_map* grad_to_var_; protected: - std::vector grad_block_; + std::vector grad_block_; }; class SingleGradOpDescMaker : public GradOpDescMakerBase { public: using GradOpDescMakerBase::GradOpDescMakerBase; - std::vector> operator()() const { - std::vector> retv; + std::vector> operator()() const { + std::vector> retv; retv.emplace_back(this->Apply()); return retv; } protected: - virtual std::unique_ptr Apply() const = 0; + virtual std::unique_ptr Apply() const = 0; }; template @@ -133,8 +149,8 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker { using SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - virtual std::unique_ptr Apply() const { - auto* grad = new OpDescBind(); + virtual std::unique_ptr Apply() const { + auto* grad = new OpDesc(); grad->SetType(this->GradOpType()); for (auto& input_param : this->InputNames()) { @@ -150,7 +166,7 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker { grad->SetAttrMap(this->Attrs()); - return std::unique_ptr(grad); + return std::unique_ptr(grad); } virtual std::string GradOpType() const { @@ -161,7 +177,7 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker { class EmptyGradOpMaker : public GradOpDescMakerBase { public: using GradOpDescMakerBase::GradOpDescMakerBase; - std::vector> operator()() const override { + std::vector> operator()() const override { return {}; } }; diff --git a/paddle/framework/library_type.h b/paddle/framework/library_type.h new file mode 100644 index 0000000000000000000000000000000000000000..68e9cabb667a5b2421fad8333d8e1be7bfa57002 --- /dev/null +++ b/paddle/framework/library_type.h @@ -0,0 +1,26 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace paddle { +namespace framework { + +// For more details about the design of LibraryType, Please refer to +// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library + +enum LibraryType { kPlain = 0; kMKLDNN = 1; kCUDNN = 2; } + +} // namespace +} // framework diff --git a/paddle/framework/lod_rank_table.cc b/paddle/framework/lod_rank_table.cc index 1c2fba70c8ab0827ba6d1563f08cd0820650822e..17d524c09276fc0eb166925bd79bc0bdfcead195 100644 --- a/paddle/framework/lod_rank_table.cc +++ b/paddle/framework/lod_rank_table.cc @@ -46,4 +46,13 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) { } } // namespace framework + +std::ostream& operator<<(std::ostream& out, + const framework::LoDRankTable& table) { + out << "NumOfSequence " << table.items().size() << "\n"; + for (auto& each_item : table.items()) { + out << "\tSeq #" << each_item.index << ", Len=" << each_item.length << "\n"; + } + return out; +} } // namespace paddle diff --git a/paddle/framework/lod_rank_table.h b/paddle/framework/lod_rank_table.h index 9faa3a4d7bdc55ab7b24e31f5e5434dacc0a4b36..d3007d3d7379a59b32465cbd55780c6268e0e4a8 100644 --- a/paddle/framework/lod_rank_table.h +++ b/paddle/framework/lod_rank_table.h @@ -13,6 +13,7 @@ limitations under the License. */ #pragma once +#include #include "paddle/framework/lod_tensor.h" namespace paddle { @@ -52,4 +53,8 @@ class LoDRankTable { }; } // namespace framework + +std::ostream& operator<<(std::ostream& out, + const framework::LoDRankTable& table); + } // namespace paddle diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h index 9411c96aea4c10ebf921cc3e3b442769c8acbefa..0923c52a0ad2fe10cea760df20c99021984ad39d 100644 --- a/paddle/framework/lod_tensor.h +++ b/paddle/framework/lod_tensor.h @@ -184,6 +184,18 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level, return tensor; } +// Get the absolute offset of a lod[start_level][start_idx:end_idx] and +// relative length of details for every levels(i.e., [start_level: ]). +// +// For example, +// lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]] +// start_level = 0 +// start_idx = 1 +// end_idx = 3 +// +// Returns: +// LoD = [[1, 4], [2, 4, 2, 3, 2]] +// pair = {11, 24} std::pair> GetSubLoDAndAbsoluteOffset( const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level); diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index 7af5b687273d846628e9e7e2c07107a50d47acb3..b361e64438251c1df827667fb825e7f5909fb09e 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -25,12 +25,11 @@ limitations under the License. */ namespace paddle { namespace framework { -class OpDescBind; -class BlockDescBind; +class OpDesc; +class BlockDesc; class CompileTimeInferShapeContext : public InferShapeContext { public: - CompileTimeInferShapeContext(const OpDescBind &op, - const BlockDescBind &block); + CompileTimeInferShapeContext(const OpDesc &op, const BlockDesc &block); bool HasInput(const std::string &name) const override; @@ -76,13 +75,12 @@ class CompileTimeInferShapeContext : public InferShapeContext { void SetDim(const std::string &name, const DDim &dim) override; - const OpDescBind &op_; - const BlockDescBind &block_; + const OpDesc &op_; + const BlockDesc &block_; }; -OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs) { +OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs) { desc_.set_type(type); inputs_ = inputs; outputs_ = outputs; @@ -90,7 +88,7 @@ OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs, need_update_ = true; } -OpDescBind::OpDescBind(const proto::OpDesc &desc, ProgramDescBind *prog) +OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog) : desc_(desc), need_update_(false) { // restore inputs_ int input_size = desc_.inputs_size(); @@ -126,20 +124,19 @@ OpDescBind::OpDescBind(const proto::OpDesc &desc, ProgramDescBind *prog) } } -proto::OpDesc *OpDescBind::Proto() { +proto::OpDesc *OpDesc::Proto() { Flush(); return &desc_; } -const std::vector &OpDescBind::Input( - const std::string &name) const { +const std::vector &OpDesc::Input(const std::string &name) const { auto it = inputs_.find(name); PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name, Type()); return it->second; } -std::vector OpDescBind::InputArgumentNames() const { +std::vector OpDesc::InputArgumentNames() const { std::vector retv; for (auto &ipt : this->inputs_) { retv.insert(retv.end(), ipt.second.begin(), ipt.second.end()); @@ -147,21 +144,20 @@ std::vector OpDescBind::InputArgumentNames() const { return retv; } -void OpDescBind::SetInput(const std::string ¶m_name, - const std::vector &args) { +void OpDesc::SetInput(const std::string ¶m_name, + const std::vector &args) { need_update_ = true; inputs_[param_name] = args; } -const std::vector &OpDescBind::Output( - const std::string &name) const { +const std::vector &OpDesc::Output(const std::string &name) const { auto it = outputs_.find(name); PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s", name, Type()); return it->second; } -std::vector OpDescBind::OutputArgumentNames() const { +std::vector OpDesc::OutputArgumentNames() const { std::vector retv; for (auto &ipt : this->outputs_) { retv.insert(retv.end(), ipt.second.begin(), ipt.second.end()); @@ -169,19 +165,19 @@ std::vector OpDescBind::OutputArgumentNames() const { return retv; } -void OpDescBind::SetOutput(const std::string ¶m_name, - const std::vector &args) { +void OpDesc::SetOutput(const std::string ¶m_name, + const std::vector &args) { need_update_ = true; this->outputs_[param_name] = args; } -proto::AttrType OpDescBind::GetAttrType(const std::string &name) const { +proto::AttrType OpDesc::GetAttrType(const std::string &name) const { auto it = attrs_.find(name); PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); return static_cast(it->second.which() - 1); } -std::vector OpDescBind::AttrNames() const { +std::vector OpDesc::AttrNames() const { std::vector retv; retv.reserve(attrs_.size()); for (auto &attr : attrs_) { @@ -190,41 +186,39 @@ std::vector OpDescBind::AttrNames() const { return retv; } -void OpDescBind::SetAttr(const std::string &name, const Attribute &v) { +void OpDesc::SetAttr(const std::string &name, const Attribute &v) { this->attrs_[name] = v; need_update_ = true; } -void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) { +void OpDesc::SetBlockAttr(const std::string &name, BlockDesc &block) { this->attrs_[name] = █ need_update_ = true; } -void OpDescBind::SetAttrMap( +void OpDesc::SetAttrMap( const std::unordered_map &attr_map) { attrs_ = attr_map; need_update_ = true; } -Attribute OpDescBind::GetAttr(const std::string &name) const { +Attribute OpDesc::GetAttr(const std::string &name) const { auto it = attrs_.find(name); PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); return it->second; } -int OpDescBind::GetBlockAttr(const std::string &name) const { +int OpDesc::GetBlockAttr(const std::string &name) const { auto it = attrs_.find(name); PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); - return boost::get(it->second)->ID(); + return boost::get(it->second)->ID(); } -const std::unordered_map &OpDescBind::GetAttrMap() - const { +const std::unordered_map &OpDesc::GetAttrMap() const { return attrs_; } -void OpDescBind::Rename(const std::string &old_name, - const std::string &new_name) { +void OpDesc::Rename(const std::string &old_name, const std::string &new_name) { for (auto &input : inputs_) { std::replace(input.second.begin(), input.second.end(), old_name, new_name); } @@ -235,8 +229,8 @@ void OpDescBind::Rename(const std::string &old_name, need_update_ = true; } -void OpDescBind::RenameOutput(const std::string &old_name, - const std::string &new_name) { +void OpDesc::RenameOutput(const std::string &old_name, + const std::string &new_name) { for (auto &output : outputs_) { std::replace(output.second.begin(), output.second.end(), old_name, new_name); @@ -244,8 +238,8 @@ void OpDescBind::RenameOutput(const std::string &old_name, need_update_ = true; } -void OpDescBind::RenameInput(const std::string &old_name, - const std::string &new_name) { +void OpDesc::RenameInput(const std::string &old_name, + const std::string &new_name) { for (auto &input : inputs_) { std::replace(input.second.begin(), input.second.end(), old_name, new_name); } @@ -278,7 +272,7 @@ struct SetAttrDescVisitor : public boost::static_visitor { void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } }; -void OpDescBind::Flush() { +void OpDesc::Flush() { if (need_update_) { this->desc_.mutable_inputs()->Clear(); for (auto &ipt : inputs_) { @@ -330,7 +324,7 @@ static void InitInferShapeFuncs() { }); } -void OpDescBind::CheckAttrs() { +void OpDesc::CheckAttrs() { PADDLE_ENFORCE(!Type().empty(), "CheckAttr() can not be called before type is setted."); auto *checker = OpInfoMap::Instance().Get(Type()).Checker(); @@ -342,7 +336,7 @@ void OpDescBind::CheckAttrs() { checker->Check(attrs_); } -void OpDescBind::InferShape(const BlockDescBind &block) const { +void OpDesc::InferShape(const BlockDesc &block) const { VLOG(3) << "CompileTime infer shape on " << Type(); InitInferShapeFuncs(); auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_; @@ -365,7 +359,7 @@ void OpDescBind::InferShape(const BlockDescBind &block) const { infer_shape(&ctx); } -void OpDescBind::InferVarType(BlockDescBind *block) const { +void OpDesc::InferVarType(BlockDesc *block) const { auto &info = OpInfoMap::Instance().Get(this->Type()); if (info.infer_var_type_) { info.infer_var_type_(*this, block); @@ -384,7 +378,7 @@ void OpDescBind::InferVarType(BlockDescBind *block) const { } CompileTimeInferShapeContext::CompileTimeInferShapeContext( - const OpDescBind &op, const BlockDescBind &block) + const OpDesc &op, const BlockDesc &block) : op_(op), block_(block) {} bool CompileTimeInferShapeContext::HasInput(const std::string &name) const { diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h index 0f0f126f9859e729aaf83f8bd13c60f551f0c1d5..93d4a88f3c390551ab41e42ec2f6f30f52e306db 100644 --- a/paddle/framework/op_desc.h +++ b/paddle/framework/op_desc.h @@ -23,17 +23,17 @@ limitations under the License. */ namespace paddle { namespace framework { -class BlockDescBind; -class ProgramDescBind; +class BlockDesc; +class ProgramDesc; -class OpDescBind { +class OpDesc { public: - OpDescBind() {} + OpDesc() {} - OpDescBind(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs); + OpDesc(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs); - OpDescBind(const proto::OpDesc &desc, ProgramDescBind *prog); + OpDesc(const proto::OpDesc &desc, ProgramDesc *prog); proto::OpDesc *Proto(); @@ -65,7 +65,7 @@ class OpDescBind { void SetAttr(const std::string &name, const Attribute &v); - void SetBlockAttr(const std::string &name, BlockDescBind &block); + void SetBlockAttr(const std::string &name, BlockDesc &block); Attribute GetAttr(const std::string &name) const; @@ -107,9 +107,9 @@ class OpDescBind { void CheckAttrs(); - void InferShape(const BlockDescBind &block) const; + void InferShape(const BlockDesc &block) const; - void InferVarType(BlockDescBind *block) const; + void InferVarType(BlockDesc *block) const; void MarkAsTarget() { desc_.set_is_target(true); } @@ -127,7 +127,9 @@ class OpDescBind { } proto::OpDesc desc_; + // input arg name => output variable names VariableNameMap inputs_; + // output arg name => output variable names VariableNameMap outputs_; AttributeMap attrs_; diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc index f202c0b27a7d4d7e7d8fe7b578d2d98bfa978f9b..dfa151316daeccfe92e26818165a694b78b5df62 100644 --- a/paddle/framework/op_registry.cc +++ b/paddle/framework/op_registry.cc @@ -47,7 +47,7 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap( std::unique_ptr OpRegistry::CreateOp( const proto::OpDesc& op_desc) { VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be" - "used in unit tests. Use CreateOp(const OpDescBind& op_desc) " + "used in unit tests. Use CreateOp(const OpDesc& op_desc) " "instead."; VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs()); VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs()); @@ -59,7 +59,7 @@ std::unique_ptr OpRegistry::CreateOp( return CreateOp(op_desc.type(), inputs, outputs, attrs); } -std::unique_ptr OpRegistry::CreateOp(const OpDescBind& op_desc) { +std::unique_ptr OpRegistry::CreateOp(const OpDesc& op_desc) { return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(), op_desc.GetAttrMap()); } diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index 7367e0e637a6d308043f35de628913d060fb379c..7f0155b61f44b676825b84667d5bebb798cae8a3 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -79,7 +79,7 @@ class OpRegistry { static std::unique_ptr CreateOp(const proto::OpDesc& op_desc); - static std::unique_ptr CreateOp(const OpDescBind& op_desc); + static std::unique_ptr CreateOp(const OpDesc& op_desc); }; template @@ -126,6 +126,14 @@ class OpKernelRegistrar : public Registrar { __test_global_namespace_##uniq_name##__>::value, \ msg) +/* + The variadic arguments should be class types derived from one of the + following classes: + OpProtoAndCheckerMaker + GradOpDescMakerBase + VarTypeInference + InferShapeBase +*/ #define REGISTER_OPERATOR(op_type, op_class, ...) \ STATIC_ASSERT_GLOBAL_NAMESPACE( \ __reg_op__##op_type, \ @@ -144,20 +152,29 @@ class OpKernelRegistrar : public Registrar { } /** - * Macro to register Operator. + * Macro to register Operator. When the input is duplicable, you should + * use REGISTER_OP_EX with deop_empty_grad=false instead. */ -#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \ - grad_op_class) \ - REGISTER_OPERATOR(grad_op_type, grad_op_class); \ - class _GradOpDescMaker_##grad_op_type##_ \ - : public ::paddle::framework::DefaultGradOpDescMaker { \ - using ::paddle::framework::DefaultGradOpDescMaker< \ - true>::DefaultGradOpDescMaker; \ - \ - protected: \ - virtual std::string GradOpType() const { return #grad_op_type; } \ - }; \ - REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \ +#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \ + grad_op_class) \ + REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type, \ + grad_op_class, true) + +// When an argument is duplicable, we need to use this version. +// Perhaps we can omit DropEmptyIG template parameter and +// only have one version of REGISTER_OP. +#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type, \ + grad_op_class, drop_empty_grad) \ + REGISTER_OPERATOR(grad_op_type, grad_op_class); \ + class _GradOpDescMaker_##grad_op_type##_ \ + : public ::paddle::framework::DefaultGradOpDescMaker { \ + using ::paddle::framework::DefaultGradOpDescMaker< \ + drop_empty_grad>::DefaultGradOpDescMaker; \ + \ + protected: \ + virtual std::string GradOpType() const { return #grad_op_type; } \ + }; \ + REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \ op_maker_class); #define REGISTER_OP_WITH_KERNEL(op_type, ...) \ diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc index 30a265ccac1d4287d5cb0d83386860425a1af4c0..b5d9e5e385c1ba57169ef885824fc23b0f130692 100644 --- a/paddle/framework/program_desc.cc +++ b/paddle/framework/program_desc.cc @@ -18,49 +18,49 @@ limitations under the License. */ namespace paddle { namespace framework { -BlockDescBind *ProgramDescBind::AppendBlock(const BlockDescBind &parent) { +BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) { auto *b = desc_.add_blocks(); b->set_parent_idx(parent.ID()); b->set_idx(desc_.blocks_size() - 1); - blocks_.emplace_back(new BlockDescBind(this, b)); + blocks_.emplace_back(new BlockDesc(this, b)); return blocks_.back().get(); } -proto::ProgramDesc *ProgramDescBind::Proto() { +proto::ProgramDesc *ProgramDesc::Proto() { for (auto &block : blocks_) { block->Flush(); } return &desc_; } -ProgramDescBind::ProgramDescBind() { +ProgramDesc::ProgramDesc() { auto *block = desc_.mutable_blocks()->Add(); block->set_idx(kRootBlockIndex); block->set_parent_idx(kNoneBlockIndex); - blocks_.emplace_back(new BlockDescBind(this, block)); + blocks_.emplace_back(new BlockDesc(this, block)); } -ProgramDescBind::ProgramDescBind(const ProgramDescBind &o) { +ProgramDesc::ProgramDesc(const ProgramDesc &o) { desc_ = o.desc_; for (int i = 0; i < desc_.blocks_size(); ++i) { auto *block = desc_.mutable_blocks(i); - blocks_.emplace_back(new BlockDescBind(*o.blocks_[i], block, this)); + blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this)); } } -ProgramDescBind::ProgramDescBind(const proto::ProgramDesc &desc) { +ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) { desc_ = desc; for (auto &block_desc : *desc_.mutable_blocks()) { - blocks_.emplace_back(new BlockDescBind(this, &block_desc)); + blocks_.emplace_back(new BlockDesc(this, &block_desc)); } } -ProgramDescBind::ProgramDescBind(const std::string &binary_str) { +ProgramDesc::ProgramDesc(const std::string &binary_str) { PADDLE_ENFORCE(desc_.ParseFromString(binary_str), "Fail to parse program_desc from binary string."); for (auto &block_desc : *desc_.mutable_blocks()) { - blocks_.emplace_back(new BlockDescBind(this, &block_desc)); + blocks_.emplace_back(new BlockDesc(this, &block_desc)); } } diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h index affec491ca598a66129f224a35cbf6019859738c..15a962bb696d6172acd1a83cf9bb1ffd0846d449 100644 --- a/paddle/framework/program_desc.h +++ b/paddle/framework/program_desc.h @@ -23,23 +23,23 @@ limitations under the License. */ namespace paddle { namespace framework { -class BlockDescBind; +class BlockDesc; -class ProgramDescBind { +class ProgramDesc { public: - ProgramDescBind(); + ProgramDesc(); - explicit ProgramDescBind(const proto::ProgramDesc &desc); + explicit ProgramDesc(const proto::ProgramDesc &desc); - ProgramDescBind(const ProgramDescBind &o); + ProgramDesc(const ProgramDesc &o); - explicit ProgramDescBind(const std::string &binary_str); + explicit ProgramDesc(const std::string &binary_str); - BlockDescBind *AppendBlock(const BlockDescBind &parent); + BlockDesc *AppendBlock(const BlockDesc &parent); - BlockDescBind *MutableBlock(size_t idx) { return blocks_[idx].get(); } + BlockDesc *MutableBlock(size_t idx) { return blocks_[idx].get(); } - const BlockDescBind &Block(size_t idx) const { return *blocks_[idx]; } + const BlockDesc &Block(size_t idx) const { return *blocks_[idx]; } size_t Size() const { return blocks_.size(); } @@ -48,7 +48,7 @@ class ProgramDescBind { private: proto::ProgramDesc desc_; - std::vector> blocks_; + std::vector> blocks_; }; } // namespace framework } // namespace paddle diff --git a/paddle/framework/program_desc_test.cc b/paddle/framework/program_desc_test.cc index c4fb28f2cc9bd35439690b1a97692bd13671d3e1..a49886f7ea56bc57459202dba65e3f76a902cd70 100644 --- a/paddle/framework/program_desc_test.cc +++ b/paddle/framework/program_desc_test.cc @@ -19,7 +19,7 @@ namespace paddle { namespace framework { TEST(ProgramDesc, copy_ctor) { - ProgramDescBind program; + ProgramDesc program; auto* global_block = program.MutableBlock(0); auto* x = global_block->Var("X"); x->SetType(proto::VarDesc_VarType_LOD_TENSOR); @@ -42,12 +42,12 @@ TEST(ProgramDesc, copy_ctor) { out->SetType(proto::VarDesc_VarType_LOD_TENSOR); op->SetOutput("Y", {out->Name()}); - ProgramDescBind program_copy(program); + ProgramDesc program_copy(program); auto* global_block_copy = program_copy.MutableBlock(0); ASSERT_NE(global_block, global_block_copy); - auto assert_same_var = [&](const std::string& name, VarDescBind* var_before) { + auto assert_same_var = [&](const std::string& name, VarDesc* var_before) { ASSERT_TRUE(global_block_copy->HasVar(name)); auto* copy = global_block_copy->Var(name); ASSERT_NE(copy, var_before); @@ -81,7 +81,7 @@ TEST(ProgramDesc, copy_ctor) { } TEST(ProgramDescBind, serialize_and_deserialize) { - ProgramDescBind program_origin; + ProgramDesc program_origin; auto* global_block = program_origin.MutableBlock(0); auto* x = global_block->Var("X"); x->SetType(proto::VarDesc_VarType_LOD_TENSOR); @@ -107,11 +107,11 @@ TEST(ProgramDescBind, serialize_and_deserialize) { std::string binary_str; program_origin.Proto()->SerializeToString(&binary_str); - ProgramDescBind program_restored(binary_str); + ProgramDesc program_restored(binary_str); auto* global_block_restored = program_restored.MutableBlock(0); ASSERT_NE(global_block, global_block_restored); - auto assert_same_var = [&](const std::string& name, VarDescBind* var_before) { + auto assert_same_var = [&](const std::string& name, VarDesc* var_before) { ASSERT_TRUE(global_block_restored->HasVar(name)); auto* restored = global_block_restored->Var(name); ASSERT_NE(restored, var_before); diff --git a/paddle/framework/prune_test.cc b/paddle/framework/prune_test.cc index 47fe4b0636c14c5a4f0d4e000a4da8f8951c5d62..bdd57659432ea4f9bdd05425a802110b0c202fb8 100644 --- a/paddle/framework/prune_test.cc +++ b/paddle/framework/prune_test.cc @@ -29,7 +29,7 @@ namespace ops = paddle::operators; void AddOp(const std::string &type, const f::VariableNameMap &inputs, const f::VariableNameMap &outputs, f::AttributeMap attrs, - paddle::framework::BlockDescBind *block) { + paddle::framework::BlockDesc *block) { // insert output for (auto kv : outputs) { for (auto v : kv.second) { @@ -51,8 +51,8 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs, } TEST(Prune, one_operator) { - f::ProgramDescBind program; - f::BlockDescBind *block = program.MutableBlock(0); + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{}, block); @@ -69,8 +69,8 @@ TEST(Prune, one_operator) { } TEST(Prune, forward) { - f::ProgramDescBind program; - f::BlockDescBind *block = program.MutableBlock(0); + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{}, block); @@ -92,8 +92,8 @@ TEST(Prune, forward) { } TEST(Prune, multi_input_op) { - f::ProgramDescBind program; - f::BlockDescBind *block = program.MutableBlock(0); + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{}, block); @@ -113,8 +113,8 @@ TEST(Prune, multi_input_op) { } TEST(Prune, multi_output_op) { - f::ProgramDescBind program; - f::BlockDescBind *block = program.MutableBlock(0); + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, f::AttributeMap{}, block); @@ -132,8 +132,8 @@ TEST(Prune, multi_output_op) { } TEST(Prune, multi_target) { - f::ProgramDescBind program; - f::BlockDescBind *block = program.MutableBlock(0); + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, f::AttributeMap{}, block); diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h index baeb98c9bd49ec65da5931bcbe33ab788f86f3e8..da152e8b9d23490e2e69c2dd215c45355e1c1e44 100644 --- a/paddle/framework/type_defs.h +++ b/paddle/framework/type_defs.h @@ -25,11 +25,9 @@ namespace paddle { namespace framework { class OperatorBase; -class OpDescBind; -class BlockDescBind; -class BlockDesc; +class OpDesc; class InferShapeContext; -class BlockDescBind; +class BlockDesc; using VariableNameMap = std::map>; @@ -37,7 +35,7 @@ using VariableNameMap = std::map>; using Attribute = boost::variant, std::vector, std::vector, bool, - std::vector, BlockDescBind*>; + std::vector, BlockDesc*>; using AttributeMap = std::unordered_map; @@ -45,13 +43,13 @@ using OpCreator = std::function; -using GradOpMakerFN = std::function>( - const OpDescBind&, const std::unordered_set& /*no_grad_set*/, +using GradOpMakerFN = std::function>( + const OpDesc&, const std::unordered_set& /*no_grad_set*/, std::unordered_map* /*grad_to_var*/, - const std::vector& grad_block)>; + const std::vector& grad_block)>; -using InferVarTypeFN = std::function; +using InferVarTypeFN = + std::function; using InferShapeFN = std::function; diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc index 2180827767e73b7ce51e05e402de8b0dd68571bd..bd8973eeb369aabd2c52d4fccf799657c564ee78 100644 --- a/paddle/framework/var_desc.cc +++ b/paddle/framework/var_desc.cc @@ -18,29 +18,27 @@ limitations under the License. */ namespace paddle { namespace framework { -proto::VarDesc::VarType VarDescBind::GetType() const { return desc_.type(); } +proto::VarDesc::VarType VarDesc::GetType() const { return desc_.type(); } -void VarDescBind::SetType(proto::VarDesc::VarType type) { - desc_.set_type(type); -} +void VarDesc::SetType(proto::VarDesc::VarType type) { desc_.set_type(type); } -void VarDescBind::SetShape(const std::vector &dims) { +void VarDesc::SetShape(const std::vector &dims) { VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims()); } -void VarDescBind::SetDataType(proto::DataType data_type) { +void VarDesc::SetDataType(proto::DataType data_type) { mutable_tensor_desc()->set_data_type(data_type); } -std::vector VarDescBind::Shape() const { +std::vector VarDesc::Shape() const { return RepeatedToVector(tensor_desc().dims()); } -proto::DataType VarDescBind::GetDataType() const { +proto::DataType VarDesc::GetDataType() const { return tensor_desc().data_type(); } -void VarDescBind::SetLoDLevel(int32_t lod_level) { +void VarDesc::SetLoDLevel(int32_t lod_level) { switch (desc_.type()) { case proto::VarDesc::LOD_TENSOR: desc_.mutable_lod_tensor()->set_lod_level(lod_level); @@ -54,7 +52,7 @@ void VarDescBind::SetLoDLevel(int32_t lod_level) { } } -int32_t VarDescBind::GetLodLevel() const { +int32_t VarDesc::GetLodLevel() const { switch (desc_.type()) { case proto::VarDesc::LOD_TENSOR: return desc_.lod_tensor().lod_level(); @@ -66,7 +64,7 @@ int32_t VarDescBind::GetLodLevel() const { } } -const proto::TensorDesc &VarDescBind::tensor_desc() const { +const proto::TensorDesc &VarDesc::tensor_desc() const { PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type"); switch (desc_.type()) { case proto::VarDesc::SELECTED_ROWS: @@ -80,7 +78,7 @@ const proto::TensorDesc &VarDescBind::tensor_desc() const { } } -proto::TensorDesc *VarDescBind::mutable_tensor_desc() { +proto::TensorDesc *VarDesc::mutable_tensor_desc() { PADDLE_ENFORCE(desc_.has_type(), "invoke MutableTensorDesc must after set type"); switch (desc_.type()) { diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h index 335a864cabfe575e153cd5c44b5cd30c312914a2..4fd2abe7fb215c3ac454de3e30754685111eb570 100644 --- a/paddle/framework/var_desc.h +++ b/paddle/framework/var_desc.h @@ -53,14 +53,14 @@ inline void VectorToRepeated(const std::vector &vec, } } -class VarDescBind { +class VarDesc { public: - explicit VarDescBind(const std::string &name) { + explicit VarDesc(const std::string &name) { desc_.set_name(name); desc_.set_type(proto::VarDesc::LOD_TENSOR); } - explicit VarDescBind(const proto::VarDesc &desc) : desc_(desc) {} + explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {} proto::VarDesc *Proto() { return &desc_; } diff --git a/paddle/framework/var_type_inference.h b/paddle/framework/var_type_inference.h index 32abbeb33479444c5e7a9889f4211f59af07f98f..1a4dca05f741f33d58eeccda9d1f800aadb8c01f 100644 --- a/paddle/framework/var_type_inference.h +++ b/paddle/framework/var_type_inference.h @@ -21,8 +21,7 @@ namespace framework { class VarTypeInference { public: virtual ~VarTypeInference() {} - virtual void operator()(const OpDescBind& op_desc, - BlockDescBind* block) const = 0; + virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0; }; } // namespace framework diff --git a/paddle/framework/var_type_inference_test.cc b/paddle/framework/var_type_inference_test.cc index 8b465cbc59c50fbbe48036eb7dc09847d21d9d0b..92f333c558413ac3253c0fb8a20d6f0cfa33f99c 100644 --- a/paddle/framework/var_type_inference_test.cc +++ b/paddle/framework/var_type_inference_test.cc @@ -33,8 +33,7 @@ class SumOpMaker : public OpProtoAndCheckerMaker { class SumOpVarTypeInference : public VarTypeInference { public: - void operator()(const OpDescBind &op_desc, - BlockDescBind *block) const override { + void operator()(const OpDesc &op_desc, BlockDesc *block) const override { auto &inputs = op_desc.Input("X"); auto default_var_type = proto::VarDesc::SELECTED_ROWS; @@ -62,7 +61,7 @@ namespace paddle { namespace framework { TEST(InferVarType, sum_op) { - ProgramDescBind prog; + ProgramDesc prog; auto *op = prog.MutableBlock(0)->AppendOp(); op->SetType("sum"); op->SetInput("X", {"test_a", "test_b", "test_c"}); @@ -85,7 +84,7 @@ TEST(InferVarType, sum_op) { } TEST(InferVarType, sum_op_without_infer_var_type) { - ProgramDescBind prog; + ProgramDesc prog; auto *op = prog.MutableBlock(0)->AppendOp(); op->SetType("sum_without_infer_var_type"); op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc index 1df88a6da9fb0c50d0d7ecd083c0533d8a886a67..5c629dc3d2aca2705e439df836214c1284b31c8f 100644 --- a/paddle/memory/memcpy.cc +++ b/paddle/memory/memcpy.cc @@ -62,33 +62,6 @@ void Copy(platform::GPUPlace dst_place, } } -template <> -void Copy(platform::CPUPlace dst_place, - void* dst, - platform::GPUPlace src_place, - const void* src, size_t num) { - platform::SetDeviceId(src_place.device); - platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); -} - -template <> -void Copy(platform::GPUPlace dst_place, - void* dst, - platform::CPUPlace src_place, - const void* src, size_t num) { - platform::SetDeviceId(dst_place.device); - platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); -} - -template <> -void Copy(platform::GPUPlace dst_place, - void* dst, - platform::GPUPlace src_place, - const void* src, size_t num) { - platform::SetDeviceId(dst_place.device); - platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice); -} - #endif } // namespace memory diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu index 539a93530206c93a37791a9ccb2fb104af17f940..dd51aad105fecf4e3118f03e2f1868abb5523bc8 100644 --- a/paddle/operators/accuracy_op.cu +++ b/paddle/operators/accuracy_op.cu @@ -26,7 +26,7 @@ template __global__ void AccuracyCudaKernel(const int N, const int D, const int64_t* Xdata, const int64_t* labeldata, int* correct_data, - float* accuracy) { + float* accuracy, int* total_data) { int count = 0; __shared__ int total[BlockSize]; @@ -47,6 +47,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D, if (threadIdx.x == 0) { *correct_data = result; *accuracy = static_cast(result) / static_cast(N); + *total_data = N; } } @@ -80,22 +81,11 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { if (num_samples == 0) { return; } - platform::GpuMemcpyAsync(total_data, &num_samples, sizeof(int), - cudaMemcpyHostToDevice, stream); AccuracyCudaKernel< PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( num_samples, infer_width, indices_data, label_data, correct_data, - accuracy_data); - - int d_num_samples, d_num_correct; - float d_accuracy; - platform::GpuMemcpyAsync(&d_num_correct, correct_data, sizeof(int), - cudaMemcpyDeviceToHost, stream); - platform::GpuMemcpyAsync(&d_num_samples, total_data, sizeof(int), - cudaMemcpyDeviceToHost, stream); - platform::GpuMemcpyAsync(&d_accuracy, accuracy_data, sizeof(float), - cudaMemcpyDeviceToHost, stream); + accuracy_data, total_data); } }; diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc index aafdb8fb248397d78360745270d390ff4d60e7eb..b6ca3cad94425207629160a4c7d715f685b23a09 100644 --- a/paddle/operators/array_to_lod_tensor_op.cc +++ b/paddle/operators/array_to_lod_tensor_op.cc @@ -149,14 +149,14 @@ class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto *grad_op = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); grad_op->SetType("lod_tensor_to_array"); grad_op->SetInput("X", OutputGrad("Out")); grad_op->SetInput("RankTable", Input("RankTable")); grad_op->SetOutput("Out", InputGrad("X")); grad_op->SetAttrMap(Attrs()); - return std::unique_ptr(grad_op); + return std::unique_ptr(grad_op); } }; diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc index 0d98755aa07e4b655e74211ffecb039254e50608..a914ff4ba92318c75326bd7945bb73bcb93b6fc3 100644 --- a/paddle/operators/assign_op.cc +++ b/paddle/operators/assign_op.cc @@ -121,12 +121,12 @@ class AssignGradMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto *op = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto *op = new framework::OpDesc(); op->SetType("assign"); op->SetInput("X", OutputGrad("Out")); op->SetOutput("Out", InputGrad("X")); - return std::unique_ptr(op); + return std::unique_ptr(op); } }; diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc index f545da22d74f4758a099d249db922de28c926ec2..1c14acbe11fbad9654bd0309f5674176ebdb5e6f 100644 --- a/paddle/operators/batch_norm_op.cc +++ b/paddle/operators/batch_norm_op.cc @@ -13,12 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/batch_norm_op.h" +#include "paddle/framework/data_layout.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; template using EigenArrayMap = @@ -60,15 +62,15 @@ class BatchNormOp : public framework::OperatorWithKernel { "Variance and VarianceOut should share the same memory"); const auto x_dims = ctx->GetInputDim("X"); - const TensorFormat tensor_format = - StringToTensorFormat(ctx->Attrs().Get("tensor_format")); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, "Input X must have 2 to 5 dimensions."); const int C = - (tensor_format == TensorFormat::NCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C); @@ -90,7 +92,7 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("is_test", "").SetDefault(false); AddAttr("momentum", "").SetDefault(0.9); AddAttr("epsilon", "").SetDefault(1e-5); - AddAttr("tensor_format", "").SetDefault("NCHW"); + AddAttr("data_layout", "").SetDefault("NCHW"); AddInput("X", "The input tensor"); AddInput("Scale", "Scale is a 1-dimensional tensor of size C " @@ -141,9 +143,9 @@ class BatchNormKernel const float epsilon = ctx.Attr("epsilon"); const float momentum = ctx.Attr("momentum"); const bool is_test = ctx.Attr("is_test"); - const std::string tensor_format_str = - ctx.Attr("tensor_format"); - const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); @@ -151,8 +153,8 @@ class BatchNormKernel "The Input dim size should be between 2 and 5"); const int N = x_dims[0]; const int C = - (tensor_format == TensorFormat::NCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); const int sample_size = x->numel() / N / C; auto *y = ctx.Output("Y"); @@ -177,8 +179,8 @@ class BatchNormKernel saved_mean_e.setZero(); saved_variance_e.setZero(); - switch (tensor_format) { - case TensorFormat::NCHW: { + switch (data_layout) { + case DataLayout::kNCHW: { ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); for (int nc = 0; nc < N * C; ++nc) { saved_mean_e(nc % C) += x_arr.col(nc).sum(); @@ -191,7 +193,7 @@ class BatchNormKernel saved_variance_e /= N * sample_size; break; } - case TensorFormat::NHWC: { + case DataLayout::kNHWC: { ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); for (int i = 0; i < N * sample_size; ++i) { saved_mean_e += x_arr.col(i); @@ -205,7 +207,7 @@ class BatchNormKernel break; } default: - PADDLE_THROW("Unknown storage order: %s", tensor_format_str); + PADDLE_THROW("Unknown storage order: %s", data_layout_str); } EigenVectorArrayMap running_mean_arr( @@ -247,8 +249,8 @@ class BatchNormKernel Eigen::Array new_bias = bias_arr - mean_arr * inv_std * scale_arr; - switch (tensor_format) { - case TensorFormat::NCHW: { + switch (data_layout) { + case DataLayout::kNCHW: { EigenArrayMap y_arr(y->mutable_data(ctx.GetPlace()), sample_size, N * C); ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); @@ -257,7 +259,7 @@ class BatchNormKernel } break; } - case TensorFormat::NHWC: { + case DataLayout::kNHWC: { EigenArrayMap(y->mutable_data(ctx.GetPlace()), C, N * sample_size) = (ConstEigenArrayMap(x->data(), C, N * sample_size).colwise() * @@ -267,7 +269,7 @@ class BatchNormKernel break; } default: - PADDLE_THROW("Unknown storage order: %d", tensor_format); + PADDLE_THROW("Unknown storage order: %d", data_layout); } } }; @@ -290,11 +292,11 @@ class BatchNormGradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), ""); const auto x_dims = ctx->GetInputDim("X"); - const TensorFormat tensor_format = - StringToTensorFormat(ctx->Attrs().Get("tensor_format")); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); const int C = - (tensor_format == TensorFormat::NCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); @@ -333,9 +335,9 @@ class BatchNormGradKernel const auto *saved_mean = ctx.Input("SavedMean"); // SavedVariance have been reverted in forward operator const auto *saved_inv_variance = ctx.Input("SavedVariance"); - const std::string tensor_format_str = - ctx.Attr("tensor_format"); - const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); // Get the size for each dimension. // NCHW [batch_size, in_channels, in_height, in_width] @@ -344,8 +346,8 @@ class BatchNormGradKernel "The Input dim size should be between 2 and 5"); const int N = x_dims[0]; const int C = - (tensor_format == TensorFormat::NCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); const int sample_size = x->numel() / N / C; ConstEigenVectorArrayMap scale_arr(scale->data(), C); @@ -376,8 +378,8 @@ class BatchNormGradKernel const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size); - switch (tensor_format) { - case TensorFormat::NCHW: { + switch (data_layout) { + case DataLayout::kNCHW: { ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); ConstEigenArrayMap d_y_arr(d_y->data(), sample_size, N * C); EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), @@ -400,7 +402,7 @@ class BatchNormGradKernel } break; } - case TensorFormat::NHWC: { + case DataLayout::kNHWC: { ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); ConstEigenArrayMap d_y_arr(d_y->data(), C, N * sample_size); EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), C, @@ -425,7 +427,7 @@ class BatchNormGradKernel break; } default: - PADDLE_THROW("Unknown storage order: %s", tensor_format_str); + PADDLE_THROW("Unknown storage order: %s", data_layout_str); } } }; diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc index c7adc3d80ed25d129cec41a0fd3d22fd42aba363..55d0736a4c8e09eea637e5ab7e49af9a618e7fd8 100644 --- a/paddle/operators/batch_norm_op.cu.cc +++ b/paddle/operators/batch_norm_op.cu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/batch_norm_op.h" +#include "paddle/framework/data_layout.h" #include #include "paddle/operators/math/math_function.h" @@ -22,12 +23,12 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; +using DataLayout = framework::DataLayout; template using CudnnDataType = platform::CudnnDataType; -void ExtractNCWHD(const framework::DDim &dims, - const TensorFormat &tensor_format, int *N, int *C, int *H, - int *W, int *D) { +void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout, + int *N, int *C, int *H, int *W, int *D) { *N = dims[0]; if (dims.size() == 2) { *C = dims[1]; @@ -35,13 +36,13 @@ void ExtractNCWHD(const framework::DDim &dims, *W = 1; *D = 1; } else { - *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1]; - *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1]; + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; *W = dims.size() > 3 - ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2]) + ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2]) : 1; *D = dims.size() > 4 - ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3]) + ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3]) : 1; } } @@ -56,9 +57,9 @@ class BatchNormKernel double epsilon = static_cast(ctx.Attr("epsilon")); const float momentum = ctx.Attr("momentum"); const bool is_test = ctx.Attr("is_test"); - const std::string tensor_format_str = - ctx.Attr("tensor_format"); - const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); // Get the size for each dimension. // NCHW [batch_size, in_channels, in_height, in_width] @@ -67,7 +68,7 @@ class BatchNormKernel PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, "The Input dim size should be between 2 and 5"); int N, C, H, W, D; - ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D); + ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); // ------------------- cudnn descriptors --------------------- cudnnTensorDescriptor_t data_desc_; @@ -93,7 +94,7 @@ class BatchNormKernel VLOG(1) << "Setting descriptors."; std::vector dims; std::vector strides; - if (tensor_format == TensorFormat::NCHW) { + if (data_layout == DataLayout::kNCHW) { dims = {N, C, H, W, D}; strides = {C * H * W * D, H * W * D, W * D, D, 1}; } else { @@ -180,9 +181,9 @@ class BatchNormGradKernel PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "It must use GPUPlace."); double epsilon = static_cast(ctx.Attr("epsilon")); - const std::string tensor_format_str = - ctx.Attr("tensor_format"); - const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); const auto *x = ctx.Input("X"); const auto *d_y = ctx.Input(framework::GradVarName("Y")); const auto *scale = ctx.Input("Scale"); @@ -192,7 +193,7 @@ class BatchNormGradKernel PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, "The Input dim size should be between 2 and 5"); int N, C, H, W, D; - ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D); + ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL); PADDLE_ENFORCE_EQ(scale->dims()[0], C); @@ -219,7 +220,7 @@ class BatchNormGradKernel std::vector dims; std::vector strides; - if (tensor_format == TensorFormat::NCHW) { + if (data_layout == DataLayout::kNCHW) { dims = {N, C, H, W, D}; strides = {C * H * W * D, H * W * D, W * D, D, 1}; } else { diff --git a/paddle/operators/batch_norm_op.h b/paddle/operators/batch_norm_op.h index 8d99b6864776e81b30e87c09028b336309cf2838..a817ef41fc87da33ad87923c99a75ee7c3c7bbfe 100644 --- a/paddle/operators/batch_norm_op.h +++ b/paddle/operators/batch_norm_op.h @@ -19,21 +19,6 @@ limitations under the License. */ namespace paddle { namespace operators { -enum TensorFormat { - NHWC = 0, - NCHW = 1, -}; - -inline TensorFormat StringToTensorFormat(const std::string& str) { - if (str == "NHWC" || str == "nhwc") { - return TensorFormat::NHWC; - } else if (str == "NCHW" || str == "nchw") { - return TensorFormat::NCHW; - } else { - PADDLE_THROW("Unknown storage order string: %s", str); - } -} - template class BatchNormKernel : public framework::OpKernel { public: diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc index ceb20cbe18445cc6fbaea6ba1c64f143e88fb9c9..32756faac5324cfb3b5366857d2c8176665fb3ec 100644 --- a/paddle/operators/beam_search_decode_op.cc +++ b/paddle/operators/beam_search_decode_op.cc @@ -119,8 +119,8 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase { class BeamSearchDecodeInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDescBind& op_desc, - framework::BlockDescBind* block) const override { + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { for (auto& o : op_desc.Output("SentenceIds")) { block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR); } diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc index 927a32645ccb6cd8aa225f3ce1ba78045bd8fd19..fc6da064904610f5c9c140a6328858d697dd954e 100644 --- a/paddle/operators/cast_op.cc +++ b/paddle/operators/cast_op.cc @@ -52,14 +52,14 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto grad = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto grad = new framework::OpDesc(); grad->SetType("cast"); grad->SetInput("X", OutputGrad("Out")); grad->SetOutput("Out", InputGrad("X")); grad->SetAttr("out_dtype", GetAttr("in_dtype")); grad->SetAttr("in_dtype", GetAttr("out_dtype")); - return std::unique_ptr(grad); + return std::unique_ptr(grad); } }; diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc index 6151e2e73fb33f01794f81bd176fde7e5579a5c8..32b61edfd0dd163e5ef8f3d1de133c55314458b5 100644 --- a/paddle/operators/concat_op.cc +++ b/paddle/operators/concat_op.cc @@ -98,8 +98,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad, - ops::ConcatOpGrad) +REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad, + ops::ConcatOpGrad, false) REGISTER_OP_CPU_KERNEL(concat, ops::ConcatKernel) REGISTER_OP_CPU_KERNEL(concat_grad, diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc index 5fe362c1b630867d68b566cb9660b57860368d70..204be7d1e5385b7fdab54914bec216543e360cd3 100644 --- a/paddle/operators/conditional_block_op.cc +++ b/paddle/operators/conditional_block_op.cc @@ -65,7 +65,7 @@ class ConditionalBlockOp : public ConditionalOp { scopes->front() = &scope.NewScope(); auto &cur_scope = *scopes->front(); - auto *block = Attr("sub_block"); + auto *block = Attr("sub_block"); framework::Executor exec(dev_ctx); exec.Run(*block->Program(), &cur_scope, block->ID(), false); } @@ -86,7 +86,7 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker { "(std::vector) The step scope of conditional block. To " "unify the conditional block, rnn and while op, the type of " "scope is std::vector"); - AddAttr( + AddAttr( "sub_block", "The step block of conditional block operator"); AddComment(R"DOC(Conditional block operator @@ -116,7 +116,7 @@ class ConditionalBlockGradOp : public ConditionalOp { auto &scopes = scope_var->Get>(); framework::Scope &cur_scope = *scopes[0]; - auto *block = Attr("sub_block"); + auto *block = Attr("sub_block"); framework::Executor exec(dev_ctx); exec.Run(*block->Program(), &cur_scope, block->ID(), false); @@ -170,18 +170,19 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto grad_op = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto grad_op = new framework::OpDesc(); grad_op->SetType("conditional_block_grad"); grad_op->SetInput("X", Input("X")); grad_op->SetInput("Params", Input("Params")); grad_op->SetInput("Out", Output("Out")); grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); grad_op->SetInput("Scope", Output("Scope")); - grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X")); - grad_op->SetOutput(framework::GradVarName("Params"), InputGrad("Params")); + grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false)); + grad_op->SetOutput(framework::GradVarName("Params"), + InputGrad("Params", false)); grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]); - return std::unique_ptr(grad_op); + return std::unique_ptr(grad_op); } }; diff --git a/paddle/operators/conv_transpose_cudnn_op.cc b/paddle/operators/conv_transpose_cudnn_op.cc index 2348bed4ff1c658e2b5614a3bdd94dfb554ad705..8980ff91f5d7c585d9ce0ce62cfb90f47ea86ec6 100644 --- a/paddle/operators/conv_transpose_cudnn_op.cc +++ b/paddle/operators/conv_transpose_cudnn_op.cc @@ -21,8 +21,6 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker { public: CudnnConv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker) : Conv2DTransposeOpMaker(proto, op_checker) { - AddAttr>("dilations", "dilations of convolution operator.") - .SetDefault({1, 1}); AddAttr("workspace_size_MB", "workspace size for cudnn, in MB, " "workspace is a section of GPU memory which will be " @@ -37,8 +35,6 @@ class CudnnConv3DTransposeOpMaker : public Conv3DTransposeOpMaker { public: CudnnConv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker) : Conv3DTransposeOpMaker(proto, op_checker) { - AddAttr>("dilations", "dilations of convolution operator.") - .SetDefault({1, 1, 1}); AddAttr("workspace_size_MB", "workspace size for cudnn, in MB, " "workspace is a section of GPU memory which will be " diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc index cae0e2ca2b472818463e109cb700da7f62180926..5e24fc4b2c8b479caac417c957033f7552e1c3f0 100644 --- a/paddle/operators/conv_transpose_op.cc +++ b/paddle/operators/conv_transpose_op.cc @@ -29,6 +29,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { auto filter_dims = ctx->GetInputDim("Filter"); std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); + std::vector dilations = ctx->Attrs().Get>("dilations"); PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, "ConvTransposeOp intput should be 4-D or 5-D tensor."); @@ -41,14 +42,18 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE_EQ(paddings.size(), strides.size(), "ConvTransposeOp paddings dimension and strides " "dimension should be the same."); + PADDLE_ENFORCE_EQ(paddings.size(), dilations.size(), + "ConvTransposeOp paddings dimension and dilations " + "dimension should be the same."); PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0], "In ConvTransposeOp, The input channel should be the same " "as the number of filters."); std::vector output_shape({in_dims[0], filter_dims[1]}); for (size_t i = 0; i < strides.size(); ++i) { + auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] + - filter_dims[i + 2]); + filter_extent); } ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); } @@ -73,6 +78,12 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto, AddOutput("Output", "(Tensor) The output tensor of convolution transpose operator. " "The format of output tensor is also NCHW."); + + AddAttr>("dilations", + "(vector default:{1, 1}), the " + "dilations(h_dilation, w_dilation) of convolution " + "transpose operator.") + .SetDefault({1, 1}); AddAttr>( "strides", "(vector default:{1, 1}), the strides(h_stride, w_stride) of " @@ -87,7 +98,7 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto, Convolution2D Transpose Operator. The convolution transpose operation calculates the output based on the input, filter -and strides, paddings, groups parameters. The size of each dimension of the +and dilations, strides, paddings, groups parameters. The size of each dimension of the parameters is checked in the infer-shape. Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the number of channels, H is the height of the feature, and W is the width of the feature. @@ -136,6 +147,13 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto, "Where N is batch size, C is " "the number of channels, D is the depth of the feature, H is the " "height of the feature, and W is the width of the feature."); + + AddAttr>( + "dilations", + "(vector default:{1, 1, 1}), the " + "dilations(d_dilation,h_dilation, w_dilation) of convolution " + "transpose operator.") + .SetDefault({1, 1, 1}); AddAttr>("strides", "(vector default:{1, 1, 1}), the " "strides{d_stride, h_stride, w_stride} of " @@ -149,7 +167,7 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto, Convolution3D Transpose Operator. The convolution transpose operation calculates the output based on the input, filter -and strides, paddings, groups parameters. The size of each dimension of the +and dilations, strides, paddings, groups parameters. The size of each dimension of the parameters is checked in the infer-shape. Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the number of channels, D is the depth of the feature, H is the height of the feature, diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h index e81651f417a5b96a1f25ae9ca9228d332a16bc6d..4c8f8a80672788e8b2919e500d3627adec1ad035 100644 --- a/paddle/operators/conv_transpose_op.h +++ b/paddle/operators/conv_transpose_op.h @@ -61,6 +61,7 @@ class GemmConvTransposeKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); // groups will alway be disabled in conv2dtranspose. const int batch_size = static_cast(input->dims()[0]); @@ -113,7 +114,6 @@ class GemmConvTransposeKernel : public framework::OpKernel { math::Col2ImFunctor col2im; math::Col2VolFunctor col2vol; - std::vector dilations({1, 1, 1}); // convolution transpose: gemm + col2im or col2vol (similar to conv-backward // on input) @@ -165,6 +165,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); const int batch_size = static_cast(input->dims()[0]); @@ -219,7 +220,6 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { math::Im2ColFunctor im2col; math::Vol2ColFunctor vol2col; - std::vector dilations({1, 1, 1}); if (input_grad) { input_grad->mutable_data(context.GetPlace()); diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc index 89dc5045221156eed7aa9411bc96ad86f91136d2..517a1946a0c25081b20c320f4104c81503a4249e 100644 --- a/paddle/operators/detail/recv_impl.cc +++ b/paddle/operators/detail/recv_impl.cc @@ -20,25 +20,57 @@ namespace detail { Status SendRecvServerImpl::SendVariable(ServerContext *context, const VariableMessage *in_var, - VariableMessage *out_var) { - framework::LoDTensor t; - // TODO(typhoonzero): desirealize in_tensor and run pserver network. + VoidMessage *out_var) { + // TODO(typhoonzero): support different variable types. std::istringstream iss(in_var->serialized()); + framework::LoDTensor t; framework::DeserializeFromStream(iss, &t); - lodtensor_queue_.Push(std::move(t)); - // Block util the sub graph is done. - t = lodtensor_return_queue_.Pop(); + TensorWithName tensor_with_name = + std::make_pair(in_var->varname(), std::move(t)); + + var_recv_queue_.Push(std::move(tensor_with_name)); + return Status::OK; +} + +Status SendRecvServerImpl::GetVariable(ServerContext *context, + const VariableMessage *in_var, + VariableMessage *out_var) { + std::string get_var_name = in_var->varname(); + auto *var = scope_->FindVar(get_var_name); + auto tensor = var->Get(); std::ostringstream oss; - // FIXME(typhoonzero): get context from op. - framework::SerializeToStream(oss, t, platform::CPUDeviceContext()); + framework::SerializeToStream(oss, tensor, platform::CPUDeviceContext()); + std::string *varname = out_var->mutable_varname(); - *varname = in_var->varname(); + *varname = get_var_name; std::string *serialized = out_var->mutable_serialized(); *serialized = oss.str(); + return Status::OK; +} +Status SendRecvServerImpl::Wait(ServerContext *context, + const VoidMessage *in_var, + VoidMessage *out_var) { + { + std::unique_lock lock(this->mutex_); + condition_.wait(lock, [=] { return this->done_ == true; }); + } return Status::OK; } +void SendRecvServerImpl::Reset() { + std::lock_guard lock(this->mutex_); + done_ = false; +} + +void SendRecvServerImpl::Done() { + { + std::lock_guard lock(this->mutex_); + done_ = true; + } + condition_.notify_all(); +} + } // namespace detail } // namespace operators } // namespace paddle diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc index da1ddf75d2afb85670c5ea0c9884376415f28208..d7165e13db961f7719139b1a40211d6a2ca67a9f 100644 --- a/paddle/operators/detail/send_impl.cc +++ b/paddle/operators/detail/send_impl.cc @@ -19,10 +19,10 @@ namespace operators { namespace detail { bool RPCClient::SendVariable(const framework::Scope& scope, - const std::string& inname, - const std::string& outname) { + const std::string& inname) { ClientContext context; - VariableMessage msg, out_msg; + VariableMessage msg; + VoidMessage out_msg; // FIXME(typhoonzero): pass device context to here. auto ctx = platform::CPUDeviceContext(); auto* var = scope.FindVar(inname); @@ -37,9 +37,26 @@ bool RPCClient::SendVariable(const framework::Scope& scope, msg.set_serialized(oss.str()); Status status = stub_->SendVariable(&context, msg, &out_msg); if (!status.ok()) { + LOG(ERROR) << "gRPC error: " << status.error_message(); return false; } - std::istringstream iss(out_msg.serialized()); + return true; +} + +bool RPCClient::GetVariable(const framework::Scope& scope, + const std::string& outname) { + ClientContext context; + VariableMessage call_msg, ret_msg; + call_msg.set_varname(outname); + auto ctx = platform::CPUDeviceContext(); + Status status = stub_->GetVariable(&context, call_msg, &ret_msg); + if (!status.ok()) { + LOG(ERROR) << "gRPC error: " << status.error_message(); + return false; + } + + std::istringstream iss(ret_msg.serialized()); + framework::LoDTensor ret_tensor; framework::DeserializeFromStream(iss, &ret_tensor); auto* outvar = scope.FindVar(outname); @@ -49,6 +66,12 @@ bool RPCClient::SendVariable(const framework::Scope& scope, return true; } +void RPCClient::Wait() { + ClientContext context; + VoidMessage call_msg, ret_msg; + stub_->Wait(&context, call_msg, &ret_msg); +} + } // namespace detail } // namespace operators } // namespace paddle diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto index 07ff9d2c621a2dfb51792821a0d3fc398c315835..ce729908062ad442e66cc00001e14ceb6f268560 100644 --- a/paddle/operators/detail/send_recv.proto +++ b/paddle/operators/detail/send_recv.proto @@ -19,7 +19,12 @@ package sendrecv; service SendRecvService { // For parameter server round-robin like hashing, do not split tensors. // Send and recv only one tensor - rpc SendVariable(VariableMessage) returns (VariableMessage) {} + // TODO(typhoonzero): add streaming API + rpc SendVariable(VariableMessage) returns (VoidMessage) {} + // Argument VariableMessage for GetVariable should only contain varname. + rpc GetVariable(VariableMessage) returns (VariableMessage) {} + // wait for one execution of the program + rpc Wait(VoidMessage) returns (VoidMessage) {} } // VariableMessage is serialized paddle variable message. diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h index b9a5340a8636db7b5d6ec7b21368632d3916b4aa..eec9dd38d188247cba4da2a377038a28c847e40e 100644 --- a/paddle/operators/detail/send_recv_impl.h +++ b/paddle/operators/detail/send_recv_impl.h @@ -20,10 +20,6 @@ #include "paddle/framework/selected_rows.h" #include "paddle/operators/detail/simple_block_queue.h" -// #include -// #include -// #include -// #include #include "paddle/operators/detail/send_recv.grpc.pb.h" #include "paddle/operators/detail/send_recv.pb.h" @@ -48,24 +44,32 @@ namespace paddle { namespace operators { namespace detail { +typedef std::pair TensorWithName; + class SendRecvServerImpl final : public SendRecvService::Service { public: explicit SendRecvServerImpl() {} Status SendVariable(ServerContext *context, const VariableMessage *in_var, - VariableMessage *out_var) override; - - const framework::LoDTensor Get() { return this->lodtensor_queue_.Pop(); } + VoidMessage *out_var) override; + Status GetVariable(ServerContext *context, const VariableMessage *in_var, + VariableMessage *out_var) override; + Status Wait(ServerContext *context, const VoidMessage *in_var, + VoidMessage *out_var) override; + void Reset(); + void Done(); + void SetScope(framework::Scope *scope) { scope_ = scope; }; - void Push(const framework::LoDTensor &tensor) { - this->lodtensor_return_queue_.Push(tensor); - } + const TensorWithName Get() { return this->var_recv_queue_.Pop(); } private: - SimpleBlockQueue lodtensor_queue_; - SimpleBlockQueue lodtensor_return_queue_; - SimpleBlockQueue selected_rows_queue_; - SimpleBlockQueue selected_rows_return_queue_; + // received variable from RPC, operators fetch variable from this queue. + SimpleBlockQueue var_recv_queue_; + framework::Scope *scope_; + // condition of the sub program + std::mutex mutex_; + bool done_; + std::condition_variable condition_; }; // RPCClient is a class to send tensors to pserver sub-network @@ -75,8 +79,9 @@ class RPCClient { RPCClient(std::shared_ptr channel) : stub_(SendRecvService::NewStub(channel)) {} - bool SendVariable(const framework::Scope &scope, const std::string &inname, - const std::string &outname); + bool SendVariable(const framework::Scope &scope, const std::string &inname); + bool GetVariable(const framework::Scope &scope, const std::string &outname); + void Wait(); private: std::unique_ptr stub_; diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu index 10c670751d026ef92e01aad7da31a8f59b8514c0..c31d2195e95b116451b0f620f6582f65c0dae706 100644 --- a/paddle/operators/dropout_op.cu +++ b/paddle/operators/dropout_op.cu @@ -71,7 +71,7 @@ class GPUDropoutKernel : public framework::OpKernel { auto M = EigenMatrix::Reshape(*mask, 1); Y.device(place) = X * M; } else { - Y.device(place) = X * dropout_prob; + Y.device(place) = X * (1.0f - dropout_prob); } } }; diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h index 84ad39f0bb639975365d427aa205411ef79ecd46..9f6c4212d4f834bbb2d1c65c836f3d3d0f3e0c96 100644 --- a/paddle/operators/dropout_op.h +++ b/paddle/operators/dropout_op.h @@ -57,7 +57,7 @@ class CPUDropoutKernel : public framework::OpKernel { auto Y = EigenMatrix::Reshape(*y, 1); auto& place = *context.template device_context().eigen_device(); - Y.device(place) = X * dropout_prob; + Y.device(place) = X * (1.0f - dropout_prob); } } }; diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index 7ebfc7df8c117edd7bcf14cc5ae6ba3dc1302c03..9edfacd6dfb506e12a0d82772d8de301bb8506e2 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -103,10 +103,12 @@ class MidWiseTransformIterator { MidWiseTransformIterator& operator++() { ++j_; - i_ = j_ / post_; - if (UNLIKELY(i_ == n_)) { + if (UNLIKELY(j_ == post_)) { + ++i_; j_ = 0; - i_ = 0; + if (UNLIKELY(i_ == n_)) { + i_ = 0; + } } return *this; } @@ -125,10 +127,10 @@ class MidWiseTransformIterator { private: const T* ptr_; - int i_; + int64_t i_; int64_t j_; int64_t n_; - int post_; + int64_t post_; }; #ifdef __NVCC__ diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc index 3e828f84d076f1afc841db2c1664f5f5d9c90dc6..b4ae1de876010effff6bf577a4e33043f6760a4f 100644 --- a/paddle/operators/fill_zeros_like_op.cc +++ b/paddle/operators/fill_zeros_like_op.cc @@ -24,10 +24,10 @@ class FillZerosLikeOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of FillZerosLikeOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Y"), - "Output(Y) of FillZerosLikeOp should not be null."); - ctx->SetOutputDim("Y", ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*->*/ "Y"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of FillZerosLikeOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); } }; @@ -36,7 +36,7 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker { FillZerosLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker) : framework::OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The input of fill-zeros-like op."); - AddOutput("Y", "The variable will be filled up with zeros."); + AddOutput("Out", "The variable will be filled up with zeros."); AddComment(R"DOC( FillZerosLike Operator. diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h index a6e2941f52150de7886717303d2cb2f10b7eef7b..351ecf8b2f1d945fabdd1d6c5ed56f76f3caae61 100644 --- a/paddle/operators/fill_zeros_like_op.h +++ b/paddle/operators/fill_zeros_like_op.h @@ -23,7 +23,7 @@ template class FillZerosLikeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Y"); + auto* out = context.Output("Out"); out->mutable_data(context.GetPlace()); math::SetConstant setter; diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc index 3a53ea89dc9a73d170ca5a9fe6cffde02a9f283b..789c92102d63355a80c3330f2107c731206397f4 100644 --- a/paddle/operators/increment_op.cc +++ b/paddle/operators/increment_op.cc @@ -93,13 +93,13 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker { public: using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - std::unique_ptr Apply() const override { - auto *grad_op = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); grad_op->SetType("increment"); grad_op->SetInput("X", Output("Out")); grad_op->SetOutput("Out", Input("X")); grad_op->SetAttr("step", -boost::get(GetAttr("step"))); - return std::unique_ptr(grad_op); + return std::unique_ptr(grad_op); } }; diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc index 3e281c8d1e292cd61336c50c013ca9c05e12c5f4..2d67046bfee01d8d148da1c8b705d3ad959a4839 100644 --- a/paddle/operators/lod_rank_table_op.cc +++ b/paddle/operators/lod_rank_table_op.cc @@ -30,6 +30,7 @@ class LoDRankTableOp : public framework::OperatorBase { scope.FindVar(Output("Out"))->GetMutable(); VLOG(10) << "Level = " << static_cast(Attr("level")); out->Reset(x.lod(), static_cast(Attr("level"))); + VLOG(10) << Input("X") << "'s lod information is " << *out; } }; @@ -62,8 +63,8 @@ class LoDRankTableInferShape : public framework::InferShapeBase { class LoDRankTableInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDescBind &op_desc, - framework::BlockDescBind *block) const override { + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { for (auto &o : op_desc.Output("Out")) { block->FindRecursiveOrCreateVar(o)->SetType( framework::proto::VarDesc::LOD_RANK_TABLE); diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc index 33af0e819f757b574b1a4cc3426a8375bdd702c6..643f8859f3d0d44c0b5be922bd786ab04093df94 100644 --- a/paddle/operators/lod_tensor_to_array_op.cc +++ b/paddle/operators/lod_tensor_to_array_op.cc @@ -127,8 +127,8 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase { class LoDTensorToArrayInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDescBind &op_desc, - framework::BlockDescBind *block) const override { + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { for (auto &out_var : op_desc.Output("Out")) { block->Var(out_var)->SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY); } @@ -140,14 +140,14 @@ class LoDTensorToArrayGradMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto *grad_op = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); grad_op->SetType("array_to_lod_tensor"); grad_op->SetInput("X", OutputGrad("Out")); grad_op->SetInput("RankTable", Input("RankTable")); grad_op->SetOutput("Out", InputGrad("X")); grad_op->SetAttrMap(Attrs()); - return std::unique_ptr(grad_op); + return std::unique_ptr(grad_op); } }; diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc index 606b44808edf11a72eb154cbdc8356bb1fc9b9d5..0a9defa8c50453abf3eefdcb89126b1349d6d756 100644 --- a/paddle/operators/lookup_table_op.cc +++ b/paddle/operators/lookup_table_op.cc @@ -108,8 +108,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel { class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDescBind& op_desc, - framework::BlockDescBind* block) const override { + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); auto attr = op_desc.GetAttr("is_sparse"); bool is_sparse = boost::get(attr); diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc index 707ebf05962fb65892c2adbbf41a0a3449763d31..c2633b2e16434558d16f699a701e7b8cf1de8342 100644 --- a/paddle/operators/math/im2col.cc +++ b/paddle/operators/math/im2col.cc @@ -61,14 +61,13 @@ class Im2ColFunctor(); T* col_data = col->data(); - for (int c = 0; c < channels_col; ++c) { int w_offset = c % filter_width; int h_offset = (c / filter_width) % filter_height; - int c_im = c / filter_width / filter_height; + int c_im = c / (filter_width * filter_height); for (int h = 0; h < col_height; ++h) { + int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; for (int w = 0; w < col_width; ++w) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; int col_idx = (c * col_height + h) * col_width + w; int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; @@ -130,16 +129,14 @@ class Col2ImFunctor= 0 && (im_row_idx) < im_height && (im_col_idx) >= 0 && (im_col_idx) < im_width) { - im_row_idx += c_im * im_height; - im_data[im_row_idx * im_width + im_col_idx] += + im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] += col_data[(c * col_height + h) * col_width + w]; } } @@ -199,12 +196,13 @@ class Im2ColFunctor= 0 && im_row_offset < im_height && im_col_offset >= 0 && im_col_offset < im_width) { int im_offset = diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h index 3e6d83386589a02c7d8f62394c1c2becb606504c..aced2690bce9a4c2db6309f18e23a8f6cd0211f3 100644 --- a/paddle/operators/math/math_function_impl.h +++ b/paddle/operators/math/math_function_impl.h @@ -67,18 +67,45 @@ void RowwiseAdd::operator()(const DeviceContext& context, template void ColwiseSum::operator()(const DeviceContext& context, const framework::Tensor& input, - framework::Tensor* vector) { + framework::Tensor* out) { auto in_dims = input.dims(); auto size = input.numel() / in_dims[0]; - PADDLE_ENFORCE_EQ(vector->numel(), size); + PADDLE_ENFORCE_EQ(out->numel(), size); - auto vec = framework::EigenMatrix::From(*vector); auto in = framework::EigenMatrix::From(input); - Eigen::array shape({{1, static_cast(size)}}); - vec.reshape(shape).device(*context.eigen_device()) = - in.sum(Eigen::array({{0}})).reshape(shape); + auto vec = framework::EigenVector::Flatten(*out); + + vec.device(*context.eigen_device()) = in.sum(Eigen::array({{0}})); } +// Specialize for CPU, since Eigen implement a general reduce. However, +// colwise-sum can be easily implemented. General reduce has a huge overhead in +// CPU +template +class ColwiseSum { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* out) { + auto& in_dims = input.dims(); + auto height = in_dims[0]; + auto size = in_dims[1]; + PADDLE_ENFORCE_EQ(out->numel(), size); + + T* out_buf = out->mutable_data(out->place()); + const T* in_buf = input.data(); + + for (size_t i = 0; i < height; ++i) { + for (size_t j = 0; j < size; ++j) { + if (i == 0) { + out_buf[j] = in_buf[i * size + j]; + } else { + out_buf[j] += in_buf[i * size + j]; + } + } + } + } +}; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc index e27f9eeac6e7cdc0615f1368b21df252866d9b7e..411f4d14efbfa5a8ee6dd7da645a044b191bf006 100644 --- a/paddle/operators/mean_op.cc +++ b/paddle/operators/mean_op.cc @@ -60,13 +60,13 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto* grad_op = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto* grad_op = new framework::OpDesc(); grad_op->SetType("mean_grad"); grad_op->SetInput("X", Input("X")); grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X")); - return std::unique_ptr(grad_op); + return std::unique_ptr(grad_op); } }; diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc index ec76cfdf279c93f28cf54c744eafe07b3957c06b..5edf29c3af958f5a939fdb830d46aca4b8d3dbe0 100644 --- a/paddle/operators/merge_lod_tensor_op.cc +++ b/paddle/operators/merge_lod_tensor_op.cc @@ -161,15 +161,15 @@ class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto *grad_op = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); grad_op->SetType("split_lod_tensor"); grad_op->SetInput("X", OutputGrad("Out")); grad_op->SetInput("Mask", Input("Mask")); grad_op->SetOutput("OutTrue", InputGrad("InTrue")); grad_op->SetOutput("OutFalse", InputGrad("InFalse")); grad_op->SetAttrMap(Attrs()); - return std::unique_ptr(grad_op); + return std::unique_ptr(grad_op); } }; diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc index eb65fededfd633de51b0c418b78aab0726e70320..2e9cc9d29d8c92ac56b451834f930758216e6a44 100644 --- a/paddle/operators/minus_op.cc +++ b/paddle/operators/minus_op.cc @@ -70,12 +70,11 @@ class MinusGradMaker : public framework::GradOpDescMakerBase { public: using framework::GradOpDescMakerBase::GradOpDescMakerBase; - std::vector> operator()() - const override { - std::vector> ops; + std::vector> operator()() const override { + std::vector> ops; auto x_g = InputGrad("X"); if (!x_g.empty()) { - auto *x_g_op = new framework::OpDescBind(); + auto *x_g_op = new framework::OpDesc(); x_g_op->SetType("scale"); x_g_op->SetInput("X", OutputGrad("Out")); x_g_op->SetOutput("Out", x_g); @@ -85,7 +84,7 @@ class MinusGradMaker : public framework::GradOpDescMakerBase { auto y_g = InputGrad("Y"); if (!y_g.empty()) { - auto *y_g_op = new framework::OpDescBind(); + auto *y_g_op = new framework::OpDesc(); y_g_op->SetType("scale"); y_g_op->SetInput("X", OutputGrad("Out")); y_g_op->SetOutput("Out", y_g); diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index a4bf0711de0efe0967b1211b7f32e5e2245860bc..599df9c3df58db6444d7cb729e1a2c1f9f628b5b 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -73,39 +73,50 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker { public: MulOpMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "The first input of mul op"); - AddInput("Y", "The second input of mul op"); - AddOutput("Out", "The output of mul op"); + AddInput("X", "(Tensor), The first input tensor of mul op."); + AddInput("Y", "(Tensor), The second input tensor of mul op."); + AddOutput("Out", "(Tensor), The output tensor of mul op."); AddAttr( "x_num_col_dims", - "(int, default 1) " - R"DOC(mul_op can take tensors with more than two dimensions as input `X`, - in that case, tensors will be reshaped to a matrix. The matrix's first - dimension(column length) will be the product of tensor's last - `num_col_dims` dimensions, and the matrix's second dimension(row length) - will be the product of tensor's first `rank - num_col_dims` dimensions. + R"DOC((int, default 1), The mul_op can take tensors with more than two + dimensions as its inputs. If the input $X$ is a tensor with more + than two dimensions, $X$ will be flattened into a two-dimensional + matrix first. The flattening rule is: the first `num_col_dims` + will be flattened to form the first dimension of the final matrix + (the height of the matrix), and the rest `rank(X) - num_col_dims` + dimensions are flattened to form the second dimension of the final + matrix (the width of the matrix). As a result, height of the + flattened matrix is equal to the product of $X$'s first + `x_num_col_dims` dimensions' sizes, and width of the flattened + matrix is equal to the product of $X$'s last `rank(x) - num_col_dims` + dimensions' size. For example, suppose $X$ is a 6-dimensional + tensor with the shape [2, 3, 4, 5, 6], and `x_num_col_dims` = 3. + Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = + [24, 30]. )DOC") .SetDefault(1) .EqualGreaterThan(1); AddAttr( "y_num_col_dims", - "(int, default 1) " - R"DOC(mul_op can take tensors with more than two dimensions as input `Y`, - in that case, tensors will be reshaped to a matrix. Just like input `X`. + R"DOC((int, default 1), The mul_op can take tensors with more than two, + dimensions as its inputs. If the input $Y$ is a tensor with more + than two dimensions, $Y$ will be flattened into a two-dimensional + matrix first. The attribute `y_num_col_dims` determines how $Y$ is + flattened. See comments of `x_num_col_dims` for more details. )DOC") .SetDefault(1) .EqualGreaterThan(1); AddComment(R"DOC( -Mul Operator. +Mul Operator. -This operator is used to perform matrix multiplication for input X and Y. +This operator is used to perform matrix multiplication for input $X$ and $Y$. The equation is: $$Out = X * Y$$ -Both the input `X` and `Y` can carry the LoD (Level of Details) information, -or not. But the output only shares the LoD information with input `X`. +Both the input $X$ and $Y$ can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input $X$. )DOC"); } diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc index d747cc0cf5f74b886bbd40549673e7d64de952e9..c1046aadafbde303a3a8b12f2377018396b9adb8 100644 --- a/paddle/operators/nccl_op_test.cu.cc +++ b/paddle/operators/nccl_op_test.cu.cc @@ -65,7 +65,7 @@ class NCCLTester : public ::testing::Test { } void NCCLInitOp() { - std::unique_ptr op1(new f::OpDescBind); + std::unique_ptr op1(new f::OpDesc); op1->SetType("ncclInit"); op1->SetOutput("Communicator", {"comm"}); @@ -81,10 +81,9 @@ class NCCLTester : public ::testing::Test { } template - void PerThreadProgram(int gpu_id, const f::OpDescBind &op_desc, - f::Scope *scope) { + void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) { std::unique_lock lk(mu); - const f::OpDescBind *op1 = &op_desc; + const f::OpDesc *op1 = &op_desc; p::GPUPlace place(gpu_id); auto &ctx = dev_ctxs.at(gpu_id); @@ -125,7 +124,7 @@ class NCCLTester : public ::testing::Test { // ncclInitOp with desc TEST(NCCL, ncclInitOp) { - std::unique_ptr op_desc(new f::OpDescBind); + std::unique_ptr op_desc(new f::OpDesc); op_desc->SetType("ncclInit"); op_desc->SetOutput("Communicator", {"x1"}); @@ -145,7 +144,7 @@ TEST(NCCL, ncclInitOp) { // ncclAllReduceOp with desc TEST_F(NCCLTester, ncclAllReduceOp) { - std::unique_ptr op2(new f::OpDescBind); + std::unique_ptr op2(new f::OpDesc); op2->SetType("ncclAllReduce"); op2->SetInput("X", {"st"}); op2->SetInput("Communicator", {"comm"}); @@ -192,7 +191,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) { // ncclReduceOp with desc TEST_F(NCCLTester, ncclReduceOp) { - std::unique_ptr op2(new f::OpDescBind); + std::unique_ptr op2(new f::OpDesc); const int kRoot = 0; op2->SetType("ncclReduce"); op2->SetInput("X", {"st"}); @@ -240,7 +239,7 @@ TEST_F(NCCLTester, ncclReduceOp) { // ncclBcastOp with desc TEST_F(NCCLTester, ncclBcastOp) { - std::unique_ptr op2(new f::OpDescBind); + std::unique_ptr op2(new f::OpDesc); const int kRoot = 5; op2->SetType("ncclBcast"); op2->SetInput("X", {"st"}); diff --git a/paddle/operators/batch_norm_op.md b/paddle/operators/op_documentation/batch_norm_op.md similarity index 100% rename from paddle/operators/batch_norm_op.md rename to paddle/operators/op_documentation/batch_norm_op.md diff --git a/paddle/operators/name_convention.md b/paddle/operators/op_documentation/name_convention.md similarity index 100% rename from paddle/operators/name_convention.md rename to paddle/operators/op_documentation/name_convention.md diff --git a/paddle/operators/net_op_design.md b/paddle/operators/op_documentation/net_op_design.md similarity index 100% rename from paddle/operators/net_op_design.md rename to paddle/operators/op_documentation/net_op_design.md diff --git a/paddle/operators/op_documentation/op_markdown_format.md b/paddle/operators/op_documentation/op_markdown_format.md new file mode 100644 index 0000000000000000000000000000000000000000..0ee804d592252c727622cbe59b0644813db3c4fd --- /dev/null +++ b/paddle/operators/op_documentation/op_markdown_format.md @@ -0,0 +1,64 @@ +# Standard Markdown Format for Operators +The following should be the standard format for documentation for all the operators that will get rendered in the `html`: + +``` +Operator Name (In PaddlePaddle) + +Operator Name (Standard) + +Operator description. + +LaTeX equation of how the operator performs an update. + +The signature of the operator. +``` + +Each section mentioned above has been covered in further detail in the rest of the document. + +# PaddlePaddle Operator Name +This should be in all small letters, in case of multiple words, we separate them with an underscore. For example: +`array to lod tensor` should be written as `array_to_lod_tensor`. + +This naming convention should be standard across all PaddlePaddle operators. + +# Standard Operator Name +This is the standard name of the operator as used in the community. The general standard is usually: +- Standard abbreviations like `SGD` are written in all capital letters. +- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word). +- Keep numbers inside a word as is, with no boundary delimiters. +- Follow the name of the operator with the keyword: `Activation Operator.` + +# Operator description +This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section. + +# LaTeX equation +This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`). + +# The signature +This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is: +`Section : +VariableName : (VariableType) VariableDescription +... +... +` + + +The following example for an `sgd` operator covers the above mentioned sections as they would ideally look like in the `html`: + +``` +sgd + +SGD operator + +This operator implements one step of the stochastic gradient descent algorithm. + +param_out = param_learning_rate * grad + +Inputs: +Param : (Tensor) Input parameter +LearningRate : (Tensor) Learning rate of SGD +Grad : (Tensor) Input gradient + +Outputs: +ParamOut : (Tensor) Output parameter +``` diff --git a/paddle/operators/rnn_design.md b/paddle/operators/op_documentation/rnn_design.md similarity index 100% rename from paddle/operators/rnn_design.md rename to paddle/operators/op_documentation/rnn_design.md diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc index 8d2d031fcdb6bc4ee955b6db6df2892aa7ee5d53..40f7a7eed53354fa65373830b0972c0e72ef54da 100644 --- a/paddle/operators/pad_op.cc +++ b/paddle/operators/pad_op.cc @@ -116,14 +116,14 @@ class PadOpGradMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto* bind = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto* bind = new framework::OpDesc(); bind->SetInput("X", Input("X")); bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); bind->SetOutput(framework::GradVarName("X"), InputGrad("X")); bind->SetAttrMap(Attrs()); bind->SetType("pad_grad"); - return std::unique_ptr(bind); + return std::unique_ptr(bind); } }; diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index ca3a063553d5c8eae3d62d63061bb13a4b3f8365..5981d5745d24e0b2fe68bf8b9852cb8a6094885f 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -234,7 +234,7 @@ class RecurrentOp : public RecurrentBase { auto reverse = Attr(kReverse); framework::Executor executor(dev_ctx); - auto *block = Attr(kStepBlock); + auto *block = Attr(kStepBlock); auto *program = block->Program(); for (size_t i = 0; i < seq_len; ++i) { @@ -317,7 +317,7 @@ class RecurrentGradOp : public RecurrentBase { auto reverse = Attr(kReverse); framework::Executor executor(dev_ctx); - auto *block = Attr(kStepBlock); + auto *block = Attr(kStepBlock); auto *program = block->Program(); for (size_t step_id = 0; step_id < seq_len; ++step_id) { @@ -522,8 +522,7 @@ The ex-state means the state value in the ex-timestep or the previous time step string::Sprintf( "The state variable names. [%s, %s, %s] must be the same order", kExStates, kStates, kInitStateGrads)); - AddAttr(kStepBlock, - "The step block inside RNN"); + AddAttr(kStepBlock, "The step block inside RNN"); AddAttr(kReverse, R"DOC(Calculate RNN reversely or not. By default reverse=False @@ -565,13 +564,13 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - virtual std::unique_ptr Apply() const { - auto *grad = new framework::OpDescBind(); + virtual std::unique_ptr Apply() const { + auto *grad = new framework::OpDesc(); grad->SetType("recurrent_grad"); for (auto &input_param : this->InputNames()) { grad->SetInput(input_param, this->Input(input_param)); grad->SetOutput(framework::GradVarName(input_param), - this->InputGrad(input_param)); + this->InputGrad(input_param, false)); } for (auto &output_param : this->OutputNames()) { @@ -588,7 +587,7 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker { grad->SetAttrMap(this->Attrs()); grad->SetBlockAttr(kStepBlock, *grad_block_[0]); - return std::unique_ptr(grad); + return std::unique_ptr(grad); } }; diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc index 2cc6cf6947b601e326ed563082800946b3ff221d..4e91d1151ebf7a0cd520f2f1fa58a0cc4a0d1bef 100644 --- a/paddle/operators/recv_op.cc +++ b/paddle/operators/recv_op.cc @@ -24,6 +24,7 @@ #include "paddle/framework/framework.pb.h" #include "paddle/framework/lod_tensor.h" #include "paddle/framework/op_registry.h" +#include "paddle/framework/proto_desc.h" #include "paddle/operators/detail/send_recv_impl.h" #include "paddle/operators/detail/simple_block_queue.h" @@ -61,29 +62,76 @@ class RecvOp : public framework::OperatorBase { server_thread_->join(); } + std::string GetGradVarNameForTrainer(const std::string &varname) const { + if (grads_counter_.find(varname) == grads_counter_.end()) { + grads_counter_[varname] = 0; + } + char ret[256]; + snprintf(ret, sizeof(ret), "%s.trainer_%d", varname.c_str(), + grads_counter_[varname]++); + return std::string(ret); + } + void Run(const framework::Scope &scope, const platform::DeviceContext &dev_ctx) const override { - // blocking get one var from client. - const framework::LoDTensor &t = rpc_service_->Get(); + // FIXME(typhoonzero): no new scopes for every run. framework::Scope &recv_scope = scope.NewScope(); - // set graph input var - auto *var = recv_scope.Var(Input("RX")); - auto *tensor = var->GetMutable(); - // FIXME(typhoonzero): do not copy - framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor); - - std::string program_str = Attr("OptimizeProgram"); - framework::ProgramDesc program_desc; - program_desc.ParseFromString(program_str); - framework::ProgramDescBind program(program_desc); - framework::Executor executor(dev_ctx); - // Run sub graph to get optimized tensor - executor.Run(program, &recv_scope, 0, /*global_block*/ - false /*create_local_scope*/); - - auto *out_var = recv_scope.FindVar("Out"); - // push back - rpc_service_->Push(out_var->Get()); + rpc_service_->SetScope(&recv_scope); + auto param_list = Attr>("ParamList"); + auto grad_list = Attr>("GradList"); + auto trainer_count = Attr("Trainers"); + size_t param_count = param_list.size(); + rpc_service_->Reset(); + // TODO(typhoonzero): change this to a while_op for every cluster-batch. + while (true) { + // Get from multiple trainers, we don't care about order in which + // the gradient arrives, just add suffix 0~n then average the gradient. + for (size_t i = 0; i < param_count * trainer_count; ++i) { + // blocking get one var from client. + const detail::TensorWithName &v = rpc_service_->Get(); + auto grad_var_name = v.first; + auto it = std::find(grad_list.begin(), grad_list.end(), grad_var_name); + std::string param_var_name; + if (it != grad_list.end()) { + param_var_name = param_list[it - grad_list.begin()]; + } else { + LOG(ERROR) << "grad have no paired param found!"; + } + VLOG(3) << "recved grad: " << grad_var_name + << " updating param: " << param_var_name; + auto *merged_grad = recv_scope.FindVar(grad_var_name); + if (merged_grad == nullptr) { + // create output of merged var. + auto merged_var = recv_scope.Var(grad_var_name); + merged_var->GetMutable(); + } + + if (trainer_count > 1) { + grad_var_name = this->GetGradVarNameForTrainer(grad_var_name); + } + + auto *var = recv_scope.Var(grad_var_name); + auto *tensor = var->GetMutable(); + // FIXME(typhoonzero): do not copy + framework::CopyFrom(v.second, dev_ctx.GetPlace(), dev_ctx, tensor); + } + rpc_service_->Reset(); + + std::string program_str = Attr("OptimizeProgram"); + framework::proto::ProgramDesc program_desc; + program_desc.ParseFromString(program_str); + framework::ProgramDesc program(program_desc); + framework::Executor executor(dev_ctx); + // Run sub graph to get optimized tensor + try { + executor.Run(program, &recv_scope, 0, /*global_block*/ + false /*create_local_scope*/, false /*create_vars*/); + } catch (std::exception &e) { + LOG(ERROR) << "run sub program error " << e.what(); + } + rpc_service_->Done(); + grads_counter_.clear(); + } // while(true) } protected: @@ -93,13 +141,14 @@ class RecvOp : public framework::OperatorBase { // grpc send/recv service implement to register. std::shared_ptr rpc_service_; std::shared_ptr server_thread_; + mutable std::unordered_map grads_counter_; }; class RecvOpMaker : public framework::OpProtoAndCheckerMaker { public: RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("RX", "(Tensor) Input tensor to be saved"); + AddInput("RX", "(Tensor) Input tensor to be optimized").AsDuplicable(); AddComment(R"DOC( Recv operator @@ -112,6 +161,17 @@ This operator will recv tensor from send_op .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); AddAttr("OptimizeProgram", "type string", "Serialized ProgramDesc string for recv to run."); + AddAttr>( + "ParamList", "type list of string", + "grad->param name mapping to find which param to optimize.") + .SetDefault({}); + AddAttr>( + "GradList", "type list of string", + "grad->param name mapping to find which param to optimize.") + .SetDefault({}); + AddAttr("Trainers", "type int", + "Number of trainers in the current cluster job") + .SetDefault(1); } }; diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc index 98170c0d1b22fd2cbc57c29295e6887c1bb1fa68..ee39888713544703ee8d305b2c04e4b03deeceab 100644 --- a/paddle/operators/scale_op.cc +++ b/paddle/operators/scale_op.cc @@ -58,13 +58,13 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker { public: using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - std::unique_ptr Apply() const override { - auto *grad_op = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); grad_op->SetType("scale"); grad_op->SetInput("X", OutputGrad("Out")); grad_op->SetOutput("Out", InputGrad("X")); grad_op->SetAttr("scale", GetAttr("scale")); - return std::unique_ptr(grad_op); + return std::unique_ptr(grad_op); } }; diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc index 0d121fb48dc2dc8bd0312aa2091f4e058caa4515..a5681910708bae584bdf4217b61ca63a3988e1ff 100644 --- a/paddle/operators/send_op.cc +++ b/paddle/operators/send_op.cc @@ -34,45 +34,56 @@ class SendOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) { // init client when the operator is created at runtime. - if (!client_) { - std::string endpoint = Attr("endpoint"); - client_.reset(new detail::RPCClient( - grpc::CreateChannel(endpoint, grpc::InsecureChannelCredentials()))); - // TODO(typhoonzero): how to call InitVariables + std::vector endpoints = + Attr>("endpoints"); + for (auto ep : endpoints) { + client_map_[ep].reset(new detail::RPCClient( + grpc::CreateChannel(ep, grpc::InsecureChannelCredentials()))); } } void Run(const framework::Scope &scope, const platform::DeviceContext &dev_ctx) const override { - auto iname = Input("X"); - auto oname = Output("Out"); - // TODO(typhoonzero): currently it's non-blocking, - // should block until server responds. - bool ret = client_->SendVariable(scope, iname, oname); - if (!ret) { - LOG(ERROR) << "send variable error"; + auto ins = Inputs("X"); + std::vector epmap = Attr>("epmap"); + // TODO(typhoonzero): use async calls to send multiple variable asyncly. + for (size_t i = 0; i < ins.size(); ++i) { + bool ret = client_map_[epmap[i]]->SendVariable(scope, ins[i]); + if (!ret) { + LOG(ERROR) << "send variable error: " << ins[i]; + } + } + // TODO(typhoonzero): support async optimization + client_map_[epmap[0]]->Wait(); + for (size_t i = 0; i < ins.size(); ++i) { + bool ret = client_map_[epmap[i]]->GetVariable(scope, ins[i]); + if (!ret) { + LOG(ERROR) << "GetVariable error: " << ins[i]; + } } } protected: - std::shared_ptr client_{nullptr}; + mutable std::unordered_map> + client_map_; }; class SendOpMaker : public framework::OpProtoAndCheckerMaker { public: SendOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "(Tensor) Input tensor to be saved"); - AddOutput("Out", "(Tensor) Output fetched from server"); + AddInput("X", "(Tensor) Input tensor to be send").AsDuplicable(); AddComment(R"DOC( Recv operator This operator will recv tensor from send_op )DOC"); - AddAttr("endpoint", - "(string, default 127.0.0.1:6164)" - "IP address to listen on.") - .SetDefault("127.0.0.1:6164") - .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); + AddAttr>("endpoints", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints to send variables to."); + AddAttr>("epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input " + "variables for mapping"); } }; diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc index 3e2e2051afacb748877e3b0c3dec8d6662ac4e72..d899d8154cce57a88de0e1d19e3393528ce367e2 100644 --- a/paddle/operators/send_recv_op_test.cc +++ b/paddle/operators/send_recv_op_test.cc @@ -16,12 +16,14 @@ // a RemoteOptimizer. #include +#include #include #include "gtest/gtest.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" #include "paddle/framework/program_desc.h" +#include "paddle/string/printf.h" USE_NO_KERNEL_OP(send); USE_NO_KERNEL_OP(recv); @@ -33,30 +35,33 @@ std::unique_ptr recv_op; void InitTensorsInScope(paddle::framework::Scope &scope, paddle::platform::CPUPlace &place) { paddle::platform::CPUDeviceContext ctx(place); - auto var = scope.Var("X"); - auto tensor = var->GetMutable(); - tensor->Resize({10, 10}); - float *expect = tensor->mutable_data(place); - for (int64_t i = 0; i < tensor->numel(); ++i) { - expect[i] = static_cast(i); + for (int i = 0; i < 2; ++i) { + auto var_name = paddle::string::Sprintf("x%d", i); + auto var = scope.Var(var_name); + auto tensor = var->GetMutable(); + tensor->Resize({10, 10}); + float *expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(i); + } } auto out_var = scope.Var("Out"); auto out_tensor = out_var->GetMutable(); out_tensor->Resize({10, 10}); - tensor->mutable_data(place); // allocate + out_tensor->mutable_data(place); // allocate } void AddOp(const std::string &type, const paddle::framework::VariableNameMap &inputs, const paddle::framework::VariableNameMap &outputs, paddle::framework::AttributeMap attrs, - paddle::framework::BlockDescBind *block) { + paddle::framework::BlockDesc *block) { // insert output for (auto kv : outputs) { for (auto v : kv.second) { auto var = block->Var(v); - var->SetDataType(paddle::framework::DataType::FP32); + var->SetDataType(paddle::framework::proto::DataType::FP32); } } @@ -78,10 +83,10 @@ void StartServerNet() { InitTensorsInScope(scope, place); // sub program run in recv_op, for simple test we use sum - paddle::framework::ProgramDescBind program; - paddle::framework::BlockDescBind *block = program.MutableBlock(0); + paddle::framework::ProgramDesc program; + paddle::framework::BlockDesc *block = program.MutableBlock(0); // X for server side tensors, RX for received tensers, must be of same shape. - AddOp("sum", {{"X", {"X", "RX"}}}, {{"Out", {"Out"}}}, {}, block); + AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block); paddle::framework::AttributeMap attrs; attrs.insert({"endpoint", std::string("127.0.0.1:6174")}); @@ -89,8 +94,8 @@ void StartServerNet() { PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto)); attrs.insert({"OptimizeProgram", program_proto}); - recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}}, - {{"Out", {"Out"}}}, attrs); + recv_op = paddle::framework::OpRegistry::CreateOp( + "recv", {{"RX", {"x0", "x1"}}}, {{"Out", {"Out"}}}, attrs); paddle::platform::CPUDeviceContext ctx(place); recv_op->Run(scope, ctx); } @@ -107,11 +112,11 @@ TEST(SendRecvOp, CPU) { attrs.insert({"endpoint", std::string("127.0.0.1:6174")}); auto send_op = paddle::framework::OpRegistry::CreateOp( - "send", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); + "send", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, attrs); paddle::platform::CPUDeviceContext ctx(place); send_op->Run(scope, ctx); - auto in_var = scope.Var("X"); + auto in_var = scope.Var("x0"); auto tensor = in_var->GetMutable(); float *expected = tensor->data(); diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc index 54e8989f256e61ce9d4f76b631a3906773d04a2e..2f0aad2003e48952ca26ca27573bc45386a4e585 100644 --- a/paddle/operators/sequence_concat_op.cc +++ b/paddle/operators/sequence_concat_op.cc @@ -67,12 +67,12 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker { "The level should be less than the level number of inputs.") .SetDefault(0); AddComment(R"DOC( -The sequence_concat operator concatenates multiple LoDTensors. -It only supports sequence (LoD Tensor with level number is 1) +The sequence_concat operator concatenates multiple LoDTensors. +It only supports sequence (LoD Tensor with level number is 1) or a nested sequence (LoD tensor with level number is 2) as its input. - Case1: If the axis is other than 0(here, axis is 1 and level is 1), - each input should have the same LoD information and the LoD + each input should have the same LoD information and the LoD information of the output keeps the same as the input. LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) @@ -80,7 +80,7 @@ or a nested sequence (LoD tensor with level number is 2) as its input. LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4) - Case2: - If the axis is 0(here, leve is 0), the inputs are concatenated along + If the axis is 0(here, leve is 0), the inputs are concatenated along time steps, the LoD information of the output need to re-compute. The LoD information of level-1 should be same. @@ -124,8 +124,9 @@ class SequenceConcatGradOp : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP(sequence_concat, ops::SequenceConcatOp, ops::SequenceConcatOpMaker, - sequence_concat_grad, ops::SequenceConcatGradOp); +REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp, + ops::SequenceConcatOpMaker, sequence_concat_grad, + ops::SequenceConcatGradOp, false); REGISTER_OP_CPU_KERNEL( sequence_concat, ops::SequenceConcatOpKernel); diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc index fe1832a36fa7fd91cf1fccddfc09918e4698e09c..b74766f012e333cc2a317e6efe17c5b60238924a 100644 --- a/paddle/operators/sequence_softmax_op.cc +++ b/paddle/operators/sequence_softmax_op.cc @@ -50,10 +50,14 @@ input Tensor can be either [N, 1] or [N], where N is the sum of the length of all sequences. The algorithm works as follows: + for i-th sequence in a mini-batch: - $$Out(X[lod[i]:lod[i+1]], :) = - \frac{\exp(X[lod[i]:lod[i+1], :])} - {\sum(\exp(X[lod[i]:lod[i+1], :]))}$$ + +$$ +Out(X[lod[i]:lod[i+1]], :) = \ +\frac{\exp(X[lod[i]:lod[i+1], :])} \ +{\sum(\exp(X[lod[i]:lod[i+1], :]))} +$$ For example, for a mini-batch of 3 sequences with variable-length, each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7], diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc index 92dbe126bc084a4e1ead48f2e2334447480362bb..48194a547bbea5ddda7c5f3e2421431d1d81042d 100644 --- a/paddle/operators/shrink_rnn_memory_op.cc +++ b/paddle/operators/shrink_rnn_memory_op.cc @@ -136,14 +136,14 @@ class ShrinkRNNGradOpMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto *op = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto *op = new framework::OpDesc(); op->SetType("shrink_rnn_memory_grad"); op->SetInput("X", Input("X")); op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); op->SetOutput(framework::GradVarName("X"), InputGrad("X")); op->SetAttrMap(Attrs()); - return std::unique_ptr(op); + return std::unique_ptr(op); } }; diff --git a/paddle/operators/sign_op.cc b/paddle/operators/sign_op.cc index b2bfce71a6c3b7153aa10521cbb55cedbc0d7940..b2459fb2f53939b3131af1540044ce361b87d08a 100644 --- a/paddle/operators/sign_op.cc +++ b/paddle/operators/sign_op.cc @@ -50,13 +50,13 @@ class SignGradMaker : public framework::SingleGradOpDescMaker { public: using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - std::unique_ptr Apply() const override { - auto *grad_op = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); grad_op->SetType("scale"); grad_op->SetInput("X", OutputGrad("Out")); grad_op->SetOutput("Out", InputGrad("X")); grad_op->SetAttr("scale", 0.0f); - return std::unique_ptr(grad_op); + return std::unique_ptr(grad_op); } }; diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc index bca3ff1562d881be1d31e56270d22252e5e491e1..d9911a6901447d8900c3881a60c7a0852dcbf429 100644 --- a/paddle/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/operators/softmax_with_cross_entropy_op.cc @@ -173,8 +173,8 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto* grad_op = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto* grad_op = new framework::OpDesc(); grad_op->SetType("softmax_with_cross_entropy_grad"); grad_op->SetInput("Label", Input("Label")); grad_op->SetInput("Softmax", Output("Softmax")); @@ -183,7 +183,7 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker { grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits")); grad_op->SetAttrMap(Attrs()); - return std::unique_ptr(grad_op); + return std::unique_ptr(grad_op); } }; diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc index c83b0cbad7f7e9115e7ce47a1823987830c0c086..3542d8624fec49f75314f046434cbcadf307497e 100644 --- a/paddle/operators/split_lod_tensor_op.cc +++ b/paddle/operators/split_lod_tensor_op.cc @@ -163,8 +163,8 @@ class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto *grad_op = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); grad_op->SetType("merge_lod_tensor"); grad_op->SetInput("InTrue", OutputGrad("OutTrue")); grad_op->SetInput("InFalse", OutputGrad("OutFalse")); @@ -172,7 +172,7 @@ class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker { grad_op->SetInput("X", Input("X")); grad_op->SetOutput("Out", InputGrad("X")); grad_op->SetAttrMap(Attrs()); - return std::unique_ptr(grad_op); + return std::unique_ptr(grad_op); } }; diff --git a/paddle/operators/split_op.cc b/paddle/operators/split_op.cc index e8c5fffcd2cdf9f1b31a6c343e4820f155d595f7..4dfae043cb1091c9491d89aec4d1415d4741e013 100644 --- a/paddle/operators/split_op.cc +++ b/paddle/operators/split_op.cc @@ -108,13 +108,13 @@ class SplitGradMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto op = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto op = new framework::OpDesc(); op->SetType("concat"); op->SetInput("X", OutputGrad("Out")); op->SetOutput("Out", InputGrad("X")); op->SetAttrMap(Attrs()); - return std::unique_ptr(op); + return std::unique_ptr(op); } }; diff --git a/paddle/operators/strided_memcpy_test.cc b/paddle/operators/strided_memcpy_test.cc index 68f064eaee5851333ddf9767b7138da83a28503d..230cc1ab0bbd5ac57eb7494795e3fbcdf02c3cc8 100644 --- a/paddle/operators/strided_memcpy_test.cc +++ b/paddle/operators/strided_memcpy_test.cc @@ -85,8 +85,10 @@ TEST(StridedMemcpy, GPUCrop) { platform::GPUPlace gpu0(0); platform::CPUPlace cpu; + platform::CUDADeviceContext ctx(gpu0); + int* gpu_src = reinterpret_cast(memory::Alloc(gpu0, sizeof(src))); - memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src)); + memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); framework::DDim src_stride({5, 1}); @@ -96,7 +98,6 @@ TEST(StridedMemcpy, GPUCrop) { framework::DDim dst_dim({2, 2}); framework::DDim dst_stride({2, 1}); - platform::CUDADeviceContext ctx(gpu0); StridedMemcpy(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst); @@ -122,9 +123,10 @@ TEST(StridedMemcpy, GPUConcat) { platform::GPUPlace gpu0(0); platform::CPUPlace cpu; + platform::CUDADeviceContext ctx(gpu0); int* gpu_src = reinterpret_cast(memory::Alloc(gpu0, sizeof(src))); - memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src)); + memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); int dst[8]; int* gpu_dst = reinterpret_cast(memory::Alloc(gpu0, sizeof(dst))); @@ -132,7 +134,6 @@ TEST(StridedMemcpy, GPUConcat) { framework::DDim src_stride({2, 1}); framework::DDim dst_dim({2, 2}); framework::DDim dst_stride({4, 1}); - platform::CUDADeviceContext ctx(gpu0); StridedMemcpy(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst); StridedMemcpy(ctx, gpu_src, src_stride, dst_dim, dst_stride, diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc index c56fc1f10b58c1678743aff277f28064dff0f28d..891839bf9cd991e15d96b86e24ea61b09e35a7c7 100644 --- a/paddle/operators/sum_op.cc +++ b/paddle/operators/sum_op.cc @@ -106,8 +106,8 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Sum operator. -This operators sums the input tensors. All the inputs can carry the -LoD (Level of Details) information. However, the output only shares +This operators sums the input tensors. All the inputs can carry the +LoD (Level of Details) information. However, the output only shares the LoD information with the first input. )DOC"); } @@ -115,8 +115,8 @@ the LoD information with the first input. class SumOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDescBind& op_desc, - framework::BlockDescBind* block) const override { + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { auto& inputs = op_desc.Input("X"); auto var_type = framework::proto::VarDesc::SELECTED_ROWS; @@ -169,20 +169,19 @@ class SumGradMaker : public framework::GradOpDescMakerBase { public: using framework::GradOpDescMakerBase::GradOpDescMakerBase; - std::vector> operator()() - const override { - auto x_grads = InputGrad("X"); - std::vector> grad_ops; + std::vector> operator()() const override { + auto x_grads = InputGrad("X", false); + std::vector> grad_ops; grad_ops.reserve(x_grads.size()); auto og = OutputGrad("Out"); std::transform(x_grads.begin(), x_grads.end(), std::back_inserter(grad_ops), [&og](const std::string& x_grad) { - auto* grad_op = new framework::OpDescBind(); + auto* grad_op = new framework::OpDesc(); grad_op->SetType("scale"); grad_op->SetInput("X", og); grad_op->SetOutput("Out", {x_grad}); grad_op->SetAttr("scale", 1.0f); - return std::unique_ptr(grad_op); + return std::unique_ptr(grad_op); }); return grad_ops; } diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index 337b7555c7f077bdebed46ef7b5d8b6fa0ce0363..90cbc19d1b1bab2e639e3d6d5b28cd13b30542f6 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -96,8 +96,8 @@ class WriteToArrayInferShape : public framework::InferShapeBase { class WriteToArrayInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDescBind &op_desc, - framework::BlockDescBind *block) const override { + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { auto x_name = op_desc.Input("X")[0]; auto out_name = op_desc.Output("Out")[0]; VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY"; @@ -175,14 +175,14 @@ class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto *grad_op = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); grad_op->SetType("read_from_array"); grad_op->SetInput("I", Input("I")); grad_op->SetInput("X", OutputGrad("Out")); grad_op->SetOutput("Out", InputGrad("X")); grad_op->SetAttrMap(Attrs()); - return std::unique_ptr(grad_op); + return std::unique_ptr(grad_op); } }; @@ -191,14 +191,14 @@ class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto *grad_op = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); grad_op->SetType("write_to_array"); grad_op->SetInput("I", Input("I")); grad_op->SetInput("X", OutputGrad("Out")); grad_op->SetOutput("Out", InputGrad("X")); grad_op->SetAttrMap(Attrs()); - return std::unique_ptr(grad_op); + return std::unique_ptr(grad_op); } }; diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 56a01e56d75a125a86e3d3a3702bf0bb64552b65..324c8b98c4811328b2a89eadc3e3420c080bd7d1 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -46,7 +46,7 @@ class WhileOp : public framework::OperatorBase { PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1})); framework::Executor executor(dev_ctx); - auto *block = Attr(kStepBlock); + auto *block = Attr(kStepBlock); auto *program = block->Program(); auto step_scopes = @@ -82,8 +82,8 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { "(StepScopeVar) A vector of local scope, which size equals the " "step number of While Op. The i'th scope storages temporary " "variables generated in the i'th step."); - AddAttr(kStepBlock, - "The step block inside WhileOp"); + AddAttr(kStepBlock, + "The step block inside WhileOp"); AddComment(R"DOC( )DOC"); } @@ -99,7 +99,7 @@ class WhileGradOp : public framework::OperatorBase { void Run(const framework::Scope &scope, const platform::DeviceContext &dev_ctx) const override { framework::Executor executor(dev_ctx); - auto *block = Attr(kStepBlock); + auto *block = Attr(kStepBlock); auto *program = block->Program(); auto *step_scopes = @@ -209,8 +209,8 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - std::unique_ptr Apply() const override { - auto *grad = new framework::OpDescBind(); + std::unique_ptr Apply() const override { + auto *grad = new framework::OpDesc(); grad->SetType("while_grad"); grad->SetInput(kParameters, Input(kParameters)); @@ -279,14 +279,14 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. grad->SetAttr("original_output_grad", extra_inputs_list); - return std::unique_ptr(grad); + return std::unique_ptr(grad); } }; class WhileGradOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDescBind &op_desc, - framework::BlockDescBind *block) const override { + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { auto p_names = op_desc.Input(kParameters); auto pg_names = op_desc.Output(framework::GradVarName(kParameters)); diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index 8cdc5f43403b0c54d3f1f01a3e97405fd5b2f434..dacee74fff369586c7ca2ff62cfe6aeebd8f39c7 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -19,7 +19,7 @@ CPUDeviceContext::CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); } -CPUDeviceContext::CPUDeviceContext(CPUPlace place) { +CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) { eigen_device_.reset(new Eigen::DefaultDevice()); } @@ -27,7 +27,7 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const { return eigen_device_.get(); } -Place CPUDeviceContext::GetPlace() const { return CPUPlace(); } +Place CPUDeviceContext::GetPlace() const { return place_; } #ifdef PADDLE_WITH_CUDA diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 56813a1d5b3c2a7f4ff7b4eddc6fa47ed861700c..6cc0508522a97f3097b30e3340e7413a7093714a 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -45,6 +45,7 @@ class CPUDeviceContext : public DeviceContext { Place GetPlace() const override; private: + CPUPlace place_; std::unique_ptr eigen_device_; }; diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc index 541eca5f39c2e6a4b464aec79fd8a920ab4c7732..7037551d7544d6fea54e2f4bf887309b7dc5a52e 100644 --- a/paddle/platform/gpu_info.cc +++ b/paddle/platform/gpu_info.cc @@ -97,17 +97,6 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count, "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync"); } -void GpuMemcpySync(void *dst, const void *src, size_t count, - enum cudaMemcpyKind kind) { - PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind), - "cudaMemcpy failed in paddle::platform::GpuMemcpySync"); - // note: cudaMemcpy may actually be asynchronous with respect to the caller, - // block on stream 0 to make sure the copy has completed - PADDLE_ENFORCE( - cudaStreamSynchronize(0), - "cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync"); -} - void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, size_t count, cudaStream_t stream) { PADDLE_ENFORCE( diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h index db961f3838af73855312d4cf6a80e2355306e08f..d05131fa4196057d19a8ae57bf4574c666e409cf 100644 --- a/paddle/platform/gpu_info.h +++ b/paddle/platform/gpu_info.h @@ -52,10 +52,6 @@ size_t GpuMaxChunkSize(); void GpuMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream); -//! Copy memory from address src to dst synchronously. -void GpuMemcpySync(void *dst, const void *src, size_t count, - enum cudaMemcpyKind kind); - //! Copy memory from one device to another device. void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, size_t count, cudaStream_t stream); diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu index d36eac8379ebedb284b36012a46186cd3ac43b91..464096111e4a85b8d64d9223bfb85a1d1d42fad4 100644 --- a/paddle/platform/transform_test.cu +++ b/paddle/platform/transform_test.cu @@ -53,11 +53,11 @@ TEST(Transform, GPUUnary) { CUDADeviceContext ctx(gpu0); float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float* gpu_buf = static_cast(Alloc(gpu0, sizeof(float) * 4)); - Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf)); + Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream()); Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale(10)); ctx.Wait(); - Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf)); + Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream()); Free(gpu0, gpu_buf); for (int i = 0; i < 4; ++i) { ASSERT_NEAR(cpu_buf[i], static_cast(i + 1), 1e-5); @@ -83,11 +83,11 @@ TEST(Transform, GPUBinary) { GPUPlace gpu0(0); CUDADeviceContext ctx(gpu0); int* gpu_buf = static_cast(Alloc(gpu0, sizeof(buf))); - Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf)); + Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream()); Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply()); ctx.Wait(); - Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf)); + Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream()); Free(gpu0, gpu_buf); for (int i = 0; i < 4; ++i) { ASSERT_EQ((i + 1) * (i + 1), buf[i]); diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index de26184d0102574164a0e06da5fd58ef052d2559..f105370f226e2cceaac685f280d55134d4291028 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -108,21 +108,21 @@ static py::bytes SerializeMessage(T &self) { // Bind Methods void BindProgramDesc(py::module &m) { - py::class_(m, "ProgramDesc", "") + py::class_(m, "ProgramDesc", "") .def(py::init<>()) .def("__init__", - [](ProgramDescBind &self, const ProgramDescBind &other) { - new (&self) ProgramDescBind(other); + [](ProgramDesc &self, const ProgramDesc &other) { + new (&self) ProgramDesc(other); }) .def("__init__", - [](ProgramDescBind &self, const py::bytes &binary_str) { + [](ProgramDesc &self, const py::bytes &binary_str) { std::string str(binary_str); - new (&self) ProgramDescBind(str); + new (&self) ProgramDesc(str); }) - .def("append_block", &ProgramDescBind::AppendBlock, + .def("append_block", &ProgramDesc::AppendBlock, py::return_value_policy::reference) .def("append_backward", - [](ProgramDescBind &program_desc, const VarDescBind &target, + [](ProgramDesc &program_desc, const VarDesc &target, const std::unordered_set &no_grad_vars) { ParamGradInfoMap param_grad_map = AppendBackward(program_desc, target, no_grad_vars); @@ -138,12 +138,12 @@ void BindProgramDesc(py::module &m) { } return retv; }) - .def("block", &ProgramDescBind::MutableBlock, + .def("block", &ProgramDesc::MutableBlock, py::return_value_policy::reference) - .def("num_blocks", &ProgramDescBind::Size) - .def("serialize_to_string", SerializeMessage) + .def("num_blocks", &ProgramDesc::Size) + .def("serialize_to_string", SerializeMessage) .def("parse_from_string", - [](ProgramDescBind &program_desc, const std::string &data) { + [](ProgramDesc &program_desc, const std::string &data) { proto::ProgramDesc *desc = program_desc.Proto(); PADDLE_ENFORCE(desc->ParseFromString(data), "Fail to parse ProgramDesc from string. This could " @@ -152,35 +152,35 @@ void BindProgramDesc(py::module &m) { } void BindBlockDesc(py::module &m) { - py::class_(m, "BlockDesc", "") - .def_property_readonly("id", &BlockDescBind::ID) - .def_property_readonly("parent", &BlockDescBind::Parent) - .def("append_op", &BlockDescBind::AppendOp, + py::class_(m, "BlockDesc", "") + .def_property_readonly("id", &BlockDesc::ID) + .def_property_readonly("parent", &BlockDesc::Parent) + .def("append_op", &BlockDesc::AppendOp, py::return_value_policy::reference) - .def("prepend_op", &BlockDescBind::PrependOp, + .def("prepend_op", &BlockDesc::PrependOp, py::return_value_policy::reference) + .def("remove_op", &BlockDesc::RemoveOp) .def("var", - [](BlockDescBind &self, py::bytes byte_name) { + [](BlockDesc &self, py::bytes byte_name) { std::string name = byte_name; return self.Var(name); }, py::return_value_policy::reference) .def("has_var", - [](BlockDescBind &self, py::bytes byte_name) { + [](BlockDesc &self, py::bytes byte_name) { std::string name = byte_name; return self.HasVar(name); }) .def("find_var", - [](BlockDescBind &self, py::bytes byte_name) { + [](BlockDesc &self, py::bytes byte_name) { std::string name = byte_name; return self.FindVar(name); }, py::return_value_policy::reference) - .def("all_vars", &BlockDescBind::AllVars, - py::return_value_policy::reference) - .def("op_size", &BlockDescBind::OpSize) - .def("op", &BlockDescBind::Op, py::return_value_policy::reference) - .def("serialize_to_string", SerializeMessage); + .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference) + .def("op_size", &BlockDesc::OpSize) + .def("op", &BlockDesc::Op, py::return_value_policy::reference) + .def("serialize_to_string", SerializeMessage); } void BindVarDsec(py::module &m) { @@ -193,25 +193,25 @@ void BindVarDsec(py::module &m) { .value("FP32", proto::DataType::FP32) .value("FP64", proto::DataType::FP64); - py::class_ var_desc(m, "VarDesc", ""); + py::class_ var_desc(m, "VarDesc", ""); var_desc .def("name", - [](const VarDescBind &self) { + [](const VarDesc &self) { py::bytes name = self.Name(); return name; }, py::return_value_policy::reference) - .def("set_shape", &VarDescBind::SetShape) - .def("set_dtype", &VarDescBind::SetDataType) - .def("shape", &VarDescBind::Shape, py::return_value_policy::reference) - .def("dtype", &VarDescBind::GetDataType) - .def("lod_level", &VarDescBind::GetLodLevel) - .def("set_lod_level", &VarDescBind::SetLoDLevel) - .def("type", &VarDescBind::GetType) - .def("set_type", &VarDescBind::SetType) - .def("serialize_to_string", SerializeMessage) - .def("persistable", &VarDescBind::Persistable) - .def("set_persistable", &VarDescBind::SetPersistable); + .def("set_shape", &VarDesc::SetShape) + .def("set_dtype", &VarDesc::SetDataType) + .def("shape", &VarDesc::Shape, py::return_value_policy::reference) + .def("dtype", &VarDesc::GetDataType) + .def("lod_level", &VarDesc::GetLodLevel) + .def("set_lod_level", &VarDesc::SetLoDLevel) + .def("type", &VarDesc::GetType) + .def("set_type", &VarDesc::SetType) + .def("serialize_to_string", SerializeMessage) + .def("persistable", &VarDesc::Persistable) + .def("set_persistable", &VarDesc::SetPersistable); py::enum_(var_desc, "VarType", "") .value("LOD_TENSOR", proto::VarDesc::LOD_TENSOR) @@ -235,26 +235,32 @@ void BindOpDesc(py::module &m) { .value("BOOLS", proto::AttrType::BOOLEANS) .value("BLOCK", proto::AttrType::BLOCK); - py::class_ op_desc(m, "OpDesc", ""); - op_desc.def("type", &OpDescBind::Type) - .def("set_type", &OpDescBind::SetType) - .def("input", &OpDescBind::Input) - .def("input_names", &OpDescBind::InputNames) - .def("set_input", &OpDescBind::SetInput) - .def("output", &OpDescBind::Output) - .def("output_names", &OpDescBind::OutputNames) - .def("set_output", &OpDescBind::SetOutput) - .def("has_attr", &OpDescBind::HasAttr) - .def("attr_type", &OpDescBind::GetAttrType) - .def("attr_names", &OpDescBind::AttrNames) - .def("set_attr", &OpDescBind::SetAttr) - .def("attr", &OpDescBind::GetAttr) - .def("set_block_attr", &OpDescBind::SetBlockAttr) - .def("block_attr", &OpDescBind::GetBlockAttr) - .def("check_attrs", &OpDescBind::CheckAttrs) - .def("infer_shape", &OpDescBind::InferShape) - .def("infer_var_type", &OpDescBind::InferVarType) - .def("serialize_to_string", SerializeMessage); + py::class_ op_desc(m, "OpDesc", ""); + op_desc.def("type", &OpDesc::Type) + .def("set_type", &OpDesc::SetType) + .def("input", &OpDesc::Input) + .def("input_names", &OpDesc::InputNames) + .def("set_input", &OpDesc::SetInput) + .def("output", &OpDesc::Output) + .def("output_names", &OpDesc::OutputNames) + .def("set_output", &OpDesc::SetOutput) + .def("has_attr", &OpDesc::HasAttr) + .def("attr_type", &OpDesc::GetAttrType) + .def("attr_names", &OpDesc::AttrNames) + .def("set_attr", &OpDesc::SetAttr) + .def("attr", &OpDesc::GetAttr) + .def("set_block_attr", &OpDesc::SetBlockAttr) + .def("set_serialized_attr", + [](OpDesc &self, const std::string &name, + const py::bytes &seriralized) { + std::string ser(seriralized); + self.SetAttr(name, ser); + }) + .def("block_attr", &OpDesc::GetBlockAttr) + .def("check_attrs", &OpDesc::CheckAttrs) + .def("infer_shape", &OpDesc::InferShape) + .def("infer_var_type", &OpDesc::InferVarType) + .def("serialize_to_string", SerializeMessage); } } // namespace pybind diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 31f802d4d2489dda7fbe1d773abfc14433db3d45..2d7fe251416dce629dd0a2318aaa020ec9668d9b 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -266,36 +266,36 @@ All parameter, weight, gradient are variables in Paddle. return ret_values; }); m.def("get_grad_op_descs", - [](const OpDescBind &op_desc, + [](const OpDesc &op_desc, const std::unordered_set &no_grad_set, std::unordered_map &grad_to_var, - const std::vector &grad_sub_block) { - std::vector> grad_op_descs = + const std::vector &grad_sub_block) { + std::vector> grad_op_descs = framework::OpInfoMap::Instance() .Get(op_desc.Type()) .GradOpMaker()(op_desc, no_grad_set, &grad_to_var, grad_sub_block); - std::vector grad_op_desc_ptrs(grad_op_descs.size()); + std::vector grad_op_desc_ptrs(grad_op_descs.size()); std::transform( grad_op_descs.begin(), grad_op_descs.end(), grad_op_desc_ptrs.begin(), - [](std::unique_ptr &p) { return p.release(); }); + [](std::unique_ptr &p) { return p.release(); }); return grad_op_desc_ptrs; }); - m.def("prune", [](const ProgramDescBind &origin, + m.def("prune", [](const ProgramDesc &origin, const std::vector> &targets) { - ProgramDescBind prog_with_targets(origin); + ProgramDesc prog_with_targets(origin); for (const auto &t : targets) { prog_with_targets.MutableBlock(t[0])->Op(t[1])->MarkAsTarget(); } proto::ProgramDesc pruned_desc; Prune(*prog_with_targets.Proto(), &pruned_desc); - return new ProgramDescBind(pruned_desc); + return new ProgramDesc(pruned_desc); }); - m.def("inference_optimize", [](ProgramDescBind &origin) { + m.def("inference_optimize", [](ProgramDesc &origin) { proto::ProgramDesc pruned_desc; InferenceOptimize(*(origin.Proto()), &pruned_desc); - return new ProgramDescBind(pruned_desc); + return new ProgramDesc(pruned_desc); }); m.def_submodule( "var_names", diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h index 41fa658502d341fe9653a3e99b58498fcaeada47..268a0f2fa386adf99f7ea1589ff1f301f943a68b 100644 --- a/paddle/pybind/tensor_py.h +++ b/paddle/pybind/tensor_py.h @@ -14,6 +14,7 @@ #pragma once #include +#include "paddle/framework/executor.h" #include "paddle/framework/tensor.h" #include "paddle/memory/memcpy.h" #include "pybind11/numpy.h" @@ -61,11 +62,15 @@ struct CastToPyBufferImpl { auto *src_ptr = static_cast(tensor.data()); auto *dst_ptr = static_cast(dst_tensor.mutable_data( tensor.dims(), platform::CPUPlace())); - // TODO(qijun): Here we use default CUDA stream to set GPU Tensor to - // a Python numpy array. It's better to manage CDUA stream unifiedly. - paddle::platform::GpuMemcpySync(dst_ptr, src_ptr, - sizeof(CUR_TYPE) * tensor.numel(), - cudaMemcpyDeviceToHost); + + framework::DeviceContextPool &pool = + framework::DeviceContextPool::Get(); + auto dev_ctx = static_cast( + pool.Borrow(tensor.place())); + + paddle::platform::GpuMemcpyAsync( + dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(), + cudaMemcpyDeviceToHost, dev_ctx->stream()); #else PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); #endif @@ -132,10 +137,12 @@ void PyCUDATensorSetFromArray( self.Resize(framework::make_ddim(dims)); auto *dst = self.mutable_data(place); - // TODO(qijun): Here we use default CUDA stream to set a Python numpy - // array to a GPU Tensor. It's better to manage CDUA stream unifiedly. - paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(), - cudaMemcpyHostToDevice); + + framework::DeviceContextPool &pool = framework::DeviceContextPool::Get(); + auto dev_ctx = + static_cast(pool.Borrow(place)); + paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(), + cudaMemcpyHostToDevice, dev_ctx->stream()); } #endif diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py index 9b3792ee9e3e4c6f319b3e2b13c4aa3a05cce8be..471255ef50a3be4739a89efbd978cdb4304d992d 100644 --- a/python/paddle/v2/fluid/__init__.py +++ b/python/paddle/v2/fluid/__init__.py @@ -16,13 +16,14 @@ import regularizer from param_attr import ParamAttr from data_feeder import DataFeeder from core import LoDTensor, CPUPlace, GPUPlace +from distribute_transpiler import DistributeTranspiler import clip Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + [ 'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr' - 'DataFeeder', 'clip' + 'DataFeeder', 'clip', 'DistributeTranspiler' ] diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py new file mode 100644 index 0000000000000000000000000000000000000000..50364c64bec46aabc7be4f4b4370a3ad5b0eb07c --- /dev/null +++ b/python/paddle/v2/fluid/distribute_transpiler.py @@ -0,0 +1,238 @@ +import framework +from framework import Program, default_main_program, Parameter, Variable +import optimizer +from layer_helper import LayerHelper + + +def hash_name_to_server(params_grads, pserver_endpoints): + """ + :param param_grads: + :return: a map of pserver endpoint -> + params -> [param list] + grads -> [grad list] + """ + + def _hash_param(param_name, total): + return hash(param_name) % total + + param_grad_map = dict() + for param, grad in params_grads: + if param.trainable is True and grad is not None: + server_id = _hash_param(param.name, len(pserver_endpoints)) + server_for_param = pserver_endpoints[server_id] + if not param_grad_map.has_key(server_for_param): + param_grad_map[server_for_param] = {"params": [], "grads": []} + param_grad_map[server_for_param]["params"].append(param) + param_grad_map[server_for_param]["grads"].append(grad) + + return param_grad_map + + +def round_robin(params_grads, pserver_endpoints): + assert (len(params_grads) > len(pserver_endpoints)) + + param_grad_map = dict() + pserver_idx = 0 + for param, grad in params_grads: + if param.trainable is True: + server_for_param = pserver_endpoints[pserver_idx] + if not param_grad_map.has_key(server_for_param): + param_grad_map[server_for_param] = {"params": [], "grads": []} + + param_grad_map[server_for_param]["params"].append(param) + param_grad_map[server_for_param]["grads"].append(grad) + + pserver_idx += 1 + if pserver_idx >= len(pserver_endpoints): + pserver_idx = 0 + return param_grad_map + + +class DistributeTranspiler: + def transpile(self, + optimize_ops, + params_grads, + program=None, + pservers="127.0.0.1:6174", + trainers=1, + split_method=round_robin): + """ + Transpile the program to a distributed data-parallelism programs. + + The main_program will be transform to use a remote parameter server + to do parameter optimization. And the optimization graph will be put + in to a parameter server program. + + Use different methods to split trainable varialbles to different + parameter servers. + + Example to run: + + exe = fluid.Executor(place) + t = fluid.DistributeTranspiler() + t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1) + + pserver_endpoint = os.getenv("PSERVER") + if pserver_endpoint: + pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops) + exe.run(fluid.default_startup_program()) + exe.run(pserver_prog) + else: + feeder = fluid.DataFeeder(feed_list=[images, label], place=place) + exe.run(fluid.default_startup_program()) + + for pass_id in range(PASS_NUM): + ... + + :param optimize_ops: op list of optimization, should be the + return value of Optimizer.minimize + :type optimize_ops: list + :param program: program to optimize, default default_main_program + :param pservers: parameter server endpoints like "m1:6174,m2:6174" + :type pservers: string + + :return: return a list of programs + """ + if program is None: + program = default_main_program() + self.trainers = trainers + self._optimize_distributed( + optimize_ops, + program, + params_grads, + pservers=pservers, + trainers=trainers, + split_method=split_method) + + def _clone_param(self, block, v): + assert isinstance(v, Parameter) + new_p = Parameter( + block=block, + shape=v.shape, + dtype=v.dtype, + type=v.type, + lod_level=v.lod_level, + stop_gradient=v.stop_gradient, + trainable=v.trainable, + optimize_attr=v.optimize_attr, + regularizer=v.regularizer, + name=v.name) + block.vars[new_p.name] = new_p + + def _clone_var(self, block, var): + assert isinstance(var, Variable) + return block.create_var( + name=var.name, + shape=var.shape, + dtype=var.dtype, + type=var.type, + lod_level=var.lod_level, + persistable=var.persistable) + + def _optimize_distributed(self, optimize_ops, program, params_and_grads, + **kwargs): + if kwargs.has_key("split_method"): + split_method = kwargs["split_method"] + else: + split_method = round_robin + + assert (callable(split_method)) + pserver_endpoints = kwargs["pservers"].split(",") + self.param_grad_map = split_method(params_and_grads, pserver_endpoints) + + send_op_ordered_inputs = [] + epmap = [] + for ep, v in self.param_grad_map.iteritems(): + send_op_ordered_inputs.extend(v["grads"]) + for i in v["grads"]: + epmap.append(ep) + send_op = program.global_block().append_op( + type="send", + inputs={"X": send_op_ordered_inputs + }, # inputs is a list of tensors to be send + outputs={}, + attrs={"endpoints": pserver_endpoints, + "epmap": epmap}) + + def get_trainer_program(optimize_ops, program): + # remove optimize ops and add a send op to main_program + program.global_block().delete_ops(optimize_ops) + + def _create_var_for_trainers(self, block, var, trainers): + var_list = [] + for i in xrange(trainers): + var_each = block.create_var( + name="%s.trainer_%d" % (var.name, i), + psersistable=var.persistable, + dtype=var.dtype, + shape=var.shape) + var_list.append(var_each) + return var_list + + def get_pserver_program(self, endpoint, optimize_ops): + pserver_program = Program() + for v in self.param_grad_map[endpoint]["params"]: + self._clone_param(pserver_program.global_block(), v) + + optimize_sub_program = Program() + grad_var_names = [ + var.name for var in self.param_grad_map[endpoint]["grads"] + ] + for opt_op in optimize_ops: + for _, var in opt_op.inputs.iteritems(): + # NOTE: append operators to merge gradients from multiple + # trainers. If trainers == 1, this is not needed. + if self.trainers > 1 and var.name in grad_var_names: + vars2merge = self._create_var_for_trainers( + optimize_sub_program.global_block(), var, self.trainers) + merged_var = optimize_sub_program.global_block().create_var( + name=var.name, + persistable=var.persistable, + dtype=var.dtype, + shape=var.shape) + optimize_sub_program.global_block().append_op( + type="sum", + inputs={"X": vars2merge}, + outputs={"Out": merged_var}) + optimize_sub_program.global_block().append_op( + type="scale", + inputs={"X": merged_var}, + outputs={"Out": merged_var}, + attrs={"scale": 1.0 / float(self.trainers)}) + else: + optimize_sub_program.global_block().create_var( + name=var.name, + persistable=var.persistable, + dtype=var.dtype, + shape=var.shape) + + if opt_op.inputs.has_key("Grad"): + if opt_op.inputs["Grad"].name in grad_var_names: + print "appending ", opt_op.type, opt_op.inputs + optimize_sub_program.global_block().append_op( + type=opt_op.type, + inputs=opt_op.inputs, + outputs=opt_op.outputs, + attrs=opt_op.attrs) + else: + optimize_sub_program.global_block().append_op( + type=opt_op.type, + inputs=opt_op.inputs, + outputs=opt_op.outputs, + attrs=opt_op.attrs) + pserver_program.global_block().append_op( + type="recv", + inputs={"RX": + self.param_grad_map[endpoint]["grads"]}, # grads to recv + outputs={}, + attrs={ + "OptimizeProgram": optimize_sub_program.desc, + "endpoint": endpoint, + "ParamList": + [p.name for p in self.param_grad_map[endpoint]["params"]], + "GradList": + [p.name for p in self.param_grad_map[endpoint]["grads"]], + "Trainers": self.trainers + }) + pserver_program.sync_with_cpp() + return pserver_program diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py index 9a99b045dc70a9e4662a6f4da141183ffc8f1846..4b4a0820abb9a85f3e9936190c835c2f186107b3 100644 --- a/python/paddle/v2/fluid/executor.py +++ b/python/paddle/v2/fluid/executor.py @@ -1,6 +1,6 @@ import numpy as np from . import core -from framework import Program, default_main_program +from framework import Program, default_main_program, Parameter, Variable __all__ = ['Executor', 'g_scope'] @@ -148,7 +148,7 @@ class Executor(object): outputs={'Out': [fetch_var]}, attrs={'col': i}) - self.executor.run(program.desc, scope, 0, True) + self.executor.run(program.desc, scope, 0, True, True) outs = [ core.get_fetch_variable(scope, fetch_var_name, i) for i in xrange(len(fetch_list)) diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index d1b12a8f097ed674b6b6384fe8ba5db950a94da5..920a49b69c77ccb98214ab76b4ee2aa125e01f02 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -359,6 +359,10 @@ class Operator(object): """ self.block = block self.desc = desc + # for clone a new operator + self.inputs = inputs + self.outputs = outputs + self.attrs = attrs if len(self.desc.type()) != 0: return if type is None: @@ -430,13 +434,18 @@ class Operator(object): continue if isinstance(attrs[attr_name], Block): self.desc.set_block_attr(attr_name, attrs[attr_name].desc) + elif isinstance(attrs[attr_name], core.BlockDesc) or \ + isinstance(attrs[attr_name], core.ProgramDesc): + self.desc.set_serialized_attr( + attr_name, attrs[attr_name].serialize_to_string()) else: self.desc.set_attr(attr_name, attrs[attr_name]) self.desc.check_attrs() no_kernel_op_set = { 'feed', 'fetch', 'save', 'load', 'recurrent', - 'rnn_memory_helper_grad', 'conditional_block', 'while' + 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', + 'recv' } if type not in no_kernel_op_set: self.desc.infer_var_type(self.block.desc) @@ -582,6 +591,7 @@ class Block(object): self.vars = dict() # var_name --> var self.ops = collections.deque() # operator list self.program = program + self.removed_vars = dict() def __str__(self): return self.to_string(True) @@ -638,6 +648,16 @@ class Block(object): self.ops.append(op) return op + def delete_ops(self, ops): + # remove from cpp + # FIXME(typhoonzero): remove only the first occuracy. + try: + start = list(self.ops).index(ops[0]) + end = list(self.ops).index(ops[-1]) + except Exception, e: + raise e + self.desc.remove_op(start, end) + def prepend_op(self, *args, **kwargs): op_desc = self.desc.prepend_op() op = Operator(self, op_desc, *args, **kwargs) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index dc6c0e7f518ee47b332a501df803a2364e0cffc0..5b7979f39ff9a661e1f127413e505b6499d01e7b 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -440,9 +440,25 @@ def topk(input, k): def lod_tensor_to_array(x, table): - """ - This function creates an operator to convert an LOD_Tensor to - an array. + """This function performs the operation that converts an LOD_Tensor to + an array. + + Args: + x (Variable|list): The tensor that needs to be converted to an array. + table (ParamAttr|list): The variable that stores the level of lod + which is ordered by sequence length in + descending order. + + Returns: + Variable: The variable of type array that has been converted from a + tensor. + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10]) + table = fluid.layers.lod_rank_table(x, level=0) + array = fluid.layers.lod_tensor_to_array(x, table) """ helper = LayerHelper("lod_tensor_to_array", **locals()) array = helper.create_variable( @@ -458,9 +474,26 @@ def lod_tensor_to_array(x, table): def array_to_lod_tensor(x, table): - """ - This function creates an operator to convert an array to a - LOD_Tensor. + """This function performs the operations that converts an array to + an LOD_Tensor. + + Args: + x (Variable|list): The array that needs to be converted to a tensor. + table (ParamAttr|list): The variable that stores the level of lod + which is ordered by sequence length in + descending order. + + Returns: + Variable: The variable of type tensor that has been converted + from an array. + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10]) + table = fluid.layers.lod_rank_table(x, level=0) + array = fluid.layers.lod_tensor_to_array(x, table) + lod_tensor = fluid.layers.array_to_lod_tensor(array, table) """ helper = LayerHelper("array_to_lod_tensor", **locals()) tmp = helper.create_tmp_variable(dtype=x.dtype) @@ -473,10 +506,24 @@ def array_to_lod_tensor(x, table): def increment(x, value=1.0, in_place=True): - """ - This function creates an operator to increment each value in the input - `x` by an amount: `value` as mentioned in the input parameter. This - operation is performed in-place by default. + """This function performs an operation that increments each value in the + input :math:`x` by an amount: :math:`value` as mentioned in the input + parameter. This operation is performed in-place by default. + + Args: + x (Variable|list): The tensor that has the input values. + value (float): The amount by which the values should be incremented. + in_place (bool): If the increment should be performed in-place. + + Returns: + Variable: The tensor variable storing the transformation of + element-wise increment of each value in the input. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32') + data = fluid.layers.increment(x=data, value=3.0, in_place=True) """ helper = LayerHelper("increment", **locals()) if not in_place: @@ -492,9 +539,24 @@ def increment(x, value=1.0, in_place=True): def array_write(x, i, array=None): - """ - This function creates an operator to write the data out as a + """This function performs the operation to write the data out as an LOD_TENSOR_ARRAY. + + Args: + x (Variable|list): The input tensor from which the data will be read. + i (Variable|list): The subscript index in tensor array, that points the + place from which data will be read. + array (Variable|list): The data can be read into this variable if + this is assigned. + Returns: + Variable: The tensor type variable that has the data written to it. + + Examples: + .. code-block::python + + tmp = fluid.layers.zeros(shape=[10], dtype='int32') + i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10) + arr = layers.array_write(tmp, i=i) """ helper = LayerHelper('array_write', **locals()) if array is None: @@ -511,6 +573,21 @@ def array_write(x, i, array=None): def create_array(dtype): + """This function creates an array of type :math:`LOD_TENSOR_ARRAY` using the + LayerHelper. + + Args: + dtype (int|float): The data type of the elements in the array. + + Returns: + Variable: The tensor variable storing the elements of data type. + + Examples: + .. code-block:: python + + data = fluid.layers.create_array(dtype='float32') + + """ helper = LayerHelper("array", **locals()) return helper.create_variable( name="{0}.out".format(helper.name), @@ -519,6 +596,24 @@ def create_array(dtype): def less_than(x, y, cond=None, **ignored): + """ + **Less than** + + This layer returns the truth value of :math:`x < y` elementwise. + + Args: + x(Variable): First operand of *less_than* + y(Variable): Second operand of *less_than* + cond(Variable|None): Optional output variable to store the result of *less_than* + + Returns: + Variable: The tensor variable storing the output of *less_than*. + + Examples: + .. code-block:: python + + less = fluid.layers.less_than(x=label, y=limit) + """ helper = LayerHelper("less_than", **locals()) if cond is None: cond = helper.create_tmp_variable(dtype='bool') @@ -531,9 +626,19 @@ def less_than(x, y, cond=None, **ignored): def array_read(array, i): - """ - This function creates an operator to read the data in as a + """This function performs the operation to read the data in as an LOD_TENSOR_ARRAY. + Args: + array (Variable|list): The input tensor that will be written to an array. + i (Variable|list): The subscript index in tensor array, that points the + place where data will be written to. + Returns: + Variable: The tensor type variable that has the data written to it. + Examples: + .. code-block::python + tmp = fluid.layers.zeros(shape=[10], dtype='int32') + i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10) + arr = layers.array_read(tmp, i=i) """ helper = LayerHelper('array_read', **locals()) if not isinstance( @@ -567,9 +672,23 @@ def shrink_memory(x, i, table): def array_length(array): - """ - This function creates an operator to find the length of the + """This function performs the operation to find the length of the input LOD_TENSOR_ARRAY. + + Args: + array (LOD_TENSOR_ARRAY): The input array that will be used + to compute the length. + + Returns: + Variable: The length of the input LoDTensorArray. + + Examples: + .. code-block::python + + tmp = fluid.layers.zeros(shape=[10], dtype='int32') + i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10) + arr = fluid.layers.array_write(tmp, i=i) + arr_len = fluid.layers.array_length(arr) """ helper = LayerHelper('array_length', **locals()) tmp = helper.create_tmp_variable(dtype='int64') diff --git a/python/paddle/v2/fluid/layers/io.py b/python/paddle/v2/fluid/layers/io.py index f4c5907f48b46ee5d9bcaba48370e5baf036c615..56c3f7b7b7f174338bb56bc5785423ca634650a6 100644 --- a/python/paddle/v2/fluid/layers/io.py +++ b/python/paddle/v2/fluid/layers/io.py @@ -12,20 +12,9 @@ def data(name, type=core.VarDesc.VarType.LOD_TENSOR, stop_gradient=True): """ - Data Layer. + **Data Layer** - Args: - name: The name/alias of the function - shape: Tuple declaring the shape. - append_batch_size: Whether or not to append the data as a batch. - dtype: The type of data : float32, float_16, int etc - type: The output type. By default it is LOD_TENSOR. - lod_level(int): The LoD Level. 0 means the input data is not a sequence. - main_program: Name of the main program that calls this - startup_program: Name of the startup program - stop_gradient: A boolean that mentions whether gradient should flow. - - This function takes in input and based on whether data has + This function takes in the input and based on whether data has to be returned back as a minibatch, it creates the global variable using the helper functions. The global variables can be accessed by all the following operations and layers in the graph. @@ -33,6 +22,24 @@ def data(name, All the input variables of this function are passed in as local variables to the LayerHelper constructor. + Args: + name(str): The name/alias of the function + shape(list): Tuple declaring the shape. + append_batch_size(bool): Whether or not to append the data as a batch. + dtype(int|float): The type of data : float32, float_16, int etc + type(VarType): The output type. By default it is LOD_TENSOR. + lod_level(int): The LoD Level. 0 means the input data is not a sequence. + main_program(Program): Name of the main program that calls this + startup_program(Program): Name of the startup program + stop_gradient(bool): A boolean that mentions whether gradient should flow. + + Returns: + Variable: The global variable that gives access to the data. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name='x', shape=[784], dtype='float32') """ helper = LayerHelper('data', **locals()) shape = list(shape) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 3536a7e3909a88fc048dbb5ae11aeb9a4b696b4e..2adce99d052639ec7d9063b1c234c623e7cdb9c6 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -13,7 +13,8 @@ __all__ = [ 'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', 'accuracy', 'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d', 'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'sequence_expand', - 'lstm_unit', 'reduce_sum', 'sequence_first_step', 'sequence_last_step' + 'lstm_unit', 'reduce_sum', 'reduce_mean', 'sequence_first_step', + 'sequence_last_step' ] @@ -25,34 +26,83 @@ def fc(input, act=None, name=None): """ - Fully Connected Layer. + **Fully Connected Layer** + + The fully connected layer can take multiple tensors as its inputs. It + creates a variable (one for each input tensor) called weights for each input + tensor, which represents a fully connected weight matrix from each input + unit to each output unit. The fully connected layer multiplies each input + tensor with its coresponding weight to produce an output Tensor. If + multiple input tensors are given, the results of multiple multiplications + will be sumed up. If bias_attr is not None, a biases variable will be + created and added to the output. Finally, if activation is not None, + it will be applied to the output as well. + + This process can be formulated as follows: + + .. math:: + + Out = Act({\sum_{i=0}^{N-1}W_iX_i + b}) + + In the above equation: + + * :math:`N`: Number of the input. + * :math:`X_i`: The input tensor. + * :math:`W`: The weights created by this layer. + * :math:`b`: The bias parameter created by this layer (if needed). + * :math:`Act`: The activation funtion. + * :math:`Out`: The output tensor. Args: - input: The input tensor to the function - size: The size of the layer - num_flatten_dims: Number of columns in input - param_attr: The parameters/weights to the FC Layer - param_initializer: Initializer used for the weight/parameter. If None, XavierInitializer() is used - bias_attr: The bias parameter for the FC layer - bias_initializer: Initializer used for the bias. If None, then ConstantInitializer() is used - act: Activation to be applied to the output of FC layer - name: Name/alias of the function - main_program: Name of the main program that calls this - startup_program: Name of the startup program - - This function can take in multiple inputs and performs the Fully Connected - function (linear transformation) on top of each of them. - So for input x, the output will be : Wx + b. Where W is the parameter, - b the bias and x is the input. - - The function also applies an activation (non-linearity) on top of the - output, if activation is passed in the input. - - All the input variables of this function are passed in as local variables - to the LayerHelper constructor. + input(Variable|list): The input tensor(s) to the fully connected layer. + size(int): The number of output units in the fully connected layer. + num_flatten_dims(int): The fc layer can accept an input tensor with more + than two dimensions. If this happens, the + multidimensional tensor will first be flattened + into a 2-dimensional matrix. The parameter + `num_flatten_dims` determines how the input tensor + is flattened: the first `num_flatten_dims` + dimensions will be flatten to form the first + dimension of the final matrix (height of the + matrix), and the rest `rank(X) - num_col_dims` + dimensions are flattened to form the second + dimension of the final matrix (width of the matrix). + For example, suppose `X` is a 6-dimensional tensor + with a shape [2, 3, 4, 5, 6], and + `x_num_col_dims` = 3. Then, the flattened matrix + will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. + By default, `x_num_col_dims` is set to 1. + param_attr(ParamAttr|list): The parameter attribute for learnable + parameters/weights of the fully connected + layer. + param_initializer(ParamAttr|list): The initializer used for the + weight/parameter. If set None, + XavierInitializer() will be used. + bias_attr(ParamAttr|list): The parameter attribute for the bias parameter + for this layer. If set None, no bias will be + added to the output units. + bias_initializer(ParamAttr|list): The initializer used for the bias. + If set None, then ConstantInitializer() + will be used. + act(str): Activation to be applied to the output of the fully connected + layer. + name(str): Name/alias of the fully connected layer. + + Returns: + Variable: The output tensor variable. + + Raises: + ValueError: If rank of the input tensor is less than 2. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") + fc = fluid.layers.fc(input=data, size=1000, act="tanh") """ - helper = LayerHelper('fc', **locals()) + + helper = LayerHelper("fc", **locals()) dtype = helper.input_dtype() @@ -72,8 +122,8 @@ def fc(input, "Y": w, }, outputs={"Out": tmp}, - attrs={'x_num_col_dims': num_flatten_dims, - 'y_num_col_dims': 1}) + attrs={"x_num_col_dims": num_flatten_dims, + "y_num_col_dims": 1}) mul_results.append(tmp) # sum @@ -91,25 +141,30 @@ def fc(input, def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'): """ - Embedding Layer. + **Embedding Layer** + + This layer is used to lookup a vector of IDs, provided by *input*, in a lookup table. + The result of this lookup is the embedding of each ID in the *input*. + + All the input variables are passed in as local variables to the LayerHelper + constructor. Args: - param_initializer: - input: The input to the function - size: The size of the layer - is_sparse: A flag that decleares whether the input is sparse - param_attr: Parameters for this layer - dtype: The type of data : float32, float_16, int etc - main_program: Name of the main program that calls this - startup_program: Name of the startup program - - This function can take in the input (which is a vector of IDs) and - performs a lookup in the lookup_table using these IDs, to result into - the embedding of each ID in the input. - - All the input variables of this function are passed in as local variables - to the LayerHelper constructor. + input(Variable): Input to the function + size(tuple|list|None): Shape of the look up table parameter + is_sparse(bool): Boolean flag that specifying whether the input is sparse + param_attr(ParamAttr): Parameters for this layer + dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc + + Returns: + Variable: The tensor variable storing the embeddings of the \ + supplied inputs. + + Examples: + .. code-block:: python + data = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32') + fc = fluid.layers.embedding(input=data, size=16) """ helper = LayerHelper('embedding', **locals()) @@ -793,6 +848,7 @@ def conv2d_transpose(input, filter_size=None, padding=None, stride=None, + dilation=None, param_attr=None): """ The transpose of conv2d layer. @@ -816,6 +872,9 @@ def conv2d_transpose(input, stride(int|tuple): The stride size. If stride is a tuple, it must contain two integers, (stride_H, stride_W). Otherwise, the stride_H = stride_W = stride. + dilation(int|tuple): The dilation size. If dilation is a tuple, it must + contain two integers, (dilation_H, dilation_W). Otherwise, the + dilation_H = dilation_W = dilation. param_attr: Parameter Attribute. main_program(Program): the main program startup_program(Program): the startup program @@ -836,10 +895,15 @@ def conv2d_transpose(input, op_attr['paddings'] = padding if isinstance(stride, int): - op_attr['strides'] = stride + op_attr['strides'] = [stride, stride] elif stride is not None: op_attr['strides'] = stride + if isinstance(dilation, int): + op_attr['dilations'] = [dilation, dilation] + elif dilation is not None: + op_attr['dilations'] = dilation + if filter_size is None: if output_size is None: raise ValueError("output_size must be set when filter_size is None") @@ -848,14 +912,17 @@ def conv2d_transpose(input, padding = op_attr.get('paddings', [0, 0]) stride = op_attr.get('strides', [1, 1]) + dilation = op_attr.get('dilations', [1, 1]) h_in = input.shape[2] w_in = input.shape[3] - filter_size_h = output_size[0] - \ - (h_in - 1) * stride[0] + 2 * padding[0] - filter_size_w = output_size[1] - \ - (w_in - 1) * stride[1] + 2 * padding[1] + + filter_size_h = (output_size[0] - (h_in - 1) * stride[0] + 2 * + padding[0] - 1) / dilation[0] + 1 + filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + 2 * + padding[1] - 1) / dilation[1] + 1 filter_size = [filter_size_h, filter_size_w] + elif isinstance(filter_size, int): filter_size = [filter_size, filter_size] @@ -1089,3 +1156,47 @@ def reduce_sum(input, dim=None, keep_dim=False): 'reduce_all': True if dim == None else False }) return out + + +def reduce_mean(input, dim=None, keep_dim=False): + """ + Computes the mean of tensor elements over the given dimension. + + Args: + input (Variable): The input variable which is a Tensor or LoDTensor. + dim (int|None): The dimension along which the mean is computed. If + :attr:`None`, compute the mean over all elements of :attr:`input` + and return a Tensor variable with a single element, otherwise + must be in the range :math:`[-rank(input), rank(input))`. If + :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`. + keep_dim (bool): Whether to reserve the reduced dimension in the + output Tensor. The result tensor will have one fewer dimension + than the :attr:`input` unless :attr:`keep_dim` is true. + + Returns: + Variable: The reduced Tensor variable. + + Examples: + .. code-block:: python + + # x is a Tensor variable with following elements: + # [[0.2, 0.3, 0.5, 0.9] + # [0.1, 0.2, 0.6, 0.7]] + # Each example is followed by the correspending output tensor. + fluid.layers.reduce_mean(x) # [0.4375] + fluid.layers.reduce_mean(x, dim=0) # [0.15, 0.25, 0.55, 0.8] + fluid.layers.reduce_mean(x, dim=-1) # [0.475, 0.4] + fluid.layers.reduce_mean(x, dim=1, keep_dim=True) # [[0.475], [0.4]] + """ + helper = LayerHelper('reduce_mean', **locals()) + out = helper.create_tmp_variable(dtype=helper.input_dtype()) + helper.append_op( + type='reduce_mean', + inputs={'X': input}, + outputs={'Out': out}, + attrs={ + 'dim': dim if dim != None else 0, + 'keep_dim': keep_dim, + 'reduce_all': True if dim == None else False + }) + return out diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py index bda017b141dcba5ac268c34388742c433a533337..e5820d24cd2b34ef53cbb91e2be66efc1b74d315 100644 --- a/python/paddle/v2/fluid/layers/tensor.py +++ b/python/paddle/v2/fluid/layers/tensor.py @@ -27,10 +27,23 @@ def cast(x, dtype): return out -def concat(input, axis): +def concat(input, axis=0): """ - This function concats the input along the axis mentioned + **Concat** + + This function concatenates the input along the axis mentioned and returns that as the output. + + Args: + input(list): List of tensors to be concatenated + axis(int): Integer axis along which the tensors will be concatenated + + Returns: + Variable: Output variable of the concatenation + + Examples: + .. code-block:: python + out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth]) """ helper = LayerHelper('concat', **locals()) out = helper.create_tmp_variable(dtype=helper.input_dtype()) @@ -43,9 +56,28 @@ def concat(input, axis): def sums(input, out=None): - """ - This function takes in the input and performs the sum operation on it - and returns that as the output. + """This function performs the sum operation on the input and returns the + result as the output. + + Args: + input (Variable|list): The input tensor that has the elements + that need to be summed up. + + Returns: + Variable: The tensor type variable that has the sum of input + written to it. + + Examples: + .. code-block::python + + tmp = fluid.layers.zeros(shape=[10], dtype='int32') + i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10) + a0 = layers.array_read(array=tmp, i=i) + i = layers.increment(x=i) + a1 = layers.array_read(array=tmp, i=i) + mean_a0 = layers.mean(x=a0) + mean_a1 = layers.mean(x=a1) + a_sum = layers.sums(input=[mean_a0, mean_a1]) """ helper = LayerHelper('sum', **locals()) if out is None: @@ -55,6 +87,24 @@ def sums(input, out=None): def assign(input, output): + """ + **Assign** + + This function copies the *input* Variable to the *output* Variable. + + Args: + input(Variable): The source variable + output(Variable): The destination variable + + Returns: + Variable: The destination variable that was supplied as the *output*. + + Examples: + .. code-block:: python + out = fluid.layers.create_tensor(dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + fluid.layers.assign(hidden, out) + """ helper = LayerHelper('assign', **locals()) helper.append_op( type='scale', @@ -66,9 +116,26 @@ def assign(input, output): def fill_constant(shape, dtype, value, out=None): """ - This function creates a tensor , with shape as mentioned in the input and - specified dtype and fills this up with a constant value that - comes in the input. It also sets the stop_gradient to be True. + **fill_constant** + + This function creates a tensor of specified *shape* and + *dtype*, and initializes this with a constant supplied in *value*. + + It also sets *stop_gradient* to True. + + Args: + shape(tuple|list|None): Shape of output tensor + dtype(np.dtype|core.DataType|str): Data type of output tensor + value(float): Constant value to initialize the output tensor + out(Variable): Output Variable to initialize + + Returns: + Variable: The tensor variable storing the output + + Examples: + .. code-block:: python + + data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64') """ helper = LayerHelper("fill_constant", **locals()) if out is None: @@ -90,6 +157,31 @@ def fill_constant_batch_size_like(input, value, input_dim_idx=0, output_dim_idx=0): + """ + **fill_constant_batch_size_like** + + This function creates a tensor of specified *shape*, *dtype* and batch size, + and initializes this with a constant supplied in *value*. The batch size is + obtained from the `input` tensor. + + It also sets *stop_gradient* to True. + + Args: + input(Variable): Tensor whose dimensions will be used to get batch size + shape(tuple|list|None): Shape of output tensor + dtype(np.dtype|core.DataType|str): Data type of output tensor + value(float): Constant value to initialize the output tensor + input_dim_idx(int): Index of input's batch size dimension + output_dim_idx(int): Index of output's batch size dimension + + Returns: + Variable: The tensor variable storing the output + + Examples: + .. code-block:: python + + data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64') + """ helper = LayerHelper("fill_constant_batch_size_like", **locals()) out = helper.create_tmp_variable(dtype=dtype) helper.append_op( diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py index 84fcbcdc2f2868a84bad5e145a934e33485b1fef..c56a531ed531cf0219e94854ba66c7399e003292 100644 --- a/python/paddle/v2/fluid/optimizer.py +++ b/python/paddle/v2/fluid/optimizer.py @@ -207,7 +207,7 @@ class Optimizer(object): optimize_ops = self.create_optimization_pass(params_grads, loss, startup_program) - return optimize_ops + return optimize_ops, params_grads class SGDOptimizer(Optimizer): diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py index f6f320c788e7e08d44df8aff5ad3792b237e103a..ab4561b0423dd73c8c0d529cbf34b52876b1b77c 100644 --- a/python/paddle/v2/fluid/param_attr.py +++ b/python/paddle/v2/fluid/param_attr.py @@ -58,7 +58,9 @@ class ParamAttr(object): def to_kwargs(self, with_initializer=False): kwargs = { 'name': self.name, - 'learning_rate': self.learning_rate, + 'optimize_attr': { + 'learning_rate': self.learning_rate + }, 'regularizer': self.regularizer, 'trainable': self.trainable, 'clip_attr': self.clip diff --git a/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py b/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py new file mode 100644 index 0000000000000000000000000000000000000000..2680502efb91061be37a77fbe5b451960fdd15f7 --- /dev/null +++ b/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py @@ -0,0 +1,72 @@ +from __future__ import print_function +import numpy as np +import paddle.v2 as paddle +import paddle.v2.fluid as fluid +import os + +images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32') +label = fluid.layers.data(name='label', shape=[1], dtype='int64') +conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=images, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") +conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + +predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax") +cost = fluid.layers.cross_entropy(input=predict, label=label) +avg_cost = fluid.layers.mean(x=cost) +optimizer = fluid.optimizer.Adam(learning_rate=0.01) +optimize_ops, params_grads = optimizer.minimize(avg_cost) + +accuracy = fluid.evaluator.Accuracy(input=predict, label=label) + +BATCH_SIZE = 50 +PASS_NUM = 3 +train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=BATCH_SIZE) + +place = fluid.CPUPlace() +exe = fluid.Executor(place) +t = fluid.DistributeTranspiler() +pserver_endpoints = os.getenv("PSERVERS") +training_role = os.getenv("TRAINING_ROLE", + "TRAINER") # get the training role: trainer/pserver +t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=1) + +if training_role == "PSERVER": + pserver_prog = t.get_pserver_program(pserver_endpoints, optimize_ops) + exe.run(fluid.default_startup_program()) + exe.run(pserver_prog) +elif training_role == "TRAINER": + feeder = fluid.DataFeeder(feed_list=[images, label], place=place) + exe.run(fluid.default_startup_program()) + + for pass_id in range(PASS_NUM): + accuracy.reset(exe) + for data in train_reader(): + loss, acc = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost] + accuracy.metrics) + pass_acc = accuracy.eval(exe) + # print loss, acc + if loss < 10.0 and pass_acc > 0.9: + # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good. + exit(0) + + pass_acc = accuracy.eval(exe) + print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc)) +else: + print("environment var TRAINER_ROLE should be TRAINER os PSERVER") + +exit(1) diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py index dee2febb83d171ed4a13921e3b7d37322ead2786..a9c0b1cfd3417d0583fa9d4e15550e7543a6bd19 100644 --- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py @@ -208,7 +208,7 @@ class TestBatchNormOp(OpTest): print 'python: NHWC, NCHW, backward checking passed' def test_forward_backward(self): - def test_with_place(place, tensor_format, shape): + def test_with_place(place, data_layout, shape): # attr epsilon = 0.00001 momentum = 0.9 @@ -292,7 +292,7 @@ class TestBatchNormOp(OpTest): SavedVariance="saved_variance", # attrs is_test=False, - tensor_format=tensor_format, + data_layout=data_layout, momentum=momentum, epsilon=epsilon) @@ -311,7 +311,7 @@ class TestBatchNormOp(OpTest): atol = 1e-4 self.__assert_close(variance_out_tensor, variance_out, "variance_out", atol) - print "op test forward passed: ", str(place), tensor_format + print "op test forward passed: ", str(place), data_layout # run backward batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set()) @@ -336,11 +336,15 @@ class TestBatchNormOp(OpTest): self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad") self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad") self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad") - print "op test backward passed: ", str(place), tensor_format + print "op test backward passed: ", str(place), data_layout places = [core.CPUPlace()] if core.is_compile_gpu() and core.op_support_gpu("batch_norm"): places.append(core.GPUPlace(0)) + + core.init_devices(["CPU", "GPU:0"]) + else: + core.init_devices(["CPU"]) for place in places: for data_format in ["NCHW", "NHWC"]: test_with_place(place, data_format, [2, 3, 4, 5]) diff --git a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py index d7b1f2f2a3abf6335998742dbbef8e17794170fa..d59537b924d57d40f7d740d99eb814c95f528e5f 100644 --- a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py +++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py @@ -3,14 +3,17 @@ import numpy as np from op_test import OpTest -def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param): +def conv2dtranspose_forward_naive(input_, filter_, attrs): in_n, in_c, in_h, in_w = input_.shape f_c, out_c, f_h, f_w = filter_.shape assert in_c == f_c - stride, pad = conv2dtranspose_param['stride'], conv2dtranspose_param['pad'] - out_h = (in_h - 1) * stride[0] + f_h - out_w = (in_w - 1) * stride[1] + f_w + stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[ + 'dilations'] + d_bolck_h = dilations[0] * (f_h - 1) + 1 + d_bolck_w = dilations[1] * (f_w - 1) + 1 + out_h = (in_h - 1) * stride[0] + d_bolck_h + out_w = (in_w - 1) * stride[1] + d_bolck_w out = np.zeros((in_n, out_c, out_h, out_w)) @@ -23,9 +26,9 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param): for k in range(out_c): tmp_out = np.sum(input_masked * filter_[:, k, :, :], axis=0) - i1, i2 = i * stride[0], i * stride[0] + f_h - j1, j2 = j * stride[0], j * stride[0] + f_w - out[n, k, i1:i2, j1:j2] += tmp_out + i1, i2 = i * stride[0], i * stride[0] + d_bolck_h + j1, j2 = j * stride[0], j * stride[0] + d_bolck_h + out[n, k, i1:i2:dilations[0], j1:j2:dilations[1]] += tmp_out out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]] return out @@ -37,11 +40,8 @@ class TestConv2dTransposeOp(OpTest): self.init_op_type() self.init_test_case() - conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad} input_ = np.random.random(self.input_size).astype("float32") filter_ = np.random.random(self.filter_size).astype("float32") - output = conv2dtranspose_forward_naive( - input_, filter_, conv2dtranspose_param).astype('float32') self.inputs = {'Input': input_, 'Filter': filter_} self.attrs = { @@ -49,6 +49,10 @@ class TestConv2dTransposeOp(OpTest): 'paddings': self.pad, 'dilations': self.dilations } + + output = conv2dtranspose_forward_naive(input_, filter_, + self.attrs).astype('float32') + self.outputs = {'Output': output} def test_check_output(self): @@ -104,11 +108,60 @@ class TestWithStride(TestConv2dTransposeOp): self.filter_size = [f_c, 6, 3, 3] +class TestWithDilation(TestConv2dTransposeOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.dilations = [2, 2] + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + # ------------ test_cudnn ------------ class TestCudnn(TestConv2dTransposeOp): def init_op_type(self): self.op_type = "conv2d_transpose_cudnn" +class TestCudnnWithPad(TestWithPad): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.dilations = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + def init_op_type(self): + self.op_type = "conv2d_transpose_cudnn" + + +class TestCudnnWithStride(TestWithStride): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.dilations = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + def init_op_type(self): + self.op_type = "conv2d_transpose_cudnn" + + +# #cudnn v5 does not support dilation conv. +# class TestCudnnWithDilation(TestWithDilation): +# def init_test_case(self): +# self.pad = [1, 1] +# self.stride = [2, 2] +# self.dilations = [2, 2] +# self.input_size = [2, 3, 5, 5] # NCHW +# f_c = self.input_size[1] +# self.filter_size = [f_c, 6, 3, 3] +# +# def init_op_type(self): +# self.op_type = "conv2d_transpose_cudnn" + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py index 8fd34b87bfea91307f52fdcbb9f71f2e1a9c6c56..a353f9b4d40233de46237005138f21430f4d865a 100644 --- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py +++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py @@ -3,15 +3,20 @@ import numpy as np from op_test import OpTest -def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param): +def conv3dtranspose_forward_naive(input_, filter_, attrs): in_n, in_c, in_d, in_h, in_w = input_.shape f_c, out_c, f_d, f_h, f_w = filter_.shape assert in_c == f_c - stride, pad = conv3dtranspose_param['stride'], conv3dtranspose_param['pad'] - out_d = (in_d - 1) * stride[0] + f_d - out_h = (in_h - 1) * stride[1] + f_h - out_w = (in_w - 1) * stride[2] + f_w + stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[ + 'dilations'] + + d_bolck_d = dilations[0] * (f_d - 1) + 1 + d_bolck_h = dilations[1] * (f_h - 1) + 1 + d_bolck_w = dilations[2] * (f_w - 1) + 1 + out_d = (in_d - 1) * stride[0] + d_bolck_d + out_h = (in_h - 1) * stride[1] + d_bolck_h + out_w = (in_w - 1) * stride[2] + d_bolck_w out = np.zeros((in_n, out_c, out_d, out_h, out_w)) for n in range(in_n): @@ -25,10 +30,11 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param): for k in range(out_c): tmp_out = np.sum(input_masked * filter_[:, k, :, :, :], axis=0) - d1, d2 = d * stride[0], d * stride[0] + f_d - i1, i2 = i * stride[1], i * stride[1] + f_h - j1, j2 = j * stride[2], j * stride[2] + f_w - out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out + d1, d2 = d * stride[0], d * stride[0] + d_bolck_d + i1, i2 = i * stride[1], i * stride[1] + d_bolck_h + j1, j2 = j * stride[2], j * stride[2] + d_bolck_w + out[n, k, d1:d2:dilations[0], i1:i2:dilations[1], j1:j2: + dilations[2]] += tmp_out out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w - pad[2]] @@ -41,18 +47,19 @@ class TestConv3dTransposeOp(OpTest): self.init_op_type() self.init_test_case() - conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad} input_ = np.random.random(self.input_size).astype("float32") filter_ = np.random.random(self.filter_size).astype("float32") - output = conv3dtranspose_forward_naive( - input_, filter_, conv3dtranspose_param).astype("float32") self.inputs = {'Input': input_, 'Filter': filter_} self.attrs = { 'strides': self.stride, 'paddings': self.pad, - # 'dilations': self.dilations + 'dilations': self.dilations } + + output = conv3dtranspose_forward_naive(input_, filter_, + self.attrs).astype("float32") + self.outputs = {'Output': output} def test_check_output(self): @@ -108,11 +115,60 @@ class TestWithStride(TestConv3dTransposeOp): self.filter_size = [f_c, 6, 3, 3, 3] +class TestWithDilation(TestConv3dTransposeOp): + def init_test_case(self): + self.pad = [1, 1, 1] + self.stride = [1, 1, 1] + self.dilations = [2, 2, 2] + self.input_size = [2, 3, 5, 5, 5] # NCDHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3, 3] + + # ------------ test_cudnn ------------ class TestCudnn(TestConv3dTransposeOp): def init_op_type(self): self.op_type = "conv3d_transpose_cudnn" +class TestCudnnWithPad(TestWithPad): + def init_test_case(self): + self.pad = [1, 1, 1] + self.stride = [1, 1, 1] + self.dilations = [1, 1, 1] + self.input_size = [2, 3, 5, 5, 5] # NCDHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3, 3] + + def init_op_type(self): + self.op_type = "conv3d_transpose_cudnn" + + +class TestCudnnWithStride(TestWithStride): + def init_test_case(self): + self.pad = [1, 1, 1] + self.stride = [2, 2, 2] + self.dilations = [1, 1, 1] + self.input_size = [2, 3, 5, 5, 5] # NCDHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3, 3] + + def init_op_type(self): + self.op_type = "conv3d_transpose_cudnn" + + +# #cudnn v5 does not support dilation conv. +# class TestCudnnWithDilation(TestWithDilation): +# def init_test_case(self): +# self.pad = [1, 1, 1] +# self.stride = [2, 2, 2] +# self.dilations = [2, 2, 2] +# self.input_size = [2, 3, 5, 5, 5] # NCDHW +# f_c = self.input_size[1] +# self.filter_size = [f_c, 6, 3, 3, 3] +# +# def init_op_type(self): +# self.op_type = "conv3d_transpose_cudnn" + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_dropout_op.py b/python/paddle/v2/fluid/tests/test_dropout_op.py index 4f5ea836b44102e5599a2302efd669291ebe920b..2483200212686caf9c46f9c1129b5d8ffdcc9145 100644 --- a/python/paddle/v2/fluid/tests/test_dropout_op.py +++ b/python/paddle/v2/fluid/tests/test_dropout_op.py @@ -47,7 +47,9 @@ class TestDropoutOp4(OpTest): self.op_type = "dropout" self.inputs = {'X': np.random.random((32, 64)).astype("float32")} self.attrs = {'dropout_prob': 0.35, 'is_test': True} - self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']} + self.outputs = { + 'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob']) + } def test_check_output(self): self.check_output() @@ -58,7 +60,9 @@ class TestDropoutOp5(OpTest): self.op_type = "dropout" self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")} self.attrs = {'dropout_prob': 0.75, 'is_test': True} - self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']} + self.outputs = { + 'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob']) + } def test_check_output(self): self.check_output() diff --git a/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py index eff8fa87d9c0dafc6935604101e94ee6c8b081ce..cd91769a22f8d6af193efabd8d997913676fbba6 100644 --- a/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py +++ b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py @@ -7,7 +7,7 @@ class TestFillZerosLikeOp(OpTest): def setUp(self): self.op_type = "fill_zeros_like" self.inputs = {'X': np.random.random((219, 232)).astype("float32")} - self.outputs = {'Y': np.zeros_like(self.inputs["X"])} + self.outputs = {'Out': np.zeros_like(self.inputs["X"])} def test_check_output(self): self.check_output() diff --git a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py index 627ab4e23562f14538d85f2e21edeb7d72d940bb..a9d943b8b7f7d9bc0dec89c5360769e0328527ba 100644 --- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py +++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py @@ -1,32 +1,47 @@ import unittest +import numpy + +import paddle.v2.fluid as fluid import paddle.v2.fluid.core as core from paddle.v2.fluid.op import Operator -import numpy +from paddle.v2.fluid.executor import Executor class TestGaussianRandomOp(unittest.TestCase): + def setUp(self): + self.op_type = "gaussian_random" + self.inputs = {} + self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10} + + self.outputs = ["Out"] + def test_cpu(self): - self.gaussian_random_test(place=core.CPUPlace()) + self.gaussian_random_test(place=fluid.CPUPlace()) def test_gpu(self): if core.is_compile_gpu(): - self.gaussian_random_test(place=core.GPUPlace(0)) + self.gaussian_random_test(place=fluid.GPUPlace(0)) def gaussian_random_test(self, place): - scope = core.Scope() - scope.var('Out').get_tensor() - - op = Operator( - "gaussian_random", - Out='Out', - shape=[1000, 784], - mean=.0, - std=1., - seed=10) context = core.DeviceContext.create(place) - op.run(scope, context) - tensor = numpy.array(scope.find_var('Out').get_tensor()) + program = fluid.Program() + block = program.global_block() + vout = block.create_var(name="Out") + op = block.append_op( + type=self.op_type, outputs={"Out": vout}, attrs=self.attrs) + + op.desc.infer_var_type(block.desc) + op.desc.infer_shape(block.desc) + + fetch_list = [] + for var_name in self.outputs: + fetch_list.append(block.var(var_name)) + + exe = Executor(place) + outs = exe.run(program, fetch_list=fetch_list) + tensor = outs[0] + self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1) self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1) diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py index 2459dfd664300d405edb36c4ca906c1769b5e7d2..29694be58bce0eb41b05439da35ef07a542ef12a 100644 --- a/python/paddle/v2/fluid/tests/test_optimizer.py +++ b/python/paddle/v2/fluid/tests/test_optimizer.py @@ -27,7 +27,7 @@ class TestOptimizer(unittest.TestCase): block.append_op( type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) - opts = sgd_optimizer.minimize(mean_out, init_program) + opts, _ = sgd_optimizer.minimize(mean_out, init_program) self.assertEqual(len(opts), 1) sgd_op = opts[0] self.assertEqual(sgd_op.type, "sgd") @@ -57,7 +57,7 @@ class TestOptimizer(unittest.TestCase): learning_rate = 0.01 sgd_optimizer = optimizer.SGDOptimizer( learning_rate=learning_rate, global_step=global_step) - opts = sgd_optimizer.minimize(mean_out, init_program) + opts, _ = sgd_optimizer.minimize(mean_out, init_program) self.assertEqual(len(opts), 2) sgd_op = opts[0] self.assertEqual(sgd_op.type, "sgd") diff --git a/python/paddle/v2/fluid/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py index f736dfb2e85552b321403c961da517f3b3efb100..00b4f196209a6414f1063a33c0e31093e33ca39d 100644 --- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py +++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py @@ -1,32 +1,50 @@ import unittest +import numpy + from paddle.v2.fluid.op import Operator import paddle.v2.fluid.core as core -import numpy +import paddle.v2.fluid as fluid class TestUniformRandomOp(unittest.TestCase): - def test_uniform_random_cpu(self): + def setUp(self): + self.op_type = "uniform_random" + self.inputs = {} + self.attrs = { + "shape": [1000, 784], + "min": -5.0, + "max": 10.0, + "seed": 10 + } + self.outputs = ["Out"] + + def test_cpu(self): self.uniform_random_test(place=core.CPUPlace()) - def test_uniform_random_gpu(self): + def test_gpu(self): if core.is_compile_gpu(): self.uniform_random_test(place=core.GPUPlace(0)) def uniform_random_test(self, place): - scope = core.Scope() - scope.var('X').get_tensor() - - op = Operator( - "uniform_random", - Out='X', - shape=[1000, 784], - min=-5.0, - max=10.0, - seed=10) - - ctx = core.DeviceContext.create(place) - op.run(scope, ctx) - tensor = numpy.array(scope.find_var('X').get_tensor()) + context = core.DeviceContext.create(place) + program = fluid.Program() + block = program.global_block() + vout = block.create_var(name="Out") + op = block.append_op( + type=self.op_type, outputs={"Out": vout}, attrs=self.attrs) + + op.desc.infer_var_type(block.desc) + op.desc.infer_shape(block.desc) + + fetch_list = [] + for var_name in self.outputs: + fetch_list.append(block.var(var_name)) + + exe = fluid.Executor(place) + outs = exe.run(program, fetch_list=fetch_list) + + tensor = outs[0] + self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1)