diff --git a/.travis.yml b/.travis.yml index 9dd5f48164a3417940880f2b15bf7d9906453fb8..bf6a41d13c4eabc2d8543ab821ce0ff747a061df 100644 --- a/.travis.yml +++ b/.travis.yml @@ -56,7 +56,7 @@ script: export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh export DOCS_DIR=`pwd` cd .. - curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/v2 + curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/ notifications: email: on_success: change diff --git a/cmake/generic.cmake b/cmake/generic.cmake index d0b5eaec2e2a50acf17e5dd1d1aeb0ec3e614fbf..471e3929069d0d28105404b4f0f6baa303faf0e0 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -244,11 +244,11 @@ function(cc_test TARGET_NAME) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_SRCS}) # Support linking flags: --whole-archive (Linux) / -force_load (MacOS) - target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) if("${cc_test_DEPS}" MATCHES "ARCHIVE_START") list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END) endif() - add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} ${cc_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) @@ -311,8 +311,8 @@ function(nv_test TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) - add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) + add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) add_test(${TARGET_NAME} ${TARGET_NAME}) endif() endfunction(nv_test) diff --git a/doc/design/cpp_data_feeding.md b/doc/design/cpp_data_feeding.md index 40205350f99722f0b71bfa6f390fe9d01d831966..22c2a925eb8c5e1dd8451e1d3cba261ce471ec51 100644 --- a/doc/design/cpp_data_feeding.md +++ b/doc/design/cpp_data_feeding.md @@ -20,9 +20,8 @@ class ReaderBase { PADDLE_ENFORCE(!shapes_.empty()); } // Read the next batch of data. (A 'batch' can be only one instance) + // If the next batch doesn't exist, the '*out' will be an empty std::vector. virtual void ReadNext(std::vector* out) = 0; - // Show whether the next bacth exists. - virtual bool HasNext() const = 0; // Reinitialize the reader and read the file from the begin. virtual void ReInit() = 0; diff --git a/doc/design/dist_refactor/distributed_architecture.md b/doc/design/fluid_dist/distributed_architecture.md similarity index 100% rename from doc/design/dist_refactor/distributed_architecture.md rename to doc/design/fluid_dist/distributed_architecture.md diff --git a/doc/design/dist_refactor/multi_cpu.md b/doc/design/fluid_dist/multi_cpu.md similarity index 100% rename from doc/design/dist_refactor/multi_cpu.md rename to doc/design/fluid_dist/multi_cpu.md diff --git a/doc/design/dist_refactor/parameter_server.md b/doc/design/fluid_dist/parameter_server.md similarity index 86% rename from doc/design/dist_refactor/parameter_server.md rename to doc/design/fluid_dist/parameter_server.md index 805dd13048d41b995d2a01cda52b2ea33e4bbe1d..6ce48dfbfce8b094684b412ebfda7e505ddc30ae 100644 --- a/doc/design/dist_refactor/parameter_server.md +++ b/doc/design/fluid_dist/parameter_server.md @@ -59,6 +59,17 @@ After converting: queue. It will block until the queue has the required number of tensors. +### Sparse Update + +For embedding layers, the gradient may have many rows containing only 0 when training, +if the gradient uses a dense tensor to do parameter optimization, +it could spend unnecessary memory, slow down the calculations and waste +the bandwidth while doing distributed training. +In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list of rows containing +non-zero gradient data. So when we do parameter optimization both locally and remotely, +we only need to send those non-zero rows to the optimizer operators: + + ### Benefits @@ -91,6 +102,6 @@ After converting: `min_count` attribute), does our current design support it? (similar question for the *Add* OP) +### References -### References: [1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf) diff --git a/doc/design/dist_refactor/src/compiler.graffle b/doc/design/fluid_dist/src/compiler.graffle similarity index 100% rename from doc/design/dist_refactor/src/compiler.graffle rename to doc/design/fluid_dist/src/compiler.graffle diff --git a/doc/design/dist_refactor/src/compiler.png b/doc/design/fluid_dist/src/compiler.png similarity index 100% rename from doc/design/dist_refactor/src/compiler.png rename to doc/design/fluid_dist/src/compiler.png diff --git a/doc/design/dist_refactor/src/dist-graph.graffle b/doc/design/fluid_dist/src/dist-graph.graffle similarity index 100% rename from doc/design/dist_refactor/src/dist-graph.graffle rename to doc/design/fluid_dist/src/dist-graph.graffle diff --git a/doc/design/dist_refactor/src/dist-graph.png b/doc/design/fluid_dist/src/dist-graph.png similarity index 100% rename from doc/design/dist_refactor/src/dist-graph.png rename to doc/design/fluid_dist/src/dist-graph.png diff --git a/doc/design/dist_refactor/src/distributed_architecture.graffle b/doc/design/fluid_dist/src/distributed_architecture.graffle similarity index 100% rename from doc/design/dist_refactor/src/distributed_architecture.graffle rename to doc/design/fluid_dist/src/distributed_architecture.graffle diff --git a/doc/design/dist_refactor/src/distributed_architecture.png b/doc/design/fluid_dist/src/distributed_architecture.png similarity index 100% rename from doc/design/dist_refactor/src/distributed_architecture.png rename to doc/design/fluid_dist/src/distributed_architecture.png diff --git a/doc/design/dist_refactor/src/local-graph.graffle b/doc/design/fluid_dist/src/local-graph.graffle similarity index 100% rename from doc/design/dist_refactor/src/local-graph.graffle rename to doc/design/fluid_dist/src/local-graph.graffle diff --git a/doc/design/dist_refactor/src/local-graph.png b/doc/design/fluid_dist/src/local-graph.png similarity index 100% rename from doc/design/dist_refactor/src/local-graph.png rename to doc/design/fluid_dist/src/local-graph.png diff --git a/doc/design/dist_refactor/src/local_architecture.graffle b/doc/design/fluid_dist/src/local_architecture.graffle similarity index 100% rename from doc/design/dist_refactor/src/local_architecture.graffle rename to doc/design/fluid_dist/src/local_architecture.graffle diff --git a/doc/design/dist_refactor/src/local_architecture.png b/doc/design/fluid_dist/src/local_architecture.png similarity index 100% rename from doc/design/dist_refactor/src/local_architecture.png rename to doc/design/fluid_dist/src/local_architecture.png diff --git a/doc/design/dist_refactor/src/multi-threads.graffle b/doc/design/fluid_dist/src/multi-threads.graffle similarity index 100% rename from doc/design/dist_refactor/src/multi-threads.graffle rename to doc/design/fluid_dist/src/multi-threads.graffle diff --git a/doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png b/doc/design/fluid_dist/src/multi-threads/multi-threads@3x.png similarity index 100% rename from doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png rename to doc/design/fluid_dist/src/multi-threads/multi-threads@3x.png diff --git a/doc/design/dist_refactor/src/multi-threads/single-thread@3x.png b/doc/design/fluid_dist/src/multi-threads/single-thread@3x.png similarity index 100% rename from doc/design/dist_refactor/src/multi-threads/single-thread@3x.png rename to doc/design/fluid_dist/src/multi-threads/single-thread@3x.png diff --git a/doc/design/dist_refactor/src/paddle-compile.graffle b/doc/design/fluid_dist/src/paddle-compile.graffle similarity index 100% rename from doc/design/dist_refactor/src/paddle-compile.graffle rename to doc/design/fluid_dist/src/paddle-compile.graffle diff --git a/doc/design/dist_refactor/src/paddle-compile.png b/doc/design/fluid_dist/src/paddle-compile.png similarity index 100% rename from doc/design/dist_refactor/src/paddle-compile.png rename to doc/design/fluid_dist/src/paddle-compile.png diff --git a/doc/design/dist_refactor/src/remote_executor.graffle b/doc/design/fluid_dist/src/remote_executor.graffle similarity index 100% rename from doc/design/dist_refactor/src/remote_executor.graffle rename to doc/design/fluid_dist/src/remote_executor.graffle diff --git a/doc/design/dist_refactor/src/remote_executor.png b/doc/design/fluid_dist/src/remote_executor.png similarity index 100% rename from doc/design/dist_refactor/src/remote_executor.png rename to doc/design/fluid_dist/src/remote_executor.png diff --git a/doc/design/fluid_dist/src/sparse_update.graffle b/doc/design/fluid_dist/src/sparse_update.graffle new file mode 100644 index 0000000000000000000000000000000000000000..08d689a58f83698d8c1158ee3990ed8abf3a7a9a Binary files /dev/null and b/doc/design/fluid_dist/src/sparse_update.graffle differ diff --git a/doc/design/fluid_dist/src/sparse_update.png b/doc/design/fluid_dist/src/sparse_update.png new file mode 100644 index 0000000000000000000000000000000000000000..8c872e6ac479f7d1b818a4a207956c43155d0ad7 Binary files /dev/null and b/doc/design/fluid_dist/src/sparse_update.png differ diff --git a/doc/fluid/dev/use_eigen_cn.md b/doc/fluid/dev/use_eigen_cn.md index 1367323b71277984834d9d4f0d9bea0f69478479..f36843b4408c21bdca1fa83853e5b0a40116791c 100644 --- a/doc/fluid/dev/use_eigen_cn.md +++ b/doc/fluid/dev/use_eigen_cn.md @@ -107,7 +107,7 @@ void Compute(const framework::ExecutionContext& context) const override { ### paddle::framework::Tensor到EigenTensor的转换 -如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。 +如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。 以EigenTensor为例,做一个介绍 @@ -125,7 +125,7 @@ From是EigenTensor模板提供的一个接口,可以实现从paddle::framework 在Eigen中,不同rank的Tensor是不同类型,Vector是rank为1的Tensor。需要额外注意的是,EigenVector::From方法是把paddle中的一维Tensor转为Eigen的一维Tensor,在这里用EigenVector来表示;而EigenVector::Flatten方法是把paddle中的一个Tensor进行reshape操作,压扁成为Eigen的一维Tensor,类型仍然为EigenVector。 -更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc)。 +更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc)。 diff --git a/doc/fluid/dev/use_eigen_en.md b/doc/fluid/dev/use_eigen_en.md index e169106e12f5d62696f1f0e7163562793b32c18c..3a466f73d1f9b94a29b171015279c782ca50bd02 100644 --- a/doc/fluid/dev/use_eigen_en.md +++ b/doc/fluid/dev/use_eigen_en.md @@ -107,7 +107,7 @@ void Compute(const framework::ExecutionContext& context) const override { ### paddle::framework::Tensor到EigenTensor的转换 -As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`. +As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`. Using EigenTensor as an example: @@ -125,7 +125,7 @@ EigenTensor::Type et = EigenTensor::From(t); In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector. -For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc) in the `eigen_test.cc` file. +For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc) in the `eigen_test.cc` file. diff --git a/doc/v2/howto/optimization/pprof_1.png b/doc/fluid/howto/optimization/pprof_1.png similarity index 100% rename from doc/v2/howto/optimization/pprof_1.png rename to doc/fluid/howto/optimization/pprof_1.png diff --git a/doc/v2/howto/optimization/pprof_2.png b/doc/fluid/howto/optimization/pprof_2.png similarity index 100% rename from doc/v2/howto/optimization/pprof_2.png rename to doc/fluid/howto/optimization/pprof_2.png diff --git a/doc/fluid/read_source.md b/doc/fluid/read_source.md index edf46aff8c6cc9fc01d26c6453b3a8123238ef91..bb6d4563f5617fb98af055bca2f6f0479bdb4393 100644 --- a/doc/fluid/read_source.md +++ b/doc/fluid/read_source.md @@ -2,17 +2,17 @@ Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/book -Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework +Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework -Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators +Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators -Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory +Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory -Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform +Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform # Compile Time -The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). +The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto). ```python x = fluid.layers.data(name='x', shape=[13], dtype='float32') @@ -29,10 +29,10 @@ sgd_optimizer.minimize(avg_cost) - Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#) - Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/layers) - Every Layer has one or more operators and variables/parameters - - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files: - - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h) - - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) - - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h) + - All the operators are defined at [`paddle/fluid/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators). Other worth-looking files: + - Base class: [`paddle/fluid/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h) + - Operator Registration: [`paddle/fluid/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_registry.h) + - Operator Lookup: [`paddle/fluid/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_info.h) - Optimizer: `fluid.optimizer.SGD`. It does the following - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/backward.py)] - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/optimizer.py)] @@ -55,13 +55,13 @@ exe.run(fluid.default_main_program(), fetch_list=[avg_cost]) ``` -- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h) - - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h) -- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)] +- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h) + - The device handle are at [paddle/fluid/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h) +- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.cc)] - Feeds the data: `feed=feeder.feed(data)` - Evaluates all the operators - Fetches the result: `fetch_list=[avg_cost]` - Other worth looking files: - - Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live - - Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live - - Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory) + - Scope: [paddle/fluid/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/scope.h). Where all the variables live + - Variable: [paddle/fluid/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h). Where all the data (most likely tensors) live + - Tensor: [paddle/fluid/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h). Where we allocate memory through [`paddle/fluid/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory) diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst index ddcd42a0c6554469d702d3a9bbecd16643d6b7ed..b3d882743785e8ee301b71b696230531d2b7ba58 100644 --- a/doc/v2/build_and_install/pip_install_cn.rst +++ b/doc/v2/build_and_install/pip_install_cn.rst @@ -34,15 +34,15 @@ PaddlePaddle可以使用常用的Python包管理工具 :align: center .. csv-table:: 各个版本最新的whl包 - :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API" - :widths: 1, 3, 3, 3 - - "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "暂无" - "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + :header: "版本说明", "cp27-cp27mu", "cp27-cp27m" + :widths: 1, 3, 3 + + "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" .. _pip_dependency: diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst index e08c84703bfa89352a79acbddd5d7f1bc88ce82e..1e409d86b9775094998f72f92954f4bbc1013ea1 100644 --- a/doc/v2/build_and_install/pip_install_en.rst +++ b/doc/v2/build_and_install/pip_install_en.rst @@ -37,15 +37,15 @@ If the links below shows up the login form, just click "Log in as guest" to star :align: center .. csv-table:: whl package of each version - :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API" - :widths: 1, 3, 3, 3 - - "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "Not Available" - "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + :header: "version", "cp27-cp27mu", "cp27-cp27m" + :widths: 1, 3, 3 + + "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" .. _pip_dependency: diff --git a/doc/v2/dev/index_cn.rst b/doc/v2/dev/index_cn.rst index c488191b8174531905e44cb9443ee539d4cb1ed3..aee3c68de05de26df3cd79170fa7f4ecad4bf386 100644 --- a/doc/v2/dev/index_cn.rst +++ b/doc/v2/dev/index_cn.rst @@ -1,9 +1,24 @@ 开发标准 ======== +PaddlePaddle遵守如下三个部分的代码和文档规范。 + +PaddlePaddle使用git做版本管理,docker作为构建和测试环境。代码中包含了Cuda, C++, Python, Shell等多种编程语言。语言规范遵守Google C++ Style, Pep-8, 代码库中包含自动化检查工具做风格检查。代码注释需要遵守Doxygen规范,不满足风格要求的代码会编译失败。关于如何使用git, 构建测试及代码开发, 我们提供了如下指南。 .. toctree:: :maxdepth: 1 contribute_to_paddle_cn.md + +PaddlePaddle面向国内外用户,包含了中文和英文两部分的文档。设计文档和issue问题描述都推荐使用英文。对于设计文档,重在问题描述,背景阐述,然后才是解决方案。文档由Sphinx生成,因此代码注释也需要符合Sphinx文档标准。推荐本地使用paddlepaddle.org工具编译生成和预览文档,请参阅如下文档。 + +.. toctree:: + :maxdepth: 1 + write_docs_cn.rst + +PaddlePaddle V2 使用新增Layer方式定义新的操作。组合基础API可以实现多种复杂Layer, 满足绝大多数应用。如需要定制Layer,请参阅如下文档,欢迎提交patch。 + +.. toctree:: + :maxdepth: 1 + new_layer_cn.rst diff --git a/doc/v2/getstarted/index_cn.rst b/doc/v2/getstarted/index_cn.rst index 1dc141396b95bda776aeff87ac30fad6baf37bd2..75af7354be93a6eeabfa9ccf86903505402a7ca6 100644 --- a/doc/v2/getstarted/index_cn.rst +++ b/doc/v2/getstarted/index_cn.rst @@ -1,8 +1,19 @@ 新手入门 ============ + +如果需要快速了解PaddlePaddle的使用,可以参考以下指南。 + .. toctree:: :maxdepth: 1 quickstart_cn.rst + + +在使用PaddlePaddle构建应用时,需要了解一些基本概念。 +这里以一个线性回归为例子,详细介绍了PaddlePaddle的使用流程,包括数据格式,模型配置与训练等。 + +.. toctree:: + :maxdepth: 1 + concepts/use_concepts_cn.rst diff --git a/doc/v2/howto/capi/compile_paddle_lib_cn.md b/doc/v2/howto/capi/compile_paddle_lib_cn.md index fd8dec8164580b9dcb716e69f3cc5357639f17d3..e223fd33a8420abcdfdad53d1cfc5ed160a1b37e 100644 --- a/doc/v2/howto/capi/compile_paddle_lib_cn.md +++ b/doc/v2/howto/capi/compile_paddle_lib_cn.md @@ -1,22 +1,80 @@ -## 安装与编译C-API预测库 - -### 概述 - -使用 C-API 进行预测依赖于将 PaddlePaddle 核心代码编译成链接库,只需在编译时需配制下面这些编译选项: - -必须配置选项: -- `WITH_C_API`,必须配置为`ON`。 - -推荐配置选项: -- `WITH_PYTHON`,推荐配置为`OFF` -- `WITH_SWIG_PY`,推荐配置为`OFF` -- `WITH_GOLANG`,推荐设置为`OFF` - -可选配置选项: -- `WITH_GPU`,可配置为`ON/OFF` -- `WITH_MKL`,可配置为`ON/OFF` - -对推荐配置中的选项建议按照设置,以避免链接不必要的库。其它可选编译选项按需进行设定。 +## 安装、编译与链接C-API预测库 + +### 直接下载安装 + +从CI系统中下载最新的C-API开发包进行安装,用户可以从下面的表格中找到需要的版本: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
版本说明C-API
cpu_avx_mklpaddle.tgz
cpu_avx_openblas暂无
cpu_noavx_openblaspaddle.tgz
cuda7.5_cudnn5_avx_mklpaddle.tgz
cuda8.0_cudnn5_avx_mklpaddle.tgz
cuda8.0_cudnn7_avx_mklpaddle.tgz
+ +### 从源码编译 + +用户也可以从 PaddlePaddle 核心代码编译C-API链接库,只需在编译时配制下面这些编译选项: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
选项
WITH_C_APION
WITH_PYTHONOFF(推荐)
WITH_SWIG_PYOFF(推荐)
WITH_GOLANGOFF(推荐)
WITH_GPUON/OFF
WITH_MKLON/OFF
+ +建议按照推荐值设置,以避免链接不必要的库。其它可选编译选项按需进行设定。 下面的代码片段从github拉取最新代码,配制编译选项(需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径): @@ -100,23 +158,19 @@ cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \ 目前提供三种链接方式: -1. 链接`libpaddle_capi_shared.so` 动态库 - - 使用 PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_shared.so`时,需注意: - 1. 如果编译时指定编译CPU版本,且使用`OpenBLAS`数学库,在使用C-API开发预测程序时,只需要链接`libpaddle_capi_shared.so`这一个库。 - 1. 如果是用编译时指定CPU版本,且使用`MKL`数学库,由于`MKL`库有自己独立的动态库文件,在使用PaddlePaddle C-API开发预测程序时,需要自己链接MKL链接库。 - 1. 如果编译时指定编译GPU版本,CUDA相关库会在预测程序运行时动态装载,需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。 - - 这种方式最为简便,链接相对容易,**在无特殊需求情况下,推荐使用此方式**。 - -2. 链接静态库 `libpaddle_capi_whole.a` - - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意: - 1. 需要指定`-Wl,--whole-archive`链接选项。 - 1. 需要显式地链接 `gflags`、`glog`、`libz`、`protobuf` 等第三方库,可在`PADDLE_ROOT/third_party`下找到。 - 1. 如果在编译 C-API 时使用OpenBLAS数学库,需要显示地链接`libopenblas.a`。 - 1. 如果在编译 C-API 是使用MKL数学库,需要显示地链接MKL的动态库。 - -3. 链接静态库 `libpaddle_capi_layers.a`和`libpaddle_capi_engine.a` - - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意: - 1. 这种链接方式主要用于移动端预测。 - 1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。 - 1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。 - 1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。 +1. 链接`libpaddle_capi_shared.so` 动态库(这种方式最为简便,链接相对容易,**在无特殊需求情况下,推荐使用此方式**),需注意: + 1. 如果编译时指定编译CPU版本,且使用`OpenBLAS`数学库,在使用C-API开发预测程序时,只需要链接`libpaddle_capi_shared.so`这一个库。 + 1. 如果是用编译时指定CPU版本,且使用`MKL`数学库,由于`MKL`库有自己独立的动态库文件,在使用PaddlePaddle C-API开发预测程序时,需要自己链接MKL链接库。 + 1. 如果编译时指定编译GPU版本,CUDA相关库会在预测程序运行时动态装载,需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。 + +2. 链接静态库 `libpaddle_capi_whole.a`,需注意: + 1. 需要指定`-Wl,--whole-archive`链接选项。 + 1. 需要显式地链接 `gflags`、`glog`、`libz`、`protobuf` 等第三方库,可在`PADDLE_ROOT/third_party`下找到。 + 1. 如果在编译 C-API 时使用OpenBLAS数学库,需要显示地链接`libopenblas.a`。 + 1. 如果在编译 C-API 是使用MKL数学库,需要显示地链接MKL的动态库。 + +3. 链接静态库 `libpaddle_capi_layers.a`和`libpaddle_capi_engine.a`,需注意: + 1. 这种链接方式主要用于移动端预测。 + 1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。 + 1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。 + 1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。 diff --git a/doc/v2/howto/cluster/cmd_argument_cn.md b/doc/v2/howto/cluster/cmd_argument_cn.md index 40e1dde4858b802c2e703bcca4b71730facde5ef..c0ba093cbf2eac5c3b60a0b071b31776a11998f3 100644 --- a/doc/v2/howto/cluster/cmd_argument_cn.md +++ b/doc/v2/howto/cluster/cmd_argument_cn.md @@ -71,6 +71,13 @@ paddle.init( - trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数 - pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开 +```python +trainer = paddle.trainer.SGD(..., is_local=False) +``` + +参数说明 + +- is_local: **必选, 默认True**, 是否使用PServer更新参数 ## 准备数据集 diff --git a/doc/v2/howto/cluster/cmd_argument_en.md b/doc/v2/howto/cluster/cmd_argument_en.md index 40179c28f83800c1c74a6045f8fac6841bdafeaa..df1381a00fa0fa129eecffe002164c489a4183aa 100644 --- a/doc/v2/howto/cluster/cmd_argument_en.md +++ b/doc/v2/howto/cluster/cmd_argument_en.md @@ -73,6 +73,14 @@ Parameter Description - trainer_id: **required, default 0**, ID for every trainer, start from 0. - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",". +```python +trainer = paddle.trainer.SGD(..., is_local=False) +``` + +Parameter Description + +- is_local: **required, default True**, whether update parameters by PServer. + ## Prepare Training Dataset Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files. diff --git a/doc/v2/howto/rnn/hierarchical_layer_cn.rst b/doc/v2/howto/rnn/hierarchical_layer_cn.rst index e05173c2006ff47ecb6ca5a4fe1502de750acc59..2f8f408b40299890da694862a7b9418cf9ff07f2 100644 --- a/doc/v2/howto/rnn/hierarchical_layer_cn.rst +++ b/doc/v2/howto/rnn/hierarchical_layer_cn.rst @@ -22,7 +22,7 @@ pooling ======== -pooling 的使用示例如下,详细见 :ref:`api_v2.layer_pooling` 配置API。 +pooling 的使用示例如下。 .. code-block:: bash @@ -47,7 +47,7 @@ pooling 的使用示例如下,详细见 :ref:`api_v2.layer_pooling` 配置API last_seq 和 first_seq ===================== -last_seq 的使用示例如下( :ref:`api_v2.layer_first_seq` 类似),详细见 :ref:`api_v2.layer_last_seq` 配置API。 +last_seq 的使用示例如下(first_seq 类似)。 .. code-block:: bash @@ -68,7 +68,7 @@ last_seq 的使用示例如下( :ref:`api_v2.layer_first_seq` 类似),详 expand ====== -expand 的使用示例如下,详细见 :ref:`api_v2.layer_expand` 配置API。 +expand 的使用示例如下。 .. code-block:: bash diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst index efdc44455ea4dc81a87b4d4fc8a81e78b15cb06a..b05b66415fbb829f471b1491b9881f65137bfe17 100644 --- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst +++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst @@ -4,7 +4,7 @@ 单双层RNN API对比介绍 ##################### -本文以PaddlePaddle的双层RNN单元测试为示例,用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型,来讲解如何使用双层RNN。本文中所有的例子,都只是介绍双层RNN的API接口,并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用,请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp `_\ 。 +本文以PaddlePaddle的双层RNN单元测试为示例,用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型,来讲解如何使用双层RNN。本文中所有的例子,都只是介绍双层RNN的API接口,并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用,请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp `_\ 。 示例1:双层RNN,子序列间无Memory ================================ @@ -166,11 +166,6 @@ 在上面代码中,单层和双层序列的使用和示例2中的示例类似,区别是同时处理了两个输入。而对于双层序列,两个输入的子序列长度也并不相同。但是,我们使用了\ :code:`targetInlink`\ 参数设置了外层\ :code:`recurrent_group`\ 的输出格式。所以外层输出的序列形状,和\ :code:`emb2`\ 的序列形状一致。 -示例4:beam_search的生成 -======================== - -TBD - 词汇表 ====== diff --git a/doc/v2/howto/rnn/index_cn.rst b/doc/v2/howto/rnn/index_cn.rst index bcc8c2f46eb662ec3650e829a77992224dbbb8e7..2032fb9e296ab024c68da1348064580c8c88d5be 100644 --- a/doc/v2/howto/rnn/index_cn.rst +++ b/doc/v2/howto/rnn/index_cn.rst @@ -1,10 +1,34 @@ RNN模型 =========== +循环神经网络(RNN)是对序列数据建模的重要工具。PaddlePaddle提供了灵活的接口以支持复杂循环神经网络的构建。 +这里将分为以下四个部分详细介绍如何使用PaddlePaddle搭建循环神经网络。 + +第一部分由浅入深的展示了使用PaddlePaddle搭建循环神经网络的全貌:首先以简单的循环神经网络(vanilla RNN)为例, +说明如何封装配置循环神经网络组件;然后更进一步的通过序列到序列(sequence to sequence)模型,逐步讲解如何构建完整而复杂的循环神经网络模型。 .. toctree:: :maxdepth: 1 rnn_config_cn.rst + +Recurrent Group是PaddlePaddle中实现复杂循环神经网络的关键,第二部分阐述了PaddlePaddle中Recurrent Group的相关概念和原理, +对Recurrent Group接口进行了详细说明。另外,对双层RNN(对应的输入为双层序列)及Recurrent Group在其中的使用进行了介绍。 + +.. toctree:: + :maxdepth: 1 + recurrent_group_cn.md + +第三部分对双层序列进行了解释说明,列出了PaddlePaddle中支持双层序列作为输入的Layer,并对其使用进行了逐一介绍。 + +.. toctree:: + :maxdepth: 1 + hierarchical_layer_cn.rst + +第四部分以PaddlePaddle的双层RNN单元测试中的网络配置为示例,辅以效果相同的单层RNN网络配置作为对比,讲解了多种情况下双层RNN的使用。 + +.. toctree:: + :maxdepth: 1 + hrnn_rnn_api_compare_cn.rst diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc index d72b64700f7cf680501fd3e355d20e694f1f097d..3693bc25d81a8309df1a6ddf3d9b08d484596ea9 100644 --- a/paddle/fluid/framework/block_desc.cc +++ b/paddle/fluid/framework/block_desc.cc @@ -135,6 +135,14 @@ OpDesc *BlockDesc::PrependOp() { return ops_.front().get(); } +OpDesc *BlockDesc::InsertOp(size_t index) { + need_update_ = true; + auto it = ops_.begin() + index; + std::unique_ptr new_op(new OpDesc(this)); + it = ops_.insert(it, std::move(new_op)); + return (*it).get(); +} + void BlockDesc::RemoveOp(size_t s, size_t e) { if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) { return; diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h index 3bd90f38907c0a45ae0c9bb00706e5c127f08417..185f018ac1b5863e0ee86fdaa17df1ccbc6e030e 100644 --- a/paddle/fluid/framework/block_desc.h +++ b/paddle/fluid/framework/block_desc.h @@ -87,6 +87,8 @@ class BlockDesc { OpDesc *PrependOp(); + OpDesc *InsertOp(size_t index); + void RemoveOp(size_t s, size_t e); std::vector AllOps() const; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index f8e7d0d99074936ad81c4ddc52be6907ead4c27d..5cae38b2a857b2037f0e5ae4da50d1591da0c11a 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -34,6 +34,15 @@ DEFINE_bool(check_nan_inf, false, namespace paddle { namespace framework { +struct ExecutorPrepareContext { + ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id) + : prog_(prog), block_id_(block_id) {} + + framework::ProgramDesc prog_; + size_t block_id_; + std::vector> ops_; +}; + Executor::Executor(const platform::Place& place) : place_(place) {} static void CreateTensor(Variable* var, proto::VarType::Type var_type) { @@ -85,73 +94,9 @@ static void CheckTensorNANOrInf(const std::string& name, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars) { - // TODO(tonyyang-svail): - // - only runs on the first device (i.e. no interdevice communication) - // - will change to use multiple blocks for RNN op and Cond Op - PADDLE_ENFORCE_LT(static_cast(block_id), pdesc.Size()); - auto& block = pdesc.Block(block_id); - - Scope* local_scope = scope; - if (create_vars) { - if (create_local_scope) { - local_scope = &scope->NewScope(); - for (auto& var : block.AllVars()) { - if (var->Name() == framework::kEmptyVarName) { - continue; - } - - if (var->Persistable()) { - auto* ptr = scope->Var(var->Name()); - CreateTensor(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " global, which pointer is " << ptr; - } else { - auto* ptr = local_scope->Var(var->Name()); - CreateTensor(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " locally, which pointer is " << ptr; - } - } - } else { - for (auto& var : block.AllVars()) { - auto* ptr = local_scope->Var(var->Name()); - CreateTensor(ptr, var->GetType()); - VLOG(3) << "Create variable " << var->Name() << ", which pointer is " - << ptr; - } - } // if (create_local_scope) - } // if (create_vars) - - for (auto& op_desc : block.AllOps()) { - auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); - - VLOG(4) << place_ << " " << op->DebugStringEx(local_scope); - op->Run(*local_scope, place_); - VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); - - if (FLAGS_benchmark) { - VLOG(2) << "Memory used after operator " + op->Type() + " running: " - << memory::memory_usage(place_); - } - if (FLAGS_check_nan_inf) { - for (auto& vname : op->OutputVars(true)) { - auto* var = local_scope->FindVar(vname); - if (var == nullptr) continue; - if (var->IsType()) { - CheckTensorNANOrInf(vname, var->Get()); - } - } - } - } - if (create_vars && create_local_scope) { - scope->DeleteScope(local_scope); - } - if (FLAGS_benchmark) { - VLOG(2) << "-------------------------------------------------------"; - VLOG(2) << "Memory used after deleting local scope: " - << memory::memory_usage(place_); - VLOG(2) << "-------------------------------------------------------"; - } + auto* ctx = Prepare(pdesc, block_id); + RunPreparedContext(ctx, scope, create_local_scope, create_vars); + delete ctx; } // Check whether the block already has feed operators and feed_holder. @@ -313,5 +258,81 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, delete copy_program; } +ExecutorPrepareContext* Executor::Prepare(const ProgramDesc& program, + int block_id) { + auto* ctx = new ExecutorPrepareContext(program, block_id); + PADDLE_ENFORCE_LT(static_cast(block_id), program.Size()); + auto& block = program.Block(block_id); + for (auto& op_desc : block.AllOps()) { + ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); + } + return ctx; +} + +void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, + bool create_local_scope, bool create_vars) { + auto& block = ctx->prog_.Block(ctx->block_id_); + + Scope* local_scope = scope; + if (create_vars) { + if (create_local_scope) { + local_scope = &scope->NewScope(); + for (auto& var : block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var->Persistable()) { + auto* ptr = scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { + auto* ptr = local_scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } + } + } else { + for (auto& var : block.AllVars()) { + auto* ptr = local_scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create variable " << var->Name() << ", which pointer is " + << ptr; + } + } // if (create_local_scope) + } // if (create_vars) + + for (auto& op : ctx->ops_) { + VLOG(4) << place_ << " " << op->DebugStringEx(local_scope); + op->Run(*local_scope, place_); + VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); + + if (FLAGS_benchmark) { + VLOG(2) << "Memory used after operator " + op->Type() + " running: " + << memory::memory_usage(place_); + } + if (FLAGS_check_nan_inf) { + for (auto& vname : op->OutputVars(true)) { + auto* var = local_scope->FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get()); + } + } + } + } + if (create_vars && create_local_scope) { + scope->DeleteScope(local_scope); + } + if (FLAGS_benchmark) { + VLOG(2) << "-------------------------------------------------------"; + VLOG(2) << "Memory used after deleting local scope: " + << memory::memory_usage(place_); + VLOG(2) << "-------------------------------------------------------"; + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index c1f4d4e02a951e8b127b66cae125309e4798cc76..28ce3315154cea45412984df4daf7385ce2cf572 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -22,7 +22,7 @@ limitations under the License. */ namespace paddle { namespace framework { - +struct ExecutorPrepareContext; class Executor { public: // TODO(dzhwinter) : Do not rely on this function, it will be removed @@ -38,8 +38,8 @@ class Executor { * ProgramDesc * Scope */ - void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true, - bool create_vars = true); + void Run(const ProgramDesc& prog, Scope* scope, int block_id, + bool create_local_scope = true, bool create_vars = true); void Run(const ProgramDesc& program, Scope* scope, std::map& feed_targets, @@ -47,6 +47,13 @@ class Executor { const std::string& feed_holder_name = "feed", const std::string& fetch_holder_name = "fetch"); + static ExecutorPrepareContext* Prepare(const ProgramDesc& program, + int block_id); + + void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, + bool create_local_scope = true, + bool create_vars = true); + private: const platform::Place place_; }; diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ac6289c5abe8f40ae9ee32aa3d58cdef3ff0e836..371c2fad97b1efd06eea9ac631122f194e65d656 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -74,6 +74,9 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { platform::SetDeviceId(dev_id); #endif } + // profile + auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place); + platform::RecordEvent record_event(Type(), dev_ctx); RunImpl(scope, place); } @@ -497,9 +500,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, RuntimeInferShapeContext infer_shape_ctx(*this, scope); this->InferShape(&infer_shape_ctx); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto dev_ctx = pool.Get(place); - // profile - platform::RecordEvent record_event(Type(), dev_ctx); + auto* dev_ctx = pool.Get(place); // check if op[type] has kernel registered. auto& all_op_kernels = AllOpKernels(); auto kernels_iter = all_op_kernels.find(type_); diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index 27ab6e750c2e665fa5055a3ecfb2f315cb4000c0..e820c3d07e85fd1dea9080786b48ad031330ee00 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -26,7 +26,6 @@ class ReaderBase { PADDLE_ENFORCE(!shapes_.empty()); } virtual void ReadNext(std::vector* out) = 0; - virtual bool HasNext() const = 0; virtual void ReInit() = 0; @@ -52,8 +51,6 @@ class DecoratedReader : public ReaderBase { PADDLE_ENFORCE_NOT_NULL(reader_); } - bool HasNext() const override { return reader_->HasNext(); } - void ReInit() override { reader_->ReInit(); } protected: @@ -68,13 +65,25 @@ class ReaderHolder { ReaderBase* Get() const { return reader_.get(); } - void ReadNext(std::vector* out) { reader_->ReadNext(out); } - bool HasNext() const { return reader_->HasNext(); } - void ReInit() { reader_->ReInit(); } + void ReadNext(std::vector* out) { + PADDLE_ENFORCE_NOT_NULL(reader_); + reader_->ReadNext(out); + } + void ReInit() { + PADDLE_ENFORCE_NOT_NULL(reader_); + reader_->ReInit(); + } - DDim shape(size_t idx) const { return reader_->shape(idx); } - std::vector shapes() const { return reader_->shapes(); } + DDim shape(size_t idx) const { + PADDLE_ENFORCE_NOT_NULL(reader_); + return reader_->shape(idx); + } + std::vector shapes() const { + PADDLE_ENFORCE_NOT_NULL(reader_); + return reader_->shapes(); + } void set_shapes(const std::vector& shapes) { + PADDLE_ENFORCE_NOT_NULL(reader_); reader_->set_shapes(shapes); } diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index ea6c8cebd3ff16451c974bf3a0ded9d822a9caf8..17e38b1cf042657834b4d0d1c12cbbb92f19fa45 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include // for unique_ptr #include // for call_once +#include #include "glog/logging.h" #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" @@ -102,6 +103,18 @@ void Scope::DeleteScope(Scope* scope) { } } +void Scope::EraseVars(std::vector& var_names) { + std::set var_set(var_names.begin(), var_names.end()); + for (auto it = vars_.begin(); it != vars_.end();) { + if (var_set.find(it->first) != var_set.end()) { + delete it->second; + it = vars_.erase(it); + } else { + ++it; + } + } +} + void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { auto origin_it = vars_.find(origin_name); diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index d8fad162e59c8ea6465b2dc78c90709a758ace24..c1e1f49caaa5a60df0e97289aada465b45213971 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -51,6 +51,8 @@ class Scope { /// Create a variable with a scope-unique name. Variable* Var(std::string* name = nullptr); + void EraseVars(std::vector& var_names); + /// Find a variable in the scope or any of its ancestors. Returns /// nullptr if cannot find. Variable* FindVar(const std::string& name) const; diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 9b465b85b0a02ffe990ab669a22f78e923e24f99..8b7533ce712b0a01060842b6f71449ed6bd23e2c 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -187,7 +187,6 @@ bool TensorContainsInf(const framework::Tensor& tensor) { void TensorToStream(std::ostream& os, const Tensor& tensor, const platform::DeviceContext& dev_ctx) { - // TODO(typhoonzero): serialize to ostream { // the 1st field, uint32_t version constexpr uint32_t version = 0; os.write(reinterpret_cast(&version), sizeof(version)); diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 3adc260caf5eaaa1408a26ed91457666e6c4adce..df51fb24a588c84788d7d0b671f932ff4c40f9c2 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -67,10 +67,10 @@ class ThreadPool { } catch (platform::EnforceNotMet ex) { return std::unique_ptr( new platform::EnforceNotMet(ex)); - } catch (...) { - LOG(FATAL) - << "Unexpected exception is catched in thread pool. All " - "throwable exception in Fluid should be an EnforceNotMet."; + } catch (const std::exception& e) { + LOG(FATAL) << "Unexpected exception is catched in thread pool. All " + "throwable exception in Fluid should be an EnforceNotMet." + << e.what(); } return nullptr; }); diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 0f5fe6d0aa9a5522c67a3c06f8677f1f2f259eb3..dce541c0971a6ff9a3728e915fe8c3d009c23550 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -115,11 +115,11 @@ void TestInference(const std::string& dirname, #endif } - // Enable the profiler - paddle::platform::EnableProfiler(state); - // 2. Initialize the inference_program and load parameters std::unique_ptr inference_program; + + // Enable the profiler + paddle::platform::EnableProfiler(state); { paddle::platform::RecordEvent record_event( "init_program", @@ -143,6 +143,10 @@ void TestInference(const std::string& dirname, inference_program = paddle::inference::Load(executor, *scope, dirname); } } + // Disable the profiler and print the timing information + paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, + "load_program_profiler.txt"); + paddle::platform::ResetProfiler(); // 3. Get the feed_target_names and fetch_target_names const std::vector& feed_target_names = @@ -165,6 +169,12 @@ void TestInference(const std::string& dirname, // 6. Run the inference program { + // Ignore the profiling results of the first run + executor.Run(*inference_program, scope, feed_targets, fetch_targets); + + // Enable the profiler + paddle::platform::EnableProfiler(state); + // Run repeat times to profile the performance for (int i = 0; i < repeat; ++i) { paddle::platform::RecordEvent record_event( @@ -173,12 +183,13 @@ void TestInference(const std::string& dirname, executor.Run(*inference_program, scope, feed_targets, fetch_targets); } - } - // Disable the profiler and print the timing information - paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, - "profiler.txt"); - paddle::platform::ResetProfiler(); + // Disable the profiler and print the timing information + paddle::platform::DisableProfiler( + paddle::platform::EventSortingKey::kDefault, + "run_inference_profiler.txt"); + paddle::platform::ResetProfiler(); + } delete scope; } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 5d436a7e0c3752c889c19820507589f34d3bee94..625e0f7561899d30b40f9daa56f743a37bdaa27f 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -222,8 +222,6 @@ cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) -if(WITH_GPU) - cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) -endif() cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) +nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc index a5ec47d84fe423f77de494e342931a6d278049d2..72f8cb04f2de3af4ee526c3d9b86ff96e34f0b0a 100644 --- a/paddle/fluid/operators/cast_op.cc +++ b/paddle/fluid/operators/cast_op.cc @@ -63,13 +63,27 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker { } }; +class CastOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); + // CastOp kernel's device type is decided by input tensor place + kt.place_ = ctx.Input("X")->place(); + return kt; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; -REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape, - ops::CastOpProtoMaker); +REGISTER_OPERATOR(cast, ops::CastOp, ops::CastOpGradMaker, + ops::CastOpInferShape, ops::CastOpProtoMaker); REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index d59cc2c9d424f067ca638cb76e52c2e95ae75182..0a8a5d4c71c4510f04eea2f7ef12f836d1fd9c9b 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -12,58 +12,21 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "mkldnn.hpp" -#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" namespace paddle { namespace operators { -using paddle::framework::Tensor; -using paddle::platform::MKLDNNDeviceContext; -using paddle::platform::MKLDNNMemDesc; - -using mkldnn::memory; // Note: paddle has also "memory" namespace -using mkldnn::primitive; -using mkldnn::convolution_forward; -using mkldnn::convolution_backward_weights; -using mkldnn::convolution_backward_data; -using mkldnn::convolution_direct; -using mkldnn::prop_kind; -using mkldnn::padding_kind; -using mkldnn::stream; - -namespace { -std::unique_ptr -ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, - const memory::desc& dst, const std::vector& strides, - const std::vector& paddings, - const mkldnn::engine& engine); - -convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc( - const memory::desc& src, const memory::desc& diff_weights, - const memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine); - -convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc( - const memory::desc& diff_src, const memory::desc& weights, - const memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine); -} // anonymous namespace - template -class ConvOpMkldnnKernel : public paddle::framework::OpKernel { +class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = + ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); auto* input = ctx.Input("Input"); @@ -88,7 +51,6 @@ class ConvOpMkldnnKernel : public paddle::framework::OpKernel { const T* input_data = input->data(); const T* filter_data = filter->data(); - // allocate memory for output T* output_data = output->mutable_data(ctx.GetPlace()); PADDLE_ENFORCE(input->dims().size() == 4, @@ -102,48 +64,69 @@ class ConvOpMkldnnKernel : public paddle::framework::OpKernel { std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); // TODO(pzelazko-intel): support more formats - // memory descriptors for convolution src/weight/dst - auto conv_src_md = - MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw); - auto conv_weights_md = - MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw); - auto conv_dst_md = - MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw); - - // create memory primitives - auto conv_src_memory = - memory({conv_src_md, mkldnn_engine}, (void*)input_data); - auto conv_weights_memory = - memory({conv_weights_md, mkldnn_engine}, (void*)filter_data); - auto conv_dst_memory = memory({conv_dst_md, mkldnn_engine}, output_data); - - std::unique_ptr conv_pd = - ConvFwdPrimitiveDesc(conv_src_md, conv_weights_md, conv_dst_md, strides, - paddings, mkldnn_engine); - - // save p_conv_pd into dev_ctx to be referred in backward path - auto p_conv_pd = conv_pd.get(); - std::shared_ptr conv_pd_value = std::move(conv_pd); - dev_ctx.SetBlob(key_conv_pd, conv_pd_value); + auto src_md = platform::MKLDNNMemDesc( + src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + auto weights_md = + platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, + mkldnn::memory::format::oihw); + auto dst_md = platform::MKLDNNMemDesc( + dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + + auto src_memory = + mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); + auto weights_memory = + mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data); + auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data); + + std::shared_ptr conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, + mkldnn_engine); + + // save conv_pd into global device context to be referred in backward path + dev_ctx.SetBlob(key_conv_pd, conv_pd); // create convolution op primitive - auto conv_prim = convolution_forward(*p_conv_pd, conv_src_memory, - conv_weights_memory, conv_dst_memory); + auto conv_prim = mkldnn::convolution_forward(*conv_pd, src_memory, + weights_memory, dst_memory); + + // push primitive to stream and wait until it's executed + std::vector pipeline{conv_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } - // push op to stream and wait MKLDNN until it's executed - std::vector pipeline{conv_prim}; - stream(stream::kind::eager).submit(pipeline).wait(); + private: + std::unique_ptr + ConvFwdPrimitiveDesc(const mkldnn::memory::desc& src, + const mkldnn::memory::desc& weights, + const mkldnn::memory::desc& dst, + const std::vector& strides, + const std::vector& paddings, + const mkldnn::engine& engine) const { + mkldnn::memory::dims stride_dims = {strides[0], strides[1]}; + mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto conv_desc = mkldnn::convolution_forward::desc( + mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, + dst, stride_dims, padding_dims, padding_dims, + mkldnn::padding_kind::zero); + + auto p_conv_pd = + new mkldnn::convolution_forward::primitive_desc(conv_desc, engine); + + return std::unique_ptr( + p_conv_pd); } }; template -class ConvGradOpMkldnnKernel : public paddle::framework::OpKernel { +class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = + ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); const Tensor* input = ctx.Input("Input"); @@ -170,7 +153,6 @@ class ConvGradOpMkldnnKernel : public paddle::framework::OpKernel { T* input_grad_data = nullptr; T* filter_grad_data = nullptr; - // allocate memory for gradient of input/filter if (input_grad) { input_grad_data = input_grad->mutable_data(ctx.GetPlace()); } @@ -184,130 +166,111 @@ class ConvGradOpMkldnnKernel : public paddle::framework::OpKernel { std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); // TODO(pzelazko-intel): support more formats - auto conv_src_md = - MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw); - auto conv_diff_src_md = - MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw); - auto conv_weights_md = - MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw); - auto conv_diff_weights_md = - MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw); - auto conv_diff_dst_md = - MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw); + auto src_md = platform::MKLDNNMemDesc( + src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + auto diff_src_md = platform::MKLDNNMemDesc( + src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + auto weights_md = + platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, + mkldnn::memory::format::oihw); + auto diff_weights_md = + platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, + mkldnn::memory::format::oihw); + auto diff_dst_md = platform::MKLDNNMemDesc( + dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); // create memory - auto conv_diff_dst_memory = - memory({conv_diff_weights_md, mkldnn_engine}, (void*)output_grad_data); + auto diff_dst_memory = mkldnn::memory({diff_weights_md, mkldnn_engine}, + (void*)output_grad_data); // Retrieve conv_pd from device context - std::shared_ptr conv_pd; - convolution_forward::primitive_desc* p_conv_pd; - - conv_pd = dev_ctx.GetBlob(key_conv_pd); + auto conv_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_conv_pd)); PADDLE_ENFORCE(conv_pd != nullptr, "Fail to find conv_pd in device context"); - p_conv_pd = - static_cast(conv_pd.get()); // create backward conv primitive for weights if (filter_grad) { // create primitive descriptor - convolution_backward_weights::primitive_desc conv_bwd_weights_pd = - ConvBwdWeightsPrimitiveDesc(conv_src_md, conv_diff_weights_md, - conv_diff_dst_md, strides, paddings, - *p_conv_pd, mkldnn_engine); + mkldnn::convolution_backward_weights::primitive_desc conv_bwd_weights_pd = + ConvBwdWeightsPrimitiveDesc(src_md, diff_weights_md, diff_dst_md, + strides, paddings, *conv_pd, + mkldnn_engine); // create memory - auto conv_diff_weights_memory = memory( - {conv_diff_weights_md, mkldnn_engine}, (void*)filter_grad_data); - auto conv_src_memory = - memory({conv_src_md, mkldnn_engine}, (void*)input_data); + auto diff_weights_memory = mkldnn::memory( + {diff_weights_md, mkldnn_engine}, (void*)filter_grad_data); + auto src_memory = + mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); // create backward conv primitive for weights - auto conv_bwd_weights_prim = convolution_backward_weights( - conv_bwd_weights_pd, conv_src_memory, conv_diff_dst_memory, - conv_diff_weights_memory); + auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights( + conv_bwd_weights_pd, src_memory, diff_dst_memory, + diff_weights_memory); // push primitive and execute it - std::vector pipeline{conv_bwd_weights_prim}; - stream(stream::kind::eager).submit(pipeline).wait(); + std::vector pipeline{conv_bwd_weights_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); } if (input_grad) { // create primitive descriptor - convolution_backward_data::primitive_desc conv_bwd_data_pd = - ConvBwdDataPrimitiveDesc(conv_diff_src_md, conv_weights_md, - conv_diff_dst_md, strides, paddings, - *p_conv_pd, mkldnn_engine); + mkldnn::convolution_backward_data::primitive_desc conv_bwd_data_pd = + ConvBwdDataPrimitiveDesc(diff_src_md, weights_md, diff_dst_md, + strides, paddings, *conv_pd, mkldnn_engine); // create memory - auto conv_diff_src_memory = - memory({conv_diff_src_md, mkldnn_engine}, (void*)input_grad_data); - auto conv_weights_memory = - memory({conv_weights_md, mkldnn_engine}, (void*)filter_data); + auto diff_src_memory = + mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)input_grad_data); + auto weights_memory = + mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data); // create backward conv primitive for data - auto conv_bwd_data_prim = - convolution_backward_data(conv_bwd_data_pd, conv_diff_dst_memory, - conv_weights_memory, conv_diff_src_memory); + auto conv_bwd_data_prim = mkldnn::convolution_backward_data( + conv_bwd_data_pd, diff_dst_memory, weights_memory, diff_src_memory); - // push primitive and execute it - std::vector pipeline{conv_bwd_data_prim}; - stream(stream::kind::eager).submit(pipeline).wait(); + // push primitive to stream and wait until it's executed + std::vector pipeline{conv_bwd_data_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); } } // Compute() + + private: + mkldnn::convolution_backward_weights::primitive_desc + ConvBwdWeightsPrimitiveDesc( + const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights, + const mkldnn::memory::desc& diff_dst, const std::vector& strides, + const std::vector& paddings, + const mkldnn::convolution_forward::primitive_desc& conv_pd, + const mkldnn::engine& engine) const { + auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc( + mkldnn::convolution_direct, src, diff_weights, diff_dst, strides, + paddings, paddings, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc( + conv_bwd_weights_desc, engine, conv_pd); + } + + mkldnn::convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc( + const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights, + const mkldnn::memory::desc& diff_dst, const std::vector& strides, + const std::vector& paddings, + const mkldnn::convolution_forward::primitive_desc& conv_pd, + const mkldnn::engine& engine) const { + auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc( + mkldnn::convolution_direct, diff_src, weights, diff_dst, strides, + paddings, paddings, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_data::primitive_desc(conv_bwd_data_desc, + engine, conv_pd); + } }; -namespace { -std::unique_ptr ConvFwdPrimitiveDesc( - const memory::desc& src, const memory::desc& weights, - const memory::desc& dst, const std::vector& strides, - const std::vector& paddings, const mkldnn::engine& engine) { - mkldnn::memory::dims stride_dims = {strides[0], strides[1]}; - mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]}; - - auto conv_desc = mkldnn::convolution_forward::desc( - mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, dst, - stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - - auto p_conv_pd = new convolution_forward::primitive_desc(conv_desc, engine); - - return std::unique_ptr( - p_conv_pd); -} - -convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc( - const memory::desc& src, const memory::desc& diff_weights, - const memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine) { - auto conv_bwd_weights_desc = convolution_backward_weights::desc( - convolution_direct, src, diff_weights, diff_dst, strides, paddings, - paddings, padding_kind::zero); - return convolution_backward_weights::primitive_desc(conv_bwd_weights_desc, - engine, conv_pd); -} - -convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc( - const memory::desc& diff_src, const memory::desc& weights, - const memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine) { - auto conv_bwd_data_desc = convolution_backward_data::desc( - convolution_direct, diff_src, weights, diff_dst, strides, paddings, - paddings, padding_kind::zero); - return convolution_backward_data::primitive_desc(conv_bwd_data_desc, engine, - conv_pd); -} -} // anonymous namespace } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace, - ops::ConvOpMkldnnKernel); + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::ConvGradOpMkldnnKernel); + ops::ConvMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/delete_var_op.cc b/paddle/fluid/operators/delete_var_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1fe9404c00335edbe3594486f8c403e69f2ab08f --- /dev/null +++ b/paddle/fluid/operators/delete_var_op.cc @@ -0,0 +1,52 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { +class DeleteVarOp : public framework::OperatorBase { + public: + DeleteVarOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + dev_ctx.Wait(); + + auto delete_var_names = Inputs("X"); + const_cast(scope).EraseVars(delete_var_names); + } +}; + +class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker { + public: + DeleteVarOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of delete op").AsDuplicable(); + AddComment(R"DOC( +Delete Operator. +It should not be configured by users directly. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp, + paddle::framework::EmptyGradOpMaker, + paddle::operators::DeleteVarOpInfoMaker); diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt index 0581bd2ac55218a2955fcb260d8b61cac0d210b5..94395ccfbcbd74ee40552a5c70dc8b8063a5f851 100644 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ b/paddle/fluid/operators/detail/CMakeLists.txt @@ -1,3 +1,6 @@ if(WITH_DISTRIBUTE) - grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) + grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(serde_test SRCS test_serde.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc) endif() diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.cc b/paddle/fluid/operators/detail/bytebuffer_stream.cc new file mode 100644 index 0000000000000000000000000000000000000000..741dd51de9e75feb608161579e56cb160b058ebb --- /dev/null +++ b/paddle/fluid/operators/detail/bytebuffer_stream.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// NOTE: This file was originally created by tensorflow +// (https://github.com/tensorflow/tensorflow/) we borrow this +// file and did some modifications so that we can send gRPC +// requests without too much copying of the tensor data. + +#include "bytebuffer_stream.h" + +namespace paddle { +namespace operators { +namespace detail { + +GrpcByteBufferSource::GrpcByteBufferSource() {} + +bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) { + cur_ = -1; + left_ = 0; + ptr_ = nullptr; + byte_count_ = 0; + bool ok = src.Dump(&slices_).ok(); + if (!ok) { + slices_.clear(); + } + return ok; +} + +bool GrpcByteBufferSource::Next(const void** data, int* size) { + // Use loop instead of if in case buffer contained empty slices. + while (left_ == 0) { + // Advance to next slice. + cur_++; + if (cur_ >= slices_.size()) { + return false; + } + const ::grpc::Slice& s = slices_[cur_]; + left_ = s.size(); + ptr_ = reinterpret_cast(s.begin()); + } + + *data = ptr_; + *size = left_; + byte_count_ += left_; + ptr_ += left_; + left_ = 0; + return true; +} + +void GrpcByteBufferSource::BackUp(int count) { + ptr_ -= count; + left_ += count; + byte_count_ -= count; +} + +bool GrpcByteBufferSource::Skip(int count) { + const void* data; + int size; + while (Next(&data, &size)) { + if (size >= count) { + BackUp(size - count); + return true; + } + // size < count; + count -= size; + } + // error or we have too large count; + return false; +} + +google::protobuf::int64 GrpcByteBufferSource::ByteCount() const { + return byte_count_; +} + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.h b/paddle/fluid/operators/detail/bytebuffer_stream.h new file mode 100644 index 0000000000000000000000000000000000000000..099deb12d0e436427c147ab9b1eb553b712e14fb --- /dev/null +++ b/paddle/fluid/operators/detail/bytebuffer_stream.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// NOTE: This file was originally created by tensorflow +// (https://github.com/tensorflow/tensorflow/) we borrow this +// file and did some modifications so that we can send gRPC +// requests without too much copying of the tensor data. + +#pragma once + +#include +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" + +namespace paddle { +namespace operators { +namespace detail { + +// A ZeroCopyInputStream that reads from a grpc::ByteBuffer. +class GrpcByteBufferSource + : public ::google::protobuf::io::ZeroCopyInputStream { + public: + GrpcByteBufferSource(); + bool Init(const ::grpc::ByteBuffer& src); // Can be called multiple times. + bool Next(const void** data, int* size) override; + void BackUp(int count) override; + bool Skip(int count) override; + ::google::protobuf::int64 ByteCount() const override; + + private: + std::vector<::grpc::Slice> slices_; + size_t cur_; // Current slice index. + int left_; // Number of bytes in slices_[cur_] left to yield. + const char* ptr_; // Address of next byte in slices_[cur_] to yield. + ::google::protobuf::int64 byte_count_; +}; + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/proto_encoder_helper.h b/paddle/fluid/operators/detail/proto_encoder_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..4a7bfb8bd586fe84c9243bc64117d146c4386674 --- /dev/null +++ b/paddle/fluid/operators/detail/proto_encoder_helper.h @@ -0,0 +1,147 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// NOTE: This file was originally created by tensorflow +// (https://github.com/tensorflow/tensorflow/) we borrow this +// file and did some modifications so that we can send gRPC +// requests without too much copying of the tensor data. + +#pragma once + +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace detail { + +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast(dst); + static const int B = 128; + if (v < (1 << 7)) { + *(ptr++) = v; + } else if (v < (1 << 14)) { + *(ptr++) = v | B; + *(ptr++) = v >> 7; + } else if (v < (1 << 21)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = v >> 14; + } else if (v < (1 << 28)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = v >> 21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = (v >> 21) | B; + *(ptr++) = v >> 28; + } + return reinterpret_cast(ptr); +} + +char* EncodeVarint64(char* dst, uint64_t v) { + static const int B = 128; + unsigned char* ptr = reinterpret_cast(dst); + while (v >= B) { + *(ptr++) = (v & (B - 1)) | B; + v >>= 7; + } + *(ptr++) = static_cast(v); + return reinterpret_cast(ptr); +} + +int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +class ProtoEncodeHelper { + public: + ProtoEncodeHelper(char* buf, int max_size) + : base_(buf), p_(buf), limit_(base_ + max_size) {} + + ~ProtoEncodeHelper() { + // Make sure callers didn't do operations that went over max_size promised + PADDLE_ENFORCE_LE(p_, limit_); + } + + const char* data() const { return base_; } + size_t size() const { return p_ - base_; } + + void WriteUint64(int tag, uint64_t v) { + Encode32(combine(tag, WIRETYPE_VARINT)); + Encode64(v); + } + void WriteBool(int tag, bool v) { + Encode32(combine(tag, WIRETYPE_VARINT)); + EncodeBool(v); + } + void WriteString(int tag, const std::string& v) { + Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); + Encode32(v.size()); + EncodeBytes(v.data(), v.size()); + } + void WriteVarlengthBeginning(int tag, uint32_t len) { + Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); + Encode32(len); + } + void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); } + + private: + // Note: this module's behavior must match the protocol buffer wire encoding + // format. + enum { + WIRETYPE_VARINT = 0, + WIRETYPE_LENGTH_DELIMITED = 2, + }; + static uint32_t combine(uint32_t tag, uint32_t type) { + return ((tag << 3) | type); + } + inline void Encode32(uint32_t v) { + if (v < 128) { + // Fast path for single-byte values. Many of the calls will use a + // constant value for v, so the comparison will get optimized away + // when Encode32 is inlined into the caller. + *p_ = v; + p_++; + } else { + p_ = EncodeVarint32(p_, v); + } + } + void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); } + void EncodeBool(bool v) { + *p_ = (v ? 1 : 0); // Equal to varint32 encoding of 0 or 1 + p_++; + } + void EncodeBytes(const char* bytes, int N) { + memcpy(p_, bytes, N); + p_ += N; + } + + char* base_; + char* p_; + char* limit_; // Just for CHECKs +}; + +} // detail +} // operators +} // paddle diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto index 8f962b4c69cc83dc2ab98b7dc27e18bc4b42bf18..b0215d4a80c9440f09c35434903fd6166b03e8b0 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/detail/send_recv.proto @@ -33,10 +33,34 @@ enum VarType { } message VariableMessage { + enum Type { + // Pod Types + BOOL = 0; + INT16 = 1; + INT32 = 2; + INT64 = 3; + FP16 = 4; + FP32 = 5; + FP64 = 6; + } + + message LodData { repeated int64 lod_data = 1; } + string varname = 1; // TODO(Yancey1989): reference framework::proto::VarDesc::VarType VarType type = 2; - bytes serialized = 3; + // bool persistable is not needed for sending. + // tensor info: + Type data_type = 3; + repeated int64 dims = 4; + + // lod details: + int64 lod_level = 5; + repeated LodData lod = 6; + // tensor data + bytes serialized = 7; + // selected_rows data + bytes rows = 8; } message VoidMessage {} diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc index 169fd40fd950a74e61a4ed06a370f25b533957db..39117eeeb611b025c426938c60ddf82c6af232ca 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.cc +++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc @@ -13,6 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/detail/bytebuffer_stream.h" +#include "paddle/fluid/operators/detail/proto_encoder_helper.h" namespace paddle { namespace operators { @@ -63,6 +68,233 @@ void DeserializeFromMessage(const sendrecv::VariableMessage& msg, } } +void SerializeToByteBuffer(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, + ::grpc::ByteBuffer* msg) { + using VarMsg = sendrecv::VariableMessage; + sendrecv::VariableMessage request; + std::string header; + request.AppendToString(&header); + // When using GPU, need to free the copied CPU buffer + // when the ByteBuffer destroies + // TODO(typhoonzero): add unref here, if we have dependent + // parallelism execution, need to know when to free the tensor. + DestroyCallback destroy_callback = [](void* backing) {}; + + void* buf = malloc(1024); + void* payload = nullptr; + size_t payload_size; + ProtoEncodeHelper e((char*)buf, 1024); + e.WriteString(VarMsg::kVarnameFieldNumber, name); + if (var->IsType()) { + e.WriteUint64(VarMsg::kTypeFieldNumber, 0); + } else if (var->IsType()) { + e.WriteUint64(VarMsg::kTypeFieldNumber, 1); + } + + switch (framework::ToVarType(var->Type())) { + case framework::proto::VarType_Type_LOD_TENSOR: { + auto tensor = var->Get(); + e.WriteUint64(VarMsg::kDataTypeFieldNumber, + framework::ToDataType(tensor.type())); + for (auto& dim : framework::vectorize(tensor.dims())) { + e.WriteUint64(VarMsg::kDimsFieldNumber, dim); + } + auto lod = tensor.lod(); // std::vector> + if (lod.size() > 0) { + e.WriteUint64(VarMsg::kLodLevelFieldNumber, lod.size()); + + for (auto& each : lod) { + e.WriteVarlengthBeginning(VarMsg::kLodFieldNumber, + 2 + // tag + varintlength of submessage + 1 + // kLodDataFieldNumber + each.size()); + // auto copied from GPU + for (auto& d : each) { + e.WriteUint64(VarMsg::LodData::kLodDataFieldNumber, d); + } + } + } + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE(platform::is_gpu_place(tensor.place())); + platform::CPUPlace cpu; + auto& gpu_dev_ctx = + static_cast(ctx); + auto copy_size = tensor.memory_size(); + payload = memory::Alloc(cpu, copy_size); + memory::Copy(cpu, payload, + boost::get(tensor.place()), + reinterpret_cast(tensor.data()), + copy_size, gpu_dev_ctx.stream()); + ctx.Wait(); + destroy_callback = [](void* backing) { + platform::CPUPlace cpu; + memory::Free(cpu, backing); + }; +#endif + } else { + payload = tensor.data(); + } + payload_size = tensor.memory_size(); + e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size); + } break; + case framework::proto::VarType_Type_SELECTED_ROWS: { + // TODO(typhoonzero): selectedrows implement should not use unique_ptr + auto* slr = var->GetMutable(); + e.WriteUint64(VarMsg::kDataTypeFieldNumber, + framework::ToDataType(slr->value().type())); + for (auto& dim : framework::vectorize(slr->value().dims())) { + e.WriteUint64(VarMsg::kDimsFieldNumber, dim); + } + e.WriteUint64(VarMsg::kLodLevelFieldNumber, 0); + auto* tensor = slr->mutable_value(); + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + platform::CPUPlace cpu; + auto& gpu_dev_ctx = + static_cast(ctx); + auto copy_size = tensor->memory_size(); + payload = memory::Alloc(cpu, copy_size); + memory::Copy(cpu, payload, + boost::get(tensor->place()), + reinterpret_cast(tensor->data()), + copy_size, gpu_dev_ctx.stream()); + ctx.Wait(); + destroy_callback = [](void* backing) { + platform::CPUPlace cpu; + memory::Free(cpu, backing); + }; +#endif + } else { + payload = slr->mutable_value()->data(); + } + payload_size = tensor->memory_size(); + e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size); + } break; + default: + PADDLE_THROW("Serialize does not support type: %s", + typeid(var->Type()).name()); + break; + } + // steal reference of tensor data + ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows + int num_slices = 2; // only SelectedRows have rows buffer + slices[0] = ::grpc::Slice(e.size()); + memcpy(const_cast(slices[0].begin()), e.data(), e.size()); + slices[1] = ::grpc::Slice( + grpc_slice_new_with_user_data(payload, payload_size, destroy_callback, + static_cast(payload)), + ::grpc::Slice::STEAL_REF); + + if (framework::ToVarType(var->Type()) == + framework::proto::VarType_Type_SELECTED_ROWS) { + auto* slr = var->GetMutable(); + + ProtoEncodeHelper e2((char*)buf, 128); + // NOTE: rows is of type int64_t + size_t rows_memory_size = + slr->rows().capacity() * framework::SizeOfType(typeid(int64_t)); + e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); + slices[2] = ::grpc::Slice(e2.size()); + memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); + + slices[3] = ::grpc::Slice( + grpc_slice_new_with_user_data( + const_cast( + reinterpret_cast(slr->rows().data())), + rows_memory_size, + [](void* backing) { + // TODO(typhoonzero): add unref here, same as above. + }, + const_cast( + reinterpret_cast(slr->rows().data()))), + ::grpc::Slice::STEAL_REF); + num_slices = 4; + } + + ::grpc::ByteBuffer tmp(&slices[0], num_slices); + msg->Swap(&tmp); +} + +void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, + const platform::DeviceContext& ctx, + framework::Variable* var) { + sendrecv::VariableMessage meta; + GrpcByteBufferSource source; + source.Init(msg); + ::google::protobuf::io::CodedInputStream input(&source); + // do zerocopy parsing + PADDLE_ENFORCE(meta.ParseFromCodedStream(&input)); + PADDLE_ENFORCE(input.ConsumedEntireMessage()); + // dims is needed by both tensor and selectedrows + std::vector vecdims; + for (auto& d : meta.dims()) { + vecdims.push_back(d); + } + framework::DDim dims = framework::make_ddim(vecdims); + + if (meta.type() == sendrecv::LOD_TENSOR) { + auto* tensor = var->GetMutable(); + tensor->Resize(dims); + void* tensor_data = tensor->mutable_data( + ctx.GetPlace(), + paddle::operators::detail::ToTypeIndex(meta.data_type())); + framework::LoD lod; + for (int i = 0; i < meta.lod_level(); ++i) { + framework::Vector v; + for (int j = 0; j < meta.lod(i).lod_data_size(); ++j) { + v.push_back(meta.lod(i).lod_data(j)); + } + lod.push_back(v); + } + tensor->set_lod(lod); + // How to avoid copying and use the message buffer directly? + // Maybe need to find a way to release all memory except tensor content. + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + platform::CPUPlace cpu; + auto& gpu_dev_ctx = static_cast(ctx); + memory::Copy(boost::get(tensor->place()), + tensor_data, cpu, + reinterpret_cast(meta.serialized().data()), + meta.serialized().size(), gpu_dev_ctx.stream()); + ctx.Wait(); +#endif + } else { + memcpy(tensor_data, + reinterpret_cast(meta.serialized().data()), + meta.serialized().size()); + } + } else if (meta.type() == sendrecv::SELECTED_ROWS) { + auto* slr = var->GetMutable(); + auto* tensor = slr->mutable_value(); + int64_t* rows_data = slr->mutable_rows()->data(); + tensor->Resize(dims); + void* tensor_data = tensor->mutable_data( + ctx.GetPlace(), + paddle::operators::detail::ToTypeIndex(meta.data_type())); + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + platform::CPUPlace cpu; + auto& gpu_dev_ctx = static_cast(ctx); + memory::Copy(boost::get(tensor->place()), + tensor_data, cpu, + reinterpret_cast(meta.serialized().data()), + meta.serialized().size(), gpu_dev_ctx.stream()); + ctx.Wait(); +#endif + } else { + memcpy(tensor_data, + reinterpret_cast(meta.serialized().data()), + meta.serialized().size()); + } + // copy rows CPU data, GPU data will be copied lazly + memcpy(rows_data, reinterpret_cast(meta.rows().data()), + meta.rows().size()); + } +} + } // namespace detail } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h index 670d0e162473750d0a5f8e9025ef1cf9a9ef407c..5208091e54b4da2bb0265f84827ce23b57e954dc 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.h +++ b/paddle/fluid/operators/detail/sendrecvop_utils.h @@ -33,6 +33,8 @@ namespace detail { #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" +typedef void (*DestroyCallback)(void*); + void SerializeToMessage(const std::string& name, const framework::Variable* var, const platform::DeviceContext& ctx, sendrecv::VariableMessage* msg); @@ -40,6 +42,32 @@ void SerializeToMessage(const std::string& name, const framework::Variable* var, void DeserializeFromMessage(const sendrecv::VariableMessage& msg, const platform::DeviceContext& ctx, framework::Variable* var); + +void SerializeToByteBuffer(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, + ::grpc::ByteBuffer* msg); + +void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, + const platform::DeviceContext& ctx, + framework::Variable* var); + +inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) { + switch (type) { + case sendrecv::VariableMessage::FP32: + return typeid(float); // NOLINT + case sendrecv::VariableMessage::FP64: + return typeid(double); // NOLINT + case sendrecv::VariableMessage::INT32: + return typeid(int); // NOLINT + case sendrecv::VariableMessage::INT64: + return typeid(int64_t); // NOLINT + case sendrecv::VariableMessage::BOOL: + return typeid(bool); // NOLINT + default: + PADDLE_THROW("Not support type %d", type); + } +} + } // namespace detail } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/test_serde.cc new file mode 100644 index 0000000000000000000000000000000000000000..2f06e5a686b996858d21930a1afa2861efca4a9b --- /dev/null +++ b/paddle/fluid/operators/detail/test_serde.cc @@ -0,0 +1,186 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace math = paddle::operators::math; +namespace memory = paddle::memory; + +void RunSerdeTestTensor(platform::Place place) { + // serialize var to ByteBuffer + framework::Variable var; + auto* tensor = var.GetMutable(); + tensor->Resize(framework::make_ddim({4, 8, 4, 2})); + framework::LoD lod; + lod.push_back(framework::Vector({1, 3, 8})); + tensor->set_lod(lod); + int tensor_numel = 4 * 8 * 4 * 2; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + tensor->mutable_data(place); + math::set_constant(ctx, tensor, 31.9); + + ::grpc::ByteBuffer msg; + operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); + EXPECT_GT(msg.Length(), 0); + + // deserialize + std::vector<::grpc::Slice> slices; + (void)msg.Dump(&slices); + std::string tmp; + for (const auto& s : slices) { + tmp.append(reinterpret_cast(s.begin()), s.size()); + } + sendrecv::VariableMessage varmsg; + EXPECT_TRUE(varmsg.ParseFromString(tmp)); + EXPECT_EQ(varmsg.varname(), "myvar"); + EXPECT_EQ(varmsg.type(), 0); + EXPECT_EQ(varmsg.dims()[0], 4); + EXPECT_EQ(varmsg.dims()[1], 8); + EXPECT_EQ(varmsg.dims()[2], 4); + EXPECT_EQ(varmsg.dims()[3], 2); + EXPECT_EQ(varmsg.lod_level(), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); + EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); + + const float* tensor_data = + reinterpret_cast(varmsg.serialized().data()); + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_FLOAT_EQ(tensor_data[i], 31.9); + } + + // deserialize zero-copy + framework::Variable var2; + operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); + auto tensor2 = var2.Get(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2.data()); + } + + EXPECT_EQ(varmsg.lod_level(), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); + EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); + for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); +} + +void RunSerdeTestSelectedRows(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + // serialize var to ByteBuffer + framework::Variable var; + auto* slr = var.GetMutable(); + auto* tensor = slr->mutable_value(); + auto* rows = slr->mutable_rows(); + tensor->Resize(framework::make_ddim({2, 10})); + tensor->mutable_data(place); + int tensor_numel = 2 * 10; + math::set_constant(ctx, tensor, 32.7); + rows->push_back(3); + rows->push_back(10); + + ::grpc::ByteBuffer msg; + operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); + EXPECT_GT(msg.Length(), 0); + + // deserialize + std::vector<::grpc::Slice> slices; + (void)msg.Dump(&slices); + std::string tmp; + for (const auto& s : slices) { + tmp.append(reinterpret_cast(s.begin()), s.size()); + } + sendrecv::VariableMessage varmsg; + EXPECT_TRUE(varmsg.ParseFromString(tmp)); + + EXPECT_EQ(varmsg.varname(), "myvar"); + EXPECT_EQ(varmsg.type(), 1); + + const float* tensor_data = + reinterpret_cast(varmsg.serialized().data()); + const int64_t* rows_data = + reinterpret_cast(varmsg.rows().data()); + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_FLOAT_EQ(tensor_data[i], 32.7); + } + EXPECT_EQ(rows_data[0], 3); + EXPECT_EQ(rows_data[1], 10); + // deserialize zero-copy + framework::Variable var2; + operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); + + auto* slr2 = var2.GetMutable(); + auto* tensor2 = slr2->mutable_value(); + auto* rows2 = slr2->mutable_rows(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(*tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2->data()); + } + const int64_t* rows_data2 = rows2->data(); + + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); + } + EXPECT_EQ(rows_data2[0], 3); + EXPECT_EQ(rows_data2[1], 10); +} + +TEST(SelectedRows, CPU) { + platform::CPUPlace place; + RunSerdeTestSelectedRows(place); +} + +TEST(SelectedRows, GPU) { + platform::CUDAPlace place; + RunSerdeTestSelectedRows(place); +} + +TEST(Tensor, CPU) { + platform::CPUPlace place; + RunSerdeTestTensor(place); +} + +TEST(Tensor, GPU) { + platform::CUDAPlace place; + RunSerdeTestTensor(place); +} \ No newline at end of file diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h index a009e9dfce130bfd6c506c71b58a418a569bdf7a..8c15bfa36bfe72586cfcbdbd8efc4542253adaca 100644 --- a/paddle/fluid/operators/detection_map_op.h +++ b/paddle/fluid/operators/detection_map_op.h @@ -273,7 +273,6 @@ class DetectionMAPOpKernel : public framework::OpKernel { std::map>>& true_pos, std::map>>& false_pos, const int class_num) const { - constexpr T kEPS = static_cast(1e-6); const int* pos_count_data = input_pos_count.data(); for (int i = 0; i < class_num; ++i) { label_pos_count[i] = pos_count_data[i]; @@ -282,12 +281,11 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto SetData = [](const framework::LoDTensor& pos_tensor, std::map>>& pos) { const T* pos_data = pos_tensor.data(); - auto pos_data_lod = pos_tensor.lod(); - for (size_t i = 0; i < pos_data_lod.size(); ++i) { - for (size_t j = pos_data_lod[0][i]; j < pos_data_lod[0][i + 1]; ++j) { + auto pos_data_lod = pos_tensor.lod()[0]; + for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) { + for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) { T score = pos_data[j * 2]; - int flag = 1; - if (pos_data[j * 2 + 1] < kEPS) flag = 0; + int flag = pos_data[j * 2 + 1]; pos[i].push_back(std::make_pair(score, flag)); } } diff --git a/paddle/fluid/operators/elementwise_add_op.cc b/paddle/fluid/operators/elementwise_add_op.cc index e9068fcd50ba9309a37939788ca8f67f1f6e25cd..4aab54f60236ecc5fa7f70e22f1553c3bfe68198 100644 --- a/paddle/fluid/operators/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise_add_op.cc @@ -29,8 +29,11 @@ class ElementwiseAddOpMaker : public ElementwiseOpMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker, - elementwise_add_grad, ops::ElementwiseOpGrad); +REGISTER_OPERATOR(elementwise_add, ops::ElementwiseOp, + ops::ElementwiseAddOpMaker, ops::ElementwiseOpInferVarType, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(elementwise_add_grad, ops::ElementwiseOpGrad); + REGISTER_OP_CPU_KERNEL( elementwise_add, ops::ElementwiseAddKernel, diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h index fe31bbaed44fced68b7b51dd2c2031950ec4247d..f04d8d8fd82ed2336dff9c5b88808dc32de6630a 100644 --- a/paddle/fluid/operators/elementwise_op.h +++ b/paddle/fluid/operators/elementwise_op.h @@ -41,6 +41,16 @@ class ElementwiseOp : public framework::OperatorWithKernel { } }; +class ElementwiseOpInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto x_var = op_desc.Input("X")[0]; + auto out_var = op_desc.Output("Out")[0]; + block->Var(out_var)->SetType(block->Var(x_var)->GetType()); + } +}; + class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { public: ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker) diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index f7f33917d7ef5bbcc7fb5d6e3d0a7f3ae63cde34..35d251f71a0cb631d5900498ea3188b5ddeae334 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -15,11 +15,23 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/math/math_function_impl.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { namespace math { +using float16 = paddle::platform::float16; + +template <> +void gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C) { + PADDLE_THROW("float16 GEMM not supported on CPU"); +} + template <> void gemm( const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, @@ -46,6 +58,15 @@ void gemm( beta, C, ldc); } +template <> +void gemm( + const platform::CPUDeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const int lda, const float16* B, + const int ldb, const float16 beta, float16* C, const int ldc) { + PADDLE_THROW("float16 GEMM not supported on CPU"); +} + template <> void gemm( const platform::CPUDeviceContext& context, const bool transA, @@ -68,6 +89,15 @@ void gemm( lda, B, ldb, beta, C, ldc); } +template <> +void matmul( + const platform::CPUDeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float16 alpha, + framework::Tensor* matrix_out, float16 beta) { + PADDLE_THROW("float16 matmul not supported on CPU"); +} + template <> void matmul( const platform::CPUDeviceContext& context, @@ -126,6 +156,15 @@ void matmul( matrix_b.data(), beta, matrix_out->data()); } +template <> +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C, const int batchCount, const int strideA, const int strideB) { + PADDLE_THROW("float16 batched_gemm not supported on CPU"); +} + #ifdef PADDLE_WITH_MKLML // Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize. template <> diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index f8d0349ac5cd96791bcb508230f5ff6d594c0e05..3abbcdb71d03eaf6f8eba3d97150d27ac5a5405e 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -16,11 +16,43 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function_impl.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { namespace math { +using float16 = paddle::platform::float16; + +template <> +void gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + const half h_alpha = static_cast(alpha); + const half h_beta = static_cast(beta); + const half* h_A = reinterpret_cast(A); + const half* h_B = reinterpret_cast(B); + half* h_C = reinterpret_cast(C); + + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, + "cublas Hgemm requires GPU compute capability >= 53"); + PADDLE_ENFORCE(platform::dynload::cublasHgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, + h_A, lda, &h_beta, h_C, N)); +} + template <> void gemm( const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, @@ -60,6 +92,31 @@ void gemm( lda, &beta, C, N)); } +template <> +void gemm( + const platform::CUDADeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const int lda, const float16* B, + const int ldb, const float16 beta, float16* C, const int ldc) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; + + const half h_alpha = static_cast(alpha); + const half h_beta = static_cast(beta); + const half* h_A = reinterpret_cast(A); + const half* h_B = reinterpret_cast(B); + half* h_C = reinterpret_cast(C); + + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, + "cublas Hgemm requires GPU compute capability >= 53"); + PADDLE_ENFORCE(platform::dynload::cublasHgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, + h_A, lda, &h_beta, h_C, ldc)); +} + template <> void gemm( const platform::CUDADeviceContext& context, const bool transA, @@ -90,6 +147,35 @@ void gemm( lda, &beta, C, ldc)); } +template <> +void matmul( + const platform::CUDADeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float16 alpha, + framework::Tensor* matrix_out, float16 beta) { + auto dim_a = matrix_a.dims(); + auto dim_b = matrix_b.dims(); + auto dim_out = matrix_out->dims(); + PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, + "The input and output of matmul be matrix"); + + PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && + platform::is_gpu_place(matrix_b.place()) && + platform::is_gpu_place(matrix_out->place()), + "Matrix must all be in CUDAPlace"); + + int M = dim_out[0]; + int N = dim_out[1]; + int K = (trans_a == false) ? dim_a[1] : dim_a[0]; + + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; + + gemm( + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); +} + template <> void matmul( const platform::CUDADeviceContext& context, @@ -148,6 +234,37 @@ void matmul( matrix_b.data(), beta, matrix_out->data()); } +template <> +void batched_gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C, const int batchCount, const int strideA, const int strideB) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int strideC = M * N; + + const half h_alpha = static_cast(alpha); + const half h_beta = static_cast(beta); + const half* h_A = reinterpret_cast(A); + const half* h_B = reinterpret_cast(B); + half* h_C = reinterpret_cast(C); + + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, + "cublas Hgemm requires GPU compute capability >= 53"); + PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, + strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount)); +} + template <> void batched_gemm( const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index 207d6a87bce178e446253ae2d880b6dd743cfc83..8982d9d066165a9da0461288685baa0c60e5f114 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -14,30 +14,41 @@ #include "gtest/gtest.h" #include "paddle/fluid/operators/math/math_function.h" -TEST(math_function, notrans_mul_trans) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor out_gpu; - paddle::framework::Tensor out; - - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); +void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size, + const std::vector& data) { + PADDLE_ENFORCE_EQ(size, data.size()); + for (size_t i = 0; i < data.size(); ++i) { + in_ptr[i] = paddle::platform::float16(data[i]); + } +} + +TEST(math_function, notrans_mul_trans_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); - - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); - out_gpu.mutable_data({2, 2}, *gpu_place); + out_gpu.mutable_data({2, 2}, gpu_place); - paddle::operators::math::matmul( + paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); - paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out); + TensorCopy(out_gpu, cpu_place, context, &out); float* out_ptr = out.data(); context.Wait(); @@ -45,33 +56,76 @@ TEST(math_function, notrans_mul_trans) { EXPECT_EQ(out_ptr[1], 14); EXPECT_EQ(out_ptr[2], 14); EXPECT_EQ(out_ptr[3], 50); - delete gpu_place; } -TEST(math_function, trans_mul_notrans) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor out_gpu; - paddle::framework::Tensor out; +TEST(math_function, notrans_mul_trans_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + // fp16 GEMM in cublas requires GPU compute capability >= 53 + if (context.GetComputeCapability() < 53) { + return; + } + + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); + + out_gpu.mutable_data({2, 2}, gpu_place); + + paddle::operators::math::matmul( + context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu, + float16(0)); + + TensorCopy(out_gpu, cpu_place, context, &out); + + float16* out_ptr = out.data(); + context.Wait(); + EXPECT_EQ(static_cast(out_ptr[0]), 5); + EXPECT_EQ(static_cast(out_ptr[1]), 14); + EXPECT_EQ(static_cast(out_ptr[2]), 14); + EXPECT_EQ(static_cast(out_ptr[3]), 50); +} + +TEST(math_function, trans_mul_notrans_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu); - - out_gpu.mutable_data({3, 3}, *gpu_place); + out_gpu.mutable_data({3, 3}, gpu_place); paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); - paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out); + TensorCopy(out_gpu, cpu_place, context, &out); float* out_ptr = out.data(); context.Wait(); @@ -84,45 +138,93 @@ TEST(math_function, trans_mul_notrans) { EXPECT_EQ(out_ptr[6], 15); EXPECT_EQ(out_ptr[7], 22); EXPECT_EQ(out_ptr[8], 29); - delete gpu_place; } -TEST(math_function, gemm_notrans_cublas) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input2; - paddle::framework::Tensor input3; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor input3_gpu; +TEST(math_function, trans_mul_notrans_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + // fp16 GEMM in cublas requires GPU compute capability >= 53 + if (context.GetComputeCapability() < 53) { + return; + } + + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); + + out_gpu.mutable_data({3, 3}, gpu_place); + + paddle::operators::math::matmul( + context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu, + float16(0)); + + TensorCopy(out_gpu, cpu_place, context, &out); + + float16* out_ptr = out.data(); + context.Wait(); + EXPECT_EQ(static_cast(out_ptr[0]), 9); + EXPECT_EQ(static_cast(out_ptr[1]), 12); + EXPECT_EQ(static_cast(out_ptr[2]), 15); + EXPECT_EQ(static_cast(out_ptr[3]), 12); + EXPECT_EQ(static_cast(out_ptr[4]), 17); + EXPECT_EQ(static_cast(out_ptr[5]), 22); + EXPECT_EQ(static_cast(out_ptr[6]), 15); + EXPECT_EQ(static_cast(out_ptr[7]), 22); + EXPECT_EQ(static_cast(out_ptr[8]), 29); +} + +TEST(math_function, gemm_notrans_cublas_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); int m = 2; int n = 3; int k = 3; - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr1[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr1, 6 * sizeof(float)); - float* input2_ptr = input2.mutable_data({3, 4}, *cpu_place); + float* input2_ptr = input2.mutable_data({3, 4}, cpu_place); float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; memcpy(input2_ptr, arr2, 12 * sizeof(float)); - float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float* input3_ptr = input3.mutable_data({2, 4}, cpu_place); float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); - - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu); - paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); - float* c = input3_gpu.mutable_data(*gpu_place); + float* c = input3_gpu.mutable_data(gpu_place); paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); - paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3); + TensorCopy(input3_gpu, cpu_place, context, &input3); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -139,47 +241,110 @@ TEST(math_function, gemm_notrans_cublas) { EXPECT_EQ(input3_ptr[5], 73); EXPECT_EQ(input3_ptr[6], 86); EXPECT_EQ(input3_ptr[7], 99); - delete gpu_place; } -TEST(math_function, gemm_trans_cublas) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input2; - paddle::framework::Tensor input3; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor input3_gpu; +TEST(math_function, gemm_notrans_cublas_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + // fp16 GEMM in cublas requires GPU compute capability >= 53 + if (context.GetComputeCapability() < 53) { + return; + } + + int m = 2; + int n = 3; + int k = 3; + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); + float16* input2_ptr = input2.mutable_data({3, 4}, cpu_place); + fill_fp16_data(input2_ptr, input2.numel(), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); + float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); + fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); + float16* a = input1_gpu.data(); + float16* b = input2_gpu.data(); + float16* c = input3_gpu.mutable_data(gpu_place); + + paddle::operators::math::gemm( + context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1), + c + 1, 4); + + TensorCopy(input3_gpu, cpu_place, context, &input3); + + // numpy code: + // a = np.arange(6).reshape(2, 3) + // b = np.arange(12).reshape(3, 4)[:, 1:] + // c = np.arange(8).reshape(2, 4)[:, 1:] + // out = np.arange(8).reshape(2, 4) + // out[:, 1:] = np.dot(a, b) + c + context.Wait(); + EXPECT_EQ(static_cast(input3_ptr[0]), 0); + EXPECT_EQ(static_cast(input3_ptr[1]), 24); + EXPECT_EQ(static_cast(input3_ptr[2]), 28); + EXPECT_EQ(static_cast(input3_ptr[3]), 32); + EXPECT_EQ(static_cast(input3_ptr[4]), 4); + EXPECT_EQ(static_cast(input3_ptr[5]), 73); + EXPECT_EQ(static_cast(input3_ptr[6]), 86); + EXPECT_EQ(static_cast(input3_ptr[7]), 99); +} + +TEST(math_function, gemm_trans_cublas_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); int m = 2; int n = 3; int k = 3; - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr1[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr1, 6 * sizeof(float)); - float* input2_ptr = input2.mutable_data({4, 3}, *cpu_place); + float* input2_ptr = input2.mutable_data({4, 3}, cpu_place); float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}; memcpy(input2_ptr, arr2, 12 * sizeof(float)); - float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float* input3_ptr = input3.mutable_data({2, 4}, cpu_place); float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); - - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu); - paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); - float* c = input3_gpu.mutable_data(*gpu_place); + float* c = input3_gpu.mutable_data(gpu_place); paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); - paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3); - context.Wait(); + TensorCopy(input3_gpu, cpu_place, context, &input3); + context.Wait(); EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[1], 24); EXPECT_EQ(input3_ptr[2], 28); @@ -188,27 +353,86 @@ TEST(math_function, gemm_trans_cublas) { EXPECT_EQ(input3_ptr[5], 73); EXPECT_EQ(input3_ptr[6], 86); EXPECT_EQ(input3_ptr[7], 99); - delete gpu_place; +} + +TEST(math_function, gemm_trans_cublas_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + // fp16 GEMM in cublas requires GPU compute capability >= 53 + if (context.GetComputeCapability() < 53) { + return; + } + + int m = 2; + int n = 3; + int k = 3; + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); + float16* input2_ptr = input2.mutable_data({4, 3}, cpu_place); + fill_fp16_data(input2_ptr, input2.numel(), + {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}); + float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); + fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); + float16* a = input1_gpu.data(); + float16* b = input2_gpu.data(); + float16* c = input3_gpu.mutable_data(gpu_place); + + paddle::operators::math::gemm( + context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1), + c + 1, 4); + + TensorCopy(input3_gpu, cpu_place, context, &input3); + + context.Wait(); + EXPECT_EQ(static_cast(input3_ptr[0]), 0); + EXPECT_EQ(static_cast(input3_ptr[1]), 24); + EXPECT_EQ(static_cast(input3_ptr[2]), 28); + EXPECT_EQ(static_cast(input3_ptr[3]), 32); + EXPECT_EQ(static_cast(input3_ptr[4]), 4); + EXPECT_EQ(static_cast(input3_ptr[5]), 73); + EXPECT_EQ(static_cast(input3_ptr[6]), 86); + EXPECT_EQ(static_cast(input3_ptr[7]), 99); } template void GemvTest(int m, int n, bool trans) { - paddle::framework::Tensor mat_a; - paddle::framework::Tensor vec_b; - paddle::framework::Tensor vec_c; - auto* cpu_place = new paddle::platform::CPUPlace(); - - T* data_a = mat_a.mutable_data({m, n}, *cpu_place); - T* data_b = vec_b.mutable_data({trans ? m : n}, *cpu_place); - T* data_c = vec_c.mutable_data({trans ? n : m}, *cpu_place); - - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::framework::Tensor g_mat_a; - paddle::framework::Tensor g_vec_b; - paddle::framework::Tensor g_vec_c; - T* g_data_a = g_mat_a.mutable_data(mat_a.dims(), *gpu_place); - T* g_data_b = g_vec_b.mutable_data(vec_b.dims(), *gpu_place); - T* g_data_c = g_vec_c.mutable_data(vec_c.dims(), *gpu_place); + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor mat_a; + Tensor vec_b; + Tensor vec_c; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + T* data_a = mat_a.mutable_data({m, n}, cpu_place); + T* data_b = vec_b.mutable_data({trans ? m : n}, cpu_place); + T* data_c = vec_c.mutable_data({trans ? n : m}, cpu_place); + + Tensor g_mat_a; + Tensor g_vec_b; + Tensor g_vec_c; + T* g_data_a = g_mat_a.mutable_data(mat_a.dims(), gpu_place); + T* g_data_b = g_vec_b.mutable_data(vec_b.dims(), gpu_place); + T* g_data_c = g_vec_c.mutable_data(vec_c.dims(), gpu_place); for (int i = 0; i < mat_a.numel(); ++i) { data_a[i] = static_cast(i); @@ -217,16 +441,14 @@ void GemvTest(int m, int n, bool trans) { data_b[i] = static_cast(i); } - paddle::platform::CUDADeviceContext context(*gpu_place); - paddle::framework::TensorCopy(mat_a, *gpu_place, context, &g_mat_a); - paddle::framework::TensorCopy(vec_b, *gpu_place, context, &g_vec_b); + TensorCopy(mat_a, gpu_place, context, &g_mat_a); + TensorCopy(vec_b, gpu_place, context, &g_vec_b); - paddle::operators::math::gemv( + paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., g_data_a, g_data_b, 0., g_data_c); - paddle::framework::TensorCopy(g_vec_c, paddle::platform::CPUPlace(), context, - &vec_c); + TensorCopy(g_vec_c, cpu_place, context, &vec_c); if (!trans) { for (int i = 0; i < m; ++i) { diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc index 9185666c56c4621d42429c9cfdb079001c6336f1..329656d26da0d32a4e30dd2aeecb9f7aa7f9a84d 100644 --- a/paddle/fluid/operators/nccl_op.cc +++ b/paddle/fluid/operators/nccl_op.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" -#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc index b4021a5dacd80a042042eadf9f5d1a932f0f00a2..90f6f955cea51ded2dbb2bde459113458d7749a4 100644 --- a/paddle/fluid/operators/nccl_op_test.cu.cc +++ b/paddle/fluid/operators/nccl_op_test.cu.cc @@ -14,19 +14,15 @@ limitations under the License. */ #include #include -#include #include #include #include -#include #include -#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/init.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" @@ -41,26 +37,35 @@ USE_CUDA_ONLY_OP(ncclBcast); namespace f = paddle::framework; namespace p = paddle::platform; -static std::vector gpu_list; - // test data amount -const f::DDim kDims = {100, 100}; +const f::DDim kDims = {20, 20}; // nccl op common tester, init communicator. class NCCLTester : public ::testing::Test { public: virtual void SetUp() override { + int count = p::GetCUDADeviceCount(); + if (count <= 1) { + LOG(WARNING) + << "Cannot test multi-gpu nccl, because the CUDA device count is " + << count; + exit(0); + } + for (int i = 0; i < count; ++i) { + gpu_list_.emplace_back(i); + } + paddle::platform::CPUPlace cpu_place; - for (size_t i = 0; i < gpu_list.size(); ++i) { + for (size_t i = 0; i < gpu_list_.size(); ++i) { p::CUDAPlace place(i); - dev_ctxs.emplace_back(new p::CUDADeviceContext(place)); + dev_ctxs_.emplace_back(new p::CUDADeviceContext(place)); } NCCLInitOp(); } virtual void TearDown() override { - for (auto &device_context : dev_ctxs) { + for (auto &device_context : dev_ctxs_) { delete device_context; } } @@ -70,36 +75,40 @@ class NCCLTester : public ::testing::Test { std::unique_ptr op1(new f::OpDesc); op1->SetType("ncclInit"); + op1->SetInput("parallel_scopes", {"p_scopes"}); op1->SetOutput("Communicator", {"comm"}); - op1->SetAttr("gpus", {gpu_list}); - auto *var = g_scope.Var("comm"); + auto *var = g_scope_.Var("comm"); var->GetMutable(); + auto *scope_var = g_scope_.Var("p_scopes"); + auto *p_scopes = scope_var->GetMutable>(); + (*p_scopes).resize(gpu_list_.size()); + auto op = f::OpRegistry::CreateOp(*op1); VLOG(1) << "invoke NCCLInitOp."; - op->Run(g_scope, cpu_place); + op->Run(g_scope_, cpu_place); VLOG(1) << "NCCLInitOp finished."; } + int GetGPUData(int gpu_id) { return gpu_id + 42; } + template void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) { - std::unique_lock lk(mu); + std::unique_lock lk(mu_); const f::OpDesc *op1 = &op_desc; p::CUDAPlace place(gpu_id); - auto &ctx = dev_ctxs.at(gpu_id); + auto &ctx = dev_ctxs_.at(gpu_id); auto *send_tensor = scope->Var("st")->GetMutable(); auto *recv_tensor = scope->Var("rt")->GetMutable(); if (!send_tensor->numel()) { - send_tensor->Resize(kDims); send_tensor->mutable_data(kDims, place); - std::vector send_vector(f::product(kDims), gpu_id); + std::vector send_vector(f::product(kDims), GetGPUData(gpu_id)); paddle::framework::TensorFromVector(send_vector, *ctx, send_tensor); - ctx->Wait(); VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel(); } @@ -118,30 +127,14 @@ class NCCLTester : public ::testing::Test { } public: - std::vector dev_ctxs; - f::Scope g_scope; - std::mutex mu; + std::vector dev_ctxs_; + f::Scope g_scope_; + std::mutex mu_; + std::vector gpu_list_; }; // ncclInitOp with desc -TEST(NCCL, ncclInitOp) { - std::unique_ptr op_desc(new f::OpDesc); - - op_desc->SetType("ncclInit"); - op_desc->SetOutput("Communicator", {"x1"}); - op_desc->SetAttr("gpus", {gpu_list}); - - f::Scope g_scope; - paddle::platform::CPUPlace cpu_place; - - auto *var = g_scope.Var("x1"); - var->GetMutable(); - - auto op = f::OpRegistry::CreateOp(*op_desc); - VLOG(1) << "invoke NCCLInitOp."; - op->Run(g_scope, cpu_place); - VLOG(1) << "NCCLInitOp finished."; -} +TEST_F(NCCLTester, ncclInitOp) {} // ncclAllReduceOp with desc TEST_F(NCCLTester, ncclAllReduceOp) { @@ -155,23 +148,25 @@ TEST_F(NCCLTester, ncclAllReduceOp) { std::vector ths; - for (size_t i = 0; i < gpu_list.size(); ++i) { - dev_scopes.emplace_back(&g_scope.NewScope()); - std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list[i], + for (size_t i = 0; i < gpu_list_.size(); ++i) { + dev_scopes.emplace_back(&g_scope_.NewScope()); + std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list_[i], *op2.get(), dev_scopes[i]); ths.emplace_back(std::move(th)); } - for (size_t i = 0; i < gpu_list.size(); ++i) { + for (size_t i = 0; i < gpu_list_.size(); ++i) { ths[i].join(); } - // check results - float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); + float expected_result = 0.0; + for (int gpu_id : gpu_list_) { + expected_result = expected_result + GetGPUData(gpu_id); + } for (size_t i = 0; i < dev_scopes.size(); ++i) { p::CPUPlace cpu_place; - p::CUDAPlace gpu_place(gpu_list[i]); + p::CUDAPlace gpu_place(gpu_list_[i]); auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get(); auto *rt = recv_tensor.data(); @@ -180,12 +175,12 @@ TEST_F(NCCLTester, ncclAllReduceOp) { auto *ct = result_tensor->mutable_data(cpu_place); paddle::memory::Copy( - cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt, + cpu_place, ct, p::CUDAPlace(gpu_list_[i]), rt, recv_tensor.numel() * sizeof(float), - static_cast(dev_ctxs[i])->stream()); + static_cast(dev_ctxs_[i])->stream()); for (int64_t j = 0; j < f::product(kDims); ++j) { - ASSERT_NEAR(ct[j], result, 1e-5); + ASSERT_NEAR(ct[j], expected_result, 1e-5); } } } @@ -204,22 +199,24 @@ TEST_F(NCCLTester, ncclReduceOp) { std::vector ths; - for (size_t i = 0; i < gpu_list.size(); ++i) { - dev_scopes.emplace_back(&g_scope.NewScope()); - std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list[i], + for (size_t i = 0; i < gpu_list_.size(); ++i) { + dev_scopes.emplace_back(&g_scope_.NewScope()); + std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list_[i], *op2.get(), dev_scopes[i]); ths.emplace_back(std::move(th)); } - for (size_t i = 0; i < gpu_list.size(); ++i) { + for (size_t i = 0; i < gpu_list_.size(); ++i) { ths[i].join(); } - // check results on - float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); + float expected_result = 0.0; + for (int gpu_id : gpu_list_) { + expected_result = expected_result + GetGPUData(gpu_id); + } p::CPUPlace cpu_place; - p::CUDAPlace gpu_place(gpu_list[kRoot]); + p::CUDAPlace gpu_place(gpu_list_[kRoot]); auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get(); auto *rt = recv_tensor.data(); @@ -229,12 +226,12 @@ TEST_F(NCCLTester, ncclReduceOp) { auto *ct = result_tensor->mutable_data(cpu_place); paddle::memory::Copy( - cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt, + cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt, recv_tensor.numel() * sizeof(float), - static_cast(dev_ctxs[kRoot])->stream()); + static_cast(dev_ctxs_[kRoot])->stream()); for (int64_t j = 0; j < f::product(kDims); ++j) { - ASSERT_NEAR(ct[j], result, 1e-5); + ASSERT_NEAR(ct[j], expected_result, 1e-5); } } @@ -252,23 +249,22 @@ TEST_F(NCCLTester, ncclBcastOp) { std::vector ths; - for (size_t i = 0; i < gpu_list.size(); ++i) { - dev_scopes.emplace_back(&g_scope.NewScope()); - std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list[i], + for (size_t i = 0; i < gpu_list_.size(); ++i) { + dev_scopes.emplace_back(&g_scope_.NewScope()); + std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list_[i], *op2.get(), dev_scopes[i]); ths.emplace_back(std::move(th)); } - for (size_t i = 0; i < gpu_list.size(); ++i) { + for (size_t i = 0; i < gpu_list_.size(); ++i) { ths[i].join(); } const int idx = 1; - // check results on - float result = kRoot; + float result = GetGPUData(kRoot); p::CPUPlace cpu_place; - p::CUDAPlace gpu_place(gpu_list[idx]); + p::CUDAPlace gpu_place(gpu_list_[idx]); auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get(); auto *rt = recv_tensor.data(); @@ -277,42 +273,11 @@ TEST_F(NCCLTester, ncclBcastOp) { auto *ct = result_tensor->mutable_data(cpu_place); paddle::memory::Copy( - cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt, + cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt, recv_tensor.numel() * sizeof(float), - static_cast(dev_ctxs[idx])->stream()); + static_cast(dev_ctxs_[idx])->stream()); for (int64_t j = 0; j < f::product(kDims); ++j) { ASSERT_NEAR(ct[j], result, 1e-5); } } - -int main(int argc, char **argv) { - // FIXME(tonyyang-svail): - // Due to the driver issue on our CI, disable for now - return 0; - const int dev_count = p::GetCUDADeviceCount(); - if (dev_count <= 1) { - LOG(WARNING) - << "Cannot test multi-gpu nccl, because the CUDA device count is " - << dev_count; - return 0; - } - - std::vector places; - - places.emplace_back(paddle::platform::CPUPlace()); - int count = paddle::platform::GetCUDADeviceCount(); - for (int i = 0; i < count; ++i) { - places.emplace_back(paddle::platform::CUDAPlace(i)); - gpu_list.emplace_back(i); - } - - VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Init(places); - - testing::InitGoogleTest(&argc, argv); - - // device context should be release before scope. - // otherwise driver will down. - return RUN_ALL_TESTS(); -} diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c88578570c1acdecaa97dd8b12a702778fef2b7e --- /dev/null +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -0,0 +1,217 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pool_op.h" +#include "paddle/fluid/platform/mkldnn_helper.h" + +namespace paddle { +namespace operators { + +template +class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + const Tensor* input = ctx.Input("X"); + Tensor* output = ctx.Output("Out"); + + // Get an unique name from "argument" name of "Out" variable + // This name will be used as key when saving info into device context + const std::string key = ctx.op().Output("Out"); + const std::string key_pool_pd = key + "@pool_pd"; + const std::string key_pool_workspace_memory = + key + "@pool_workspace_memory"; + + std::string pooling_type = ctx.Attr("pooling_type"); + std::vector ksize = ctx.Attr>("ksize"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + if (ctx.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(input->dims()[i + 2]); + } + } + + // Only 2D pooling is supported now + PADDLE_ENFORCE(ksize.size() == 2, "ksize must be 2D, i.e. 2D pooling"); + PADDLE_ENFORCE(pooling_type == "max" || pooling_type == "avg", + "pooling_type must be 'max' or 'avg'"); + PADDLE_ENFORCE(input->dims().size() == 4, + "Input dim must be with 4, i.e. NCHW"); + + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + + // TODO(pzelazko-intel): support more formats + auto src_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + + std::shared_ptr pool_pd = + CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize, + pooling_type, mkldnn_engine); + + // save pool_pd into global device context to be referred in backward path + dev_ctx.SetBlob(key_pool_pd, pool_pd); + + std::shared_ptr workspace_memory = + CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine); + + // save pool_workspace_memory to be referred in backward path + dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); + + auto src_memory = + mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); + auto dst_memory = + mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data); + + auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory, + *workspace_memory); + + // push primitive to stream and wait until it's executed + std::vector pipeline{pool_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } + + private: + std::unique_ptr CreatePrimitiveDesc( + const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst, + const std::vector& stride, const std::vector& padding, + const std::vector& kernel, const std::string& pooling_type, + const mkldnn::engine& engine) const { + auto pool_desc = mkldnn::pooling_forward::desc( + mkldnn::prop_kind::forward, + pooling_type == "max" ? mkldnn::algorithm::pooling_max + : mkldnn::algorithm::pooling_avg, + src, dst, stride, kernel, padding, padding, mkldnn::padding_kind::zero); + + auto p_pool_pd = + new mkldnn::pooling_forward::primitive_desc(pool_desc, engine); + return std::unique_ptr(p_pool_pd); + } + + std::unique_ptr CreateWorkspaceMemory( + std::shared_ptr pool_pd, + const std::string& pooling_type, const mkldnn::engine& engine) const { + mkldnn::memory::primitive_desc workspace_md = + pooling_type == "max" + ? pool_pd->workspace_primitive_desc() + : mkldnn::memory::primitive_desc( + {{}, mkldnn::memory::f32, mkldnn::memory::format::nchw}, + engine); + + auto p_workspace_memory = new mkldnn::memory(workspace_md); + return std::unique_ptr(p_workspace_memory); + } +}; + +template +class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + const Tensor* in_x = ctx.Input("X"); + const Tensor* out_grad = ctx.Input(framework::GradVarName("Out")); + Tensor* in_x_grad = ctx.Output(framework::GradVarName("X")); + + // Get an unique name from "argument" name of "Out" variable + // This name will be used as key when referring info from device context + const std::string key = ctx.op().Input("Out"); + const std::string key_pool_pd = key + "@pool_pd"; + const std::string key_pool_workspace_memory = + key + "@pool_workspace_memory"; + + std::string pooling_type = ctx.Attr("pooling_type"); + std::vector ksize = ctx.Attr>("ksize"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + + if (ctx.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_x->dims()[i + 2]); + } + } + + auto& dev_ctx = + ctx.template device_context(); + const mkldnn::engine& mkldnn_engine = dev_ctx.GetEngine(); + + const T* out_grad_data = out_grad->data(); + T* in_x_grad_data = in_x_grad->mutable_data(ctx.GetPlace()); + + std::vector diff_src_tz = + paddle::framework::vectorize2int(in_x_grad->dims()); + std::vector diff_dst_tz = + paddle::framework::vectorize2int(out_grad->dims()); + + auto diff_src_md = platform::MKLDNNMemDesc(diff_src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + auto diff_dst_md = platform::MKLDNNMemDesc(diff_dst_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + + // Retrieve pool_pd/pool_workspace_memory from device context + auto pool_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_pool_pd)); + PADDLE_ENFORCE(pool_pd != nullptr, + "Fail to find pool_pd in device context"); + + auto workspace_memory = std::static_pointer_cast( + dev_ctx.GetBlob(key_pool_workspace_memory)); + PADDLE_ENFORCE(workspace_memory != nullptr, + "Fail to find workspace_memory in device context"); + + auto pool_bwd_desc = mkldnn::pooling_backward::desc( + pooling_type == "max" ? mkldnn::algorithm::pooling_max + : mkldnn::algorithm::pooling_avg, + diff_src_md, diff_dst_md, strides, ksize, paddings, paddings, + mkldnn::padding_kind::zero); + auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc( + pool_bwd_desc, mkldnn_engine, *pool_pd); + + auto diff_src_memory = + mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data); + auto diff_dst_memory = + mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data); + + auto bwd_prim = mkldnn::pooling_backward( + pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory); + + // push primitive to stream and wait until it's executed + std::vector pipeline{bwd_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } // Compute() +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace, + paddle::operators::PoolMKLDNNOpKernel); +REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace, + paddle::operators::PoolMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index ac22acb25a7ab33a26de49804667703e84b78a8a..d78da10016a0e2b1d9a0ca9f3dfe4e8009bbe61d 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -13,6 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/pool_op.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace operators { @@ -76,20 +82,18 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const { framework::OpKernelType PoolOp::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { - bool use_cudnn = ctx.Attr("use_cudnn"); - use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); + framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(ctx.GetPlace())) { - auto &dev_ctx = ctx.template device_context(); - use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + if (platform::CanCUDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; } #endif - framework::LibraryType library_; - if (use_cudnn) { - library_ = framework::LibraryType::kCUDNN; - } else { - library_ = framework::LibraryType::kPlain; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; } +#endif std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); @@ -107,20 +111,18 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { framework::OpKernelType PoolOpGrad::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { - bool use_cudnn = ctx.Attr("use_cudnn"); - use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); + framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(ctx.GetPlace())) { - auto &dev_ctx = ctx.template device_context(); - use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + if (platform::CanCUDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; } #endif - framework::LibraryType library_; - if (use_cudnn) { - library_ = framework::LibraryType::kCUDNN; - } else { - library_ = framework::LibraryType::kPlain; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; } +#endif std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); @@ -181,6 +183,9 @@ Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker) "output height and width. False is the default. If it is set to False, " "the floor function will be used.") .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " @@ -276,6 +281,9 @@ Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker) "output height and width. False is the default. If it is set to False, " "the floor function will be used.") .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc index be7898c22190339e0717317807b91e038f4949f6..7ba55437cb20f802cc12ceea7777d7d78bba62a6 100644 --- a/paddle/fluid/operators/prior_box_op.cc +++ b/paddle/fluid/operators/prior_box_op.cc @@ -111,7 +111,8 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { }); AddAttr>( "max_sizes", - "(vector) List of max sizes of generated prior boxes."); + "(vector) List of max sizes of generated prior boxes.") + .SetDefault(std::vector{}); AddAttr>( "aspect_ratios", "(vector) List of aspect ratios of generated prior boxes."); diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h index 0113d2f09af90eec32632180741db3a1fc40c724..18bb2deb6b5acf626dfb2883a5771d9d195d45c0 100644 --- a/paddle/fluid/operators/prior_box_op.h +++ b/paddle/fluid/operators/prior_box_op.h @@ -97,9 +97,6 @@ class PriorBoxOpKernel : public framework::OpKernel { boxes->mutable_data(ctx.GetPlace()); vars->mutable_data(ctx.GetPlace()); - T inv_img_width = 1.0 / img_width; - T inv_img_height = 1.0 / img_height; - auto e_boxes = framework::EigenTensor::From(*boxes); for (int h = 0; h < feature_height; ++h) { for (int w = 0; w < feature_width; ++w) { @@ -110,36 +107,30 @@ class PriorBoxOpKernel : public framework::OpKernel { for (size_t s = 0; s < min_sizes.size(); ++s) { auto min_size = min_sizes[s]; // first prior: aspect_ratio = 1, size = min_size - box_width = box_height = min_size; + box_width = box_height = min_size / 2.; // xmin - e_boxes(h, w, idx, 0) = (center_x - box_width * 0.5) * inv_img_width; + e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; // ymin - e_boxes(h, w, idx, 1) = - (center_y - box_height * 0.5) * inv_img_height; + e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; // xmax - e_boxes(h, w, idx, 2) = (center_x + box_width * 0.5) * inv_img_width; + e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; // ymax - e_boxes(h, w, idx, 3) = - (center_y + box_height * 0.5) * inv_img_height; + e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; idx++; if (max_sizes.size() > 0) { auto max_size = max_sizes[s]; // second prior: aspect_ratio = 1, // size = sqrt(min_size * max_size) - box_width = box_height = sqrt(min_size * max_size); + box_width = box_height = sqrt(min_size * max_size) / 2.; // xmin - e_boxes(h, w, idx, 0) = - (center_x - box_width * 0.5) * inv_img_width; + e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; // ymin - e_boxes(h, w, idx, 1) = - (center_y - box_height * 0.5) * inv_img_height; + e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; // xmax - e_boxes(h, w, idx, 2) = - (center_x + box_width * 0.5) * inv_img_width; + e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; // ymax - e_boxes(h, w, idx, 3) = - (center_y + box_height * 0.5) * inv_img_height; + e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; idx++; } @@ -149,20 +140,16 @@ class PriorBoxOpKernel : public framework::OpKernel { if (fabs(ar - 1.) < 1e-6) { continue; } - box_width = min_size * sqrt(ar); - box_height = min_size / sqrt(ar); + box_width = min_size * sqrt(ar) / 2.; + box_height = min_size / sqrt(ar) / 2.; // xmin - e_boxes(h, w, idx, 0) = - (center_x - box_width * 0.5) * inv_img_width; + e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; // ymin - e_boxes(h, w, idx, 1) = - (center_y - box_height * 0.5) * inv_img_height; + e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; // xmax - e_boxes(h, w, idx, 2) = - (center_x + box_width * 0.5) * inv_img_width; + e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; // ymax - e_boxes(h, w, idx, 3) = - (center_y + box_height * 0.5) * inv_img_height; + e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; idx++; } } diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc index 62beab82d4f2b0b795d5d32f50352172de6870cc..2a5605e0d378a184ae132e657b2872279784855d 100644 --- a/paddle/fluid/operators/read_op.cc +++ b/paddle/fluid/operators/read_op.cc @@ -60,15 +60,16 @@ class ReadOp : public framework::OperatorBase { const platform::Place& dev_place) const override { framework::ReaderHolder* reader = scope.FindVar(Input("Reader"))->GetMutable(); - if (!reader->HasNext()) { + std::vector out_arg_names = Outputs("Out"); + std::vector ins; + reader->ReadNext(&ins); + if (ins.empty()) { reader->ReInit(); + reader->ReadNext(&ins); PADDLE_ENFORCE( - reader->HasNext(), + !ins.empty(), "Reader can not read the next data even it has been re-initialized."); } - std::vector out_arg_names = Outputs("Out"); - std::vector ins; - reader->ReadNext(&ins); PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); for (size_t i = 0; i < ins.size(); ++i) { auto* out = diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt index 06489f32d64d69030c084a038acb78ac2bac6200..335c5b26a864381bf87a2824b78f521cdce063e4 100644 --- a/paddle/fluid/operators/reader/CMakeLists.txt +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -2,4 +2,5 @@ cc_library(reader_op_registry SRCS reader_op_registry.cc DEPS operator op_regist op_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc DEPS reader_op_registry) op_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc DEPS reader_op_registry) op_library(create_batch_reader_op SRCS create_batch_reader_op.cc DEPS reader_op_registry) -set(READER_LIBRARY create_random_data_generator_op create_shuffle_reader_op create_batch_reader_op PARENT_SCOPE) +op_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS reader_op_registry) +set(READER_LIBRARY create_random_data_generator_op create_shuffle_reader_op create_batch_reader_op create_double_buffer_reader_op PARENT_SCOPE) diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc index bac043a5529d877dba79c03f07b9d43c9b71d7aa..277f2856c07b3fec2113486539aec1d9139fae92 100644 --- a/paddle/fluid/operators/reader/create_batch_reader_op.cc +++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc @@ -68,10 +68,10 @@ void BatchReader::ReadNext(std::vector* out) { buffer_.clear(); buffer_.reserve(batch_size_); for (int i = 0; i < batch_size_; ++i) { - if (reader_->HasNext()) { - buffer_.push_back(std::vector()); - reader_->ReadNext(&buffer_.back()); - } else { + buffer_.push_back(std::vector()); + reader_->ReadNext(&buffer_.back()); + if (buffer_.back().empty()) { + buffer_.pop_back(); break; } } diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b6a0609a1e23195ececee0f16a69daa1c1c46ed8 --- /dev/null +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/channel.h" +#include "paddle/fluid/operators/reader/reader_op_registry.h" + +namespace paddle { +namespace operators { +namespace reader { + +static constexpr size_t kDoubleBufferSize = 2; + +class DoubleBufferReader : public framework::DecoratedReader { + public: + explicit DoubleBufferReader(ReaderBase* reader) + : DecoratedReader(reader), + buffer_(framework::MakeChannel>( + kDoubleBufferSize)) { + std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this); + prefetch.detach(); + } + + void ReadNext(std::vector* out) override; + void ReInit() override; + + ~DoubleBufferReader() { buffer_->Close(); } + + private: + void PrefetchThreadFunc(); + + framework::Channel>* buffer_; +}; + +class CreateDoubleBufferReaderOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) + ->Get(); + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + out->Reset(new DoubleBufferReader(underlying_reader.Get())); + } +}; + +class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase { + public: + CreateDoubleBufferReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : DecoratedReaderMakerBase(op_proto, op_checker) { + AddComment(R"DOC( + CreateDoubleBufferReader Operator + + A double buffer reader takes another reader as its 'underlying reader'. + It launches another thread to execute the 'underlying reader' asynchronously, + which prevents reading process from blocking subsequent training. + )DOC"); + } +}; + +void DoubleBufferReader::ReadNext(std::vector* out) { + out->clear(); + buffer_->Receive(out); +} + +void DoubleBufferReader::ReInit() { + reader_->ReInit(); + buffer_->Close(); + // The existing prefetch thread will terminate for the buffer_ is closed. + buffer_ = framework::MakeChannel>( + kDoubleBufferSize); + std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this); + prefetch.detach(); +} + +void DoubleBufferReader::PrefetchThreadFunc() { + VLOG(5) << "A new prefetch thread starts."; + while (true) { + std::vector batch; + reader_->ReadNext(&batch); + if (batch.empty()) { + // EOF + buffer_->Close(); + VLOG(5) << "Reached the end of the file. The prefetch thread terminates."; + break; + } + if (!buffer_->Send(&batch)) { + VLOG(5) << "WARNING: The double buffer channel has been closed. The " + "prefetch thread terminates."; + break; + } + } +} + +} // namespace reader +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators::reader; +REGISTER_DECORATED_READER_OPERATOR(create_double_buffer_reader, + ops::CreateDoubleBufferReaderOp, + ops::CreateDoubleBufferReaderOpMaker); diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc index f77ab8ab196dae4cf9351cee9bc5566ec2c04e4b..73c39b5da4484b27f75aeba3c8171c5ffed2398f 100644 --- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc +++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc @@ -50,8 +50,6 @@ class RandomDataGenerator : public framework::FileReader { } } - bool HasNext() const override { return true; } - void ReInit() override { return; } private: diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc index 3e8b463efc99e4a962e5ae14ab133cf634548756..4dac3831109beeed660d32f08fb27c7adf62ac2b 100644 --- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc +++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc @@ -39,10 +39,10 @@ void ShuffleReader::ReadNext(std::vector* out) { buffer_.clear(); buffer_.reserve(buffer_size_); for (int i = 0; i < buffer_size_; ++i) { - if (reader_->HasNext()) { - buffer_.push_back(std::vector()); - reader_->ReadNext(&buffer_.back()); - } else { + buffer_.push_back(std::vector()); + reader_->ReadNext(&buffer_.back()); + if (buffer_.back().empty()) { + buffer_.pop_back(); break; } } diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc index 7ea4f4b8d9feecac5bc2d0338bbbe9ab7a532040..f80769d7cd2d35261cd55fc1d6c8c20197f5e88c 100644 --- a/paddle/fluid/operators/reader/reader_op_registry.cc +++ b/paddle/fluid/operators/reader/reader_op_registry.cc @@ -49,6 +49,10 @@ FileReaderMakerBase::FileReaderMakerBase( } void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE( + !ctx->IsRuntime(), + "'FileReaderInferShape' should only be invoked during compile time."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "The output file reader should not be null."); const auto shape_concat = ctx->Attrs().Get>("shape_concat"); @@ -56,16 +60,14 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { std::vector shapes = RestoreShapes(shape_concat, ranks); ctx->SetReaderDims("Out", shapes); - if (ctx->IsRuntime()) { - const auto lod_levels = ctx->Attrs().Get>("lod_levels"); - PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), - "The number of 'lod_levels'(%d) doesn't match the number " - "of 'shapes'(%d).", - lod_levels.size(), shapes.size()); - framework::VarDesc* reader = - boost::get(ctx->GetOutputVarPtrs("Out")[0]); - reader->SetLoDLevels(lod_levels); - } + const auto lod_levels = ctx->Attrs().Get>("lod_levels"); + PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), + "The number of 'lod_levels'(%d) doesn't match the number " + "of 'shapes'(%d).", + lod_levels.size(), shapes.size()); + framework::VarDesc* reader = + boost::get(ctx->GetOutputVarPtrs("Out")[0]); + reader->SetLoDLevels(lod_levels); } void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc, @@ -77,19 +79,21 @@ void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc, void DecoratedReaderInferShape::operator()( framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(!ctx->IsRuntime(), + "'DecoratedReaderInferShape' should only be invoked during " + "compile time."); + PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"), "Input(UnderlyingReader) should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "The output decorated reader should not be null."); ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader")); - if (ctx->IsRuntime()) { - framework::VarDesc* in_reader = boost::get( - ctx->GetInputVarPtrs("UnderlyingReader")[0]); - framework::VarDesc* out_reader = - boost::get(ctx->GetOutputVarPtrs("Out")[0]); - out_reader->SetLoDLevels(in_reader->GetLoDLevels()); - } + framework::VarDesc* in_reader = boost::get( + ctx->GetInputVarPtrs("UnderlyingReader")[0]); + framework::VarDesc* out_reader = + boost::get(ctx->GetOutputVarPtrs("Out")[0]); + out_reader->SetLoDLevels(in_reader->GetLoDLevels()); } void DecoratedReaderInferVarType::operator()( const framework::OpDesc& op_desc, framework::BlockDesc* block) const { diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 178976f96fdbd08cead7b7c518ea1fbaaa2a5db8..8fdd08eae6b22cd57506d6e75182c1a7e2022562 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -24,15 +24,15 @@ limitations under the License. */ namespace paddle { namespace operators { -static bool IsVariableInitialized(const framework::Scope& scope, - const std::string& varname) { +static bool NeedSend(const framework::Scope& scope, + const std::string& varname) { auto* var = scope.FindVar(varname); PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.", varname); if (var->IsType()) { return var->Get().IsInitialized(); } else if (var->IsType()) { - return var->Get().value().IsInitialized(); + return var->Get().rows().size() > 0UL; } else { PADDLE_THROW( "Variable type in send side should be in " @@ -67,7 +67,7 @@ class SendOp : public framework::OperatorBase { detail::RPCClient* rpc_client = client_var->GetMutable(); for (size_t i = 0; i < ins.size(); i++) { - if (IsVariableInitialized(scope, ins[i])) { + if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); } else { diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc index 7cc73de8788e3dceb763b6f5a1519459d0fb05dd..d0aa2f9cbadaadf4e7e625628d9db5677d50d277 100644 --- a/paddle/fluid/operators/sgd_op.cc +++ b/paddle/fluid/operators/sgd_op.cc @@ -39,6 +39,14 @@ class SGDOp : public framework::OperatorWithKernel { // and run time. ctx->SetOutputDim("ParamOut", param_dim); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Param")->type()), + ctx.GetPlace()); + } }; class SGDOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h index 2fec84815a9ecc63675de88816b23cfaa75aca65..0ad801079400f1830d85a945e57a434a86adeb00 100644 --- a/paddle/fluid/operators/sgd_op.h +++ b/paddle/fluid/operators/sgd_op.h @@ -47,6 +47,12 @@ class SGDOpKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(param, param_out); auto* grad = ctx.Input("Grad"); + // for distributed training, a sparse var may be empty, + // just skip updating. + if (grad->rows().size() == 0) { + return; + } + auto in_height = grad->height(); auto out_dims = param_out->dims(); PADDLE_ENFORCE_EQ(in_height, out_dims[0]); @@ -60,13 +66,15 @@ class SGDOpKernel : public framework::OpKernel { auto* in_data = in_value.data(); auto* out_data = param_out->data(); auto* lr = learning_rate->data(); - for (size_t i = 0; i < in_rows.size(); i++) { + PADDLE_ENFORCE(in_rows[i] < in_height, + "Input rows index should less than height"); for (int64_t j = 0; j < in_row_numel; j++) { out_data[in_rows[i] * in_row_numel + j] -= lr[0] * in_data[i * in_row_numel + j]; } } + } else { PADDLE_THROW("Unsupported Variable Type of Grad"); } diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h index 23baf8e72eca87f3865fc2b63ce2de96f799dce3..0e9ce165b98845f4745ee70b028513ea31cc6657 100644 --- a/paddle/fluid/operators/split_selected_rows_op.h +++ b/paddle/fluid/operators/split_selected_rows_op.h @@ -21,15 +21,24 @@ limitations under the License. */ namespace paddle { namespace operators { -static int FindOutIdx(int row, const std::vector& height_sections) { - int offset = 0; - for (size_t i = 0; i < height_sections.size(); ++i) { - if (row >= offset && row < (offset + height_sections[i])) { - return i; +static int FindOutIdx(int row, const std::vector& abs_sections) { + for (size_t i = 1; i < abs_sections.size(); ++i) { + if (row < abs_sections[i]) { + return i - 1; } - offset += height_sections[i]; } - return -1; + return abs_sections.size() - 1; +} + +static std::vector ToAbsoluteSection( + const std::vector& height_sections) { + std::vector abs_sections; + abs_sections.resize(height_sections.size()); + abs_sections[0] = 0; + for (size_t i = 1; i < height_sections.size(); ++i) { + abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1]; + } + return abs_sections; } template @@ -40,16 +49,23 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { auto outs = ctx.MultiOutput("Out"); auto height_sections = ctx.Attr>("height_sections"); + auto abs_sections = ToAbsoluteSection(height_sections); + auto x_rows = x->rows(); std::vector> outs_rows_idx; + std::vector> outs_dense_idx; + outs_rows_idx.resize(outs.size()); + outs_dense_idx.resize(outs.size()); auto row_numel = x->value().numel() / x->value().dims()[0]; auto src = x->value().data(); + // split rows index into output sparse vars for (size_t i = 0; i < x_rows.size(); ++i) { - int out_idx = FindOutIdx(x_rows[i], height_sections); - outs_rows_idx[out_idx].push_back(i); + int out_idx = FindOutIdx(x_rows[i], abs_sections); + outs_rows_idx[out_idx].push_back(x_rows[i]); + outs_dense_idx[out_idx].push_back(i); } auto place = ctx.GetPlace(); @@ -61,19 +77,20 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { dims[0] = rows_idx.size(); outs[i]->mutable_value()->mutable_data(dims, x->place()); for (auto idx : rows_idx) { - outs[i]->mutable_rows()->push_back(x_rows[idx]); + outs[i]->mutable_rows()->push_back(idx - abs_sections[i]); } auto dst = outs[i]->mutable_value()->mutable_data(ctx.GetPlace()); for (size_t j = 0; j < rows_idx.size(); j++) { if (platform::is_cpu_place(place)) { - memory::Copy(platform::CPUPlace(), dst + j * row_numel, - platform::CPUPlace(), src + rows_idx[j] * row_numel, - sizeof(T) * row_numel); + memory::Copy( + platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(), + src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel); } else { #ifdef PADDLE_WITH_CUDA auto stream = ctx.cuda_device_context().stream(); memory::Copy(platform::CUDAPlace(), dst + j * row_numel, - platform::CUDAPlace(), src + rows_idx[j] * row_numel, + platform::CUDAPlace(), + src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel, stream); #else PADDLE_THROW("Paddle is not compiled with GPU"); diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index c3abb3ea4a53126c22c817069e8ad955b202f09d..d3d5c8a3429e2070c5472355b4440401eaa699cb 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -76,10 +76,16 @@ class SumOp : public framework::OperatorWithKernel { static_cast(dtype), ctx.device_context()); } else if (x_vars[0]->IsType()) { - return framework::OpKernelType( - framework::ToDataType( - x_vars[0]->Get().value().type()), - ctx.device_context()); + for (auto& var : x_vars) { + auto& value = var->Get().value(); + if (value.IsInitialized()) { + return framework::OpKernelType(framework::ToDataType(value.type()), + ctx.device_context()); + } + } + // if input sparse vars are not initialized, use an default kernel type. + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.device_context()); } else if (x_vars[0]->IsType()) { for (auto& x_var : x_vars) { auto& array = x_var->Get(); diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 48b2d2779aeeb168cf87e61557e01d5cbde476b3..e7e5346cdca5efaf81c2b0fddedde7406e3b874d 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -109,6 +109,12 @@ class SumKernel : public framework::OpKernel { in_dim[0] = static_cast(first_dim); out_value->Resize(framework::make_ddim(in_dim)); + + // if all the input sparse vars are empty, no need to + // merge these vars. + if (first_dim == 0UL) { + return; + } out_value->mutable_data(context.GetPlace()); math::SelectedRowsAddTo functor; @@ -116,7 +122,7 @@ class SumKernel : public framework::OpKernel { int64_t offset = 0; for (int i = 0; i < N; i++) { auto &sel_row = get_selected_row(i); - if (!sel_row.value().IsInitialized() || sel_row.rows().size() == 0) { + if (sel_row.rows().size() == 0) { continue; } PADDLE_ENFORCE_EQ(out->height(), sel_row.height()); diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index a1a743d94e204ca506c994f8fabb6bbf8c22cea5..7eec6ab657723c6390dfa14a78d6c49a76f2a279 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -48,7 +48,6 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) -nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context) cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS}) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index bb9fbd468f38fffc94107e321e777fc0e772fbe6..98b4178177b0a8bafd6fe34a92be2a07a2fbc5a7 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -127,6 +127,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { SetDeviceId(place_.device); + compute_capability = GetCUDAComputeCapability(place_.device); multi_process = GetCUDAMultiProcessors(place_.device); max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); @@ -162,6 +163,10 @@ void CUDADeviceContext::Wait() const { PADDLE_ENFORCE(cudaGetLastError()); } +int CUDADeviceContext::GetComputeCapability() const { + return compute_capability; +} + int CUDADeviceContext::GetMaxPhysicalThreadCount() const { return multi_process * max_threads_per_mp; } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index e779644190de1246cd650fbf91eeaeb03494643f..603b890af13b529c490c29112a73a09cc815d07a 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return place in the device context. */ Place GetPlace() const override; + /*! \brief Return compute capability in the device context. */ + int GetComputeCapability() const; + /*! \brief Return the max physical thread count in the device context */ int GetMaxPhysicalThreadCount() const; @@ -104,6 +107,7 @@ class CUDADeviceContext : public DeviceContext { cudnnHandle_t cudnn_handle_; cublasHandle_t cublas_handle_; + int compute_capability; int multi_process; int max_threads_per_mp; }; diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index 580ed9bb57fca942bb76b9f7bf76f48ea281a0de..fa9041134d863ebfd8d1e00379da3b92323ae6e3 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -68,6 +68,8 @@ extern void *cublas_dso_handle; __macro(cublasDgemv_v2); \ __macro(cublasSgemm_v2); \ __macro(cublasDgemm_v2); \ + __macro(cublasHgemm); \ + __macro(cublasSgemmEx); \ __macro(cublasSgeam_v2); \ __macro(cublasDgeam_v2); \ __macro(cublasCreate_v2); \ @@ -83,6 +85,7 @@ extern void *cublas_dso_handle; __macro(cublasDgemmStridedBatched); \ __macro(cublasCgemmStridedBatched); \ __macro(cublasZgemmStridedBatched); \ + __macro(cublasHgemmStridedBatched); \ __macro(cublasSgetrfBatched); \ __macro(cublasSgetriBatched); \ __macro(cublasDgetrfBatched); \ diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index da4041bad0d82fe1c8c7a12fd0c7177e6dbddef3..dd70ff9ff574b32bc96a9e8255b1bf77a5cc84e4 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -33,6 +33,15 @@ int GetCUDADeviceCount() { return count; } +int GetCUDAComputeCapability(int id) { + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + cudaDeviceProp device_prop; + PADDLE_ENFORCE(cudaGetDeviceProperties(&device_prop, id), + "cudaGetDeviceProperties failed in " + "paddle::platform::GetCUDAComputeCapability"); + return device_prop.major * 10 + device_prop.minor; +} + int GetCUDAMultiProcessors(int id) { PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); int count; diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index c38ccf0f2ade1d2405177b541b33fd84283726ff..fa469fa77f5ca780da153cc87da8d04f239711f3 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -30,6 +30,9 @@ const std::string kEnvFractionGpuMemoryToUse = //! Get the total number of GPU devices in system. int GetCUDADeviceCount(); +//! Get the compute capability of the ith GPU (format: major * 10 + minor) +int GetCUDAComputeCapability(int i); + //! Get the MultiProcessors of the ith GPU. int GetCUDAMultiProcessors(int i); diff --git a/paddle/fluid/platform/nccl_test.cu b/paddle/fluid/platform/nccl_test.cu deleted file mode 100644 index 32a293796c09e5254c5eb48d11fa74617b3465ac..0000000000000000000000000000000000000000 --- a/paddle/fluid/platform/nccl_test.cu +++ /dev/null @@ -1,153 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "glog/logging.h" -#include "gtest/gtest.h" - -#include "paddle/fluid/framework/init.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/dynload/nccl.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/gpu_info.h" - -static int dev_count = 0; - -namespace paddle { -namespace platform { - -TEST(NCCL, init) { - std::vector comms; - comms.resize(dev_count); - PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr)); - - for (int i = 0; i < dev_count; ++i) { - dynload::ncclCommDestroy(comms[i]); - } -} - -template -struct PerThreadData { - thrust::device_vector send_buff; - thrust::device_vector recv_buff; - CUDADeviceContext dev_ctx; - - T* SendBuff() { return thrust::raw_pointer_cast(send_buff.data()); } - - T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); } - - PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) { - send_buff.resize(size); - for (size_t i = 0; i < size; ++i) { - send_buff[i] = static_cast(i); - } - recv_buff.resize(size); - } -}; - -static constexpr int ELEM_COUNT = 10000; - -TEST(NCCL, all_reduce) { - std::vector comms; - comms.resize(dev_count); - VLOG(1) << "Initializing ncclComm"; - dynload::ncclCommInitAll(comms.data(), dev_count, nullptr); - VLOG(1) << "ncclComm initialized"; - VLOG(1) << "Creating thread data"; - std::vector>> data; - data.reserve(dev_count); - for (int i = 0; i < dev_count; ++i) { - VLOG(1) << "Creating thread data for device " << i; - SetDeviceId(i); - data.emplace_back(new PerThreadData(i, ELEM_COUNT)); - } - VLOG(1) << "Thread data created"; - - VLOG(1) << "Check send_buf data"; - for (int i = 0; i < dev_count; ++i) { - VLOG(1) << "Check on device " << i; - SetDeviceId(i); - thrust::host_vector tmp = data[i]->send_buff; - for (size_t j = 0; j < tmp.size(); ++j) { - ASSERT_NEAR(static_cast(j), tmp[j], 1e-5); - } - } - - VLOG(1) << "Invoking ncclAllReduce"; - - dynload::ncclGroupStart(); - for (int i = 0; i < dev_count; ++i) { - VLOG(1) << "Invoking ncclAllReduce with device " << i; - SetDeviceId(i); - PADDLE_ENFORCE(dynload::ncclAllReduce( - data[i]->SendBuff(), data[i]->RecvBuff(), ELEM_COUNT, ncclDouble, - ncclSum, comms[i], data[i]->dev_ctx.stream())); - VLOG(1) << "Invoked ncclAllReduce for device " << i; - } - dynload::ncclGroupEnd(); - - VLOG(1) << "Invoked ncclAllReduce"; - - VLOG(1) << "Sync devices"; - for (int i = 0; i < dev_count; ++i) { - VLOG(1) << "Sync device " << i; - SetDeviceId(i); - data[i]->dev_ctx.Wait(); - } - VLOG(1) << "device synced"; - - for (int i = 0; i < dev_count; ++i) { - SetDeviceId(i); - VLOG(1) << "Checking vector on device " << i; - thrust::host_vector tmp = data[i]->recv_buff; - for (size_t j = 0; j < tmp.size(); ++j) { - auto elem = static_cast(j); - elem *= dev_count; - ASSERT_NEAR(tmp[j], elem, 1e-4); - } - } - - for (int i = 0; i < dev_count; ++i) { - dynload::ncclCommDestroy(comms[i]); - } -} -} // namespace platform -} // namespace paddle - -int main(int argc, char** argv) { - dev_count = paddle::platform::GetCUDADeviceCount(); - if (dev_count <= 1) { - LOG(WARNING) - << "Cannot test multi-gpu nccl, because the CUDA device count is " - << dev_count; - return 0; - } - - std::vector places; - - places.emplace_back(paddle::platform::CPUPlace()); - int count = paddle::platform::GetCUDADeviceCount(); - for (int i = 0; i < count; ++i) { - places.emplace_back(paddle::platform::CUDAPlace(i)); - } - - VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Init(places); - - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index b0a2497d919b65afbe5eeaf4fe47c19baa1aba1c..45a64f43846e79c27295e52c59dca6bdfaa120a3 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -161,6 +161,8 @@ void BindBlockDesc(py::module &m) { py::return_value_policy::reference) .def("prepend_op", &BlockDesc::PrependOp, py::return_value_policy::reference) + .def("insert_op", &BlockDesc::InsertOp, + py::return_value_policy::reference) .def("remove_op", &BlockDesc::RemoveOp) .def("var", [](BlockDesc &self, py::bytes byte_name) { diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 0df3fd0343dbdaee88192a402d990fdfc2235811..84a57aff516ad2d7ba1efaf1d530e77747d3b254 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -37,7 +37,7 @@ from distribute_transpiler_simple import SimpleDistributeTranspiler from concurrency import (Go, make_channel, channel_send, channel_recv, channel_close) import clip -from memory_optimization_transpiler import memory_optimize +from memory_optimization_transpiler import memory_optimize, release_memory import profiler import unique_name @@ -63,6 +63,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [ 'SimpleDistributeTranspiler', 'DistributeTranspiler', 'memory_optimize', + 'release_memory', 'profiler', 'unique_name', ] diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 09d137c9017bb4185308af94287fe0e8aa005505..b6f20daee3a585777a23255355f0a0e31328d23f 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -457,7 +457,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, "Out": [_append_grad_suffix_(loss.name)] }, {"shape": [1], "value": 1.0, - "dtype": loss.dtype}) + "dtype": loss.dtype, + "force_cpu": False}) root_block.desc.append_op().copy_from(op_desc) block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0])) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 1817caa94275e4efa47ec1a5a0aa861255c75561..35aa80a2ae9a6289665b581275fb86c3931fd7a8 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -102,6 +102,9 @@ def save_vars(executor, save_var_map = {} for each_var in vars: + # NOTE: don't save the variable which type is RAW + if each_var.type == core.VarDesc.VarType.RAW: + continue new_var = _clone_var_in_block_(save_block, each_var) if filename is None: save_block.append_op( diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 2bf7cf21ca94c742a15d980194b896d9ec8ad91b..ea189749bc6cc1e37c1efc6fea424143b887cecd 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -130,8 +130,13 @@ def detection_output(loc, target_box=loc, code_type='decode_center_size') - nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) + old_shape = scores.shape + scores = ops.reshape(x=scores, shape=(-1, old_shape[-1])) + scores = ops.softmax(x=scores) + scores = ops.reshape(x=scores, shape=old_shape) scores = nn.transpose(scores, perm=[0, 2, 1]) + + nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) helper.append_op( type="multiclass_nms", inputs={'Scores': scores, @@ -562,16 +567,16 @@ def multi_box_head(inputs, base_size, num_classes, aspect_ratios, - min_ratio, - max_ratio, + min_ratio=None, + max_ratio=None, min_sizes=None, max_sizes=None, steps=None, step_w=None, step_h=None, offset=0.5, - variance=[0.1, 0.1, 0.1, 0.1], - flip=False, + variance=[0.1, 0.1, 0.2, 0.2], + flip=True, clip=False, kernel_size=1, pad=0, @@ -614,7 +619,7 @@ def multi_box_head(inputs, the inputs[i] will be automatically calculated. Default: None. offset(float): Prior boxes center offset. Default: 0.5 variance(list|tuple): the variances to be encoded in prior boxes. - Default:[0.1, 0.1, 0.1, 0.1]. + Default:[0.1, 0.1, 0.2, 0.2]. flip(bool): Whether to flip aspect ratios. Default:False. clip(bool): Whether to clip out-of-boundary boxes. Default: False. kernel_size(int): The kernel size of conv2d. Default: 1. @@ -668,6 +673,19 @@ def multi_box_head(inputs, helper = LayerHelper("prior_box", **locals()) dtype = helper.input_dtype() + attrs = { + 'min_sizes': min_sizes, + 'aspect_ratios': aspect_ratios, + 'variances': variance, + 'flip': flip, + 'clip': clip, + 'step_w': step_w, + 'step_h': step_h, + 'offset': offset + } + if len(max_sizes) > 0 and max_sizes[0] > 0: + attrs['max_sizes'] = max_sizes + box = helper.create_tmp_variable(dtype) var = helper.create_tmp_variable(dtype) helper.append_op( @@ -676,17 +694,7 @@ def multi_box_head(inputs, "Image": image}, outputs={"Boxes": box, "Variances": var}, - attrs={ - 'min_sizes': min_sizes, - 'max_sizes': max_sizes, - 'aspect_ratios': aspect_ratios, - 'variances': variance, - 'flip': flip, - 'clip': clip, - 'step_w': step_w, - 'step_h': step_h, - 'offset': offset - }) + attrs=attrs, ) return box, var def _reshape_with_axis_(input, axis=1): @@ -714,7 +722,7 @@ def multi_box_head(inputs, if num_layer <= 2: assert min_sizes is not None and max_sizes is not None assert len(min_sizes) == num_layer and len(max_sizes) == num_layer - else: + elif min_sizes is None and max_sizes is None: min_sizes = [] max_sizes = [] step = int(math.floor(((max_ratio - min_ratio)) / (num_layer - 2))) @@ -759,9 +767,6 @@ def multi_box_head(inputs, min_size = [min_size] if not _is_list_or_tuple_(max_size): max_size = [max_size] - if not (len(max_size) == len(min_size)): - raise ValueError( - 'the length of max_size and min_size should be equal.') aspect_ratio = [] if aspect_ratios is not None: @@ -779,7 +784,7 @@ def multi_box_head(inputs, num_boxes = box.shape[2] - # get box_loc + # get loc num_loc_output = num_boxes * 4 mbox_loc = nn.conv2d( input=input, @@ -796,7 +801,7 @@ def multi_box_head(inputs, mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape) mbox_locs.append(mbox_loc_flatten) - # get conf_loc + # get conf num_conf_output = num_boxes * num_classes conf_loc = nn.conv2d( input=input, diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index b4fa530aa66fa1b84673b504a922595b83f42268..10b0405f47097fa3d83690e519ea878e082f68b9 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1406,6 +1406,7 @@ def pool2d(input, global_pooling=False, use_cudnn=True, ceil_mode=False, + use_mkldnn=False, name=None): """ This function adds the operator for pooling in 2 dimensions, using the @@ -1443,7 +1444,8 @@ def pool2d(input, "strides": pool_stride, "paddings": pool_padding, "use_cudnn": use_cudnn, - "ceil_mode": ceil_mode + "ceil_mode": ceil_mode, + "use_mkldnn": use_mkldnn }) return pool_out diff --git a/python/paddle/fluid/memory_optimization_transpiler.py b/python/paddle/fluid/memory_optimization_transpiler.py index 4fa2d03ef563b98b2eec576bf87d4b2e54ca0a36..41d1eca82e8b680977f44f1756c25c37340668a4 100644 --- a/python/paddle/fluid/memory_optimization_transpiler.py +++ b/python/paddle/fluid/memory_optimization_transpiler.py @@ -29,7 +29,10 @@ dtype_to_size = { core.VarDesc.VarType.BOOL: 1 } -sub_block_ops = ["while", "while_grad", "parallel_do", "parallel_do_grad"] +sub_block_ops = [ + "while", "while_grad", "parallel_do", "parallel_do_grad", + "conditional_block", "conditional_block_grad" +] PRINT_LOG = False @@ -122,36 +125,80 @@ class ControlFlowGraph(object): else: return block_desc.find_var_recursive(str(var_name)) - def memory_optimize(self): - def check_var_validity(block_desc, x, is_forward): - if str(x) == "@EMPTY@": - return False - if not self._has_var(block_desc, x, is_forward): - return False - if self._find_var(block_desc, x, is_forward).persistable(): - return False - if self._find_var( - block_desc, x, - is_forward).type() != core.VarDesc.VarType.LOD_TENSOR: - return False - if x in self._skip_opt: - return False - if not self._find_var(block_desc, x, is_forward).shape(): - return False - return True + def _check_var_validity(self, block_desc, x, is_forward): + if str(x) == "@EMPTY@": + return False + if not self._has_var(block_desc, x, is_forward): + return False + if self._find_var(block_desc, x, is_forward).persistable(): + return False + if self._find_var(block_desc, x, + is_forward).type() != core.VarDesc.VarType.LOD_TENSOR: + return False + if x in self._skip_opt: + return False + if not self._find_var(block_desc, x, is_forward).shape(): + return False + return True - self._build_graph() + def _update_skip_opt_set(self): + for i in range(self.op_size): + op = self._ops[i] + if op.type() == "fill_constant" and op.attr("force_cpu") == True: + self._skip_opt.update(op.output_arg_names()) + + def release_memory(self): self._dataflow_analyze() + self._update_skip_opt_set() + fwd_id = 0 + bwd_id = 0 + for i in range(self.op_size): + op = self._ops[i] + if op.type() in sub_block_ops: + continue + block_desc = op.block() + is_forward = i < self._forward_num + in_diff, out_diff = self._get_diff(self._live_in[i], + self._live_out[i]) + can_optimize = filter( + lambda x: self._check_var_validity(block_desc, x, is_forward), + in_diff) + if can_optimize: + index = i + fwd_id + 1 if is_forward else i - self._forward_num + bwd_id + 1 + delete_op = block_desc.insert_op(index) + delete_op.set_type("delete_var") + delete_op.set_input("X", can_optimize) + if is_forward: + fwd_id += 1 + else: + bwd_id += 1 + + def memory_optimize(self, level=0): + def compare_shape(x_shape, cache_shape, opt_level): + if opt_level == 0: + return x_shape == cache_shape + if opt_level == 1: + if (x_shape[0] == -1) ^ (cache_shape[0] == -1): + return False + x_size = abs(reduce(lambda x, y: x * y, x_shape)) + cache_size = abs(reduce(lambda x, y: x * y, cache_shape)) + if x_size <= cache_size: + return True + return False + + self._dataflow_analyze() + self._update_skip_opt_set() self.pool = [] for i in range(self.op_size): op = self._ops[i] if op.type() in sub_block_ops: continue block_desc = op.block() + self.current_block_desc = block_desc is_forward = i < self._forward_num if self.pool: defs_can_optimize = filter( - lambda x: check_var_validity(block_desc, x, is_forward), + lambda x: self._check_var_validity(block_desc, x, is_forward), self._defs[i]) out_pair = [ (x, self._find_var(block_desc, x, is_forward).shape()) @@ -164,7 +211,7 @@ class ControlFlowGraph(object): for index, cache_pair in enumerate(self.pool): cache_var = cache_pair[0] cache_shape = cache_pair[1] - if x_shape == cache_shape: + if compare_shape(x_shape, cache_shape, level): if self._has_var(block_desc, cache_var, is_forward): x_dtype = self._find_var(block_desc, x, is_forward).dtype() @@ -196,7 +243,7 @@ class ControlFlowGraph(object): in_diff, out_diff = self._get_diff(self._live_in[i], self._live_out[i]) can_optimize = filter( - lambda x: check_var_validity(block_desc, x, is_forward), + lambda x: self._check_var_validity(block_desc, x, is_forward), in_diff) if can_optimize: for var_name in can_optimize: @@ -270,7 +317,8 @@ def _get_cfgs(input_program): ([block_desc.op(i) for i in range(op_size)], op_size, set())) sub_block_pair = [("while", "while_grad"), ("parallel_do", - "parallel_do_grad")] + "parallel_do_grad"), + ("conditional_block", "conditional_block_grad")] ops_list.extend(_process_sub_block_pair(pdesc, sub_block_pair)) @@ -281,9 +329,15 @@ def _get_cfgs(input_program): return cfgs -def memory_optimize(input_program, print_log=False): +def memory_optimize(input_program, print_log=False, level=0): global PRINT_LOG PRINT_LOG = print_log cfgs = _get_cfgs(input_program) for cfg in cfgs: - cfg.memory_optimize() + cfg.memory_optimize(level) + + +def release_memory(input_program): + cfgs = _get_cfgs(input_program) + for cfg in cfgs: + cfg.release_memory() diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index 8c627ad55bcbaf0cb0e8dd74a3e67ed40a6245db..3b2e1a3073251a6d6460450dc957e1b5c7a873c5 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -45,7 +45,8 @@ def simple_img_conv_pool(input, pool_size=pool_size, pool_type=pool_type, pool_stride=pool_stride, - use_cudnn=use_cudnn) + use_cudnn=use_cudnn, + use_mkldnn=use_mkldnn) return pool_out @@ -107,7 +108,8 @@ def img_conv_group(input, pool_size=pool_size, pool_type=pool_type, pool_stride=pool_stride, - use_cudnn=use_cudnn) + use_cudnn=use_cudnn, + use_mkldnn=use_mkldnn) return pool_out diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 1c12d53e4f352b7e4dea980301973a69665e49f9..421963a2f9120dae3a72142681f0a30232c11166 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -92,7 +92,10 @@ class Optimizer(object): # create learning rate variable for every parameter param = param_and_grad[0] param_lr = param.optimize_attr['learning_rate'] - return self.global_learning_rate() * param_lr + if param_lr == 1.0: + return self.global_learning_rate() + else: + return self.global_learning_rate() * param_lr def _create_accumulators(self, block, parameters): """Create all accumulators needed by the parameters diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py index c9d2a5ecaab0669f308b5b9c5cf74d0212fa462a..ad79e96b958b36a06c8a3cc990dbe3608e32c9ac 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py @@ -50,6 +50,7 @@ sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01) sgd_optimizer.minimize(avg_cost) fluid.memory_optimize(fluid.default_main_program(), print_log=True) +# fluid.release_memory(fluid.default_main_program()) BATCH_SIZE = 200 @@ -69,8 +70,6 @@ exe.run(fluid.default_startup_program()) PASS_NUM = 100 for pass_id in range(PASS_NUM): - fluid.io.save_persistables(exe, "./fit_a_line.model/") - fluid.io.load_persistables(exe, "./fit_a_line.model/") for data in train_reader(): avg_loss_value, = exe.run(fluid.default_main_program(), feed=feeder.feed(data), diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py index 80ff11f8d78b0a22fc6aefd722c9e6a2c23fbd5c..204669d7e6176e9e8250e8aebc2d10441fa24b67 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py @@ -125,9 +125,10 @@ opts = optimizer.minimize(avg_cost) batch_size = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size) -fluid.memory_optimize(fluid.default_main_program()) +# fluid.memory_optimize(fluid.default_main_program(), level=0) +fluid.release_memory(fluid.default_main_program()) -BATCH_SIZE = 128 +BATCH_SIZE = 16 PASS_NUM = 1 # fix the order of training data @@ -159,8 +160,7 @@ for pass_id in range(PASS_NUM): print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str( pass_acc)) # this model is slow, so if we can train two mini batch, we think it works properly. - - if i > 2: + if i > 0: exit(0) if math.isnan(float(loss)): sys.exit("got NaN loss, training failed.") diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py index 689a75afc7ccdf84142f5531a438e1f9af7af4ca..a24834a6f0b19d1265f6c8d7089d31583af82d1f 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py @@ -105,7 +105,8 @@ def main(): optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) optimizer.minimize(avg_cost) - fluid.memory_optimize(fluid.default_main_program()) + # fluid.memory_optimize(fluid.default_main_program()) + fluid.release_memory(fluid.default_main_program()) # fix the order of training data train_data = paddle.batch( diff --git a/python/paddle/fluid/tests/test_cpp_reader.py b/python/paddle/fluid/tests/test_cpp_reader.py index b65592057817cef83bf2157c55bacea5bbe34ea1..4b0d039b7e05a55980946a8949e32802e9e57c20 100644 --- a/python/paddle/fluid/tests/test_cpp_reader.py +++ b/python/paddle/fluid/tests/test_cpp_reader.py @@ -15,16 +15,30 @@ import paddle.v2 as paddle import paddle.fluid as fluid import numpy as np +import sys -prog = fluid.framework.Program() -block = prog.current_block() +startup_prog = fluid.framework.Program() +startup_block = startup_prog.current_block() -random_reader = block.create_var( +random_reader = startup_block.create_var( type=fluid.core.VarDesc.VarType.READER, name="RandomDataGenerator") random_reader.desc.set_dtypes( [fluid.core.VarDesc.VarType.FP32, fluid.core.VarDesc.VarType.FP32]) +random_reader.persistable = True +shuffle_reader = startup_block.create_var( + type=fluid.core.VarDesc.VarType.READER, name="ShuffleReader") +shuffle_reader.persistable = True +batch_reader = startup_block.create_var( + type=fluid.core.VarDesc.VarType.READER, name="BatchReader") +batch_reader.persistable = True +double_buffer = startup_block.create_var( + type=fluid.core.VarDesc.VarType.READER, name="DoubleBuffer") +double_buffer.persistable = True + +main_prog = startup_prog.clone() +main_block = main_prog.current_block() -create_random_data_generator_op = block.append_op( +create_random_data_generator_op = startup_block.append_op( type="create_random_data_generator", outputs={"Out": random_reader}, attrs={ @@ -34,37 +48,45 @@ create_random_data_generator_op = block.append_op( "max": 1.0, 'lod_levels': [0, 0] }) -shuffle_reader = block.create_var( - type=fluid.core.VarDesc.VarType.READER, name="ShuffleReader") -create_shuffle_reader_op = block.append_op( +create_shuffle_reader_op = startup_block.append_op( type="create_shuffle_reader", inputs={"UnderlyingReader": random_reader}, outputs={"Out": shuffle_reader}, attrs={"buffer_size": 7}) -batch_reader = block.create_var( - type=fluid.core.VarDesc.VarType.READER, name="BatchReader") - -create_batch_reader_op = block.append_op( +create_batch_reader_op = startup_block.append_op( type="create_batch_reader", inputs={"UnderlyingReader": shuffle_reader}, outputs={"Out": batch_reader}, attrs={"batch_size": 10}) -out1 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out1") -out2 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out2") +create_double_buffer_reader_op = startup_block.append_op( + type="create_double_buffer_reader", + inputs={"UnderlyingReader": batch_reader}, + outputs={"Out": double_buffer}) + +out1 = main_block.create_var( + type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out1") +out2 = main_block.create_var( + type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out2") -read_op = block.append_op( - type="read", inputs={"Reader": batch_reader}, +main_block.var("DoubleBuffer").desc.set_shapes(double_buffer.desc.shapes()) +main_block.var("DoubleBuffer").desc.set_dtypes(double_buffer.desc.dtypes()) +main_block.var("DoubleBuffer").desc.set_lod_levels( + double_buffer.desc.lod_levels()) + +read_op = main_block.append_op( + type="read", + inputs={"Reader": double_buffer}, outputs={"Out": [out1, out2]}) place = fluid.CPUPlace() exe = fluid.Executor(place) -[res1, res2] = exe.run(prog, fetch_list=[out1, out2]) - -if not (res1.shape == (10, 2) and res2.shape == (10, 1)): - exit(1) +exe.run(startup_prog) -exit(0) +for i in range(1, 100): + [res1, res2] = exe.run(main_prog, fetch_list=[out1, out2]) + if not (res1.shape == (10, 2) and res2.shape == (10, 1)): + exit(1) diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py index f3197a623efb1730d27467c5650bc90f2762e7b2..a905a854ad157ffa3d7816dfbd445f3e344a1249 100644 --- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py +++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py @@ -166,8 +166,6 @@ class TestDetectionMAPOp(OpTest): elif not difficult: label_count[label] += 1 - true_pos = collections.defaultdict(list) - false_pos = collections.defaultdict(list) for (label, score, tp, fp) in tf_pos: true_pos[label].append([score, tp]) false_pos[label].append([score, fp]) diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py index e75a6529e9fa265121ba187f3ed6bc0273c058d7..6382e290eb30c621da64d5c600be6d8a7c6254f1 100644 --- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py +++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py @@ -19,6 +19,7 @@ import unittest import paddle.fluid as fluid import paddle.fluid.layers as layers import paddle.fluid.framework as framework +import paddle.fluid.core as core def exponential_decay(learning_rate, @@ -81,12 +82,25 @@ def piecewise_decay(global_step, boundaries, values): class TestLearningRateDecay(unittest.TestCase): def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for place in places: + self.check_decay_with_place(place, python_decay_fn, fluid_decay_fn, + kwargs) + + def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn, + kwargs): + decayed_lr = fluid_decay_fn(**kwargs) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) + + fluid.memory_optimize(fluid.default_main_program()) + for step in range(10): lr_val, = exe.run(fluid.default_main_program(), feed={}, diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index 9d87f4daa98da42fcc33aba4b51a4528343fb137..e775db1d10f4561b6fb90631757a25c9f74cb777 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -21,31 +21,43 @@ from paddle.fluid.backward import append_backward class TestOptimizer(unittest.TestCase): def test_sgd_optimizer(self): - init_program = framework.Program() - program = framework.Program() - block = program.global_block() - mul_x = block.create_parameter( - dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") - mul_y = block.create_var( - dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") - mul_out = block.create_var( - dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") - mean_out = block.create_var( - dtype="float32", shape=[1], lod_level=0, name="mean.out") - block.append_op( - type="mul", - inputs={"X": mul_x, - "Y": mul_y}, - outputs={"Out": mul_out}, - attrs={"x_num_col_dims": 1}) - block.append_op( - type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) - sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) - opts, _ = sgd_optimizer.minimize(mean_out, init_program) + def check_sgd_optimizer(optimizer_attr): + init_program = framework.Program() + program = framework.Program() + block = program.global_block() + mul_x = block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr=optimizer_attr) + mul_y = block.create_var( + dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") + mul_out = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") + mean_out = block.create_var( + dtype="float32", shape=[1], lod_level=0, name="mean.out") + block.append_op( + type="mul", + inputs={"X": mul_x, + "Y": mul_y}, + outputs={"Out": mul_out}, + attrs={"x_num_col_dims": 1}) + block.append_op( + type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) + sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) + opts, _ = sgd_optimizer.minimize(mean_out, init_program) + return opts + + opts = check_sgd_optimizer({'learning_rate': 1.1}) self.assertEqual(len(opts), 3) self.assertEqual([op.type for op in opts], ["fill_constant", "elementwise_mul", "sgd"]) + opts = check_sgd_optimizer({'learning_rate': 1.0}) + self.assertEqual(len(opts), 1) + self.assertEqual([op.type for op in opts], ["sgd"]) + class TestMomentumOptimizer(unittest.TestCase): class MockMomentum(optimizer.MomentumOptimizer): @@ -60,7 +72,11 @@ class TestMomentumOptimizer(unittest.TestCase): program = framework.Program() block = program.global_block() mul_x = block.create_parameter( - dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr={'learning_rate': 1.1}) mul_y = block.create_var( dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( @@ -110,7 +126,11 @@ class TestMomentumOptimizer(unittest.TestCase): program = framework.Program() block = program.global_block() mul_x = block.create_parameter( - dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr={'learning_rate': 1.1}) mul_y = block.create_var( dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( @@ -169,7 +189,11 @@ class TestAdagradOptimizer(unittest.TestCase): program = framework.Program() block = program.global_block() mul_x = block.create_parameter( - dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr={'learning_rate': 1.1}) mul_y = block.create_var( dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( @@ -229,7 +253,11 @@ class TestAdamOptimizer(unittest.TestCase): program = framework.Program() block = program.global_block() mul_x = block.create_parameter( - dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr={'learning_rate': 1.1}) mul_y = block.create_var( dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( @@ -292,7 +320,11 @@ class TestAdamaxOptimizer(unittest.TestCase): program = framework.Program() block = program.global_block() mul_x = block.create_parameter( - dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr={'learning_rate': 1.1}) mul_y = block.create_var( dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( @@ -352,7 +384,11 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): program = framework.Program() block = program.global_block() mul_x = block.create_parameter( - dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr={'learning_rate': 1.1}) mul_y = block.create_var( dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py index d2107fb4796a588b87d09f3d67e08566c12ffefb..964d78f1966aa10e36eeaabe943d44e002d50293 100644 --- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py @@ -79,6 +79,7 @@ def avg_pool2D_forward_naive(x, class TestPool2d_Op(OpTest): def setUp(self): self.use_cudnn = False + self.use_mkldnn = False self.init_test_case() self.init_global_pool() self.init_op_type() @@ -99,6 +100,7 @@ class TestPool2d_Op(OpTest): 'pooling_type': self.pool_type, 'global_pooling': self.global_pool, 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn, 'ceil_mode': self.ceil_mode, 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter } @@ -260,5 +262,42 @@ class TestCeilModeCase4(TestCase2): self.ceil_mode = True +#--------------------test pool2d MKLDNN-------------------- +class TestMKLDNNCase1(TestPool2d_Op): + def init_op_type(self): + self.use_mkldnn = True + self.op_type = "pool2d" + + +class TestMKLDNNCase2(TestCase1): + def init_op_type(self): + self.use_mkldnn = True + self.op_type = "pool2d" + + +class TestMKLDNNCase3(TestCase2): + def init_op_type(self): + self.use_mkldnn = True + self.op_type = "pool2d" + + +class TestMKLDNNCase4(TestCase3): + def init_op_type(self): + self.use_mkldnn = True + self.op_type = "pool2d" + + +class TestMKLDNNCase5(TestCase4): + def init_op_type(self): + self.use_mkldnn = True + self.op_type = "pool2d" + + +class TestMKLDNNCase6(TestCase5): + def init_op_type(self): + self.use_mkldnn = True + self.op_type = "pool2d" + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py index 286d305a777a4683d42a4d3d2d5d5f0c5b6ac12a..61040a39ced6dc57d05a10bf0605c80011db45c3 100644 --- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py @@ -60,8 +60,8 @@ class TestSpliteSelectedRows(unittest.TestCase): # expected output selected rows expected_out0_rows = [0, 4] - expected_out1_rows = [5, 7] - expected_out4_rows = [20] + expected_out1_rows = [0, 2] + expected_out4_rows = [0] op = Operator( "split_selected_rows", @@ -101,7 +101,7 @@ class TestSpliteSelectedRows(unittest.TestCase): out0_grad_tensor.set(np_array, place) out1_grad = scope.var("out1@GRAD").get_selected_rows() - rows1 = [7, 5] + rows1 = [2, 0] out1_grad.set_rows(rows1) out1_grad.set_height(height) out1_grad_tensor = out1_grad.get_tensor()