diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 25c6b4ef52d3f8ebff1572ae8d348be7c577c08c..9686df00219001769d074ee815d9cc8db0258496 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -51,7 +51,7 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${MKLDNN_DEPENDS} GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" - GIT_TAG "v0.9" + GIT_TAG "v0.10" PREFIX ${MKLDNN_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index e9fd3d4bedc983ae7c544cf289dc841cf22f9de4..74f3279831357c21038df133df0f5a432a6dfd20 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -28,7 +28,7 @@ INCLUDE(ExternalProject) SET(MKLML_PROJECT "extern_mklml") SET(MKLML_VER "mklml_lnx_2018.0.20170720") -SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz") +SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz") SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") SET(MKLML_DST_DIR "mklml") @@ -54,7 +54,8 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${MKLML_SOURCE_DIR} DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate -qO- ${MKLML_URL} | tar xz -C ${MKLML_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz + && tar zxf ${MKLML_VER}.tgz DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst index a24df6c518fad84a48061ecb34ee46cb312a4995..dd9923697ab85825557aa89a08870bece7c76673 100644 --- a/doc/getstarted/build_and_install/index_cn.rst +++ b/doc/getstarted/build_and_install/index_cn.rst @@ -6,14 +6,12 @@ 安装流程 ++++++++ -PaddlePaddle提供数个预编译的二进制来进行安装,包括Docker镜像,ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境,同时欢迎贡献更多的安装包。 +PaddlePaddle提供Docker镜像来部署环境。 .. toctree:: :maxdepth: 1 docker_install_cn.rst - ubuntu_install_cn.rst - 编译流程 diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/getstarted/build_and_install/index_en.rst index 1bfd4f75c0b9b82d61d28a30f03181f7be159f24..8a53588e0439df8f4d5fd529b7a20262c67d4e58 100644 --- a/doc/getstarted/build_and_install/index_en.rst +++ b/doc/getstarted/build_and_install/index_en.rst @@ -8,14 +8,13 @@ Install PaddlePaddle :maxdepth: 1 docker_install_en.rst - ubuntu_install_en.rst Build from Source ----------------- .. warning:: - Please use :code:`deb` package or :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code. + Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code. .. toctree:: :maxdepth: 1 diff --git a/doc/getstarted/build_and_install/ubuntu_install_cn.rst b/doc/getstarted/build_and_install/ubuntu_install_cn.rst deleted file mode 100644 index 9e39ccb00f5d5655c30148900a3d76a22aacfc01..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/ubuntu_install_cn.rst +++ /dev/null @@ -1,71 +0,0 @@ -Ubuntu部署PaddlePaddle -=================================== - -PaddlePaddle提供了ubuntu 14.04 deb安装包。 - -安装 ------- - -安装包的下载地址是\: https://github.com/PaddlePaddle/Paddle/releases - -它包含四个版本\: - -* cpu版本: 支持主流x86处理器平台, 使用了avx指令集。 - -* cpu-noavx版本:支持主流x86处理器平台,没有使用avx指令集。 - -* gpu版本:支持主流x86处理器平台,支持nvidia cuda平台,使用了avx指令集。 - -* gpu-noavx版本:支持主流x86处理器平台,支持nvidia cuda平台,没有使用avx指令集。 - -下载完相关安装包后,执行: - -.. code-block:: shell - - sudo apt-get install gdebi - gdebi paddle-*-cpu.deb - -或者: - -.. code-block:: shell - - dpkg -i paddle-*-cpu.deb - apt-get install -f - - -在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的, -在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。 - -安装完成后,可以使用命令 :code:`paddle version` 查看安装后的paddle 版本: - -.. code-block:: shell - - PaddlePaddle 0.8.0b1, compiled with - with_avx: ON - with_gpu: OFF - with_double: OFF - with_python: ON - with_rdma: OFF - with_timer: OFF - with_predict_sdk: - - -可能遇到的问题 --------------- - -libcudart.so/libcudnn.so找不到 -++++++++++++++++++++++++++++++ - -安装完成后,运行 :code:`paddle train` 报错\: - -.. code-block:: shell - - 0831 12:36:04.151525 1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH. - -原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle,请安装CUDA 7.5 和CUDNN 5到本地环境中,并设置: - -.. code-block:: shell - - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH - export PATH=/usr/local/cuda/bin:$PATH - diff --git a/doc/getstarted/build_and_install/ubuntu_install_en.rst b/doc/getstarted/build_and_install/ubuntu_install_en.rst deleted file mode 100644 index ea8042085bf458be96e71017d229d88ad867695b..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/ubuntu_install_en.rst +++ /dev/null @@ -1,25 +0,0 @@ -Debian Package installation guide -================================= - -PaddlePaddle supports :code:`deb` pacakge. The installation of this :code:`deb` package is tested in ubuntu 14.04, but it should be support other debian based linux, too. - -There are four versions of debian package, :code:`cpu`, :code:`gpu`, :code:`cpu-noavx`, :code:`gpu-noavx`. And :code:`noavx` version is used to support CPU which does not contain :code:`AVX` instructions. The download url of :code:`deb` package is \: https://github.com/baidu/Paddle/releases/ - - -After downloading PaddlePaddle deb packages, you can use :code:`gdebi` install. - -.. code-block:: bash - - gdebi paddle-*.deb - -If :code:`gdebi` is not installed, you can use :code:`sudo apt-get install gdebi` to install it. - -Or you can use following commands to install PaddlePaddle. - -.. code-block:: bash - - dpkg -i paddle-*.deb - apt-get install -f - -And if you use GPU version deb package, you need to install CUDA toolkit and cuDNN, and set related environment variables(such as LD_LIBRARY_PATH) first. It is normal when `dpkg -i` get errors. `apt-get install -f` will continue install paddle, and install dependences. - diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt index c572a9d433bc16e6733b8fc9367970bef28e699a..f43f15e5cacb70b625d7791e1e02ce7780286200 100644 --- a/paddle/function/CMakeLists.txt +++ b/paddle/function/CMakeLists.txt @@ -21,6 +21,8 @@ if(USE_NNPACK) endif() endif() +list(APPEND cpp_files neon/NeonDepthwiseConv.cpp) + add_library(paddle_function STATIC ${cpp_files} ${cu_objs}) add_dependencies(paddle_function ${external_project_dependencies}) add_dependencies(paddle_function paddle_proto) @@ -42,11 +44,11 @@ if(WITH_GPU) add_simple_unittest(RowConvOpTest) add_simple_unittest(BlockExpandOpTest) add_simple_unittest(CropOpTest) - add_simple_unittest(DepthwiseConvOpTest) endif() add_simple_unittest(Im2ColTest) add_simple_unittest(GemmConvOpTest) +add_simple_unittest(DepthwiseConvOpTest) endif() add_style_check_target(paddle_function ${h_files}) diff --git a/paddle/function/DepthwiseConvOpTest.cpp b/paddle/function/DepthwiseConvOpTest.cpp index f44ae0c342e9536366e2b537694cee81fcb1a6ed..d8e8c889d5c23bf9b2b5fd0b0393395883188fd8 100644 --- a/paddle/function/DepthwiseConvOpTest.cpp +++ b/paddle/function/DepthwiseConvOpTest.cpp @@ -34,4 +34,13 @@ TEST(DepthwiseConv, BackwardFilter) { } #endif +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + +TEST(DepthwiseConv, Forward) { + DepthwiseConvolution( + "GemmConv-CPU", "NeonDepthwiseConv-CPU", forward); +} + +#endif + } // namespace paddle diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h index 48e2e32f9256fb49c67ba25e9b5a47d72499758b..9b91e223a6a28586b11fe7ed4a44421e029a67bb 100644 --- a/paddle/function/Im2Col.h +++ b/paddle/function/Im2Col.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "TensorShape.h" #include "TensorType.h" +#include "neon/neon_util.h" namespace paddle { @@ -93,4 +94,95 @@ public: int paddingWidth); }; +template +struct Padding { + static void run(const T* src, + T* dest, + int channels, + int inputHeight, + int inputWidth, + int paddingHeight, + int paddingWidth) { + const int destWidth = inputWidth + 2 * paddingWidth; + for (int c = 0; c < channels; c++) { + if (paddingHeight > 0) { + memset(dest, 0, destWidth * paddingHeight * sizeof(T)); + dest += destWidth * paddingHeight; + } + + for (int i = 0; i < inputHeight; i++) { + // padding head + for (int j = 0; j < paddingWidth; j++) { + *dest++ = T(0); + } + + memcpy(dest, src, inputWidth * sizeof(T)); + dest += inputWidth; + src += inputWidth; + + // padding tail + for (int j = 0; j < paddingWidth; j++) { + *dest++ = T(0); + } + } + + if (paddingHeight > 0) { + memset(dest, 0, destWidth * paddingHeight * sizeof(T)); + dest += destWidth * paddingHeight; + } + } + } +}; + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +template <> +struct Padding { + static void run(const float* src, + float* dest, + int channels, + int inputHeight, + int inputWidth, + int paddingHeight, + int paddingWidth) { + const int destWidth = inputWidth + 2 * paddingWidth; + for (int c = 0; c < channels; c++) { + if (paddingHeight > 0) { + memset(dest, 0, destWidth * paddingHeight * sizeof(float)); + dest += destWidth * paddingHeight; + } + + for (int i = 0; i < inputHeight; i++) { + // padding head + for (int j = 0; j < paddingWidth; j++) { + *dest++ = float(0); + } + + int step = inputWidth >> 2; + int remain = inputWidth & 3; + for (int s = 0; s < step; s++) { + float32x4_t s0 = vld1q_f32(src); + vst1q_f32(dest, s0); + src += 4; + dest += 4; + } + for (int r = 0; r < remain; r++) { + *dest++ = *src++; + } + + // padding tail + for (int j = 0; j < paddingWidth; j++) { + *dest++ = float(0); + } + } + + if (paddingHeight > 0) { + memset(dest, 0, destWidth * paddingHeight * sizeof(float)); + dest += destWidth * paddingHeight; + } + } + } +}; + +#endif + } // namespace paddle diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f09e98587d1681d29a79a9cb0303c2d4356c6935 --- /dev/null +++ b/paddle/function/neon/NeonDepthwiseConv.cpp @@ -0,0 +1,577 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "neon_util.h" +#include "paddle/function/ConvOp.h" +#include "paddle/function/Im2Col.h" + +namespace paddle { + +namespace neon { + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + +template +struct DepthwiseConvKernel {}; + +inline float32_t conv3x3(float32x4_t r0, + float32x4_t r1, + float32x4_t r2, + float32x4_t k0, + float32x4_t k1, + float32x4_t k2) { + float32x4_t tmp; + tmp = vmulq_f32(r0, k0); + tmp = vmlaq_f32(tmp, r1, k1); + tmp = vmlaq_f32(tmp, r2, k2); + return vaddvq_f32(tmp); +} + +inline float32_t conv4x4(float32x4_t r0, + float32x4_t r1, + float32x4_t r2, + float32x4_t r3, + float32x4_t k0, + float32x4_t k1, + float32x4_t k2, + float32x4_t k3) { + float32x4_t tmp; + tmp = vmulq_f32(r0, k0); + tmp = vmlaq_f32(tmp, r1, k1); + tmp = vmlaq_f32(tmp, r2, k2); + tmp = vmlaq_f32(tmp, r3, k3); + return vaddvq_f32(tmp); +} + +/** + * Each step calculates four elements of the output. + * First step: + * R0[0, 1, 2, 3...] * K[0][0] + * R0[1, 2, 3, 4...] * K[0][1] + * R0[2, 3, 4, 5...] * K[0][2] + * R1[0, 1, 2, 3...] * K[1][0] + * R1[1, 2, 3, 4...] * K[1][1] + * R1[2, 3, 4, 5...] * K[1][2] + * R2[0, 1, 2, 3...] * K[2][0] + * R2[1, 2, 3, 4...] * K[2][1] + * + R2[2, 3, 4, 5...] * K[2][2] + * ------------------------------ + * Output[0, 1, 2, 3] + */ +template <> +struct DepthwiseConvKernel<3, 1> { + static void run(const float* inputData, + const float* filterData, + int inputHeight, + int inputWidth, + int outputChannels, + int outputHeight, + int outputWidth, + int filterMultiplier, + float* outputData) { + const int steps = outputWidth >> 2; + const int remain = outputWidth & 3; + for (int c = 0; c < outputChannels; c++, filterData += 9) { + // Load the filters + float32x4_t k[3]; + k[0] = vld1q_f32(filterData); + k[1] = vld1q_f32(filterData + 3); + k[2] = vld1q_f32(filterData + 6); + k[0] = vsetq_lane_f32(0.f, k[0], 3); + k[1] = vsetq_lane_f32(0.f, k[1], 3); + k[2] = vsetq_lane_f32(0.f, k[2], 3); + + const float* r0 = + inputData + (c / filterMultiplier) * (inputHeight * inputWidth); + const float* r1 = r0 + inputWidth; + const float* r2 = r0 + inputWidth * 2; + float32x4_t input[3][3]; + for (int h = 0; h < outputHeight; h++) { + for (int s = 0; s < steps; s++) { + // Load the inputs + float32x4_t tmp; + input[0][0] = vld1q_f32(r0); + tmp = vld1q_f32(r0 + 4); + input[0][1] = vextq_f32(input[0][0], tmp, 1); + input[0][2] = vextq_f32(input[0][0], tmp, 2); + input[1][0] = vld1q_f32(r1); + tmp = vld1q_f32(r1 + 4); + input[1][1] = vextq_f32(input[1][0], tmp, 1); + input[1][2] = vextq_f32(input[1][0], tmp, 2); + input[2][0] = vld1q_f32(r2); + tmp = vld1q_f32(r2 + 4); + input[2][1] = vextq_f32(input[2][0], tmp, 1); + input[2][2] = vextq_f32(input[2][0], tmp, 2); + + float32x4_t tmp1 = vdupq_n_f32(0.f); + float32x4_t tmp2 = vdupq_n_f32(0.f); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp1 = vaddq_f32(tmp1, tmp2); + + vst1q_f32(outputData, tmp1); + r0 += 4; + r1 += 4; + r2 += 4; + outputData += 4; + } + + for (int r = 0; r < remain; r++) { + float32x4_t i0 = vld1q_f32(r0); + float32x4_t i1 = vld1q_f32(r1); + float32x4_t i2 = vld1q_f32(r2); + *outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]); + r0++; + r1++; + r2++; + outputData++; + } + + r0 += 2; + r1 += 2; + r2 += 2; + } + } + } +}; + +/** + * Each step calculates four elements of the output. + * First step: + * R0[0, 2, 4, 6...] * K[0][0] + * R0[1, 3, 5, 7...] * K[0][1] + * R0[2, 4, 6, 8...] * K[0][2] + * R1[0, 2, 4, 6...] * K[1][0] + * R1[1, 3, 5, 7...] * K[1][1] + * R1[2, 4, 6, 8...] * K[1][2] + * R2[0, 2, 4, 6...] * K[2][0] + * R2[1, 3, 5, 7...] * K[2][1] + * R2[2, 4, 6, 8...] * K[2][2] + * ------------------------------ + * Output[0, 1, 2, 3] + */ +template <> +struct DepthwiseConvKernel<3, 2> { + static void run(const float* inputData, + const float* filterData, + int inputHeight, + int inputWidth, + int outputChannels, + int outputHeight, + int outputWidth, + int filterMultiplier, + float* outputData) { + const int steps = outputWidth >> 2; + const int remain = outputWidth & 3; + for (int c = 0; c < outputChannels; c++, filterData += 9) { + // Load the filters + float32x4_t k[3]; + k[0] = vld1q_f32(filterData); + k[1] = vld1q_f32(filterData + 3); + k[2] = vld1q_f32(filterData + 6); + k[0] = vsetq_lane_f32(0.f, k[0], 3); + k[1] = vsetq_lane_f32(0.f, k[1], 3); + k[2] = vsetq_lane_f32(0.f, k[2], 3); + + const float* start = + inputData + (c / filterMultiplier) * (inputHeight * inputWidth); + float32x4_t input[3][3]; + for (int h = 0; h < outputHeight; h++) { + const float* r0 = start + 2 * h * inputWidth; + const float* r1 = start + (2 * h + 1) * inputWidth; + const float* r2 = start + (2 * h + 2) * inputWidth; + for (int s = 0; s < steps; s++) { + // Load the inputs + float32x4_t data1; + float32x4x2_t data2; + + data2 = vld2q_f32(r0); + input[0][0] = data2.val[0]; + input[0][1] = data2.val[1]; + data1 = vld1q_f32(r0 + 8); + input[0][2] = vextq_f32(data2.val[0], data1, 1); + + data2 = vld2q_f32(r1); + input[1][0] = data2.val[0]; + input[1][1] = data2.val[1]; + data1 = vld1q_f32(r1 + 8); + input[1][2] = vextq_f32(data2.val[0], data1, 1); + + data2 = vld2q_f32(r2); + input[2][0] = data2.val[0]; + input[2][1] = data2.val[1]; + data1 = vld1q_f32(r2 + 8); + input[2][2] = vextq_f32(data2.val[0], data1, 1); + + float32x4_t tmp1 = vdupq_n_f32(0.f); + float32x4_t tmp2 = vdupq_n_f32(0.f); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp1 = vaddq_f32(tmp1, tmp2); + + vst1q_f32(outputData, tmp1); + r0 += 8; + r1 += 8; + r2 += 8; + outputData += 4; + } + + for (int r = 0; r < remain; r++) { + float32x4_t i0 = vld1q_f32(r0); + float32x4_t i1 = vld1q_f32(r1); + float32x4_t i2 = vld1q_f32(r2); + *outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]); + r0 += 2; + r1 += 2; + r2 += 2; + outputData++; + } + } + } + } +}; + +/** + * Each step calculates four elements of the output. + */ +template <> +struct DepthwiseConvKernel<4, 1> { + static void run(const float* inputData, + const float* filterData, + int inputHeight, + int inputWidth, + int outputChannels, + int outputHeight, + int outputWidth, + int filterMultiplier, + float* outputData) { + const int steps = outputWidth >> 2; + const int remain = outputWidth & 3; + for (int c = 0; c < outputChannels; c++, filterData += 16) { + // Load the filters + float32x4_t k[4]; + k[0] = vld1q_f32(filterData); + k[1] = vld1q_f32(filterData + 4); + k[2] = vld1q_f32(filterData + 8); + k[3] = vld1q_f32(filterData + 12); + + const float* r0 = + inputData + (c / filterMultiplier) * (inputHeight * inputWidth); + const float* r1 = r0 + inputWidth; + const float* r2 = r0 + inputWidth * 2; + const float* r3 = r0 + inputWidth * 3; + float32x4_t input[4][4]; + for (int h = 0; h < outputHeight; h++) { + for (int s = 0; s < steps; s++) { + // Load the inputs + float32x4_t tmp; + input[0][0] = vld1q_f32(r0); + tmp = vld1q_f32(r0 + 4); + input[0][1] = vextq_f32(input[0][0], tmp, 1); + input[0][2] = vextq_f32(input[0][0], tmp, 2); + input[0][3] = vextq_f32(input[0][0], tmp, 3); + + input[1][0] = vld1q_f32(r1); + tmp = vld1q_f32(r1 + 4); + input[1][1] = vextq_f32(input[1][0], tmp, 1); + input[1][2] = vextq_f32(input[1][0], tmp, 2); + input[1][3] = vextq_f32(input[1][0], tmp, 3); + + input[2][0] = vld1q_f32(r2); + tmp = vld1q_f32(r2 + 4); + input[2][1] = vextq_f32(input[2][0], tmp, 1); + input[2][2] = vextq_f32(input[2][0], tmp, 2); + input[2][3] = vextq_f32(input[2][0], tmp, 3); + + input[3][0] = vld1q_f32(r3); + tmp = vld1q_f32(r3 + 4); + input[3][1] = vextq_f32(input[3][0], tmp, 1); + input[3][2] = vextq_f32(input[3][0], tmp, 2); + input[3][3] = vextq_f32(input[3][0], tmp, 3); + + float32x4_t tmp1 = vdupq_n_f32(0.f); + float32x4_t tmp2 = vdupq_n_f32(0.f); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); + tmp1 = vaddq_f32(tmp1, tmp2); + + vst1q_f32(outputData, tmp1); + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + outputData += 4; + } + + for (int r = 0; r < remain; r++) { + float32x4_t i0 = vld1q_f32(r0); + float32x4_t i1 = vld1q_f32(r1); + float32x4_t i2 = vld1q_f32(r2); + float32x4_t i3 = vld1q_f32(r3); + *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]); + r0++; + r1++; + r2++; + r3++; + outputData++; + } + + r0 += 3; + r1 += 3; + r2 += 3; + r3 += 3; + } + } + } +}; + +/** + * Each step calculates four elements of the output. + */ +template <> +struct DepthwiseConvKernel<4, 2> { + static void run(const float* inputData, + const float* filterData, + int inputHeight, + int inputWidth, + int outputChannels, + int outputHeight, + int outputWidth, + int filterMultiplier, + float* outputData) { + const int steps = outputWidth >> 2; + const int remain = outputWidth & 3; + for (int c = 0; c < outputChannels; c++, filterData += 16) { + // Load the filters + float32x4_t k[4]; + k[0] = vld1q_f32(filterData); + k[1] = vld1q_f32(filterData + 4); + k[2] = vld1q_f32(filterData + 8); + k[3] = vld1q_f32(filterData + 12); + + const float* start = + inputData + (c / filterMultiplier) * (inputHeight * inputWidth); + float32x4_t input[4][4]; + for (int h = 0; h < outputHeight; h++) { + const float* r0 = start + 2 * h * inputWidth; + const float* r1 = start + (2 * h + 1) * inputWidth; + const float* r2 = start + (2 * h + 2) * inputWidth; + const float* r3 = start + (2 * h + 3) * inputWidth; + for (int s = 0; s < steps; s++) { + // Load the inputs + float32x4x2_t data1; + float32x4x2_t data2; + + data1 = vld2q_f32(r0); + data2 = vld2q_f32(r0 + 8); + input[0][0] = data1.val[0]; + input[0][1] = data1.val[1]; + input[0][2] = vextq_f32(data1.val[0], data2.val[0], 1); + input[0][3] = vextq_f32(data1.val[1], data2.val[1], 1); + + data1 = vld2q_f32(r1); + data2 = vld2q_f32(r1 + 8); + input[1][0] = data1.val[0]; + input[1][1] = data1.val[1]; + input[1][2] = vextq_f32(data1.val[0], data2.val[0], 1); + input[1][3] = vextq_f32(data1.val[1], data2.val[1], 1); + + data1 = vld2q_f32(r2); + data2 = vld2q_f32(r2 + 8); + input[2][0] = data1.val[0]; + input[2][1] = data1.val[1]; + input[2][2] = vextq_f32(data1.val[0], data2.val[0], 1); + input[2][3] = vextq_f32(data1.val[1], data2.val[1], 1); + + data1 = vld2q_f32(r3); + data2 = vld2q_f32(r3 + 8); + input[3][0] = data1.val[0]; + input[3][1] = data1.val[1]; + input[3][2] = vextq_f32(data1.val[0], data2.val[0], 1); + input[3][3] = vextq_f32(data1.val[1], data2.val[1], 1); + + float32x4_t tmp1 = vdupq_n_f32(0.f); + float32x4_t tmp2 = vdupq_n_f32(0.f); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); + tmp1 = vaddq_f32(tmp1, tmp2); + + vst1q_f32(outputData, tmp1); + r0 += 8; + r1 += 8; + r2 += 8; + r3 += 8; + outputData += 4; + } + + for (int r = 0; r < remain; r++) { + float32x4_t i0 = vld1q_f32(r0); + float32x4_t i1 = vld1q_f32(r1); + float32x4_t i2 = vld1q_f32(r2); + float32x4_t i3 = vld1q_f32(r3); + *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]); + r0 += 2; + r1 += 2; + r2 += 2; + r3 += 2; + outputData++; + } + } + } + } +}; + +template +class NeonDepthwiseConvFunction : public ConvFunctionBase { +public: + void init(const FuncConfig& config) override { + ConvFunctionBase::init(config); + } + + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + checkShape(input, filter, output); + } + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + check(inputs, outputs); + + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + + size_t batchSize = input[0]; + size_t inputChannels = input[1]; + size_t inputHeight = input[2]; + size_t inputWidth = input[3]; + size_t filterHeight = getFilterHeight(filter); + size_t filterWidth = getFilterWidth(filter); + size_t outputChannels = output[1]; + size_t outputHeight = output[2]; + size_t outputWidth = output[3]; + size_t filterMultiplier = outputChannels / groups_; + CHECK_EQ(inputChannels, groups_); + + // only support strideH() == strideW() and filterHeight == filterWidth. + CHECK_EQ(strideH(), strideW()); + CHECK_EQ(filterHeight, filterWidth); + + float* inputData = inputs[0].data(); + float* filterData = inputs[1].data(); + float* outputData = outputs[0].data(); + + // padding the input + float* inputPadding = inputData; + if (paddingH() > 0 || paddingW() > 0) { + int newSize = batchSize * inputChannels * (inputHeight + 2 * paddingH()) * + (inputWidth + 2 * paddingW()); + resizeBuffer(newSize); + inputPadding = reinterpret_cast(memory_->getBuf()); + Padding::run(inputData, + inputPadding, + batchSize * inputChannels, + inputHeight, + inputWidth, + paddingH(), + paddingW()); + + // height and width of padding data + inputHeight += 2 * paddingH(); + inputWidth += 2 * paddingW(); + } + + std::function + DepthWiseConv; + + if (filterWidth == 3 && strideW() == 1) { + DepthWiseConv = DepthwiseConvKernel<3, 1>::run; + } else if (filterWidth == 3 && strideW() == 2) { + DepthWiseConv = DepthwiseConvKernel<3, 2>::run; + } else if (filterWidth == 4 && strideW() == 1) { + DepthWiseConv = DepthwiseConvKernel<4, 1>::run; + } else if (filterWidth == 4 && strideW() == 2) { + DepthWiseConv = DepthwiseConvKernel<4, 2>::run; + } else { + LOG(FATAL) << "Not supported"; + } + + for (size_t i = 0; i < batchSize; i++) { + DepthWiseConv(inputPadding, + filterData, + inputHeight, + inputWidth, + outputChannels, + outputHeight, + outputWidth, + filterMultiplier, + outputData); + inputPadding += inputChannels * inputHeight * inputWidth; + outputData += outputChannels * outputHeight * outputWidth; + } + } +}; + +REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction); + +#endif + +} // namespace neon +} // namespace paddle diff --git a/paddle/function/neon/neon_util.h b/paddle/function/neon/neon_util.h new file mode 100644 index 0000000000000000000000000000000000000000..56b3febe2d27bb4fbf57e49079b3ad071d556914 --- /dev/null +++ b/paddle/function/neon/neon_util.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + +#include + +namespace paddle { + +namespace neon { + +inline float32x4_t vld1q_f32_aligned(const float* p) { + return vld1q_f32( + (const float*)__builtin_assume_aligned(p, sizeof(float32x4_t))); +} + +#ifndef __aarch64__ +inline float32_t vaddvq_f32(float32x4_t a) { + float32x2_t v = vadd_f32(vget_high_f32(a), vget_low_f32(a)); + return vget_lane_f32(vpadd_f32(v, v), 0); +} + +inline float32x4_t vmlaq_laneq_f32(float32x4_t a, + float32x4_t b, + float32x4_t v, + const int lane) { + return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane)); +} +#endif + +} // namespace neon +} // namespace paddle + +#endif diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h index 0ce72ef40a5ac23d20f485eb5b518186d1ec0686..0f655b48eea051c41ce17c0a41189b26188cc866 100644 --- a/paddle/gserver/layers/CostLayer.h +++ b/paddle/gserver/layers/CostLayer.h @@ -318,7 +318,9 @@ public: void forwardImp(Matrix& output, Argument& label, Matrix& cost) override; - void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {} + void backwardImp(Matrix& outputValue, + Argument& label, + Matrix& outputGrad) override {} }; /** diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/gserver/layers/CrossEntropyOverBeam.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4acc077035b17fdf5ec06e0d4d916fa0a62f6cba --- /dev/null +++ b/paddle/gserver/layers/CrossEntropyOverBeam.cpp @@ -0,0 +1,393 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "CrossEntropyOverBeam.h" + +namespace paddle { + +void CostForOneSequence::calValidExpandStep() { + validExpansionCount_ = 0; + goldAsExtraPath_ = true; + + for (size_t i = 0; i < beams_->expansionCount; ++i) { + real gold = static_cast(beams_->gold[i]); + if (i) { + real* start = beams_->candidateIds[i - 1]->getData(); + goldRowIds_[i] = std::count_if( + start, + start + goldRowIds_[i - 1] * beamSize_ + goldColIds_[i - 1], + [](const real& val) { return val != -1.; }); + } else { + goldRowIds_[i] = 0; + } + + real* start = + beams_->candidateIds[i]->getData() + goldRowIds_[i] * beamSize_; + real* findEnd = std::find(start, start + beamSize_, gold); + validExpansionCount_++; + + if (start + beamSize_ == findEnd) return; + goldColIds_[i] = findEnd - start; + } + if (goldColIds_[beams_->expansionCount - 1] != -1) goldAsExtraPath_ = false; +} + +size_t CostForOneSequence::initLastExpansion() { + int beamId = validExpansionCount_ - 1; + const MatrixPtr candidates = beams_->candidateIds[beamId]; + size_t height = candidates->getHeight(); + + /* initialization the last expansion. */ + size_t pathCount = std::count_if(candidates->getData(), + candidates->getData() + height * beamSize_, + [](const real& val) { return val != -1; }); + /* + * if the gold sequence falls off the beam during search, add the gold + * sequence as the last path into the all expanded candidates. + */ + if (goldAsExtraPath_) goldIdsInFinalExpansion_ = pathCount++; + + pathRowIdsInEachBeam_.clear(); + pathRowIdsInEachBeam_.resize(validExpansionCount_, + std::vector(pathCount, 0)); + parentIdsInBeam_.clear(); + parentIdsInBeam_.resize(pathCount, 0); + + if (goldAsExtraPath_) { + /* add gold sequence into the total expansion. */ + pathRowIdsInEachBeam_[beamId].back() = + beams_->gold[beamId] + + getSeqStartPos(beamId, goldRowIds_[validExpansionCount_ - 1]); + parentIdsInBeam_.back() = goldRowIds_[validExpansionCount_ - 1]; + } else { + size_t goldOffset = goldRowIds_[beamId] * beamSize_ + goldColIds_[beamId]; + goldIdsInFinalExpansion_ = + std::count_if(candidates->getData(), + candidates->getData() + goldOffset, + [](const real& val) { return val != -1.; }); + } + + /* + * TODO(caoying): fix this, store the indices of selected candidate + * paths into Argument.ids + */ + real* ids = candidates->getData(); + size_t curIdx = 0; + for (size_t i = 0; i < height; ++i) { + int basePos = getSeqStartPos(beamId, i); + for (size_t j = 0; j < beamSize_; ++j) { + int id = ids[i * beamSize_ + j]; + if (id == -1) continue; + pathRowIdsInEachBeam_[beamId][curIdx] = id + basePos; + parentIdsInBeam_[curIdx++] = i; + } + } + return pathCount; +} + +void CostForOneSequence::constructTotalExpansion() { + /* + * construct the entire expanded beam by begining with the last search + * in which gold falls off the beam. + */ + size_t totalPathCount = initLastExpansion(); + + for (int beamId = validExpansionCount_ - 2; beamId >= 0; --beamId) { + const MatrixPtr candidates = beams_->candidateIds[beamId]; + real* ids = candidates->getData(); + + int lastParentIdInBeam = -1; + int basePos = -1; + for (size_t i = 0; + i < (goldAsExtraPath_ ? totalPathCount - 1 : totalPathCount); + ++i) { + int id = ids[parentIdsInBeam_[i]]; + int parentRowId = std::div(parentIdsInBeam_[i], beamSize_).quot; + if (parentIdsInBeam_[i] != lastParentIdInBeam) + basePos = getSeqStartPos(beamId, parentRowId); + + pathRowIdsInEachBeam_[beamId][i] = id + basePos; + lastParentIdInBeam = parentIdsInBeam_[i]; + parentIdsInBeam_[i] = parentRowId; + + if (goldAsExtraPath_) + pathRowIdsInEachBeam_[beamId][totalPathCount - 1] = + beams_->gold[beamId] + getSeqStartPos(beamId, goldRowIds_[beamId]); + } + } +} + +real CostForOneSequence::globallyNormalizedScore() { + expandedPathScores_.resize(validExpansionCount_); + + Matrix::resizeOrCreate( + softmaxOut_, 1, pathRowIdsInEachBeam_[0].size(), false, false); + softmaxOut_->zeroMem(); + MatrixPtr tmp = Matrix::create( + softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false); + + for (size_t i = 0; i < validExpansionCount_; ++i) { + Matrix::resizeOrCreate(expandedPathScores_[i], + pathRowIdsInEachBeam_[i].size(), + 1, + false, + false); + expandedPathScores_[i]->zeroMem(); + + IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(), + pathRowIdsInEachBeam_[i].size(), + false); + expandedPathScores_[i]->selectRows(*(beams_->scores[i]), *rowIds); + tmp->add(*expandedPathScores_[i]); + } + + softmaxOut_->softmax(*softmaxOut_); + return -std::log(softmaxOut_->getData()[goldIdsInFinalExpansion_]); +} + +real CostForOneSequence::forward() { + calValidExpandStep(); + constructTotalExpansion(); + return globallyNormalizedScore(); +} + +void CostForOneSequence::backward() { + /* + * when softmax layer is the output layer, and it is combined with + * cross-entropy as cost. The derivate with regard to softmax's input + * is simply: + * + * grad_i = softmax_out_i - target_i, + * + * and here hard label is used. + */ + softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.; + + MatrixPtr tmp = Matrix::create( + softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false); + + for (size_t i = 0; i < validExpansionCount_; ++i) { + IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(), + pathRowIdsInEachBeam_[i].size(), + false); + /* + beams_->scoreGrad[i] has been intialized outside this class, this + class only keeps a pointer pointing to the original input gradients, + so here does not need to allocate or initalize the memory. + */ + tmp->addToRows(*beams_->scoreGrad[i], *rowIds); + } +} + +REGISTER_LAYER(cross_entropy_over_beam, CrossEntropyOverBeam); + +bool CrossEntropyOverBeam::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + /* Initialize the basic parent class */ + Layer::init(layerMap, parameterMap); + CHECK_EQ(0U, inputLayers_.size() % 3) << "Error input number."; + + beamExpanCount_ = inputLayers_.size() / 3; + + candidateScores_.resize(beamExpanCount_); + candidateScoreGrad_.resize(beamExpanCount_); + + candidateInBeam_.resize(beamExpanCount_); + goldSequence_.resize(beamExpanCount_); + gradToInputs_.resize(beamExpanCount_); + + setNeedSequenceInfo(false); + return true; +} + +void CrossEntropyOverBeam::checkInputs() { + batchSize_ = 0; + for (size_t i = 0; i < beamExpanCount_; ++i) { + const Argument& scores = getInput(i * 3); + const Argument& selCandidates = getInput(i * 3 + 1); + const Argument& goldSeq = getInput(i * 3 + 2); + + if (i) { + CHECK(scores.hasSubseq()) << "input " << i << " " + << inputLayers_[i * 3]->getName() + << " should be a nested sequence"; + CHECK_EQ(getInputValue(i * 3 + 1)->getWidth(), beamSize_); + CHECK_EQ(scores.getNumSequences(), batchSize_); + CHECK_EQ(scores.getNumSubSequences(), selCandidates.getBatchSize()); + } else { + CHECK(scores.hasSeq()) << "input " << i << " " + << inputLayers_[i]->getName() + << " should be a sequence"; + batchSize_ = scores.getNumSequences(); + beamSize_ = getInputValue(i * 3 + 1)->getWidth(); + CHECK_EQ(batchSize_, selCandidates.getBatchSize()); + } + CHECK_EQ(1U, scores.value->getWidth()); + CHECK_EQ(batchSize_, goldSeq.getBatchSize()); + } +} + +void CrossEntropyOverBeam::copyInputsToCpu() { + auto copyValue = [](const MatrixPtr& src, MatrixPtr& trg) { + if (dynamic_cast(src.get())) { + Matrix::resizeOrCreate( + trg, src->getHeight(), src->getWidth(), false, false); + trg->copyFrom(*src); + } else { + trg = std::move(src); + } + }; + + auto copyIds = [](const IVectorPtr& src, IVectorPtr& trg) { + if (dynamic_cast(src.get())) { + IVector::resizeOrCreate(trg, src->getSize(), false); + trg->copyFrom(*src); + } else { + trg = std::move(src); + } + }; + + beamSplitPos_.clear(); + beamSplitPos_.resize(batchSize_, std::vector(beamExpanCount_, 0)); + for (size_t i = 0; i < beamExpanCount_; ++i) { + copyValue(getInputValue(i * 3), candidateScores_[i]); + copyValue(getInputValue(i * 3 + 1), candidateInBeam_[i]); + copyIds(getInput(i * 3 + 2).ids, goldSequence_[i]); + + if (i) { + ICpuGpuVectorPtr seqInfo = getInput(i * 3).sequenceStartPositions; + const int* seqStarts = seqInfo->getMutableData(false); + ICpuGpuVectorPtr subSeqInfo = getInput(i * 3).subSequenceStartPositions; + const int* subSeqStarts = subSeqInfo->getMutableData(false); + + size_t seqId = 1; + for (size_t subSeqId = 0; subSeqId < subSeqInfo->getSize() - 1; + ++subSeqId) { + CHECK_LT(seqId, seqInfo->getSize()); + if (subSeqStarts[subSeqId] == seqStarts[seqId]) { + beamSplitPos_[seqId][i] = beamSplitPos_[seqId - 1][i]; + seqId++; + } + beamSplitPos_[seqId - 1][i]++; + } + } else { + for (size_t j = 0; j < batchSize_; ++j) beamSplitPos_[j][i] = j + 1; + } + } +} + +void CrossEntropyOverBeam::splitBatchBeams() { + beamCosts_.resize(batchSize_); + beamPerSeq_.resize(batchSize_, BeamExpansion(beamExpanCount_)); + + for (size_t i = 0; i < beamExpanCount_; ++i) { + int* seqStarts = + getInput(i * 3).sequenceStartPositions->getMutableData(false); + + int* subSeqStarts = nullptr; + int maxLen = 0; + if (i) { + subSeqStarts = + getInput(i * 3).subSequenceStartPositions->getMutableData(false); + maxLen = getInput(i * 3).subSequenceStartPositions->getSize() - 1; + } else { + maxLen = getInput(i).sequenceStartPositions->getSize() - 1; + } + + for (size_t j = 0; j < batchSize_; ++j) { + beamPerSeq_[j].scores[i] = + Matrix::create(candidateScores_[i]->getData() + seqStarts[j], + seqStarts[j + 1] - seqStarts[j], + 1, + false, + false); + beamPerSeq_[j].scoreGrad[i] = + Matrix::create(candidateScoreGrad_[i]->getData() + seqStarts[j], + seqStarts[j + 1] - seqStarts[j], + 1, + false, + false); + + int offset = j ? beamSplitPos_[j - 1][i] : 0; + int height = beamSplitPos_[j][i] - (j ? beamSplitPos_[j - 1][i] : 0); + CHECK_GE(maxLen, offset + height); + beamPerSeq_[j].seqInfo[i] = IVector::create( + (i ? subSeqStarts : seqStarts) + offset, height + 1, false); + + beamPerSeq_[j].candidateIds[i] = + Matrix::create(candidateInBeam_[i]->getData() + offset * beamSize_, + height, + beamSize_, + false, + false); + beamPerSeq_[j].gold[i] = goldSequence_[i]->getData()[j]; + + CHECK_LE(beamPerSeq_[j].gold[i], seqStarts[j + 1] - seqStarts[j]); + } + } +} + +void CrossEntropyOverBeam::resizeOutput() { + Matrix::resizeOrCreate(output_.value, batchSize_, 1, false, false); + output_.value->zeroMem(); + + for (size_t i = 0; i < beamExpanCount_; ++i) { + MatrixPtr inGrad = getInputGrad(i * 3); + if (dynamic_cast(inGrad.get())) { + Matrix::resizeOrCreate(candidateScoreGrad_[i], + inGrad->getHeight(), + inGrad->getWidth(), + false, + false); + } else { + candidateScoreGrad_[i] = std::move(inGrad); + } + candidateScoreGrad_[i]->zeroMem(); + } +} + +void CrossEntropyOverBeam::copyGradToGpu(size_t copyCount) { + for (size_t i = 0; i < beamExpanCount_; ++i) { + if (dynamic_cast(getInputGrad(i * 3).get())) + getInputGrad(i * 3)->copyFrom(*candidateScoreGrad_[i]); + + if (i == copyCount - 1) break; + } +} + +void CrossEntropyOverBeam::forward(PassType passType) { + Layer::forward(passType); + + checkInputs(); + copyInputsToCpu(); + + resizeOutput(); + splitBatchBeams(); + + MatrixPtr outputValue = getOutputValue(); + for (size_t i = 0; i < batchSize_; ++i) { + beamCosts_[i].setData( + std::move(std::make_shared(beamPerSeq_[i])), beamSize_); + outputValue->getData()[i] = beamCosts_[i].forward(); + } +} + +void CrossEntropyOverBeam::backward(const UpdateCallback& callback) { + for (size_t i = 0; i < batchSize_; ++i) { + beamCosts_[i].backward(); + copyGradToGpu(beamCosts_[i].getValidExpansionCount()); + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.h b/paddle/gserver/layers/CrossEntropyOverBeam.h new file mode 100644 index 0000000000000000000000000000000000000000..5643556f43370912a730d9895658d8944f50dced --- /dev/null +++ b/paddle/gserver/layers/CrossEntropyOverBeam.h @@ -0,0 +1,135 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "CrossEntropyOverBeam.h" +#include "Layer.h" + +namespace paddle { + +/* This struct stores the beams in all search steps for a single sequence. */ +struct BeamExpansion { + std::vector scores; + std::vector seqInfo; + + std::vector candidateIds; + std::vector gold; + + std::vector scoreGrad; + + size_t expansionCount; + + explicit BeamExpansion(int n) { + expansionCount = n; + scores.resize(expansionCount); + seqInfo.resize(expansionCount); + candidateIds.resize(expansionCount); + scoreGrad.resize(expansionCount); + + gold.resize(expansionCount); + } +}; +typedef std::shared_ptr BeamExpansionPtr; + +class CostForOneSequence { +public: + CostForOneSequence() + : beamSize_(0), validExpansionCount_(0), goldAsExtraPath_(false) {} + void setData(const BeamExpansionPtr bPtr, size_t beamSize) { + beams_ = bPtr; + beamSize_ = beamSize; + + expandedPathScores_.clear(); + expandedPathScores_.resize(beams_->expansionCount); + + goldRowIds_.clear(); + goldRowIds_.resize(beams_->expansionCount, 0); + goldColIds_.clear(); + goldColIds_.resize(beams_->expansionCount, -1); + } + size_t getValidExpansionCount() { return validExpansionCount_; } + + real forward(); + void backward(); + +private: + void calValidExpandStep(); + void constructTotalExpansion(); + size_t initLastExpansion(); + real globallyNormalizedScore(); + + int getSeqStartPos(size_t beamId, size_t rowId) { + CHECK_GT(beams_->seqInfo[beamId]->getSize() - 1, rowId); + int* starts = beams_->seqInfo[beamId]->getData(); + return starts[rowId] - starts[0]; + } + + size_t beamSize_; + size_t validExpansionCount_; + bool goldAsExtraPath_; + std::vector goldRowIds_; + std::vector goldColIds_; + + BeamExpansionPtr beams_; + std::vector> pathRowIdsInEachBeam_; + std::vector parentIdsInBeam_; + size_t goldIdsInFinalExpansion_; + + std::vector expandedPathScores_; + + MatrixPtr softmaxOut_; +}; + +class CrossEntropyOverBeam : public Layer { +public: + explicit CrossEntropyOverBeam(const LayerConfig& config) : Layer(config) {} + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + void forward(PassType passType) override; + void backward(const UpdateCallback& callback) override; + +private: + void checkInputs(); + void copyInputsToCpu(); + void resizeOutput(); + void copyGradToGpu(size_t copyCount); + void splitBatchBeams(); + + size_t beamExpanCount_; + size_t batchSize_; + size_t beamSize_; + + /* + * the process of constructing beams is not friendly to GPU, currently, this + * layer only runs on CPU, if any of its inputs is on GPU memory, then copy + * it to CPU memory. + */ + std::vector candidateScores_; + std::vector candidateScoreGrad_; + std::vector candidateInBeam_; + std::vector gradToInputs_; + std::vector goldSequence_; + std::vector> beamSplitPos_; + + /* + * split entire bath of beams into beam per sequnence and store the result + * into this member. + */ + std::vector beamPerSeq_; + /* beamCosts_ is used to propagate error in one sequence. */ + std::vector beamCosts_; +}; + +} // namespace paddle diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp index 0ece2799318ea5ecc91f97f71289d4d07246dcaa..20de475fc3f6b6f3c05ac26bea8363daff0cf110 100644 --- a/paddle/gserver/layers/ExpandConvLayer.cpp +++ b/paddle/gserver/layers/ExpandConvLayer.cpp @@ -29,6 +29,10 @@ namespace paddle { REGISTER_LAYER(exconv, ExpandConvLayer); REGISTER_LAYER(exconvt, ExpandConvLayer); +inline bool isDepthwiseConv(int channels, int groups) { + return channels == groups; +} + bool ExpandConvLayer::init(const LayerMap &layerMap, const ParameterMap ¶meterMap) { /* Initialize the basic convolutional parent class */ @@ -47,14 +51,27 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, std::vector paddings = {(size_t)paddingY_[i], (size_t)padding_[i]}; std::vector strides = {(size_t)strideY_[i], (size_t)stride_[i]}; - if (useGpu_ && (size_t)groups_[i] == (size_t)channels_[i] && !isDeconv_) { + // Convolution Layer uses the GemmConv function by default. + convType = "GemmConv"; + convGradInputType = "GemmConvGradInput"; + convGradFilterType = "GemmConvGradFilter"; + + // If depth wise convolution and useGpu == true + if (useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) { convType = "DepthwiseConv"; convGradInputType = "DepthwiseConvGradInput"; convGradFilterType = "DepthwiseConvGradFilter"; - } else { - convType = "GemmConv"; - convGradInputType = "GemmConvGradInput"; - convGradFilterType = "GemmConvGradFilter"; + } + + // If depth wise convolution and useGpu == false and ARM-NEON + if (!useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) { +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + if ((filterSize_[i] == filterSizeY_[i]) && + (filterSize_[i] == 3 || filterSize_[i] == 4) && + (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2)) { + convType = "NeonDepthwiseConv"; + } +#endif } if (FLAGS_use_nnpack && !isDeconv_) { diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp index d5621412caee843e24a0d0c9b7096402765738c7..2bc20eee6c452d0943dbf43b17ebe77976c97489 100644 --- a/paddle/gserver/layers/Layer.cpp +++ b/paddle/gserver/layers/Layer.cpp @@ -41,7 +41,7 @@ namespace paddle { Layer::Layer(const LayerConfig& config, bool useGpu) : config_(config), useGpu_(useGpu), - deviceId_(-1), + deviceId_(CPU_DEVICE), needSequenceInfo_(true) {} bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h index 0ed482889d0cea884db3759620088575c5b10201..edef36194aabdb9c122ec3423deb036169a34d7c 100644 --- a/paddle/gserver/layers/Layer.h +++ b/paddle/gserver/layers/Layer.h @@ -59,7 +59,12 @@ protected: LayerConfig config_; /// whether to use GPU bool useGpu_; - /// Device Id. CPU is -1, and GPU is 0, 1, 2 ... + /// Paddle device ID, MKLDNN is -2, CPU is -1 + enum PADDLE_DEVICE_ID { + MKLDNN_DEVICE = -2, + CPU_DEVICE = -1, + }; + /// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ... int deviceId_; /// Input layers std::vector inputLayers_; @@ -77,6 +82,7 @@ protected: Argument output_; /// Several outputs stored on different devices, used in 'parallel_nn' case, /// and record them by deviceId_. + /// Also used in 'use_mkldnn' case. std::vector outputOtherDevice_; /// If there are several outputs, map them by each name. std::map outputMap_; @@ -172,6 +178,13 @@ protected: return inputLayer.getOutput(deviceId_); } + /** + * Get the argument of input layer with deviceId. + */ + const Argument& getInput(size_t inputIndex, int deviceId) const { + return inputLayers_[inputIndex]->getOutput(deviceId); + } + /** * Get the forward-input value. */ @@ -186,6 +199,13 @@ protected: return inputLayer.getOutput(deviceId_).value; } + /** + * Get the forward-input value with deviceId. + */ + const MatrixPtr& getInputValue(int inputIndex, int deviceId) { + return inputLayers_[inputIndex]->getOutput(deviceId).value; + } + /** * Get the forward-input grad. */ @@ -200,6 +220,13 @@ protected: return inputLayer.getOutput(deviceId_).grad; } + /** + * Get the forward-input grad. + */ + const MatrixPtr& getInputGrad(int inputIndex, int deviceId) { + return inputLayers_[inputIndex]->getOutput(deviceId).grad; + } + /** * Get the forward-input label. */ diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp index d201fac65e0459050304195140e1aae081468f43..8318c8c519a4cec1610eadd28320ee5ce0b4147d 100644 --- a/paddle/gserver/layers/MKLDNNFcLayer.cpp +++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp @@ -61,43 +61,42 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() { return; } - // TODO(TJ): dst format should get from wgtVal_ - int dstFmt = PARAM_FORMAT_MKLDNN_OI; - int srcFmt = weight_->getParameterPtr()->getHeaderFormat(); - if (srcFmt == dstFmt) { - return; - } - - // The weight_ is transposed from initial paddle weight - MatrixPtr paddleWgt = Matrix::create( - weight_->getW()->getData(), iLayerSize_, oc_, false, false); - - // TODO(TJ): remove this print when do not need differ weights - std::ostringstream ostr; - paddleWgt->print(ostr); - VLOG(MKLDNN_ALL) << "Initial Weight from paddle: " << std::endl << ostr.str(); - - // The mkldnn weight is transposed from initial paddle matrix - MatrixPtr paddleWgtT; - paddleWgt->transpose(paddleWgtT, true); - weight_->getW()->copyFrom(*paddleWgtT); - weight_->getParameterPtr()->setHeaderFormat(dstFmt); + CHECK(wgtVal_) << "should have been initialized"; + bool hasNoSpatial_ = ih_ == 1 && iw_ == 1; + auto targetDim = wgtVal_->getDims(); + auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo; + wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim); hasInitedWgt_ = true; } void MKLDNNFcLayer::convertWeightsToPaddle() { - MatrixPtr dnnWgt = weight_->getW(); - MatrixPtr paddleWgt; - dnnWgt->transpose(paddleWgt, true); - - // copy paddle weight and override on weight_ - MatrixPtr dnnWgtT = Matrix::create( - dnnWgt->getData(), dnnWgt->getWidth(), dnnWgt->getHeight(), false, false); - dnnWgtT->copyFrom(*paddleWgt); + CHECK(wgtVal_) << "should have been initialized"; + bool hasNoSpatial_ = ih_ == 1 && iw_ == 1; + auto targetDim = wgtVal_->getDims(); + auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo; + wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim); +} + +void MKLDNNFcLayer::convertOutputToOtherDevice() { + copyOutputInfoToOtherDevice(); + // find other cpu device and reorder output to cpu device + int cnt = 0; + for (size_t i = 0; i < outputOtherDevice_.size(); i++) { + if (outputOtherDevice_[i].deviceId == CPU_DEVICE) { + // fc cpu output value do not need convert + // just share point + outputOtherDevice_[i].value = output_.value; + ++cnt; + } + } + + if (cnt > 1) { + LOG(WARNING) << "should not have more than one CPU devie"; + } } void MKLDNNFcLayer::reshape() { - const Argument& input = getInput(0); + const Argument& input = getInput(0, getPrev(0)->getDeviceId()); int batchSize = input.getBatchSize(); if (bs_ == batchSize) { return; @@ -111,10 +110,6 @@ void MKLDNNFcLayer::reshape() { if (iw_ == 0) { iw_ = 1; } - hasSpatial_ = true; - if (ih_ == 1 && iw_ == 1) { - hasSpatial_ = false; - } CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize()); ic_ = iLayerSize_ / (ih_ * iw_); CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible"; @@ -135,37 +130,53 @@ void MKLDNNFcLayer::reshape() { void MKLDNNFcLayer::resetFwd() { bool hasBias = biases_ && biases_->getW(); - real* iData = getInputValue(0)->getData(); - real* oData = getOutputValue()->getData(); - real* wData = weight_->getW()->getData(); - real* bData = hasBias ? biases_->getW()->getData() : NULL; - - // TODO(TJ): below create should be covered in MkldnnMatrix - // create memory desc - memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw) - : createMD({bs_, ic_}, format::nc); - memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw) - : createMD({oc_, ic_}, format::oi); - memory::desc bMD = bData != NULL ? createMD({oc_}, format::x) - : createMD({}, format::format_undef); - memory::desc oMD = createMD({bs_, oc_}, format::nc); - - // create memory primitive desc and memory self - inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData)); - wgtVal_.reset(new memory(memory::primitive_desc(wMD, engine_), wData)); - outVal_.reset(new memory(memory::primitive_desc(oMD, engine_), oData)); + const MatrixPtr& wgt = weight_->getW(); + const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr; + const MatrixPtr& out = output_.value; + + if (inputIsOnlyMKLDNN()) { + const MatrixPtr& in = getInputValue(0); + inVal_ = std::dynamic_pointer_cast(in); + CHECK(inVal_) << "Input should be MKLDNNMatrix"; + } else { + CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet"; + const MatrixPtr& in = getInputValue(0, CPU_DEVICE); + inVal_ = MKLDNNMatrix::create( + in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_); + } + inVal_->downSpatial(); + wgtVal_ = MKLDNNMatrix::create( + wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_); + wgtVal_->downSpatial(); + biasVal_ = + hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr; + outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_); + + // change original output value to mkldnn output value + output_.value = std::dynamic_pointer_cast(outVal_); + if (!outputIsOnlyMKLDNN()) { + convertOutputToOtherDevice(); + } + // create forward handle prop_kind pk = prop_kind::forward; - fc_fwd::desc fwdDesc = bData != NULL ? fc_fwd::desc(pk, iMD, wMD, bMD, oMD) - : fc_fwd::desc(pk, iMD, wMD, oMD); + fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk, + inVal_->getMemoryDesc(), + wgtVal_->getMemoryDesc(), + biasVal_->getMemoryDesc(), + outVal_->getMemoryDesc()) + : fc_fwd::desc(pk, + inVal_->getMemoryDesc(), + wgtVal_->getMemoryDesc(), + outVal_->getMemoryDesc()); fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_); - - if (bData != NULL) { - biasVal_.reset(new memory(memory::primitive_desc(bMD, engine_), bData)); + if (hasBias) { fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_)); } else { fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_)); } + printValueFormatFlow(); + pipelineFwd_.clear(); pipelineFwd_.push_back(*fwd_); } @@ -175,45 +186,46 @@ void MKLDNNFcLayer::resetBwd() { return; } needResetBwd_ = false; - bool hasBias = biases_ && biases_->getWGrad(); - real* iData = getInputValue(0)->getData(); - real* iDiff = getInputGrad(0) != nullptr ? getInputGrad(0)->getData() : NULL; - real* oDiff = getOutputGrad()->getData(); - real* wDiff = weight_->getWGrad()->getData(); - real* bDiff = hasBias ? biases_->getWGrad()->getData() : NULL; /// backward weight - // create memory desc for backward memory - memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw) - : createMD({bs_, ic_}, format::nc); - memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw) - : createMD({oc_, ic_}, format::oi); - memory::desc oMD = createMD({bs_, oc_}, format::nc); - memory::desc bMD = bDiff != NULL ? createMD({oc_}, format::x) - : createMD({}, format::format_undef); - - if (inVal_) { - // update data - inVal_->set_data_handle(iData); - } else { - inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData)); - } - - // create memory primitive desc and memory self - wgtGrad_.reset(new memory(memory::primitive_desc(wMD, engine_), wDiff)); - outGrad_.reset(new memory(memory::primitive_desc(oMD, engine_), oDiff)); - - fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, iMD, wMD, oMD); + CHECK(inVal_) << "Should have input value"; + const MatrixPtr& wgt = weight_->getWGrad(); + const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr; + + // TODO(TJ): merge outgrad + int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE; + // for MKLDNN device: + // can not directly cast outputgrad to mkldnnmatrix, + // since each layer can not write the inputgrad to mkldnn inputgrad. + // So just create from matrix with outputvalue format. + // for CPU device: + // fc do not need to convert from cpu device since output is always nc format + // only need create from cpu device + const MatrixPtr& out = getOutput(device).grad; + outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc()); + wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc()); + biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc()) + : nullptr; + + // create memory primitive desc + fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, + inVal_->getMemoryDesc(), + wgtGrad_->getMemoryDesc(), + outGrad_->getMemoryDesc()); fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_); - fc_bwdWgt::desc bwdWgtDesc = bDiff != NULL - ? fc_bwdWgt::desc(iMD, wMD, bMD, oMD) - : fc_bwdWgt::desc(iMD, wMD, oMD); + fc_bwdWgt::desc bwdWgtDesc = hasBias + ? fc_bwdWgt::desc(inVal_->getMemoryDesc(), + wgtGrad_->getMemoryDesc(), + biasGrad_->getMemoryDesc(), + outGrad_->getMemoryDesc()) + : fc_bwdWgt::desc(inVal_->getMemoryDesc(), + wgtGrad_->getMemoryDesc(), + outGrad_->getMemoryDesc()); fc_bwdWgt::primitive_desc bwdWgtPD = fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD); - if (bDiff != NULL) { - biasGrad_.reset(new memory(memory::primitive_desc(bMD, engine_), bDiff)); + if (hasBias) { bwdWgt_.reset( new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_)); } else { @@ -223,15 +235,26 @@ void MKLDNNFcLayer::resetBwd() { pipelineBwd_.push_back(*bwdWgt_); /// backward data - if (iDiff == NULL) { + device = inputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE; + const MatrixPtr& in = getInputGrad(0, device); + if (in == nullptr) { return; } - fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(iMD, wMD, oMD); + if (getInput(0, device).getAllCount() > 1) { + // TODO(TJ): use outputMaps_ ways when merge outgrad done + } else { + inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc()); + } + + fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMemoryDesc(), + wgtGrad_->getMemoryDesc(), + outGrad_->getMemoryDesc()); fc_bwdData::primitive_desc bwdDataPD = fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD); - inGrad_.reset(new memory(memory::primitive_desc(iMD, engine_), iDiff)); + CHECK(wgtVal_) << "Should have weight memory"; bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_)); + printGradFormatFlow(); pipelineBwd_.push_back(*bwdData_); } @@ -241,11 +264,7 @@ void MKLDNNFcLayer::forward(PassType passType) { { REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str()); - - // update input data - // since it might be changed if this is after data layer - real* iData = getInputValue(0)->getData(); - inVal_->set_data_handle(iData); + syncInputValue(); // just submit forward pipeline stream_->submit(pipelineFwd_); @@ -267,10 +286,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) { REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str()); resetBwd(); - // update diff - real* oDiff = getOutputGrad()->getData(); - outGrad_->set_data_handle(oDiff); - + syncOutputGrad(); // just sumbmit backward pipeline stream_->submit(pipelineBwd_); } diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h index 7954852a23f81d36d5fb0ae6a19768f419886fb1..e138a6faf181c412949218458e7ecf800a0d6a07 100644 --- a/paddle/gserver/layers/MKLDNNFcLayer.h +++ b/paddle/gserver/layers/MKLDNNFcLayer.h @@ -32,16 +32,13 @@ protected: // if has already init the weight bool hasInitedWgt_; - // if input layer has image size info (ih>1 && iw>1) - bool hasSpatial_; - // fc weight and bias std::unique_ptr weight_; std::unique_ptr biases_; public: explicit MKLDNNFcLayer(const LayerConfig& config) - : MKLDNNLayer(config), hasInitedWgt_(false), hasSpatial_(true) {} + : MKLDNNLayer(config), hasInitedWgt_(false) {} ~MKLDNNFcLayer() {} @@ -75,6 +72,8 @@ protected: * only would be called when needed */ void resetBwd(); + + void convertOutputToOtherDevice() override; }; } // namespace paddle diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index 63e29f447eede5ff9df8715bc9140b64ab7f7d17..b983b833d510b823c5d4cff0b9390173e4cefc89 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -18,9 +18,9 @@ limitations under the License. */ #include "Layer.h" #include "MKLDNNBase.h" #include "mkldnn.hpp" +#include "paddle/math/MKLDNNMatrix.h" DECLARE_bool(use_mkldnn); -DECLARE_bool(use_mkldnn_wgt); namespace paddle { @@ -52,15 +52,15 @@ protected: std::vector pipelineFwd_; std::vector pipelineBwd_; - // TODO(TJ): change below memory as MKLDNNMatrixPtr type - std::shared_ptr inVal_; - std::shared_ptr inGrad_; - std::shared_ptr outVal_; - std::shared_ptr outGrad_; - std::shared_ptr wgtVal_; - std::shared_ptr wgtGrad_; - std::shared_ptr biasVal_; - std::shared_ptr biasGrad_; + // MKLDNNMatrixPtr + MKLDNNMatrixPtr inVal_; + MKLDNNMatrixPtr inGrad_; + MKLDNNMatrixPtr outVal_; + MKLDNNMatrixPtr outGrad_; + MKLDNNMatrixPtr wgtVal_; + MKLDNNMatrixPtr wgtGrad_; + MKLDNNMatrixPtr biasVal_; + MKLDNNMatrixPtr biasGrad_; public: explicit MKLDNNLayer(const LayerConfig& config) @@ -83,17 +83,21 @@ public: virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) { + CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn." + << "Please set WITH_MKLDNN=ON " + << "and set use_mkldnn=True"; + CHECK(!useGpu_) << "Do not support GPU yet"; + + // set device id before Layer::init + setDevice(MKLDNN_DEVICE); + // change param device to MKLDNN device + setParamsDevice(MKLDNN_DEVICE, parameterMap); if (!Layer::init(layerMap, parameterMap)) { return false; } - CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn." - << "Please set WITH_MKLDNN=ON " - << "and set use_mkldnn=True"; stream_.reset(new MKLDNNStream()); engine_ = CPUEngine::Instance().getEngine(); - - // TODO(TJ): deivecId return true; } @@ -109,6 +113,12 @@ public: */ virtual void convertWeightsToPaddle() {} + /** + * convert MKLDNN output to other device. + * only support CPU device yet + */ + virtual void convertOutputToOtherDevice() {} + /** * print info about sizes */ @@ -118,14 +128,124 @@ public: << ", oh: " << oh_ << ", ow: " << ow_; } - // TODO(TJ): move to MkldnnMatrix - // create memory desc - inline mkldnn::memory::desc createMD( - mkldnn::memory::dims dims, - mkldnn::memory::format fmt, - mkldnn::memory::data_type type = mkldnn::memory::data_type::f32) { - // TODO(TJ): isFmtSuppoted(fmt) - return mkldnn::memory::desc(dims, type, fmt); + /** + * Print the mkldnn memory format flow of value + */ + virtual void printValueFormatFlow() { + if (inVal_ && outVal_) { + VLOG(MKLDNN_FMTS) << "value format flow --- " << inVal_->getFormat() + << " >>> " << outVal_->getFormat(); + } + } + + /** + * Print the mkldnn memory format flow of grad + */ + virtual void printGradFormatFlow() { + if (inGrad_ && outGrad_) { + VLOG(MKLDNN_FMTS) << "grad format flow --- " << inGrad_->getFormat() + << " <<< " << outGrad_->getFormat(); + } + } + +protected: + /** + * copy image size and sequence info to other device + * @note: can not directly use Layer::copyOutputToOtherDevice since here only + * copy base info and do not copy data value + */ + void copyOutputInfoToOtherDevice() { + for (size_t i = 0; i < outputOtherDevice_.size(); i++) { + outputOtherDevice_[i].setFrameHeight(output_.getFrameHeight()); + outputOtherDevice_[i].setFrameWidth(output_.getFrameWidth()); + outputOtherDevice_[i].sequenceStartPositions = + output_.sequenceStartPositions; + outputOtherDevice_[i].subSequenceStartPositions = + output_.subSequenceStartPositions; + outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims; + } + } + + /** + * If input only has MKLDNN device. + * Otherwise, only support the previous layer using CPU device. + */ + bool inputIsOnlyMKLDNN(int index = 0) { + int prevDevice = getPrev(index)->getDeviceId(); + if (prevDevice == MKLDNN_DEVICE) { + return true; + } else { + // do not support GPU yet + CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet"; + return false; + } + } + + /** + * If output only has MKLDNN device. + * Otherwise, other devices should only using CPU device. + */ + bool outputIsOnlyMKLDNN() { + for (size_t i = 0; i < outputOtherDevice_.size(); i++) { + CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE) + << "Only support other device is CPU yet"; + } + return outputOtherDevice_.size() == 0; + } + + /** + * Sync input value data + */ + void syncInputValue() { + if (inputIsOnlyMKLDNN()) { + return; + } + real* iData = getInputValue(0, CPU_DEVICE)->getData(); + // update input data + // since it might be changed if this is after data layer + inVal_->updateData(iData); + } + + /** + * Sync output grad data + */ + void syncOutputGrad() { + if (outputIsOnlyMKLDNN()) { + return; + } + + // update diff + real* oDiff = getOutput(CPU_DEVICE).grad->getData(); + outGrad_->updateData(oDiff); + } + + /** + * Set deviceId of this layer. + */ + void setDevice(int id) { deviceId_ = id; } + + /** + * Set deviceId of the params used in this layer. + */ + void setParamsDevice(int id, const ParameterMap& parameterMap) { + for (auto& inputConfig : config_.inputs()) { + if (inputConfig.has_input_parameter_name()) { + ParameterPtr parameter; + std::string name = inputConfig.input_parameter_name(); + CHECK(mapGet(name, parameterMap, ¶meter)) + << "Cannot find input parameter " << name << " for layer " + << getName(); + parameter->setDevice(id); + } + } + if (config_.has_bias_parameter_name()) { + ParameterPtr parameter; + std::string name = config_.bias_parameter_name(); + CHECK(mapGet(name, parameterMap, ¶meter)) + << "Cannot find bias parameter " << name << " for layer " + << getName(); + parameter->setDevice(id); + } } }; diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index 346c01ced648e47a5516c810e1e975a3a5ed2394..de9b8e63dfc4291f8f42ca8c57cb5eb6baed8d8e 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -34,6 +34,13 @@ add_unittest_without_exec(test_CRFLayerGrad add_test(NAME test_CRFLayerGrad COMMAND test_CRFLayerGrad) +################ test_CrossEntropyOverBeam #################### +add_unittest_without_exec(test_CrossEntropyOverBeam + test_CrossEntropyOverBeamGrad.cpp + LayerGradUtil.cpp) +add_test(NAME test_CrossEntropyOverBeam + COMMAND test_CrossEntropyOverBeam) + ################ test_SeqSliceLayerGrad #################### add_unittest_without_exec(test_SeqSliceLayerGrad test_SeqSliceLayerGrad.cpp diff --git a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp new file mode 100644 index 0000000000000000000000000000000000000000..538d18cdc3d262df0ddb031d9e6b38a3fea57606 --- /dev/null +++ b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp @@ -0,0 +1,353 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include +#include "ModelConfig.pb.h" +#include "paddle/gserver/layers/DataLayer.h" +#include "paddle/trainer/Trainer.h" + +#include "LayerGradUtil.h" +#include "paddle/testing/TestUtil.h" + +using namespace paddle; // NOLINT + +DECLARE_int32(gpu_id); +DECLARE_bool(thread_local_rand_use_global_seed); + +const size_t MAX_SEQ_NUM = 23; +const size_t MAX_SEQ_LEN = 50; +const size_t MAX_BEAM_SIZE = 27; + +const size_t SEED = (size_t)(time(NULL)); + +struct SingleBeamExpansion { + vector seqStartPos; + vector subSeqStartPos; + vector candidateScores; + + // TODO(caoying): store this into Argument.ids + vector selectedIndices; + + vector groundTruth; + vector inBeam; + vector rowIdxInBeam; + vector colIdxInBeam; + + void resetGroundTruth(size_t n) { + groundTruth.clear(); + groundTruth.resize(n, -1); + + inBeam.clear(); + inBeam.resize(n, 0); + + rowIdxInBeam.clear(); + rowIdxInBeam.resize(n, -1); + + colIdxInBeam.clear(); + colIdxInBeam.resize(n, -1); + } +}; + +inline float randFloat() { + return static_cast(rand()) / static_cast(RAND_MAX); +} + +void genRand(real* numbers, size_t n) { + default_random_engine generator; + uniform_real_distribution distribution(0.0, 1.0); + for (size_t i = 0; i < n; ++i) numbers[i] = distribution(generator); +} + +vector randSampling(real range, int n) { + CHECK_GE(range, n); + vector num(range); + iota(begin(num), end(num), 0.); + if (range == n) return num; + + random_shuffle(begin(num), end(num)); + num.resize(n); + sort(begin(num), end(num)); + return num; +} + +void genCandidateScores(bool hasSubseq, + size_t beamSize, + SingleBeamExpansion& prevBeam, + SingleBeamExpansion& curBeam) { + vector& seqStartPos = curBeam.seqStartPos; + seqStartPos.resize(1, 0); + vector& subSeqStartPos = curBeam.subSeqStartPos; + subSeqStartPos.resize(1, 0); + + srand(SEED); + if (prevBeam.selectedIndices.size()) { + if (prevBeam.subSeqStartPos.size() > 1) { + int seqIdx = 1; + // samples in previous beam are nested sequences. + for (size_t i = 1; i < prevBeam.subSeqStartPos.size(); ++i) { + for (size_t j = 0; j < beamSize; ++j) { + if (prevBeam.selectedIndices[(i - 1) * beamSize + j] == -1.) break; + subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) + + subSeqStartPos.back()); + } + if (prevBeam.seqStartPos[seqIdx] == prevBeam.subSeqStartPos[i]) { + seqStartPos.push_back(subSeqStartPos.back()); + seqIdx++; + } + } + } else { + for (size_t i = 0; i <= prevBeam.selectedIndices.size(); ++i) { + if (i && i % beamSize == 0) { + seqStartPos.push_back(subSeqStartPos.back()); + if (i == prevBeam.selectedIndices.size()) break; + } + if (prevBeam.selectedIndices[i] == -1.) continue; + subSeqStartPos.push_back(subSeqStartPos.back() + + (1 + (rand() % MAX_SEQ_LEN))); + } + } + } else { + // the first beam expansion + int seqNum = 1 + (rand() % MAX_SEQ_NUM); + for (int i = 0; i < seqNum; ++i) { + if (hasSubseq) { + for (size_t j = 0; j < 1 + (rand() % MAX_SEQ_NUM); ++j) + subSeqStartPos.push_back(subSeqStartPos.back() + + (1 + (rand() % MAX_SEQ_LEN))); + seqStartPos.push_back(subSeqStartPos.back()); + } else { + seqStartPos.push_back(seqStartPos.back() + + (1 + (rand() % MAX_SEQ_LEN))); + } + } + } + + size_t totalSeqNum = hasSubseq ? subSeqStartPos.back() : seqStartPos.back(); + curBeam.candidateScores.resize(totalSeqNum, 0.); + genRand(curBeam.candidateScores.data(), totalSeqNum); +} + +void genSelectedIndices(size_t beamSize, + vector& seqStartPos, + vector& selectedIndices) { + size_t selectedIdsCount = beamSize * (seqStartPos.size() - 1); + selectedIndices.resize(selectedIdsCount, -1.); + + for (size_t i = 0; i < seqStartPos.size() - 1; ++i) { + int seqLen = seqStartPos[i + 1] - seqStartPos[i]; + int n = min(seqLen, static_cast(beamSize)); + vector ids = randSampling(seqLen, n); + memcpy(selectedIndices.data() + i * beamSize, + ids.data(), + sizeof(real) * ids.size()); + } +} + +void genGroundTruth(vector& beamExpansions, + size_t beamSize) { + SingleBeamExpansion& beam = beamExpansions[1]; + size_t seqNum = beam.seqStartPos.size() - 1; + for (size_t i = 2; i < beamExpansions.size(); ++i) + CHECK_EQ(seqNum, beamExpansions[i].seqStartPos.size() - 1); + + srand(SEED); + + // initialize the first beam. + beam.resetGroundTruth(seqNum); + for (size_t i = 0; i < seqNum; ++i) { + if (randFloat() > 0.5) { + /* + * force the randomly generated label falls in the beam by chance 0.5. + * otherwise, when sequence length is relatively long and beam size is + * relatively small, the gold sequences falls off the beam at in the + * first search. + */ + real* begPos = beam.selectedIndices.data() + i * beamSize; + beam.colIdxInBeam[i] = + rand() % count_if(begPos, begPos + beamSize, [](const real& val) { + return val != -1.; + }); + beam.groundTruth[i] = + beam.selectedIndices[i * beamSize + beam.colIdxInBeam[i]]; + beam.inBeam[i] = 1; + } else { + int label = rand() % (beam.seqStartPos[i + 1] - beam.seqStartPos[i]); + beam.groundTruth[i] = label; + + real* begPos = beam.selectedIndices.data() + i * beamSize; + real* endPos = begPos + beamSize; + real* lblPos = find(begPos, endPos, real(label)); + if (lblPos != endPos) { + beam.inBeam[i] = 1; + beam.colIdxInBeam[i] = lblPos - begPos; + } + } + beam.rowIdxInBeam[i] = i; + } + + // iterate over each beam expansions + for (size_t i = 2; i < beamExpansions.size(); ++i) { + SingleBeamExpansion& curBeam = beamExpansions[i]; + SingleBeamExpansion& prevBeam = beamExpansions[i - 1]; + curBeam.resetGroundTruth(seqNum); + + // iterate over each sequence + for (size_t j = 0; j < seqNum; ++j) { + if (!prevBeam.inBeam[j]) continue; + + // gold sequence falls in the beam in previous search. + real* begPos = prevBeam.selectedIndices.data(); + int offset = + prevBeam.rowIdxInBeam[j] * beamSize + prevBeam.colIdxInBeam[j]; + curBeam.rowIdxInBeam[j] = count_if( + begPos, begPos + offset, [](const real& val) { return val != -1.; }); + + if (randFloat() > 0.5) { + // force the randomly generated label falls in the beam by chance 0.5. + + real* start = + curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize; + int n = rand() % count_if(start, start + beamSize, [](const real& val) { + return val != -1.; + }); + curBeam.colIdxInBeam[j] = n; + curBeam.groundTruth[j] = *(start + n); + curBeam.inBeam[j] = 1; + } else { + CHECK_LE(curBeam.rowIdxInBeam[j] + 1, + curBeam.subSeqStartPos.size() - 1); + int start = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j]]; + int end = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j] + 1]; + CHECK_GT(size_t(end), size_t(start)); + int label = rand() % (end - start); + + curBeam.groundTruth[j] = label; + real* findBeg = + curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize; + real* lblPos = + find(findBeg, findBeg + beamSize, static_cast(label)); + if (lblPos != (findBeg + beamSize)) { + curBeam.inBeam[j] = 1; + curBeam.colIdxInBeam[j] = lblPos - findBeg; + } + } + } + } +} + +void genOneBeam(size_t beamSize, + bool hasSubseq, + SingleBeamExpansion& prevBeam, + SingleBeamExpansion& curBeam) { + genCandidateScores(hasSubseq, beamSize, prevBeam, curBeam); + genSelectedIndices(beamSize, + hasSubseq ? curBeam.subSeqStartPos : curBeam.seqStartPos, + curBeam.selectedIndices); +} + +void genRandomBeamExpansion(size_t expansionCount, + size_t beamSize, + vector& beamExpansions) { + beamExpansions.clear(); + beamExpansions.resize(expansionCount + 1); + + // beamExpansions[0] is reserved. + for (size_t i = 1; i <= expansionCount; ++i) + genOneBeam(beamSize, bool(i - 1), beamExpansions[i - 1], beamExpansions[i]); + genGroundTruth(beamExpansions, beamSize); +} + +void testCrossEntropyOverBeam(bool useGpu, + size_t beamSize, + vector& beams) { + TestConfig config; + config.layerConfig.set_type("cross_entropy_over_beam"); + + size_t seqNum = 0; + for (size_t i = 1; i < beams.size(); ++i) { + const SingleBeamExpansion& beam = beams[i]; + // create scores for all the candidates + MatrixPtr candidateScorePtr = + Matrix::create(beam.candidateScores.size(), 1, false, false); + candidateScorePtr->copyFrom(beam.candidateScores.data(), + beam.candidateScores.size()); + + ostringstream paramName; + paramName << "candidate_scores_" << i; + + if (beam.subSeqStartPos.size() > 1) { + seqNum = beam.subSeqStartPos.size() - 1; + config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, + paramName.str(), + candidateScorePtr, + beam.seqStartPos, + beam.subSeqStartPos}); + } else { + seqNum = beam.seqStartPos.size() - 1; + config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, + paramName.str(), + candidateScorePtr, + beam.seqStartPos}); + } + config.layerConfig.add_inputs(); + + // create indices for the selected candidates + MatrixPtr selectedCandidates = + Matrix::create(seqNum, beamSize, false, false); + selectedCandidates->copyFrom(beam.selectedIndices.data(), + beam.selectedIndices.size()); + paramName.clear(); + paramName << "selected_candidates_" << i; + config.inputDefs.push_back( + {INPUT_SELF_DEFINE_DATA, paramName.str(), selectedCandidates}); + config.layerConfig.add_inputs(); + + // create the ground truth + paramName.clear(); + paramName << "label_" << i; + config.inputDefs.push_back( + {INPUT_SELF_DEFINE_DATA, paramName.str(), beam.groundTruth}); + config.layerConfig.add_inputs(); + } + + testLayerGrad( + config, "cross_entropy_over_beam", seqNum, false, useGpu, false); +} + +TEST(Layer, CrossEntropyOverBeam) { + LOG(INFO) << "SEED = " << SEED; + const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE; + LOG(INFO) << "beamSize = " << beamSize; + + // TODO(caoying): test with random beam expansions. + const size_t expansionCount = 3; + vector beams; + genRandomBeamExpansion(expansionCount, beamSize, beams); + + for (bool useGpu : {false, true}) + testCrossEntropyOverBeam(useGpu, beamSize, beams); +} + +int main(int argc, char** argv) { + initMain(argc, argv); + hl_start(); + hl_init(FLAGS_gpu_id); + FLAGS_thread_local_rand_use_global_seed = true; + srand(SEED); + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h index 666a8b8368e3e2ebc522902c176d7491d2920d2a..94ef561f066a127496e2849a419835e175c526d7 100644 --- a/paddle/math/Allocator.h +++ b/paddle/math/Allocator.h @@ -48,7 +48,13 @@ public: */ virtual void* alloc(size_t size) { void* ptr; +#ifdef PADDLE_USE_MKLDNN + // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp + // memory alignment + CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0); +#else CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0); +#endif CHECK(ptr) << "Fail to allocate CPU memory: size=" << size; return ptr; } diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt index bf28092e82b778dc904c5a2e271f76261cf5f6b6..68b5296228cd733dc3cb7ca0f762e0a69187dbff 100644 --- a/paddle/math/CMakeLists.txt +++ b/paddle/math/CMakeLists.txt @@ -14,6 +14,17 @@ # file(GLOB MATH_HEADERS . *.h) file(GLOB MATH_SOURCES . *.cpp) + +if(NOT WITH_MKLDNN) + set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h") + set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp") + list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}") + list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}") + message(STATUS "Skip compiling with MKLDNNMatrix") +else() + message(STATUS "Compile with MKLDNNMatrix") +endif() + set(MATH_SOURCES "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu" "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu" diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0a355e2644cce572ce90ecf5c9d2a5b7b395bc61 --- /dev/null +++ b/paddle/math/MKLDNNMatrix.cpp @@ -0,0 +1,144 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "MKLDNNMatrix.h" + +using namespace mkldnn; // NOLINT + +namespace paddle { + +MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) { + memory::desc md = pd.desc(); + size_t ndims = md.data.ndims; + int* dims = md.data.dims; + CHECK(ndims > 0) << "Input dims should not be empty"; + size_t cnts = 1; + for (size_t i = 0; i < ndims; ++i) { + cnts *= dims[i]; + } + + if (m == nullptr) { + size_t height = dims[0]; + size_t width = cnts / dims[0]; + m = Matrix::create(height, width, false, false); + } + + CHECK(m) << " Matrix should not be empty"; + CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast(m); + CHECK(cpuMatrix) << "Only support create from CPU matrix yet"; + + CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match"; + return std::make_shared( + m->getData(), m->getHeight(), m->getWidth(), pd); +} + +MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, + memory::dims dims, + memory::format fmt, + engine& eg, + mkldnn::memory::data_type dtype) { + return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg)); +} + +void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m, + memory::format srcFmt, + memory::dims targetDim) { + memory::format dstFmt = getFormat(); + if (srcFmt == dstFmt) { + return; + } + CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal"; + reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim); +} + +void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m, + memory::format dstFmt, + memory::dims targetDim) { + memory::format srcFmt = getFormat(); + if (srcFmt == dstFmt) { + return; + } + CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal"; + reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim); +} + +void MKLDNNMatrix::reorderOnce(void* srcData, + void* dstData, + memory::format srcFmt, + memory::format dstFmt, + memory::dims dm) { + CHECK(srcData); + CHECK(dstData); + MatrixPtr tmpSrc; + if (dstData == srcData) { + // inplace data + size_t sz = 1; + for (size_t i = 0; i < dm.size(); ++i) { + sz *= dm[i]; + } + tmpSrc = Matrix::create(sz, 1, false, false); + tmpSrc->copyFrom((real*)srcData, sz); + srcData = tmpSrc->getData(); + } + + auto dtype = this->getDtype(); + auto srcMD = memory::desc(dm, dtype, srcFmt); + auto dstMD = memory::desc(dm, dtype, dstFmt); + + auto eg = this->getEngine(); + auto src = memory(memory::primitive_desc(srcMD, eg), srcData); + auto dst = memory(memory::primitive_desc(dstMD, eg), dstData); + + auto r = reorder(src, dst); + stream(stream::kind::eager).submit({r}).wait(); +} + +void MKLDNNMatrix::downSpatial() { + int fmt = getFormat(); + if (!(fmt == memory::format::nchw || fmt == memory::format::oihw)) { + // only support nchw and oihw yet, later can support more like nhwc, ihwo + return; + } + + // TODO(TJ): change H(height) and W(width) if support nhwc or more + const int H = 2, W = 3; + memory::dims srcDims = getDims(); + if (srcDims[H] != 1 || srcDims[W] != 1) { + // can not down spatial + return; + } + + memory::dims dstDims = memory::dims{srcDims[0], srcDims[1]}; + memory::format dstFmt; + switch (fmt) { + case memory::format::nchw: + dstFmt = memory::format::nc; + break; + case memory::format::oihw: + dstFmt = memory::format::oi; + break; + default: + LOG(FATAL) << "unsupported format"; + } + memory::desc md = memory::desc(dstDims, getDtype(), dstFmt); + memory::primitive_desc pd = memory::primitive_desc(md, getEngine()); + mkldnn_primitive_t result; + mkldnn::error::wrap_c_api( + mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr), + "could not create a memory primitive"); + reset(result); + set_data_handle(getData()); +} + +} // namespace paddle diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h new file mode 100644 index 0000000000000000000000000000000000000000..e50f698b495713e6f15ab7a12a7ee7487662040f --- /dev/null +++ b/paddle/math/MKLDNNMatrix.h @@ -0,0 +1,148 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "Matrix.h" +#include "mkldnn.hpp" +#include "paddle/parameter/Parameter.h" + +namespace paddle { + +class MKLDNNMatrix; +typedef std::shared_ptr MKLDNNMatrixPtr; + +/** + * @brief MKLDNN Matrix. + * + */ +class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory { +public: + MKLDNNMatrix(real* data, + size_t height, + size_t width, + mkldnn::memory::primitive_desc pd) + : CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {} + + ~MKLDNNMatrix() {} + + /** + * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc + */ + static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd); + + /** + * Create MKLDNNMatrix from a MatrixPtr and memory details info + */ + static MKLDNNMatrixPtr create( + MatrixPtr m, + mkldnn::memory::dims dims, + mkldnn::memory::format fmt, + mkldnn::engine& eg, + mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32); + +public: + /** + * Reorder this MKLDNNMatrix from other format. + * Support inplace reorder. + * @note: this function would only reorder the data layout. + * will NOT change this original dim or format info + */ + void reorderDataFrom(const MKLDNNMatrixPtr& m, + memory::format srcFmt, + memory::dims targetDim); + + /** + * Reorder this MKLDNNMatrix to other format. + * Support inplace reorder. + * @note: this function would only reorder the data layout. + * will NOT change the dst dim or format info + */ + void reorderDataTo(const MKLDNNMatrixPtr& m, + memory::format dstFmt, + memory::dims targetDim); + + /** + * Dimensionality reduction. + * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1 + */ + void downSpatial(); + + /** + * Update the memory data handle. + * Caution: This will not check the buffer size of the data, + * it should be coverd by user. + */ + void updateData(void* data) { set_data_handle(data); } + + /** + * Get primitive descriptor. + */ + mkldnn::memory::primitive_desc getPrimitiveDesc() { + return this->get_primitive_desc(); + } + + /** + * Get memory descriptor. + */ + mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); } + + /** + * Get dimensions. + */ + mkldnn::memory::dims getDims() { + mkldnn::memory::desc md = getMemoryDesc(); + const int* src = md.data.dims; + int ndims = md.data.ndims; + mkldnn::memory::dims dst; + dst.resize(ndims); + for (int i = 0; i < ndims; ++i) { + dst[i] = src[i]; + } + return dst; + } + + /** + * Get format. + */ + mkldnn::memory::format getFormat() { + return (mkldnn::memory::format)(getMemoryDesc().data.format); + } + + /** + * Get memory data type. + */ + mkldnn::memory::data_type getDtype() { + return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type); + } + + /** + * Get engine. + */ + mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); } + +protected: + /** + * Do reorder once. + * Can support inplace. + */ + void reorderOnce(void* srcData, + void* dstData, + memory::format srcFmt, + memory::format dstFmt, + memory::dims dm); +}; + +} // namespace paddle diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index 94c1bda5bb376c9851cf1d04c7297bc9709e3f24..8dbef0b22e7b2f14c62586f86e686356b6e9c68e 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -679,6 +679,7 @@ void Argument::reorganizeSeqInfo( const ICpuGpuVectorPtr subSeqStartPos, std::vector>& reorganizedSeqInfo) { CHECK(seqStartPos); + reorganizedSeqInfo.clear(); int seqNum = seqStartPos->getSize() - 1; int* seqStarts = seqStartPos->getMutableData(false); diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h index 321f4275d8e68d7d3fbbc19acf0afacf689474e5..04f12efaac15a21ef54ae71074b6d474e2b66c04 100644 --- a/paddle/parameter/Parameter.h +++ b/paddle/parameter/Parameter.h @@ -281,7 +281,11 @@ public: /** * @brief Set the format in header. */ - void setHeaderFormat(int32_t fmt) { headerFormat_ = fmt; } + void setHeaderFormat(int32_t fmt) { + CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: " + << fmt; + headerFormat_ = fmt; + } /** * @brief Parameter Update Hook. diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp index 8616fd2d5aef666f16533fe062f3f40a7a2b202d..4203f2616456244df616ee2109436ab7caef9741 100644 --- a/paddle/pserver/LightNetwork.cpp +++ b/paddle/pserver/LightNetwork.cpp @@ -22,7 +22,6 @@ limitations under the License. */ #include #include -#include #include #include diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index a40e2bc71d63265fcf40f403cb37854653696519..bc6e65f65eb5b6d95e384230807ba75c865f2234 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1688,6 +1688,21 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase): self.config.softmax_selfnorm_alpha = softmax_selfnorm_alpha +@config_layer('cross_entropy_over_beam') +class CrossEntropyOverBeamLayer(LayerBase): + def __init__(self, name, inputs, **xargs): + config_assert(len(inputs) % 3 == 0, "Error input number.") + super(CrossEntropyOverBeamLayer, self).__init__( + name, 'cross_entropy_over_beam', 0, inputs, **xargs) + input_num = len(inputs) / 3 + for i in range(input_num): + input_layer = self.get_input_layer(i * 3) + config_assert(input_layer.size == 1, ( + "Inputs for this layer are made up of " + "several triples, in which the first one is scores over " + "all candidate paths, whose size should be equal to 1.")) + + @config_layer('fc') class FCLayer(LayerBase): layer_type = 'fc' @@ -2386,6 +2401,7 @@ def define_cost(class_name, cost_type): define_cost('MultiClassCrossEntropy', 'multi-class-cross-entropy') +define_cost('CrossEntropyOverBeamCostLayer', 'cross_entropy_over_beam') define_cost('RankingCost', 'rank-cost') define_cost('AucValidation', 'auc-validation') define_cost('PnpairValidation', 'pnpair-validation') diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index c92764e1f9f9c6d864483adec020f15df1e97e8b..0506c70087f0fce920bee5ef73882707b0c7ff22 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import functools import collections import inspect @@ -106,6 +105,8 @@ __all__ = [ 'nce_layer', 'cross_entropy_with_selfnorm', 'cross_entropy', + 'BeamInput', + 'cross_entropy_over_beam', 'multi_binary_label_cross_entropy', 'sum_cost', 'rank_cost', @@ -227,6 +228,7 @@ class LayerType(object): HUBER_CLASSIFICATION = 'huber_classification' CROSS_ENTROPY = 'multi-class-cross-entropy' CROSS_ENTROPY_WITH_SELFNORM = 'multi_class_cross_entropy_with_selfnorm' + CROSS_ENTROPY_OVER_BEAM = 'cross_entropy_over_beam' SOFT_BIN_CLASS_CROSS_ENTROPY = 'soft_binary_class_cross_entropy' MULTI_BIN_LABEL_CROSS_ENTROPY = 'multi_binary_label_cross_entropy' SUM_COST = 'sum_cost' @@ -4217,8 +4219,12 @@ def __cost_input__(input, label, weight=None): """ inputs and parents for cost layers. """ - ipts = [Input(input.name), Input(label.name)] - parents = [input, label] + if isinstance(input, LayerOutput): + input = [input] + if isinstance(label, LayerOutput): + label = [label] + ipts = [Input(ipt.name) for ipt in (input + label)] + parents = [ipt for ipt in (input + label)] if weight is not None: assert weight.size == 1 ipts.append(Input(weight.name)) @@ -5205,17 +5211,6 @@ def warp_ctc_layer(input, building process, PaddlePaddle will clone the source codes, build and install it to :code:`third_party/install/warpctc` directory. - To use warp_ctc layer, you need to specify the path of :code:`libwarpctc.so`, - using following methods: - - 1. Set it in :code:`paddle.init` (python api) or :code:`paddle_init` (c api), - such as :code:`paddle.init(use_gpu=True, - warpctc_dir=your_paddle_source_dir/third_party/install/warpctc/lib)`. - - 2. Set environment variable LD_LIBRARY_PATH on Linux or DYLD_LIBRARY_PATH - on Mac OS. For instance, :code:`export - LD_LIBRARY_PATH=your_paddle_source_dir/third_party/install/warpctc/lib:$LD_LIBRARY_PATH`. - More details of CTC can be found by referring to `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with Recurrent Neural Networks