Merge branch 'develop' of upstream into refine_rank_loss_op

9907350d · Yibing Liu · da62d6cc · 990818f7 · 9907350d · 9907350d
13 changed file
--- a/doc/design/float16.md
+++ b/doc/design/float16.md
@@ -28,6 +28,51 @@ The goal of float16 is to serve as a key for the executor to find and run the co
 - [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
 - [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).

+### CUDA version issue
+There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. 
+CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
+```
+typedef struct __align__(2) {
+   unsigned short x;
+} __half;
+
+typedef __half half;
+```
+This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
+```
+__global__ void Add() {
+  half a, b, c;
+  c = __hadd(a, b); // correct
+  c = a + b; // compiler error: no operator "+" matches these operands
+}
+```
+CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
+
+Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
+```
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __half_raw;
+
+
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    unsigned short __x;
+public:
+    // constructors and conversion operators from/to 
+    // __half_raw and other built-in data types
+}
+
+typedef __half half;
+
+__device__ __forceinline__ 
+__half operator+(const __half &lh, const __half &rh) { 
+    return __hadd(lh, rh); 
+}
+
+// Other overloaded operators
+``` 
+This new design makes `c = a + b` work correctly for CUDA half data type. 

 ## Implementation
 The float16 class holds a 16-bit `uint16_t` data internally.

--- a/doc/howto/optimization/cpu_profiling.md
+++ b/doc/howto/optimization/cpu_profiling.md
@@ -71,7 +71,7 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py

 ```

-可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python与C++混合代码的性能分析`来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。

 ```text
 Called By:
@@ -121,7 +121,7 @@ python -m yep -v main.py

 1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
 2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
-3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟如果单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。

 ### 查看性能分析文件


--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -62,11 +62,11 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
 endif()

 if(NOT MOBILE_INFERENCE)
-################## test_Evaluator #######################
+    ################## test_Evaluator #######################
    add_unittest(test_Evaluator
        test_Evaluator.cpp)
      
-############### test_RecurrentGradientMachine ###############
+    ############### test_RecurrentGradientMachine ###############
    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
    # I will fix it.
    add_unittest_without_exec(test_RecurrentGradientMachine
@@ -77,7 +77,7 @@ if(NOT MOBILE_INFERENCE)
                ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
      
-############### test_NetworkCompare ###############
+    ############### test_NetworkCompare ###############
    add_unittest_without_exec(test_NetworkCompare
        test_NetworkCompare.cpp)
    if(WITH_GPU)
@@ -89,34 +89,33 @@ if(NOT MOBILE_INFERENCE)
            COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
    endif()
-endif()
-
-
-add_unittest_without_exec(test_PyDataProvider2
-        test_PyDataProvider2.cpp)
-
-add_test(NAME test_PyDataProvider2
-   COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
-)

-################# test_CompareSparse ##################
-add_unittest_without_exec(test_CompareSparse
+    ################# test_CompareSparse ##################
+    add_unittest_without_exec(test_CompareSparse
        test_CompareSparse.cpp)
-if(NOT ON_TRAVIS)
+    if(NOT ON_TRAVIS)
      add_test(NAME test_CompareSparse
        COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
              ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
                  ./.set_port.sh -p port -n 6
                      ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-endif()
+    endif()

-################ test_CompareTwoNets ######################
-add_unittest_without_exec(test_CompareTwoNets
+    ################ test_CompareTwoNets ######################
+    add_unittest_without_exec(test_CompareTwoNets
        test_CompareTwoNets.cpp)
-add_test(NAME test_CompareTwoNets
+    add_test(NAME test_CompareTwoNets
      COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
            ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
            ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endif()
+
+################ test_PyDataProvider2 ######################
+add_unittest_without_exec(test_PyDataProvider2
+        test_PyDataProvider2.cpp)
+add_test(NAME test_PyDataProvider2
+   COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
+)
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -191,6 +191,7 @@ set(DEPS_OPS
    sum_op
    pool_op
    maxout_op
+    unpool_op
    pool_with_index_op
    conv_op
    conv_transpose_op
@@ -235,6 +236,7 @@ op_library(adagrad_op DEPS selected_rows_functor)
 op_library(conv_op DEPS vol2col)
 op_library(pool_op DEPS pooling)
 op_library(maxout_op DEPS maxouting)
+op_library(unpool_op DEPS unpooling)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
 op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)

--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -13,8 +13,9 @@ if(WITH_GPU)
    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
-    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
    nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
+    nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
+    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
 else()
    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
    cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
@@ -26,8 +27,9 @@ else()
    cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
-    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
    cc_library(maxouting SRCS maxouting.cc DEPS device_context)
+    cc_library(unpooling SRCS unpooling.cc DEPS device_context)
+    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
 endif()

 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)

--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/unpooling.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+class Unpool2dMaxFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          output_data[index] = input_data[i];
+        }
+        input_data += input_feasize;
+        indices_data += input_feasize;
+        output_data += output_feasize;
+      }
+    }
+  }
+};
+template <class T>
+class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const int* indices_data = indices.data<int>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          input_grad_data[i] = output_grad_data[index];
+        }
+        input_grad_data += input_feasize;
+        indices_data += input_feasize;
+        output_grad_data += output_feasize;
+      }
+    }
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/unpooling.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
+                                  const int* indices_data,
+                                  const int input_height, const int input_width,
+                                  const int channels, T* output_data,
+                                  const int output_height,
+                                  const int output_width) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    output_data[out_offset + out_index] = input_data[i];
+  }
+}
+template <typename T>
+__global__ void KernelUnpool2dMaxGrad(
+    const int nthreads, const T* input_data, const int* indices_data,
+    const int input_height, const int input_width, const int channels,
+    const T* output_data, const T* output_grad, const int output_height,
+    const int output_width, T* input_grad) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    input_grad[i] = output_grad[out_offset + out_index];
+  }
+}
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMax<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(input.numel(), input_data, indices_data,
+                              input_height, input_width, output_channels,
+                              output_data, output_height, output_width);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMaxGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(input.numel(), input_data, indices_data,
+                              input_height, input_width, output_channels,
+                              output_data, output_grad_data, output_height,
+                              output_width, input_grad_data);
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename Place, typename T>
+class Unpool2dMaxFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output);
+};
+template <typename Place, class T>
+class Unpool2dMaxGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -22,22 +22,20 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");

    auto x_dims = ctx->GetInputDim("X");
    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
+    PADDLE_ENFORCE_EQ(x_dims, y_dims);
    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "The tensor rank of X must be at least 2.");
+                      "The tensor rank of Input(X) should not be less than 2.");
    if (ctx->HasInput("InsideWeight")) {
      PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
                     "If weights are provided, must specify both "
                     "inside and outside weights.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims,
-                        "The shape of InsideWeight must be same as X.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims,
-                        "The shape of OutsideWeight must be same as X.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
    }

    ctx->SetOutputDim("Diff", x_dims);
@@ -53,25 +51,29 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
                      framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
-             "The input tensor of smooth l1 loss op."
-             "The rank should be greater or equal to 2 with shape "
-             "[batch_size, value_dim1, value_dim2, ..., value_dimN]");
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The input value of smooth l1 loss op with shape "
+             "[batch_size, dim1, ..., dimN].");
    AddInput("Y",
-             "The target tensor of smooth l1 loss op "
-             "with the same shape as X.");
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The target value of smooth l1 loss op with same shape as X.");
    AddInput("InsideWeight",
-             "Optional input tensor of smooth l1 loss op with the same shape "
-             "as X. If provided, the result of (X - Y) will be multiplied "
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the result of (X - Y) will be multiplied "
             "by this tensor element by element.")
        .AsDispensable();
    AddInput("OutsideWeight",
-             "Optinal input of smooth l1 loss op with the same shape as X."
-             "If provided, the output smooth l1 loss will be multiplied by "
-             "this tensor element by element.")
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the out smooth l1 loss will be multiplied by this "
+             "tensor element by element.")
        .AsDispensable();
-    AddOutput("Diff", "Intermediate variable to cache InsideWeight*(X-Y).")
+    AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).")
        .AsIntermediate();
-    AddOutput("Out", "Smooth l1 loss.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>) A tensor with rank be 2. "
+              "The output smooth l1 loss with shape [batch_size, 1].");
    AddAttr<AttrType>("sigma",
                      "Hyper parameter of smooth l1 loss op."
                      "A float scalar with default value 3.0.")
@@ -79,15 +81,23 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 Smooth L1 Loss Operator.

-This operator computes the smooth l1 loss for input and target.
-The operator takes the first dimension of input as the batch size.
+This operator computes the smooth l1 loss for X and Y.
+The operator takes the first dimension of X and Y as batch size.
 For each instance, it computes the smooth l1 loss element by element first
-and then sums all the losses. So the resulting output shape
-is [batch_size, 1].
+and then sums all the losses. So the shape of Out is [batch_size, 1].

 The equation is:
-loss = $$0.5 * (\sigma * (x-y))^2$$   if $$|x - y| < 1 /({\sigma}^2)$$
-       $$\frac{|x - y| - 0.5}{{\sigma}^2}$$ otherwise
+$$
+Out_{\sigma}(X, Y)_i = \begin{cases}
+0.5 * (\sigma * (X_i - Y_i)) ^ 2
+\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\
+\frac{|X_i - Y_i| - 0.5}{{\sigma}^2},
+\quad otherwise
+\end{cases}
+$$
+
+In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
+element of Out, X and Y.

 )DOC");
  }

--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+namespace paddle {
+namespace operators {
+
+class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Unpool2dOpMaker(framework::OpProto* proto,
+                  framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of unpool operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddInput(
+        "Indices",
+        "(Tensor) The input tensor of the indices given out by MaxPool2d. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of unpool operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "(vector), the unpooling window size(height, width) "
+        "of unpooling operator.");
+    AddAttr<std::vector<int>>("strides",
+                              "(vector, default:{1, 1}), "
+                              "strides (height, width) of unpooling operator.")
+        .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector defalut:{0,0}), "
+                              "paddings (height, width) of unpooling operator.")
+        .SetDefault({0, 0});
+    AddAttr<std::string>(
+        "unpooling_type",
+        "(string), unpooling type, can be \"max\" for max-unpooling ")
+        .InEnum({"max"});
+    AddComment(R"DOC(
+        "Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        Output shape: $(N, C_{out}, H_{out}, W_{out})$
+        Where
+          $$
+            H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
+            W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+          $$
+        Paper: http://www.matthewzeiler.com/wp-content/uploads/2017
+        /07/iccv2011.pdf
+        )DOC");
+  }
+};
+
+int OutputSize(int input_size, int ksize, int padding, int stride) {
+  int output_size = (input_size - 1) * stride - 2 * padding + ksize;
+  return output_size;
+}
+
+class UnpoolOp : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input(Indices) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UnpoolOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    auto in_y_dims = ctx->GetInputDim("Indices");
+    std::string unpooling_type =
+        ctx->Attrs().Get<std::string>("unpooling_type");
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Unpooling intput must be of 4-dimensional.");
+    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+          OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class UnpoolOpGrad : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
+            ops::UnpoolOpGrad);
+REGISTER_OP_CPU_KERNEL(unpool,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
+    ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/operators/unpool_op.cu.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(unpool,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    unpool_grad, ops::UnpoolGradKernel<paddle::platform::GPUPlace, float>,
+    ops::UnpoolGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/unpooling.h"
+
+namespace paddle {
+namespace operators {
+template <typename Place, typename T>
+class UnpoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    auto* out = context.Output<framework::Tensor>("Out");
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    T* output_data = out->mutable_data<T>(context.GetPlace());
+    if (output_data) {
+      math::SetConstant<Place, T> set_zero;
+      set_zero(context.device_context(), out, static_cast<T>(0));
+    }
+    math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
+    unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
+  }
+};
+template <typename Place, typename T>
+class UnpoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& device_ctx = context.device_context();
+    math::SetConstant<Place, T> zero;
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, in_x_grad, static_cast<T>(0));
+    }
+    math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
+    unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out,
+                          *out_grad, in_x_grad);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/python/paddle/v2/fluid/tests/test_unpool_op.py
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
+    s0, s1, s2, s3 = input.shape
+    out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
+    out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
+    out = np.zeros((s0, s1, out_hsize, out_wsize))
+    for nidx in xrange(s0):
+        for cidx in xrange(s1):
+            for h in xrange(s2):
+                for w in xrange(s3):
+                    index = indices[nidx, cidx, h, w]
+                    hidx = (index - index % out_wsize) / out_wsize
+                    widx = index % out_wsize
+                    out[nidx, cidx, int(hidx), int(widx)] = \
+                            input[nidx, cidx, h, w]
+
+    return out
+
+
+class TestUnpoolOp(OpTest):
+    def setUp(self):
+        self.op_type = "unpool"
+        self.init_test_case()
+        pre_input = np.random.random(self.shape).astype("float32")
+        nsize, csize, hsize, wsize = pre_input.shape
+        hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) / \
+                self.strides[0] + 1
+        wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) / \
+                self.strides[1] + 1
+        input = np.zeros((nsize, csize, hsize_out, wsize_out))
+        indices = np.zeros((nsize, csize, hsize_out, wsize_out))
+        for i in xrange(hsize_out):
+            for j in xrange(wsize_out):
+                r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
+                r_end = np.min((i * self.strides[0] + self.ksize[0] - \
+                        self.paddings[0], hsize))
+                c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
+                c_end = np.min((j * self.strides[1] + self.ksize[1] - \
+                        self.paddings[1], wsize))
+                for nidx in xrange(nsize):
+                    for cidx in xrange(csize):
+                        x_masked = pre_input[nidx, cidx, r_start:r_end, \
+                                c_start:c_end]
+                        input[nidx, cidx, i, j] = x_masked.max()
+                        arg = x_masked.argmax()
+                        indices[nidx, cidx, i, j] = \
+                                (r_start + arg / self.ksize[1]) * wsize + \
+                                c_start + arg % self.ksize[1]
+        output = self.unpool2d_forward_naive(input, indices, self.ksize, \
+                self.strides, self.paddings).astype("float32")
+        self.inputs = {
+            'X': input.astype('float32'),
+            'Indices': indices.astype('int32')
+        }
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'unpooling_type': self.unpooling_type,
+        }
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.unpool2d_forward_naive = unpool2dmax_forward_naive
+        self.unpooling_type = "max"
+        self.shape = [6, 4, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [2, 2]
+        self.paddings = [0, 0]
+
+
+if __name__ == '__main__':
+    unittest.main()