gpu test ok unpool2dmax

e2a5905e · sweetsky0901 · abb3357d · e2a5905e · e2a5905e · e2a5905e
6 changed file
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -37,8 +37,6 @@ class Unpool2dMaxFunctor<platform::CPUPlace, T> {
    const T* input_data = input.data<T>();
    const T * indices_data = indices.data<T>();
    T* output_data = output->mutable_data<T>(context.GetPlace());
-    memset(output_data, 0, \
-           sizeof(T) * output_feasize * output_channels * batch_size);
    for (int b = 0; b < batch_size; ++b) {
      for (int c = 0; c < output_channels; ++c) {
        for (int i = 0; i < input_feasize; ++i) {

--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -22,38 +22,53 @@ namespace math {
 template <typename T>
 __global__ void KernelUnpool2dMax(const int nthreads,
                                  const T* input_data,
-                                  const int* indices_data,
+                                  const T* indices_data,
                                  const int input_height,
                                  const int input_width,
+                                  const int channels,
                                  T* output_data,
                                  const int output_height,
                                  const int output_width) {
+  int bsize = input_height * input_width * channels;
+  int csize = input_height * input_width;
+  int out_bsize = output_height * output_width * channels;
+  int out_csize = output_height * output_width;
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int offset = blockDim.x * gridDim.x;
  for (int i = index; i < nthreads; i += offset) {
-    int out_offset =  i / (input_height * input_width) \
+    int bidx = i / bsize;
-                      * output_height * output_width;
+    int boffset = i % bsize;
+    int cidx = boffset / csize;
+    int out_offset = bidx * out_bsize + cidx * out_csize;
    int out_index = indices_data[i];
    PADDLE_ASSERT(out_index < (output_height * output_width));
+    printf("-------%d------[%f]\n", out_offset + out_index, input_data[i]);
    output_data[out_offset + out_index] = input_data[i];
  }
 }
 template <typename T>
 __global__ void KernelUnpool2dMaxGrad(const int nthreads,
                                      const T* input_data,
-                                      const int* indices_data,
+                                      const T* indices_data,
                                      const int input_height,
                                      const int input_width,
+                                      const int channels,
                                      const T* output_data,
                                      const T* output_grad,
                                      const int output_height,
                                      const int output_width,
                                      T* input_grad) {
+    int bsize = input_height * input_width * channels;
+    int csize = input_height * input_width;
+    int out_bsize = output_height * output_width * channels;
+    int out_csize = output_height * output_width;
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int offset = blockDim.x * gridDim.x;
    for (int i = index; i < nthreads; i += offset) {
-        int out_offset =  i / (input_height * input_width) \
+      int bidx = i / bsize;
-                          * output_height * output_width;
+      int boffset = i % bsize;
+      int cidx = boffset / csize;
+      int out_offset = bidx * out_bsize + cidx * out_csize;
      int out_index = indices_data[i];
      PADDLE_ASSERT(out_index < (output_height * output_width));
      input_grad[i] = output_grad[out_offset + out_index];
@@ -78,8 +93,7 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
    const T* input_data = input.data<T>();
    const T* indices_data = indices.data<T>();
    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * output_channels * input_height * input_width;
-    int nthreads =  output->numel();
    int blocks = (nthreads + 1024 - 1) / 1024;
    dim3 threads(1024, 1);
    dim3 grid(blocks, 1);
@@ -88,7 +102,7 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
        T><<<grid, threads, 0,
             reinterpret_cast<const platform::CUDADeviceContext&>(context)
                 .stream()>>>(nthreads, input_data, indices_data,
-                              input_height, input_width,
+                              input_height, input_width, output_channels,
                              output_data, output_height, output_width);
  }
 };
@@ -115,7 +129,7 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int nthreads =  output.numel();
+    int nthreads = batch_size * output_channels * input_height * input_width;
    int blocks = (nthreads + 1024 - 1) / 1024;
    dim3 threads(1024, 1);
    dim3 grid(blocks, 1);
@@ -125,7 +139,7 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
             reinterpret_cast<const platform::CUDADeviceContext&>(context)
                 .stream()>>>(
                              nthreads, input_data, indices_data,
-                              input_height, input_width,
+                              input_height, input_width, output_channels,
                              output_data, output_grad_data,
                              output_height, output_width,
                              input_grad_data);

--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
@@ -21,9 +21,6 @@ namespace paddle {
 namespace operators {
 namespace math {
-#define FLT_MAX \
-    __FLT_MAX__
 template <typename Place, typename T>
 class Unpool2dMaxFunctor {

--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -108,9 +108,6 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    // PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null.");
-    // PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-    //                           "Input(Out@GRAD) should not be null");
    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                                  "Input(X@GRAD) should not be null.");
    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));

--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
@@ -29,11 +29,16 @@ class UnpoolKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* in_x = context.Input<Tensor>("X");
    const Tensor* in_y = context.Input<Tensor>("Y");
-    Tensor* out = context.Output<Tensor>("Out");
+    auto * out = context.Output<Tensor>("Out");
    std::string unpoolingtype = context.Attr<std::string>("unpoolingtype");
    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    T* output_data = out->mutable_data<T>(context.GetPlace());
+    if (output_data) {
+      math::SetConstant<Place, T> set_zero;
+      set_zero(context.device_context(), out, static_cast<T>(0));
+    }
    switch (ksize.size()) {
    case 2: {
      if (unpoolingtype == "max") {

--- a/python/paddle/v2/fluid/tests/test_unpool_op.py
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -54,6 +54,8 @@ class TestUnpoolOp(OpTest):
        self.outputs = {'Out': output.astype('float32')}
    def test_check_output(self):
+        print self.inputs['X']
+        print self.inputs['Y']
        print self.outputs['Out']
        self.check_output()
@@ -63,7 +65,7 @@ class TestUnpoolOp(OpTest):
    def init_test_case(self):
        self.Unpool2d_forward_naive = unpool2dmax_forward_naive
        self.unpoolingtype = "max"
-        self.shape = [10, 2, 5, 5]
+        self.shape = [6, 4, 5, 5]
        self.ksize = [3, 3]
        self.strides = [2, 2]
        self.paddings = [0, 0]