Merge branch 'sweetsky0901-my_maxout_op' into develop

91278404 · wanghaox · 45eabb8c · fef617ae · 91278404 · 91278404
9 changed file
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -96,7 +96,7 @@ function(op_library TARGET)
        # It's enough to just adding one operator to pybind
        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
    endif()
    # reduce_op contains several operators
    if ("${TARGET}" STREQUAL "reduce_op")
        set(pybind_flag 1)
@@ -138,6 +138,7 @@ set(DEPS_OPS
    softmax_with_cross_entropy_op
    sum_op
    pool_op
+    maxout_op
    pool_with_index_op
    nccl_op
    sequence_conv_op
@@ -149,6 +150,7 @@ op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(sum_op DEPS net_op selected_rows_functor)
 op_library(pool_op DEPS pooling)
+op_library(maxout_op DEPS maxouting)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
 if(WITH_GPU)

--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -12,6 +12,7 @@ if(WITH_GPU)
    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context)
    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
+    nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
 else()
    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
    cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
@@ -22,6 +23,7 @@ else()
    cc_library(context_project SRCS context_project.cc DEPS device_context)
    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
+    cc_library(maxouting SRCS maxouting.cc DEPS device_context)
 endif()
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)

--- a/paddle/operators/math/maxouting.cc
+++ b/paddle/operators/math/maxouting.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/math/maxouting.h"
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename MaxOutProcess, typename T>
+class MaxOutFunctor<platform::CPUPlace, MaxOutProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  int groups, int num_channels, MaxOutProcess maxout_process) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = num_channels/groups;
+    int fea_size = input_height * input_width;
+    int c_size = fea_size * output_channels;
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+    for (int i = 0; i < batch_size; i++) {
+      int new_bindex =  c_size * i;
+      for (int c = 0; c < output_channels; ++c) {
+        int new_cindex = fea_size * c;
+        for (int f = 0; f < fea_size; f++) {
+          T ele = maxout_process.initial();
+          for (int ph = 0; ph < groups; ++ph) {
+            maxout_process.compute(ele,
+              input_data[(new_bindex+new_cindex) * groups+ph*fea_size+f]);
+          }
+          maxout_process.finalize(ele, (static_cast<T>(groups)));
+          output_data[(new_bindex+new_cindex+f)] = ele;
+        }
+      }
+    }
+  }
+};
+template <class T>
+class MaxOutGradFunctor<platform::CPUPlace, T> {
+public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  int groups, int num_channels) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = num_channels / groups;
+    int fea_size = input_height * input_width;
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    for (int i = 0; i < batch_size; i++) {
+      int blen = fea_size * output_channels * i;
+      for (int c = 0; c < output_channels; ++c) {
+        int clen = fea_size * c;
+        for (int f = 0; f < fea_size; f++) {
+          int input_idx = 0;
+          bool stop = false;
+          int output_idx = blen + clen + f;
+          for (int g = 0; g < groups && !stop; g++) {
+              input_idx = (blen + clen) * groups + fea_size * g + f;
+              input_grad_data[input_idx] = 0;
+              if (input_data[input_idx] == output_data[output_idx]) {
+                input_grad_data[input_idx] += output_grad_data[output_idx];
+                stop = true;
+              } else {
+                input_grad_data[input_idx] = 0;
+              }
+          }
+        }
+      }
+    }
+  }
+};
+template class MaxOutGradFunctor<platform::CPUPlace, float>;
+template class MaxOutGradFunctor<platform::CPUPlace, double>;
+template class MaxOutFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxOut<float>, float>;
+template class MaxOutFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxOut<double>, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/maxouting.cu
+++ b/paddle/operators/math/maxouting.cu
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/math/maxouting.h"
+#include "paddle/platform/cuda_helper.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename MaxOutProcess, typename T>
+__global__ void KernelMaxOut(const int nthreads, const T* input_data,
+                             T* output_data, const int channels,
+                             const int input_height, const int input_width,
+                             int groups, MaxOutProcess maxout_process) {
+  int size = input_height * input_width * channels / groups;
+  int featLen = input_height * input_width;
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+              index += blockDim.x * gridDim.x) {
+    int batch_idx = index / size;
+    int i = index % size;
+    int channel_idx = i / featLen;
+    int feat_idx = i % featLen;
+    int data_idx =
+      (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
+    T ele = maxout_process.initial();
+    for (int g = 0; g < groups; g++) {
+      maxout_process.compute(ele, input_data[data_idx + g * featLen]);
+    }
+    maxout_process.finalize(ele, (static_cast<T>(groups)));
+    output_data[index] = ele;
+  }
+}
+template <typename T>
+__global__ void KernelMaxoutGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_height, const int input_width, int groups) {
+    int size = input_height * input_width * channels / groups;
+    int featLen = input_height * input_width;
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+         index += blockDim.x * gridDim.x) {
+      int batch_idx = index / size;
+      int i = index % size;
+      int channel_idx = i / featLen;
+      int feat_idx = i % featLen;
+      int data_idx =
+        (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
+      int maxIndex = -1;
+      bool stop = false;
+      for (int g = 0; g < groups && !stop; g++) {
+        if (input_data[data_idx + g * featLen] == output_data[index]) {
+          maxIndex = data_idx + g * featLen;
+          stop = true;
+        }
+      }
+      if (maxIndex != -1) {
+        // atomic add
+        platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
+      }
+    }
+}
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename MaxOutProcess, typename T>
+class MaxOutFunctor<platform::GPUPlace, MaxOutProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  int groups, int num_channels,
+                  MaxOutProcess maxout_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = num_channels / groups;
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelMaxOut<
+        MaxOutProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(nthreads, input_data, output_data, input_channels,
+                              input_height, input_width, groups,
+                              maxout_process);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T>
+class MaxOutGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  int groups, int num_channels) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelMaxoutGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_height, input_width, groups);
+  }
+};
+template class MaxOutGradFunctor<platform::GPUPlace, float>;
+template class MaxOutGradFunctor<platform::GPUPlace, double>;
+template class MaxOutFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxOut<float>, float>;
+template class MaxOutFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxOut<double>, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/maxouting.h
+++ b/paddle/operators/math/maxouting.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+namespace paddle {
+namespace operators {
+namespace math {
+#define FLT_MAX \
+  __FLT_MAX__  // It might need to be placed in another file, but I'm still
+               // wondering where to put it.
+/*
+ * \brief Extracting simple operations from pooling.
+ *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
+ * operation.
+ *        MaxPool initializes temp variable to the negative maximum to find the
+ * maximum value in the pooling field.
+ *        AvgPool initializes temp variable to the zero to accumulate all values
+ * in pool pooling, and finally takes the average.
+ *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
+ */
+template <class T>
+class MaxOut {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
+  DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; }
+  DEVICE inline void finalize(T& y, const T& group) {}
+};
+template <class T>
+class MaxOutGrad {
+ public:
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
+                             T scale) {
+    dx += dy * (x == y);
+  }
+};
+/*
+ * \brief Getting pooling results, and calculating gradient.
+ *
+ * In pool2d, all tensors are in NCHW format. Where N is batch size, C is the
+ * number of channels, H and W is the height and width of feature.
+ * In pool3d, all tensors are in NCDHW format. Where N is batch size, C is the
+ * number of channels, D, H and W is the depth, height and width of feature.
+ *
+ * In max pooling, it is possible that the pooling region has multiple maximum
+ * elements. In this case, we should compute the gradient of the first maximum
+ * element.
+ * This is different from average pooling. So we rewrite the max_pool_grad:
+ * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
+ */
+template <typename Place, typename MaxOutProcess, typename T>
+class MaxOutFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  int groups, int num_channels, MaxOutProcess maxout_compute);
+};
+template <typename Place, class T>
+class MaxOutGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, int groups,
+                  int num_channels);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+#include "paddle/operators/maxout_op.h"
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+/********first define ProtoMaker类 ***************/
+class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxOutOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+        "(Tensor) The input tensor of pooling operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+        "(Tensor) The output tensor of pooling operator."
+        "The format of output tensor is also NCHW."
+        "Where N is batch size, C is "
+        "the number of channels, H and W is the height and "
+        "width of feature.");
+    AddAttr<int>(
+        "groups",
+        R"DOC(The group number of input layer.
+        )DOC")
+        .SetDefault(2);
+    AddAttr<int>(
+        "num_channels",
+        R"DOC(The channel number of input layer.
+        )DOC")
+        .SetDefault(0);
+    AddComment(R"DOC(A layer to do max out on conv layer output.
+        - Input: output of a conv layer.
+        - Output: feature map size same as input. Channel is (input channel) / groups.
+        So groups should be larger than 1, and the num of channels should be able
+        to devided by groups.
+        )DOC");
+  }
+};
+/******************2nd **********************************/
+class MaxOutOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of maxoutOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of maxoutOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    int groups = ctx->Attrs().Get<int>("groups");
+    int num_channels = ctx->Attrs().Get<int>("num_channels");
+    // check groups > 1
+    PADDLE_ENFORCE_GT(
+        groups, 1,
+        "in maxoutop  groups should be larger than 1");
+    // check num_channels%groups=0
+    PADDLE_ENFORCE_EQ(num_channels % groups, 0,
+                      "the num of channels should be able"
+    "to devided by groups");
+    int out_num_channels = num_channels / groups;
+    std::vector<int64_t> output_shape({in_x_dims[0], out_num_channels});
+    output_shape.push_back(in_x_dims[2]);
+    output_shape.push_back(in_x_dims[3]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+class MaxOutOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+    "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}    // namespace operators
+}    // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
+                        ops::MaxOutOpGrad);
+REGISTER_OP_CPU_KERNEL(maxout, ops::MaxOutKernel<paddle::platform::CPUPlace,
+                       float>);
+REGISTER_OP_CPU_KERNEL(maxout_grad,
+                       ops::MaxOutGradKernel<paddle::platform::CPUPlace,
+                       float>);
--- a/paddle/operators/maxout_op.cu
+++ b/paddle/operators/maxout_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#define EIGEN_USE_GPU
+#include "paddle/operators/maxout_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(maxout, ops::MaxOutKernel<paddle::platform::GPUPlace,
+                       float>);
+REGISTER_OP_GPU_KERNEL(maxout_grad,
+                       ops::MaxOutGradKernel<paddle::platform::GPUPlace,
+                       float>);
--- a/paddle/operators/maxout_op.h
+++ b/paddle/operators/maxout_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/maxouting.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename Place, typename T>
+class MaxOutKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    int groups = context.template Attr<int>("groups");
+    int num_channels = context.template Attr<int>("num_channels");
+    paddle::operators::math::MaxOutFunctor<
+    Place, paddle::operators::math::MaxOut<T>, T>
+    maxout_forward;
+    paddle::operators::math::MaxOut<T> maxout_process;
+    maxout_forward(context.device_context(), *in_x, *out, groups, num_channels,
+    maxout_process);
+  }
+};
+template <typename Place, typename T>
+class MaxOutGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    const Tensor* out = context.Input<Tensor>("Out");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    int groups = context.template Attr<int>("groups");
+    int num_channels = context.template Attr<int>("num_channels");
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      auto temp = framework::EigenVector<T>::Flatten(*in_x_grad);
+      temp.device(context.GetEigenDevice<Place>()) =
+      temp.constant(static_cast<T>(0));
+      paddle::operators::math::MaxOutGradFunctor<Place, T>
+      maxout_backward;
+      maxout_backward(context.device_context(), *in_x, *in_x_grad, *out,
+      *out_grad, groups, num_channels);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/python/paddle/v2/framework/tests/test_maxout_op.py
+++ b/python/paddle/v2/framework/tests/test_maxout_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+def maxout_forward_naive_2sweetsky(input, groups, num_channels):
+    s0, s1, s2, s3 = input.shape
+    return np.ndarray([s0, s1 / groups, groups, s2, s3], \
+        buffer = input, dtype=input.dtype).max(axis=(2))
+def maxout_forward_naive(input, groups,num_channels):
+    s0, s1, s2, s3 = input.shape
+    return np.ndarray([s0, s1 / groups, groups, s2, s3], \
+        buffer = input, dtype=input.dtype).max(axis=(2))
+class TestMaxOut_Op(OpTest):
+    def setUp(self):
+        self.op_type = "maxout"
+        self.init_test_case()
+        input = np.random.random(self.shape).astype("float32")
+        output = self.MaxOut_forward_naive(input, self.groups,
+                self.num_channels).astype("float32")
+        self.inputs = {'X': input}
+        self.attrs = {'groups': self.groups, 'num_channels': self.num_channels}
+        self.outputs = {'Out': output.astype('float32')}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        print self.inputs
+        print self.outputs
+        self.check_grad(['X'], 'Out', max_relative_error=0.5)
+    def init_test_case(self):
+        self.MaxOut_forward_naive = maxout_forward_naive
+        self.shape = [100, 6, 2, 2]
+        self.groups=2
+        self.num_channels=6
+if __name__ == '__main__':
+    unittest.main()