diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index d4272c72f2404739a02d3458f00a771aa8d3d2c0..8dcd32b06764ef833e1856b68ed4e9de90262047 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "DepthwiseConvOp.h"
+#include "ConvOp.h"
 #include "GemmFunctor.h"
-#include "paddle/math/MemoryHandle.h"
+//#include "paddle/math/MemoryHandle.h"
 
 namespace paddle {
 
 template <class T>
 class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
 public:
-  void operator()(int outputSize,
-                  const T* inputData,
+  void operator()(const T* inputData,
                   const T* filterData,
                   int batchSize,
                   int outputChannels,
@@ -44,13 +44,13 @@ public:
 template <class T>
 class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
 public:
-  void operator()(int inputSize,
-                  const T* outputGrad,
+  void operator()(const T* outputGrad,
                   const T* filterData,
                   int batchSize,
                   int outputChannels,
                   int outputHeight,
                   int outputWidth,
+                  int inputChannels,
                   int inputHeight,
                   int inputWidth,
                   int filterHeight,
@@ -65,14 +65,13 @@ public:
 template <class T>
 class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
 public:
-  void operator()(int num_i,
-                  int colDataSize,
-                  const T* outputGrad,
+  void operator()(const T* outputGrad,
                   const T* inputData,
                   int batchSize,
                   int outputChannels,
                   int outputHeight,
                   int outputWidth,
+                  int inputChannels,
                   int inputHeight,
                   int inputWidth,
                   int filterHeight,
@@ -87,7 +86,7 @@ public:
 };
 
 /*
- * \brief Forward calculation of convolution.
+ * \brief Forward calculation of depthwise convolution.
  */
 template <DeviceType Device>
 class DepthwiseConvFunction : public ConvFunctionBase {
@@ -126,11 +125,9 @@ public:
     real* inputData = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
     real* outputData = outputs[0].data<real>();
-    size_t outputSize = batchSize * outputChannels * outputHeight * outputWidth;
 
     DepthwiseConvFunctor<Device, real> depthwiseConv;
-    depthwiseConv(outputSize,
-                  inputData,
+    depthwiseConv(inputData,
                   filterData,
                   batchSize,
                   outputChannels,
@@ -149,7 +146,7 @@ public:
 };
 
 /*
- * \brief Backward input calculation of convolution.
+ * \brief Backward input calculation of depthwise convolution.
  */
 template <DeviceType Device>
 class DepthwiseConvGradInputFunction : public ConvFunctionBase {
@@ -191,16 +188,14 @@ public:
     real* filterData = inputs[1].data<real>();
     real* inputGrad = outputs[0].data<real>();
 
-    size_t inputSize = batchSize * inputChannels * inputHeight * inputWidth;
-
     DepthwiseConvGradInputFunctor<Device, real> depthwiseConvGradInput;
-    depthwiseConvGradInput(inputSize,
-                           outputGrad,
+    depthwiseConvGradInput(outputGrad,
                            filterData,
                            batchSize,
                            outputChannels,
                            outputHeight,
                            outputWidth,
+                           inputChannels,
                            inputHeight,
                            inputWidth,
                            filterHeight,
@@ -214,7 +209,7 @@ public:
 };
 
 /*
- * \brief Backward filter calculation of convolution.
+ * \brief Backward filter calculation of depthwise convolution.
  */
 template <DeviceType Device>
 class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
@@ -255,35 +250,31 @@ public:
     real* multiplierData = inputs[2].data<real>();
     real* filterGrad = outputs[0].data<real>();
 
-    size_t size =
+    int size =
         inputChannels * filterHeight * filterWidth * outputHeight * outputWidth;
-
     resizeBuffer<Device>(size);
     real* colData = reinterpret_cast<real*>(memory_->getBuf());
 
     DepthwiseConvGradFilterFunctor<Device, real> depthwiseConvGradFilter;
 
-    for (size_t i = 0; i < batchSize; i++) {
-      depthwiseConvGradFilter(i,
-                              size,
-                              outputGrad,
-                              inputData,
-                              batchSize,
-                              outputChannels,
-                              outputHeight,
-                              outputWidth,
-                              inputHeight,
-                              inputWidth,
-                              filterHeight,
-                              filterWidth,
-                              strideH(),
-                              strideW(),
-                              paddingH(),
-                              paddingW(),
-                              colData,
-                              multiplierData,
-                              filterGrad);
-    }
+    depthwiseConvGradFilter(outputGrad,
+                            inputData,
+                            batchSize,
+                            outputChannels,
+                            outputHeight,
+                            outputWidth,
+                            inputChannels,
+                            inputHeight,
+                            inputWidth,
+                            filterHeight,
+                            filterWidth,
+                            strideH(),
+                            strideW(),
+                            paddingH(),
+                            paddingW(),
+                            colData,
+                            multiplierData,
+                            filterGrad);
   }
 };
 
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/function/DepthwiseConvOp.h
index 44290682def458aa51789b3ab12e8c5ac2c6a802..da180b29b064702b78b7ba8c3d63f0114dec6d82 100644
--- a/paddle/function/DepthwiseConvOp.h
+++ b/paddle/function/DepthwiseConvOp.h
@@ -14,15 +14,36 @@ limitations under the License. */
 
 #pragma once
 
-#include "ConvOp.h"
+#include "TensorType.h"
 
 namespace paddle {
 
+/**
+ *\brief   Depthwise convolution forward. The outputData
+ *         of depthwise convolution is same with ExpandConvLayer
+ *         when groups equals inputChannels in ExpandConvLayer.
+ *
+ * \param[in]   inputData         input data.
+ * \param[in]   filterData        the Paramters of the depthwise conv layer..
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData..
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[out]  outputData        outputData.
+ *
+ */
 template <DeviceType Device, class T>
 class DepthwiseConvFunctor {
 public:
-  void operator()(int outputSize,
-                  const T* inputData,
+  void operator()(const T* inputData,
                   const T* filterData,
                   int batchSize,
                   int outputChannels,
@@ -39,16 +60,38 @@ public:
                   T* outputData);
 };
 
+/**
+ *\brief  Functor tot compute the depthwise convolution backprop w.r.t input.
+ *
+ *
+ * \param[in]   outputGradData    the grad data of output.
+ * \param[in]   filterData        the Paramters of the depthwise conv layer..
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of input data.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData..
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[out]  inputGrad         the grad data of input.
+ *
+ */
 template <DeviceType Device, class T>
 class DepthwiseConvGradInputFunctor {
 public:
-  void operator()(int inputSize,
-                  const T* outputGrad,
+  void operator()(const T* outputGrad,
                   const T* filterData,
                   int batchSize,
                   int outputChannels,
                   int outputHeight,
                   int outputWidth,
+                  int inputChannels,
                   int inputHeight,
                   int inputWidth,
                   int filterHeight,
@@ -60,17 +103,42 @@ public:
                   T* inputGrad);
 };
 
+/**
+ *\brief  Functor tot compute the depthwise convolution backprop w.r.t filter.
+ *
+ * \param[in]   outputGradData    the grad data of output.
+ * \param[in]   inputData         inputData.
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of input data.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData..
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[in]   colData           Auxiliary data when calculating filterGrad.
+ *                                size:
+ *inputChannels*filterHeight*filterWidth*outputHeight*outputWidth \param[in]
+ *multiplierData    Auxiliary data when calculating filterGrad. size:
+ *outputHeight * outputWidth. \param[out]
+ *filterGrad        the grad data of filter.
+ *
+ */
 template <DeviceType Device, class T>
 class DepthwiseConvGradFilterFunctor {
 public:
-  void operator()(int num_i,
-                  int colDataSize,
-                  const T* outputGrad,
+  void operator()(const T* outputGrad,
                   const T* inputData,
                   int batchSize,
                   int outputChannels,
                   int outputHeight,
                   int outputWidth,
+                  int inputChannels,
                   int inputHeight,
                   int inputWidth,
                   int filterHeight,
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
index 08fe9221ac036d9eea324e6ce050d36ee0452d6e..df9be80b3fa6644048985fc33b581bd96eebf198 100644
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "ConvOp.h"
 #include "DepthwiseConvOp.h"
 #include "GemmFunctor.h"
-#include "paddle/math/MemoryHandle.h"
 
 namespace paddle {
+// CUDA kernel to compute the depthwise convolution forward pass
 template <class T>
 __global__ 
 void ConvolutionDepthwiseForward(const int nthreads,
@@ -48,7 +47,7 @@ void ConvolutionDepthwiseForward(const int nthreads,
 		  for (int kw = 0; kw < filterWidth; ++kw) {
 			const int h_in = -paddingH + h * strideH + kh;
 			const int w_in = -paddingW + w * strideW + kw;
-			  const int offset = ((n * outputChannels + c) * inputHeight + h_in)
+			const int offset = ((n * outputChannels + c) * inputHeight + h_in)
 					* inputWidth + w_in;
 			  value += (*weight) * inputData[offset];
 			++weight;
@@ -73,6 +72,7 @@ void ConvolutionDepthwiseForward(const int nthreads,
   }
 }
 
+// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
 template <class T>
 __global__
 void ConvolutionDepthwiseInputBackward(const int nthreads,
@@ -113,6 +113,7 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
   }
 }
 
+// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
 template <class T>
 __global__
 void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
@@ -150,15 +151,14 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
 template <class T>
 class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T>{
 public:
-  void operator()(int outputSize, 
-            const T* inputData, 
+  void operator()(const T* inputData, 
             const T* filterData,
             int batchSize,
             int outputChannels,
             int outputHeight,
             int outputWidth,
-			int inputHeight,
-			int inputWidth,
+            int inputHeight,
+            int inputWidth,
             int filterHeight,
             int filterWidth,
             int strideH,
@@ -167,12 +167,14 @@ public:
             int paddingW,
             T* outputData){
 
+    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
+
     size_t blocks = (outputSize + 1024 -1) / 1024;
     size_t blockX = 512;
     size_t blockY = (blocks+512-1)/512;
     dim3 threads(1024, 1);
     dim3 grid(blockX, blockY);
-    
+
     ConvolutionDepthwiseForward<T>
         <<< grid, threads, 0, STREAM_DEFAULT >>>(
             outputSize, 
@@ -182,8 +184,8 @@ public:
             outputChannels,
             outputHeight,
             outputWidth,
-			inputHeight,
-			inputWidth,
+            inputHeight,
+            inputWidth,
             filterHeight,
             filterWidth,
             strideH,
@@ -197,13 +199,13 @@ public:
 template <class T>
 class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T>{
 public:
-  void operator()(int inputSize,
-            const T* outputGrad,
+  void operator()(const T* outputGrad,
             const T* filterData,
             int batchSize,
             int outputChannels,
             int outputHeight,
             int outputWidth,
+            int inputChannels,
             int inputHeight,
             int inputWidth,
             int filterHeight,
@@ -212,7 +214,9 @@ public:
             int strideW,
             int paddingH,
             int paddingW,
-                T* inputGrad){
+            T* inputGrad){
+
+	int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
 
     size_t blocks = (inputSize + 1024 -1) / 1024;
     size_t blockX = 512;
@@ -220,6 +224,7 @@ public:
     dim3 threads(1024, 1);
     dim3 grid(blockX, blockY);
 
+
     ConvolutionDepthwiseInputBackward<T>
           // NOLINT_NEXT_LINE(whitespace/operators)
         <<< grid, threads, 0, STREAM_DEFAULT >>>(
@@ -245,14 +250,13 @@ public:
 template <class T>
 class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
 public:
-  void operator()(int num_i,
-                int colDataSize,
-                const T* outputGrad,
+  void operator()(const T* outputGrad,
                 const T* inputData,
                 int batchSize,
                 int outputChannels,
                 int outputHeight,
                 int outputWidth,
+                int inputChannels,
                 int inputHeight,
                 int inputWidth,
                 int filterHeight,
@@ -265,60 +269,65 @@ public:
                 T* multiplierData,
                 T* filterGrad){
 
+        int colDataSize = inputChannels * filterHeight * filterWidth * outputHeight * outputWidth;
+
         size_t blocks = (colDataSize + 1024 -1) / 1024;
         size_t blockX = 512;
         size_t blockY = (blocks+512-1)/512;
         dim3 threads(1024, 1);
         dim3 grid(blockX, blockY);
 
-	    ConvolutionDepthwiseFilterBackward<T>
-            <<< grid, threads, 0, STREAM_DEFAULT >>>(
-                num_i,
-                colDataSize,
-                outputGrad,
-                inputData,
-                batchSize,
-                outputChannels,
-                outputHeight,
-                outputWidth,
-                inputHeight,
-                inputWidth,
-                filterHeight,
-                filterWidth,
-                strideH,
-                strideW,
-                paddingH,
-                paddingW,
-                colData
-            );
-        GemmFunctor<DEVICE_TYPE_GPU, real> gemm;
-        int M = colDataSize / outputHeight / outputWidth;
-        int N = 1;
-        int K = outputHeight * outputWidth;
-        gemm(CblasNoTrans,
-            CblasNoTrans,
-            M,
-            N,
-            K,
-            (T)1.0,
-            colData,
-            K,
-            multiplierData,
-            N,
-            (T)1.0,
-            filterGrad,
-            N);
+        for(int i = 0; i < batchSize; i++) {
+			ConvolutionDepthwiseFilterBackward<T>
+				<<< grid, threads, 0, STREAM_DEFAULT >>>(
+                    i,
+                    colDataSize,
+                    outputGrad,
+                    inputData,
+                    batchSize,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    inputHeight,
+                    inputWidth,
+                    filterHeight,
+                    filterWidth,
+                    strideH,
+                    strideW,
+                    paddingH,
+                    paddingW,
+                    colData
+				);
+			GemmFunctor<DEVICE_TYPE_GPU, real> gemm;
+			int M = colDataSize / outputHeight / outputWidth;
+			int N = 1;
+			int K = outputHeight * outputWidth;
+			gemm(CblasNoTrans,
+				CblasNoTrans,
+				M,
+				N,
+				K,
+				(T)1.0,
+				colData,
+				K,
+				multiplierData,
+				N,
+				(T)1.0,
+				filterGrad,
+				N);
+		}
         //gemv
     }
 };
 
 #ifdef PADDLE_TYPE_DOUBLE
-using real=double;
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
 #else 
-using real=float;
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
 #endif
-template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, real>;
-template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, real>;
-template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, real>;
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/DepthwiseConvLayer.cpp b/paddle/gserver/layers/DepthwiseConvLayer.cpp
index f07100d94978959d36327ecd6c54fb3f672b8fa1..8da3a52c2434048eaa7da636bc14ec9705b42592 100644
--- a/paddle/gserver/layers/DepthwiseConvLayer.cpp
+++ b/paddle/gserver/layers/DepthwiseConvLayer.cpp
@@ -15,14 +15,9 @@ limitations under the License. */
 #include "DepthwiseConvLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include <iostream>
 
 namespace paddle {
 
-/*
- * The calculation of the exconvt(convolution transpose (deconv) operation)
- * is a swap of forward and backward of the calculation of exconv.
- * */
 REGISTER_LAYER(depthwise_conv, DepthwiseConvLayer);
 
 bool DepthwiseConvLayer::init(const LayerMap &layerMap,
@@ -76,11 +71,12 @@ bool DepthwiseConvLayer::init(const LayerMap &layerMap,
 #define BACKWARD_FILTER(i, inputs, outputs) \
   backward_[2 * i + 1]->calc(inputs, outputs)
 
+// compute the depthwise convolution forward pass
 void DepthwiseConvLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  // std::cout << "outputSize" << getOutputSize() <<std::endl;
+
   resetOutput(batchSize, getOutputSize());
 
   // Calculate the shape of the input, output, and filter.
@@ -127,6 +123,7 @@ void DepthwiseConvLayer::forward(PassType passType) {
   forwardActivation();
 }
 
+// compute the depthwise convolution backprop.
 void DepthwiseConvLayer::backward(const UpdateCallback &callback) {
   backwardActivation();
 
diff --git a/paddle/gserver/layers/DepthwiseConvLayer.h b/paddle/gserver/layers/DepthwiseConvLayer.h
index 61dd87c12a0b9f9090d26ba3bb9a0d836786a89e..c640d13b582fe0918d409ba8c7b4a46c14541a1e 100644
--- a/paddle/gserver/layers/DepthwiseConvLayer.h
+++ b/paddle/gserver/layers/DepthwiseConvLayer.h
@@ -22,10 +22,8 @@ namespace paddle {
 
 /**
  * @brief A subclass of convolution layer.
- * This layer expands input and use matrix multiplication to
- * calculate convolution operation.
- *
- * The config file api is img_conv_layer.
+ * This layer do the depthwise convolution calculation in mobilenet.
+ * The config file api is img_depthwise_conv_layer.
  */
 
 class DepthwiseConvLayer : public ExpandConvBaseLayer {