Merge remote-tracking branch 'upstream/develop' into develop

76973440 · zhangyang · 560f6f10 · ec186c12 · 76973440 · 76973440
32 changed file
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -71,6 +71,8 @@ const char *G_OP_TYPE_SUM = "sum";

 const char *G_OP_TYPE_QUANTIZE = "quantize";
 const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
+const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN = "fusion_dequant_add_bn";
+const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU = "fusion_dequant_bn_relu";
 const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU = "fusion_dequant_add_bn_relu";

 const char *G_OP_TYPE_TANH = "tanh";
@@ -136,6 +138,8 @@ std::unordered_map<
        {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
        {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Y"}}},
+        {G_OP_TYPE_FUSION_DEQUANT_BN_RELU, {{"X", "Scale"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU, {{"X", "Scale"}, {"Out"}}},
        {G_OP_TYPE_TANH, {{"X"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}},

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -138,6 +138,8 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL;

 extern const char *G_OP_TYPE_QUANTIZE;
 extern const char *G_OP_TYPE_DEQUANTIZE;
+extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN;
+extern const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU;

 extern const char *G_OP_TYPE_TANH;

--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -56,6 +56,7 @@ class CLImage {
    tensor_dims_ = dim;
  }

+  bool isInit() { return initialized_; }
  /*
   * need call SetTensorData first
   *

--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -233,6 +233,14 @@ LOAD_OP1(quantize, CPU);
 #ifdef DEQUANT_OP
 LOAD_OP1(dequantize, CPU);
 #endif
+#ifdef FUSION_DEQUANT_ADD_BN_OP
+LOAD_OP1(fusion_dequant_add_bn, CPU);
+LOAD_FUSION_MATCHER(fusion_dequant_add_bn);
+#endif
+#ifdef FUSION_DEQUANT_BN_RELU_OP
+LOAD_OP1(fusion_dequant_bn_relu, CPU);
+LOAD_FUSION_MATCHER(fusion_dequant_bn_relu);
+#endif
 #ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
 LOAD_OP1(fusion_dequant_add_bn_relu, CPU);
 LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu);

--- a/src/io/ios_io/PaddleMobileCPU.mm
+++ b/src/io/ios_io/PaddleMobileCPU.mm
@@ -95,7 +95,8 @@ static std::mutex shared_mutex;
         andModelParamsLen:(size_t)combinedParamsLen
      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf {
  pam_->SetThreadNum(2);
-  return loaded_ = pam_->LoadCombinedMemory(modelLen, modelBuf, combinedParamsLen, combinedParamsBuf);
+  return loaded_ = pam_->LoadCombinedMemory(modelLen, modelBuf, combinedParamsLen,
+          const_cast<uint8_t*>(combinedParamsBuf));
 }

 - (BOOL)load:(NSString *)modelAndWeightPath{

--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
@@ -18,7 +18,7 @@ limitations under the License. */

 #include <string>
 #include "framework/operator.h"
-#include "operators/kernel/depthwise_conv_kernel.h"
+#include "operators/kernel/conv_kernel.h"

 namespace paddle_mobile {
 namespace operators {
@@ -26,19 +26,16 @@ namespace operators {
 template <typename DeviceType, typename T>
 class DepthwiseConvOp : public framework::OperatorWithKernel<
                            DeviceType, ConvParam<DeviceType>,
-                            operators::DepthwiseConvKernel<DeviceType, T>> {
+                            operators::ConvKernel<DeviceType, T>> {
 public:
  DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs,
                  const VariableNameMap &outputs,
                  const framework::AttributeMap &attrs,
                  std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<
-            DeviceType, ConvParam<DeviceType>,
-            operators::DepthwiseConvKernel<DeviceType, T>>(
+      : framework::OperatorWithKernel<DeviceType, ConvParam<DeviceType>,
+                                      operators::ConvKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
  void InferShape() const override;
-
- private:
 };

 }  // namespace operators

--- a/src/operators/fusion_conv_bn_add_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_add_relu_op.cpp
@@ -55,6 +55,9 @@ REGISTER_FUSION_MATCHER(fusion_conv_bn_add_relu,
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
+#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
 #endif

--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -12,27 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef DEPTHWISECONV_OP
+#ifdef FUSION_DEQUANT_ADD_BN_OP

-#include "operators/kernel/depthwise_conv_kernel.h"
-#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h"
+#include "operators/fusion_dequant_add_bn_op.h"

 namespace paddle_mobile {
 namespace operators {

-template <>
-bool DepthwiseConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
-  return true;
+template <typename Dtype, typename T>
+void FusionDequantAddBNOp<Dtype, T>::InferShape() const {
+  const auto& input_dims = this->param_.input_->dims();
+  this->param_.output_->Resize(input_dims);
 }

-template <>
-void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
-  DepthwiseConvCompute<float>(param);
-}
-
-template class DepthwiseConvKernel<CPU, float>;
-
 }  // namespace operators
 }  // namespace paddle_mobile

+namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_dequant_add_bn, ops::FusionDequantAddBNMatcher);
+
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_dequant_add_bn, ops::FusionDequantAddBNOp);
+#endif
+
 #endif
--- a/src/operators/fusion_dequant_add_bn_op.h
+++ b/src/operators/fusion_dequant_add_bn_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DEQUANT_ADD_BN_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/dequant_add_bn_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+class FusionDequantAddBNMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDequantAddBNMatcher() {
+    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
+                  {G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "BNScale"},
+                    {"Mean", "BNMean"},
+                    {"Bias", "BNBias"},
+                    {"Variance", "BNVariance"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN; }
+};
+
+template <typename DeviceType, typename T>
+class FusionDequantAddBNOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionDequantAddBNParam<DeviceType>,
+          operators::FusionDequantAddBNKernel<DeviceType, T>> {
+ public:
+  FusionDequantAddBNOp(const std::string &type, const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const framework::AttributeMap &attrs,
+                       std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDequantAddBNParam<DeviceType>,
+            operators::FusionDequantAddBNKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // inference output shape
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/fusion_dequant_add_bn_relu_op.h
+++ b/src/operators/fusion_dequant_add_bn_relu_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_add_bn_relu_kernel.h"
+#include "operators/kernel/dequant_bn_relu_kernel.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {

--- a/src/operators/kernel/depthwise_conv_kernel.h
+++ b/src/operators/kernel/depthwise_conv_kernel.h
@@ -12,29 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef DEPTHWISECONV_OP
+#ifdef FUSION_DEQUANT_BN_RELU_OP

-#pragma once
-
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
+#include "operators/fusion_dequant_bn_relu_op.h"

 namespace paddle_mobile {
 namespace operators {

-using framework::OpKernelBase;
+template <typename Dtype, typename T>
+void FusionDequantBNReluOp<Dtype, T>::InferShape() const {
+  const auto& input_dims = this->param_.input_->dims();
+  this->param_.output_->Resize(input_dims);
+}

-template <typename DeviceType, typename T>
-class DepthwiseConvKernel
-    : public OpKernelBase<DeviceType, ConvParam<DeviceType>> {
- public:
-  void Compute(const ConvParam<DeviceType> &param);
-  bool Init(ConvParam<DeviceType> *param);
-};
 }  // namespace operators
 }  // namespace paddle_mobile

+namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_dequant_bn_relu,
+                        ops::FusionDequantBNReluMatcher);
+
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_dequant_bn_relu, ops::FusionDequantBNReluOp);
+#endif
+
 #endif
--- a/src/operators/fusion_dequant_bn_relu_op.h
+++ b/src/operators/fusion_dequant_bn_relu_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DEQUANT_BN_RELU_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/dequant_bn_relu_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+class FusionDequantBNReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDequantBNReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "BNScale"},
+                    {"Mean", "BNMean"},
+                    {"Bias", "BNBias"},
+                    {"Variance", "BNVariance"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_BN_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionDequantBNReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionDequantBNReluParam<DeviceType>,
+          operators::FusionDequantBNReluKernel<DeviceType, T>> {
+ public:
+  FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs,
+                        std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDequantBNReluParam<DeviceType>,
+            operators::FusionDequantBNReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // inference output shape
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -22,41 +22,43 @@ namespace operators {

 template <>
 bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
+  bool conv3x3 = param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+                 param->Filter()->dims()[2] == 3;
+  bool depth3x3 = conv3x3 && param->Groups() == param->Input()->dims()[1] &&
+                  param->Input()->dims()[1] == param->Output()->dims()[1];
  if (param->Filter()->type() == typeid(int8_t)) {
-    if (param->Groups() == param->Input()->dims()[1] &&
-        param->Input()->dims()[1] == param->Output()->dims()[1] &&
-        param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-        param->Filter()->dims()[2] == 3 && param->Strides()[0] < 3 &&
+    if (depth3x3 && param->Strides()[0] < 3 &&
        param->Strides()[0] == param->Strides()[1]) {
      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3_INT8;
    } else {
      param->ExecMode() = ConvParam<CPU>::EXEC_GEMM_INT8;
    }
  } else {
-    if (param->Groups() == param->Input()->dims()[1] &&
-        param->Input()->dims()[1] == param->Output()->dims()[1] &&
-        param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-        param->Filter()->dims()[2] == 3 && param->Strides()[0] == 1) {
+    if (depth3x3 && param->Strides()[0] == param->Strides()[1] &&
+        param->Strides()[0] == 1 && param->Paddings()[0] == 1 &&
+        param->Paddings()[0] == param->Paddings()[1]) {
      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3S1P1_FLOAT;
-    } else if (param->Groups() == param->Input()->dims()[1] &&
-               param->Input()->dims()[1] == param->Output()->dims()[1] &&
-               param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-               param->Filter()->dims()[2] == 3) {
-      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3_FLOAT;
+    } else if (depth3x3 && param->Strides()[0] == param->Strides()[1] &&
+               param->Strides()[0] == 2 && param->Paddings()[0] == 0 &&
+               param->Paddings()[0] == param->Paddings()[1]) {
+      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3S2P0_FLOAT;
+    } else if (depth3x3 && param->Strides()[0] == param->Strides()[1] &&
+               param->Strides()[0] == 2 && param->Paddings()[0] == 1 &&
+               param->Paddings()[0] == param->Paddings()[1]) {
+      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3S2P1_FLOAT;
 #ifndef __aarch64__
-    } else if (param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-               param->Strides()[0] == param->Strides()[1] &&
+    } else if (conv3x3 && param->Strides()[0] == param->Strides()[1] &&
               param->Dilations()[0] == param->Dilations()[1] &&
-               param->Filter()->dims()[2] == 3 && param->Strides()[0] == 1 &&
-               param->Dilations()[0] == 1 && param->Output()->dims()[1] >= 16 &&
+               param->Strides()[0] == 1 && param->Dilations()[0] == 1 &&
+               param->Output()->dims()[1] >= 16 &&
               param->Input()->dims()[1] >= 16 &&
               param->Input()->dims()[2] <= 140 /* refered from ncnn */) {
      param->ExecMode() = ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT;
      // transform weight
-      framework::Tensor *transformed_weight = new framework::Tensor;
+      framework::Tensor transformed_weight;
      operators::math::winograd_transform_weight<8, 3>(*param->Filter(),
-                                                       transformed_weight);
-      param->Filter() = transformed_weight;
+                                                       &transformed_weight);
+      framework::TensorCopy(transformed_weight, param->Filter());
 #endif
    } else {
      param->ExecMode() = ConvParam<CPU>::EXEC_GEMM_FLOAT;
@@ -78,9 +80,13 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
      math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
                                 nullptr, false);
      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3_FLOAT:
-      math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
-                             param.Filter(), nullptr, param.Output(), false);
+    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2P1_FLOAT:
+      math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
+                                   param.Output(), nullptr, false);
+      break;
+    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2P0_FLOAT:
+      math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
+                                 nullptr, false);
      break;
    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
      WinogradConv3x3<8, 3>(param);

--- a/src/operators/kernel/arm/dequant_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/dequant_add_bn_relu_kernel.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+#ifdef FUSION_DEQUANT_ADD_BN_OP

-#include "operators/kernel/dequant_add_bn_relu_kernel.h"
+#include "operators/kernel/dequant_add_bn_kernel.h"
 #include <cmath>
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
@@ -24,8 +24,8 @@ namespace paddle_mobile {
 namespace operators {

 template <>
-bool FusionDequantAddBNReluKernel<CPU, float>::Init(
-    FusionDequantAddBNReluParam<CPU> *param) {
+bool FusionDequantAddBNKernel<CPU, float>::Init(
+    FusionDequantAddBNParam<CPU> *param) {
  // elementwise add params
  const Tensor *bias = param->bias_;
  // batch norm params
@@ -49,8 +49,8 @@ bool FusionDequantAddBNReluKernel<CPU, float>::Init(
 }

 template <>
-void FusionDequantAddBNReluKernel<CPU, float>::Compute(
-    const FusionDequantAddBNReluParam<CPU> &param) {
+void FusionDequantAddBNKernel<CPU, float>::Compute(
+    const FusionDequantAddBNParam<CPU> &param) {
  const int32_t *input = param.input_->data<int32_t>();
  const float *bn_scale = param.bn_scale_->data<float>();
  const float *bn_bias = param.bn_bias_->data<float>();
@@ -78,7 +78,6 @@ void FusionDequantAddBNReluKernel<CPU, float>::Compute(
      remain = spatial_size & 0xF;
      float32x4_t __scale = vdupq_n_f32(scale);
      float32x4_t __bias = vdupq_n_f32(bias);
-      float32x4_t __zero = vdupq_n_f32(0.f);

      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
        int32x4_t r0 = vld1q_s32(x);
@@ -93,10 +92,6 @@ void FusionDequantAddBNReluKernel<CPU, float>::Compute(
        f1 = vmlaq_f32(__bias, __scale, f1);
        f2 = vmlaq_f32(__bias, __scale, f2);
        f3 = vmlaq_f32(__bias, __scale, f3);
-        f0 = vmaxq_f32(__zero, f0);
-        f1 = vmaxq_f32(__zero, f1);
-        f2 = vmaxq_f32(__zero, f2);
-        f3 = vmaxq_f32(__zero, f3);
        vst1q_f32(y, f0);
        vst1q_f32(y + 4, f1);
        vst1q_f32(y + 8, f2);
@@ -104,7 +99,7 @@ void FusionDequantAddBNReluKernel<CPU, float>::Compute(
      }
 #endif  // __ARM_NEON__
      for (int k = 0; k < remain; ++k) {
-        y[k] = std::max(scale * x[k] + bias, 0.f);
+        y[k] = scale * x[k] + bias;
      }
    }
  }
@@ -113,4 +108,4 @@ void FusionDequantAddBNReluKernel<CPU, float>::Compute(
 }  // namespace operators
 }  // namespace paddle_mobile

-#endif  // FUSION_DEQUANT_ADD_BN_RELU_OP
+#endif  // FUSION_DEQUANT_ADD_BN_OP
--- a/src/operators/kernel/arm/dequant_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/dequant_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/dequant_bn_relu_kernel.h"
+#include <cmath>
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+
+namespace paddle_mobile {
+namespace operators {
+
+#if defined(FUSION_DEQUANT_BN_RELU_OP) || defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
+void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
+  const int32_t *input = param->input_->data<int32_t>();
+  const float *bn_scale = param->bn_scale_->data<float>();
+  const float *bn_bias = param->bn_bias_->data<float>();
+  // dequantize params
+  const float activation_scale = param->activation_scale_->data<float>()[0];
+  const float weight_scale = param->weight_scale_;
+  const float dequant_scale = activation_scale / weight_scale;
+
+  float *output = param->output_->mutable_data<float>();
+  int batch_size = param->input_->dims()[0];
+  int channels = param->input_->dims()[1];
+  size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
+
+  #pragma omp parallel for collapse(2)
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int c = 0; c < channels; ++c) {
+      float scale = bn_scale[c] * dequant_scale;
+      float bias = bn_bias[c];
+      size_t offset = (batch * channels + c) * spatial_size;
+      const int32_t *x = input + offset;
+      float *y = output + offset;
+      size_t remain = spatial_size;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+      int loop = spatial_size >> 4;
+      remain = spatial_size & 0xF;
+      float32x4_t __scale = vdupq_n_f32(scale);
+      float32x4_t __bias = vdupq_n_f32(bias);
+      float32x4_t __zero = vdupq_n_f32(0.f);
+
+      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
+        int32x4_t r0 = vld1q_s32(x);
+        int32x4_t r1 = vld1q_s32(x + 4);
+        int32x4_t r2 = vld1q_s32(x + 8);
+        int32x4_t r3 = vld1q_s32(x + 12);
+        float32x4_t f0 = vcvtq_f32_s32(r0);
+        float32x4_t f1 = vcvtq_f32_s32(r1);
+        float32x4_t f2 = vcvtq_f32_s32(r2);
+        float32x4_t f3 = vcvtq_f32_s32(r3);
+        f0 = vmlaq_f32(__bias, __scale, f0);
+        f1 = vmlaq_f32(__bias, __scale, f1);
+        f2 = vmlaq_f32(__bias, __scale, f2);
+        f3 = vmlaq_f32(__bias, __scale, f3);
+        f0 = vmaxq_f32(__zero, f0);
+        f1 = vmaxq_f32(__zero, f1);
+        f2 = vmaxq_f32(__zero, f2);
+        f3 = vmaxq_f32(__zero, f3);
+        vst1q_f32(y, f0);
+        vst1q_f32(y + 4, f1);
+        vst1q_f32(y + 8, f2);
+        vst1q_f32(y + 12, f3);
+      }
+#endif  // __ARM_NEON__
+      for (int k = 0; k < remain; ++k) {
+        y[k] = std::max(scale * x[k] + bias, 0.f);
+      }
+    }
+  }
+}
+#endif
+
+#ifdef FUSION_DEQUANT_BN_RELU_OP
+template <>
+bool FusionDequantBNReluKernel<CPU, float>::Init(
+    FusionDequantBNReluParam<CPU> *param) {
+  // batch norm params
+  const Tensor *bn_mean = param->bn_mean_;
+  const Tensor *bn_variance = param->bn_variance_;
+  Tensor *bn_scale = param->bn_scale_;
+  Tensor *bn_bias = param->bn_bias_;
+  const float epsilon = param->epsilon_;
+
+  const float *mean_ptr = bn_mean->data<float>();
+  const float *var_ptr = bn_variance->data<float>();
+  float *bn_scale_ptr = bn_scale->mutable_data<float>();
+  float *bn_bias_ptr = bn_bias->mutable_data<float>();
+  for (int c = 0; c < bn_scale->numel(); ++c) {
+    float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
+    bn_scale_ptr[c] = inv_scale;
+    bn_bias_ptr[c] = bn_bias_ptr[c] - inv_scale * mean_ptr[c];
+  }
+  return true;
+}
+
+template <>
+void FusionDequantBNReluKernel<CPU, float>::Compute(
+    const FusionDequantBNReluParam<CPU> &param) {
+  DequantBNReluCompute(&param);
+}
+#endif  // FUSION_DEQUANT_BN_RELU_OP
+
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+template <>
+bool FusionDequantAddBNReluKernel<CPU, float>::Init(
+    FusionDequantAddBNReluParam<CPU> *param) {
+  // elementwise add params
+  const Tensor *bias = param->bias_;
+  // batch norm params
+  const Tensor *bn_mean = param->bn_mean_;
+  const Tensor *bn_variance = param->bn_variance_;
+  Tensor *bn_scale = param->bn_scale_;
+  Tensor *bn_bias = param->bn_bias_;
+  const float epsilon = param->epsilon_;
+
+  const float *bias_ptr = bias->data<float>();
+  const float *mean_ptr = bn_mean->data<float>();
+  const float *var_ptr = bn_variance->data<float>();
+  float *bn_scale_ptr = bn_scale->mutable_data<float>();
+  float *bn_bias_ptr = bn_bias->mutable_data<float>();
+  for (int c = 0; c < bn_scale->numel(); ++c) {
+    float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
+    bn_scale_ptr[c] = inv_scale;
+    bn_bias_ptr[c] = inv_scale * (bias_ptr[c] - mean_ptr[c]) + bn_bias_ptr[c];
+  }
+  return true;
+}
+
+template <>
+void FusionDequantAddBNReluKernel<CPU, float>::Compute(
+    const FusionDequantAddBNReluParam<CPU> &param) {
+  DequantBNReluCompute(&param);
+}
+#endif  // FUSION_DEQUANT_ADD_BN_RELU_OP
+
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
--- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
@@ -132,10 +132,10 @@ void ConvAddCompute(const FusionConvAddParam<CPU> &param) {
    //                               param.Output(), false);
    if (param.Paddings()[0] == 0) {
      math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
-                                 *param.Bias(), true);
+                                 param.Bias(), true);
    } else {
      math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
-                                   param.Output(), *param.Bias(), true);
+                                   param.Output(), param.Bias(), true);
    }
  } else {
    ConvAddBasic(param);

--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -164,31 +164,21 @@ template <typename Itype, typename Otype>
 inline void DepthwiseConv3x3(const ConvParam<CPU> &param) {
  const Tensor *input = param.Input();
  const Tensor *filter = param.Filter();
+  const std::vector<int> &paddings = param.Paddings();
+  const std::vector<int> &strides = param.Strides();
+  const int batch_size = input->dims()[0];
  Tensor *output = param.Output();
  output->mutable_data<Otype>();

-  const std::vector<int> &paddings = param.Paddings();
-  const std::vector<int> &strides = param.Strides();
-  const int batch_size = static_cast<int>(input->dims()[0]);
-  Tensor input_pad;
-  math::PadFunctor<CPU, Itype> pad;
  for (int i = 0; i < batch_size; i++) {
    Tensor in_batch = input->Slice(i, i + 1);
    Tensor out_batch = output->Slice(i, i + 1);
-    if (paddings[0] || paddings[1]) {
-      framework::DDim pad_shape = in_batch.dims();
-      pad_shape[2] += 2 * paddings[0];
-      pad_shape[3] += 2 * paddings[1];
-      input_pad.mutable_data<float>(pad_shape);
-      pad(in_batch, paddings[0], paddings[0], paddings[1], paddings[1],
-          &input_pad);
-    } else {
-      input_pad = in_batch;
-    }
    if (strides[0] == 1) {
-      math::DepthwiseConv3x3s1<Itype, Otype>(input_pad, *filter, &out_batch);
+      math::DepthwiseConv3x3S1<Itype, Otype>(in_batch, *filter, paddings,
+                                             &out_batch);
    } else if (strides[0] == 2) {
-      math::DepthwiseConv3x3s2<Itype, Otype>(input_pad, *filter, &out_batch);
+      math::DepthwiseConv3x3S2<Itype, Otype>(in_batch, *filter, paddings,
+                                             &out_batch);
    } else {
      // math::DepthwiseConv3x3<Itype, Otype>(input_pad, *filter,
      // &out_batch);

--- a/src/operators/kernel/cl/batchnorm_kernel.cpp
+++ b/src/operators/kernel/cl/batchnorm_kernel.cpp
@@ -77,15 +77,25 @@ void BatchNormKernel<GPU_CL, float>::Compute(
  auto new_scale = param.NewScale()->GetCLImage();
  auto new_bias = param.NewBias()->GetCLImage();
  const int out_width = default_work_size[1];
-
-  clSetKernelArg(kernel, 1, sizeof(int), &out_width);
-  clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
-  clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_scale);
-  clSetKernelArg(kernel, 4, sizeof(cl_mem), &new_bias);
-  clSetKernelArg(kernel, 5, sizeof(cl_mem), &out);
-
-  //  cl_event out_event = param.OutputY()->GetClEvent();
-  //  cl_event wait_event = param.InputX()->GetClEvent();
+  DLOG << *param.InputX();
+  DLOG << *param.NewBias();
+  DLOG << *param.NewScale();
+  DLOG << default_work_size[0];
+  DLOG << default_work_size[1];
+  DLOG << default_work_size[2];
+  DLOG << out_width;
+  DLOG << *param.OutputY();
+  cl_int status;
+  clSetKernelArg(kernel, 0, sizeof(cl_int), &out_width);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 1, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 2, sizeof(cl_mem), &new_scale);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_bias);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 4, sizeof(cl_mem), &out);
+  CL_CHECK_ERRORS(status);
  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
                         default_work_size.data(), NULL, 0, NULL, NULL);
 }

--- a/src/operators/kernel/cl/cl_kernel/conv_bn_add_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_bn_add_relu_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#define BATCH_NORM
+#define BIASE
+#define RELU
+
+#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -924,6 +924,387 @@ __kernel void conv_5x5(__private const int global_size_dim0,
    write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
 }

+__kernel void convBNAdd_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input_image,
+                                              __read_only image2d_t filter,
+
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height,/* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        return;
+    }
+
+
+    int2 stride_xy;
+    stride_xy.x = stride;
+    stride_xy.y = stride;
+
+    int2 ouput_pos_in_one_block;
+    ouput_pos_in_one_block.x = out_w;
+    ouput_pos_in_one_block.y = out_nh;
+
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    int2 in_pos_in_one_block;
+    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+
+
+    half4 output = (half4)0.0f;
+
+   half4 input[9];
+
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        input[0] = select(read_imageh(input_image, sampler,
+                            (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+                            (half4)(0.0f),
+                            (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+
+        input[1] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y - dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+
+        input[2] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+
+        input[3] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        input[4] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        input[5] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        input[6] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+
+        input[7] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+
+        input[8] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+
+
+/*
+        for (int j = 0; j < 9; ++j) {
+            int2 pos_of_weight;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+        }
+*/
+            int j = 0;
+            int2 pos_of_weight;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+            j = 1;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+            j = 2;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+            j = 3;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+            j = 4;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+            j = 5;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+           j = 6;
+           pos_of_weight.x = i * 3 + j % 3;
+           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+           weight_x = read_imageh(filter, sampler, pos_of_weight);
+           output.x += dot(input[j], weight_x);
+
+           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+           weight_y = read_imageh(filter, sampler, pos_of_weight);
+           output.y += dot(input[j], weight_y);
+
+           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+           weight_z = read_imageh(filter, sampler, pos_of_weight);
+           output.z += dot(input[j], weight_z);
+
+           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+           weight_w = read_imageh(filter, sampler, pos_of_weight);
+           output.w += dot(input[j], weight_w);
+
+           j = 7;
+           pos_of_weight.x = i * 3 + j % 3;
+           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+           weight_x = read_imageh(filter, sampler, pos_of_weight);
+           output.x += dot(input[j], weight_x);
+
+           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+           weight_y = read_imageh(filter, sampler, pos_of_weight);
+           output.y += dot(input[j], weight_y);
+
+           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+           weight_z = read_imageh(filter, sampler, pos_of_weight);
+           output.z += dot(input[j], weight_z);
+
+           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+           weight_w = read_imageh(filter, sampler, pos_of_weight);
+           output.w += dot(input[j], weight_w);
+
+           j = 8;
+           pos_of_weight.x = i * 3 + j % 3;
+           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+           weight_x = read_imageh(filter, sampler, pos_of_weight);
+           output.x += dot(input[j], weight_x);
+
+           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+           weight_y = read_imageh(filter, sampler, pos_of_weight);
+           output.y += dot(input[j], weight_y);
+
+           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+           weight_z = read_imageh(filter, sampler, pos_of_weight);
+           output.z += dot(input[j], weight_z);
+
+           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+           weight_w = read_imageh(filter, sampler, pos_of_weight);
+           output.w += dot(input[j], weight_w);
+
+    }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef BIASE
+    output += read_imageh(bias, sampler, (int2)(out_c * global_size_dim1 + out_w, out_nh));
+#endif
+
+#ifdef RELU
+    output = activation(output);
+#endif
+
+    write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
+}
+
+__kernel void convBNAdd_1x1(__private const int global_size_dim0,
+                       __private const int global_size_dim1,
+                       __private const int global_size_dim2,
+                       __read_only image2d_t input_image,
+                       __read_only image2d_t filter,
+#ifdef BIASE
+                       __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                       __read_only image2d_t new_scale,
+                       __read_only image2d_t new_biase,
+#endif
+                       __write_only image2d_t output_image,
+                       __private const int stride,
+                       __private const int offset,
+                       __private const int input_c,
+                       __private const int dilation,
+                       __private const int input_width,/* of one block */
+                       __private const int input_height,/* of one block */
+                       __private const int output_width,
+                       __private const int output_height) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                           CLK_ADDRESS_CLAMP         |
+                           CLK_FILTER_NEAREST;
+
+  const uint kernelHXW = 1;
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
+  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
+
+  half4 output = 0.0f;
+
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        half4 input = read_imageh(input_image, sampler, pos_in);
+
+        half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+        half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+        half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+        half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+/*
+        output.x = dot(input, weight0);
+        output.y = dot(input, weight1);
+        output.z = dot(input, weight2);
+        output.w = dot(input, weight3);
+*/
+
+        output = mad(input.x, weight0, output);
+        output = mad(input.y, weight1, output);
+        output = mad(input.z, weight2, output);
+        output = mad(input.w, weight3, output);
+
+   }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef BIASE
+   output += read_imageh(bias, sampler, (int2)(out_c * global_size_dim1 + out_w, out_nh));
+#endif
+
+#ifdef RELU
+  output = activation(output);
+#endif
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos, output);
+}




--- a/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNADDRELU_OP
+
+#include "operators/kernel/conv_bn_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvBNAddReluKernel<GPU_CL, float>::Init(
+    FusionConvBNAddReluParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+
+  const framework::CLImage *mean = param->InputMean();
+  const framework::CLImage *variance = param->InputVariance();
+  const framework::CLImage *scale = param->InputScale();
+  const framework::CLImage *bias = param->InputBias();
+
+  const float epsilon = param->Epsilon();
+
+  const int C = mean->numel();
+
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  float *new_scale_ptr = new float[C];
+  float *new_bias_ptr = new float[C];
+
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+
+  framework::CLImage *new_scale = new framework::CLImage();
+
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " new scale - " << j << new_scale_ptr[j];
+  //  }
+  //
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " new bias - " << j << new_bias_ptr[j];
+  //  }
+
+  new_scale->SetTensorData(new_scale_ptr, variance->dims());
+  new_scale->InitCLImage(this->cl_helper_.CLContext(),
+                         cl_helper_.CLCommandQueue());
+
+  //  DLOG << " climage - y bias: " << *(param->Bias());
+  //
+  //  DLOG << " climage - new scale: " << *new_scale;
+
+  framework::CLImage *new_bias = new framework::CLImage();
+
+  new_bias->SetTensorData(new_bias_ptr, variance->dims());
+  new_bias->InitCLImage(this->cl_helper_.CLContext(),
+                        cl_helper_.CLCommandQueue());
+
+  //  DLOG << " climage - new bias: " << *new_bias;
+  //
+  //  DLOG << " climage - filter: " << *(param->Filter());
+
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  delete[](new_scale_ptr);
+  delete[](new_bias_ptr);
+
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+
+  param->SetOffset(offset);
+
+  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("convBNAdd_1x1", "conv_bn_add_relu_kernel.cl");
+    DLOG << " conv bn add relu conv 1x1";
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("depth_convBNAdd_3x3",
+                               "conv_bn_add_relu_kernel.cl");
+    DLOG << " conv bn add relu depth_conv_3x3";
+
+  } else if (param->Filter()->dims()[2] == 3 &&
+             param->Filter()->dims()[3] == 3) {
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    this->cl_helper_.AddKernel("convBNAdd_3x3", "conv_bn_add_relu_kernel.cl");
+    DLOG << " conv bn add relu conv_3x3";
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+
+  return true;
+}
+
+template <>
+void ConvBNAddReluKernel<GPU_CL, float>::Compute(
+    const FusionConvBNAddReluParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto biase = param.Bias()->GetCLImage();
+  auto new_scale = param.NewScale()->GetCLImage();
+  auto new_bias = param.NewBias()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+
+  //              DLOG << " c block " << c_block;
+  //              DLOG << " w " << w;
+  //              DLOG << " nh " << nh;
+  //              DLOG << " stride " << stride;
+  //              DLOG << " offset " << offset;
+  //              DLOG << " input_c " << input_c;
+  //              DLOG << " dilation " << dilation;
+  //              DLOG << " input width " << input_width;
+  //              DLOG << " input height " << input_height;
+  //              DLOG << " output width " << output_width;
+  //              DLOG << " output height " << output_height;
+  //              DLOG << " input dim " << *param.Input();
+  //              DLOG << " output dim " <<* param.Output();
+  //              DLOG << " filter dim " << *param.Filter();
+  //              DLOG<<*param.Bias();
+
+  cl_int status;
+
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+template class ConvBNAddReluKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/cl/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DEPTHWISECONV_OP
-
-#include "operators/kernel/depthwise_conv_kernel.h"
-#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
-  DLOG << " depthwise conv kernel init begin ";
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-  param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                               this->cl_helper_.CLCommandQueue());
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-  param->SetOffset(offset);
-  this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
-  DLOG << " depthwise conv kernel init end ";
-  return true;
-}
-
-template <>
-void DepthwiseConvKernel<GPU_CL, float>::Compute(
-    const ConvParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  cl_int status;
-
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
-  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
-  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
-  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
-  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
-
-  CL_CHECK_ERRORS(status);
-
-  //  cl_event out_event = param.Output()->GetClEvent();
-  //  cl_event wait_event = param.Input()->GetClEvent();
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-
-  CL_CHECK_ERRORS(status);
-}
-
-template class DepthwiseConvKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
+///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. */
+//
+//#ifdef DEQUANT_OP
+//
+//#include "operators/kernel/dequantize_kernel.h"
+//
+// namespace paddle_mobile {
+// namespace operators {
+//
+// template <>
+// bool DequantizeKernel<GPU_CL, float>::Init(DequantizeParam<GPU_CL> *param) {
+//  DLOG << " depthwise conv kernel init begin ";
+//  PADDLE_MOBILE_ENFORCE(
+//      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+//          param->Paddings()[0] == param->Paddings()[1],
+//      "need equal");
+//  param->Filter()->InitCLImage(cl_helper_.CLContext(),
+//                               this->cl_helper_.CLCommandQueue());
+//  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+//               static_cast<int>(param->Paddings()[1]);
+//  param->SetOffset(offset);
+//  this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+//  DLOG << " depthwise conv kernel init end ";
+//  return true;
+//}
+//
+// template <>
+// void DequantizeKernel<GPU_CL, float>::Compute(
+//    const DequantizeParam<GPU_CL> &param) {
+//  auto kernel = this->cl_helper_.KernelAt(0);
+//  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+//  int c_block = default_work_size[0];
+//  int w = default_work_size[1];
+//  int nh = default_work_size[2];
+//  auto input = param.Input()->GetCLImage();
+//  auto filter = param.Filter()->GetCLImage();
+//  auto output = param.Output()->GetCLImage();
+//  int stride = param.Strides()[0];
+//  int offset = param.Offset();
+//  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+//                    param.Input()->Converter())
+//                    ->GetCBlock();
+//  int dilation = param.Dilations()[0];
+//
+//  int input_width = param.Input()->dims()[3];
+//  int input_height = param.Input()->dims()[2];
+//  int output_width = param.Output()->dims()[3];
+//  int output_height = param.Output()->dims()[2];
+//
+//  cl_int status;
+//
+//  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+//  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+//  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+//  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+//  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+//  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
+//  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
+//  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
+//  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
+//  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
+//  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
+//  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
+//  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
+//  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
+//
+//  CL_CHECK_ERRORS(status);
+//
+//  //  cl_event out_event = param.Output()->GetClEvent();
+//  //  cl_event wait_event = param.Input()->GetClEvent();
+//
+//  status = clEnqueueNDRangeKernel(
+//      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
+//      NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+//
+//  CL_CHECK_ERRORS(status);
+//}
+//
+// template class DepthwiseConvKernel<GPU_CL, float>;
+//
+//}  // namespace operators
+//}  // namespace paddle_mobile
+//
+//#endif
--- a/src/operators/kernel/cl/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/cl/elementwise_add_kernel.cpp
@@ -24,7 +24,11 @@ bool ElementwiseAddKernel<GPU_CL, float>::Init(
    ElementwiseAddParam<GPU_CL> *param) {
  DLOG << "-----init add-----";
  CLImage *bias = (CLImage *)(param->InputY());
-  bias->InitCLImage(cl_helper_.CLContext(), this->cl_helper_.CLCommandQueue());
+  if (!bias->isInit()) {
+    bias->InitCLImage(cl_helper_.CLContext(),
+                      this->cl_helper_.CLCommandQueue());
+  }
+
  DLOG << " bias: " << *bias;
  if (bias->dims().size() == 4) {
    this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl");

--- a/src/operators/kernel/dequant_add_bn_relu_kernel.h
+++ b/src/operators/kernel/dequant_add_bn_relu_kernel.h
@@ -14,7 +14,7 @@ limitations under the License. */

 #pragma once

-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+#ifdef FUSION_DEQUANT_ADD_BN_OP

 #include "framework/operator.h"
 #include "operators/op_param.h"
@@ -23,12 +23,12 @@ namespace paddle_mobile {
 namespace operators {

 template <typename DeviceType, typename T>
-class FusionDequantAddBNReluKernel
+class FusionDequantAddBNKernel
    : public framework::OpKernelBase<DeviceType,
-                                     FusionDequantAddBNReluParam<DeviceType>> {
+                                     FusionDequantAddBNParam<DeviceType>> {
 public:
-  void Compute(const FusionDequantAddBNReluParam<DeviceType> &param);
-  bool Init(FusionDequantAddBNReluParam<DeviceType> *param);
+  void Compute(const FusionDequantAddBNParam<DeviceType> &param);
+  bool Init(FusionDequantAddBNParam<DeviceType> *param);
 };

 }  // namespace operators

--- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
@@ -12,42 +12,35 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef DEPTHWISECONV_OP
-
 #pragma once
-#include <vector>
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#include "operators/math/depthwise_conv3x3.h"
+
+#include "framework/operator.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {

-template <typename P>
-void DepthwiseConvCompute(const ConvParam<CPU> &param) {
-  Tensor Bias;
-  Bias.mutable_data<float>({param.Groups()});
-  if (param.Groups() == param.Input()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
-    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
-                               &Bias, false);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    //    math::DepthwiseConv3x3(param.Input(), param.Strides(),
-    //    param.Paddings(),
-    //                           param.Filter(), &Bias, param.Output(), false);
-    math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), param.Output(),
-                                 Bias, false);
-
-  } else {
-    GemmConv<float, float>(param);
-  }
-}
+#ifdef FUSION_DEQUANT_BN_RELU_OP
+template <typename DeviceType, typename T>
+class FusionDequantBNReluKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     FusionDequantBNReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionDequantBNReluParam<DeviceType> &param);
+  bool Init(FusionDequantBNReluParam<DeviceType> *param);
+};
+#endif
+
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+template <typename DeviceType, typename T>
+class FusionDequantAddBNReluKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     FusionDequantAddBNReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionDequantAddBNReluParam<DeviceType> &param);
+  bool Init(FusionDequantAddBNReluParam<DeviceType> *param);
+};
+#endif

 }  // namespace operators
 }  // namespace paddle_mobile
-
-#endif
--- a/src/operators/math/depthwise_conv3x3.cpp
+++ b/src/operators/math/depthwise_conv3x3.cpp
@@ -1272,13 +1272,16 @@ void DepthwiseConvAddBNRelu3x3s2p1(const framework::Tensor *input,

 void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
                            const framework::Tensor *filter,
-                            framework::Tensor *output, framework::Tensor bias,
+                            framework::Tensor *output, framework::Tensor *bias,
                            bool if_bias) {
 #if __ARM_NEON
  const float *input_data = input->data<float>();
  const float *filter_data = filter->data<float>();
  float *output_data = output->data<float>();
-  const float *bias_data = bias.data<float>();
+  const float *bias_data;
+  if (if_bias) {
+    bias_data = bias->data<float>();
+  }

  const int in_h = static_cast<int>(input->dims()[2]);
  const int in_w = static_cast<int>(input->dims()[3]);
@@ -1905,7 +1908,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input,

 void DepthwiseConv3x3s2p0(const framework::Tensor *input,
                          const framework::Tensor *filter,
-                          framework::Tensor *output, framework::Tensor bias,
+                          framework::Tensor *output, framework::Tensor *bias,
                          bool if_bias) {
 #if __ARM_NEON

@@ -1925,7 +1928,7 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input,
    for (int c = 0; c < input_channel; c++) {
      const float *filter_data = filter->data<float>() + c * 9;
      const float *input_data = input->data<float>() + c * inhxw;
-      const float *bias_data = bias.data<float>() + c;
+      const float *bias_data = bias->data<float>() + c;
      float *output_data = output->data<float>() + c * outhxw;
      float w00 = filter_data[0];
      float w01 = filter_data[1];

--- a/src/operators/math/depthwise_conv3x3.h
+++ b/src/operators/math/depthwise_conv3x3.h
@@ -50,7 +50,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const framework::Tensor *input,

 void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
                            const framework::Tensor *filter,
-                            framework::Tensor *output, framework::Tensor bias,
+                            framework::Tensor *output, framework::Tensor *bias,
                            bool if_bias);

 void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input,
@@ -62,7 +62,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input,

 void DepthwiseConv3x3s2p0(const framework::Tensor *input,
                          const framework::Tensor *filter,
-                          framework::Tensor *output, framework::Tensor bias,
+                          framework::Tensor *output, framework::Tensor *bias,
                          bool if_bias);

 // TODO(hjchen2) need to be implemented
@@ -70,16 +70,19 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input,
 // void DepthwiseConv3x3(const framework::Tensor *input,
 //                      const framework::Tensor *filter,
 //                      const std::vector<int> &strides,
+//                      const std::vector<int> &paddings,
 //                      framework::Tensor *output);

 template <typename Itype, typename Otype>
-void DepthwiseConv3x3s1(const framework::Tensor &input,
+void DepthwiseConv3x3S1(const framework::Tensor &input,
                        const framework::Tensor &filter,
+                        const std::vector<int> &paddings,
                        framework::Tensor *output);

 template <typename Itype, typename Otype>
-void DepthwiseConv3x3s2(const framework::Tensor &input,
+void DepthwiseConv3x3S2(const framework::Tensor &input,
                        const framework::Tensor &filter,
+                        const std::vector<int> &paddings,
                        framework::Tensor *output);

 }  // namespace math

--- a/src/operators/math/depthwise_conv3x3_int8.cpp
+++ b/src/operators/math/depthwise_conv3x3_int8.cpp
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -405,9 +405,9 @@ class ConvParam : public OpParam {

  const RType *Input() const { return input_; }

-  RType *&Filter() const { return filter_; }
+  RType *Filter() const { return filter_; }

-  RType *&Output() const { return output_; }
+  RType *Output() const { return output_; }

  const vector<int> &Strides() const { return strides_; }

@@ -419,6 +419,8 @@ class ConvParam : public OpParam {
    EXEC_INVALID = 0,
    EXEC_GEMM_FLOAT,
    EXEC_DEPTHWISE3x3S1P1_FLOAT,
+    EXEC_DEPTHWISE3x3S2P0_FLOAT,
+    EXEC_DEPTHWISE3x3S2P1_FLOAT,
    EXEC_DEPTHWISE3x3_FLOAT,
    EXEC_WINOGRAD3X3_FLOAT,
    EXEC_WINOGRAD5X5_FLOAT,
@@ -439,8 +441,8 @@ class ConvParam : public OpParam {

 private:
  RType *input_;
-  mutable RType *output_;
-  mutable RType *filter_;
+  RType *output_;
+  RType *filter_;
  vector<int> strides_;
  vector<int> paddings_;
  vector<int> dilations_;
@@ -2573,7 +2575,9 @@ class DequantizeParam : public OpParam {
  DequantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
                  const AttributeMap &attrs, const Scope &scope) {
    input_ = InputXFrom<GType>(inputs, scope);
+    if (outputs.count("Out")) {
      output_ = OutFrom<GType>(outputs, scope);
+    }
    activation_scale_ = OpParam::GetVarValue<GType>("Scale", inputs, scope);
    // dequantization is performed as x = x / static_scale / online_scale
    if (HasAttr("weight_scale", attrs)) {
@@ -2593,20 +2597,19 @@ class DequantizeParam : public OpParam {
 };
 #endif

-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+#if defined(FUSION_DEQUANT_ADD_BN_OP) ||      \
+    defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || \
+    defined(FUSION_DEQUANT_BN_RELU_OP) || defined(FUSION_DEQUANT_BN_OP)
 template <typename Dtype>
-class FusionDequantAddBNReluParam : public DequantizeParam<Dtype> {
+class FusionDequantBNParam : public DequantizeParam<Dtype> {
  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
  typedef typename DtypeTensorTrait<Dtype>::rtype RType;

 public:
-  FusionDequantAddBNReluParam(const VariableNameMap &inputs,
+  FusionDequantBNParam(const VariableNameMap &inputs,
                       const VariableNameMap &outputs,
                       const AttributeMap &attrs, const Scope &scope)
      : DequantizeParam<Dtype>(inputs, outputs, attrs, scope) {
-    // element wise add params
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
    // batch norm params
    bn_mean_ = OpParam::GetVarValue<GType>("BNMean", inputs, scope);
    bn_variance_ = OpParam::GetVarValue<GType>("BNVariance", inputs, scope);
@@ -2614,21 +2617,83 @@ class FusionDequantAddBNReluParam : public DequantizeParam<Dtype> {
    bn_bias_ = OpParam::GetVarValue<GType>("BNBias", inputs, scope);
    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
    // output
-    output_ = OpParam::OutFrom<GType>(outputs, scope);
+    if (outputs.count("Y")) {
+      this->output_ = OpParam::OutputYFrom<GType>(outputs, scope);
+    }
  }

 public:
-  // elementwise add
-  int axis_;
-  RType *bias_;
  // batch norm
  RType *bn_mean_;
  RType *bn_variance_;
  RType *bn_scale_;
  RType *bn_bias_;
  float epsilon_;
+};
+#endif
+
+#if defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || defined(FUSION_DEQUANT_ADD_BN_OP)
+template <typename Dtype>
+class FusionDequantAddBNParam : public FusionDequantBNParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionDequantAddBNParam(const VariableNameMap &inputs,
+                          const VariableNameMap &outputs,
+                          const AttributeMap &attrs, const Scope &scope)
+      : FusionDequantBNParam<Dtype>(inputs, outputs, attrs, scope) {
+    // element wise add params
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
    // output
-  RType *output_;
+    if (outputs.count("Y")) {
+      this->output_ = OpParam::OutputYFrom<GType>(outputs, scope);
+    }
+  }
+
+ public:
+  // elementwise add
+  int axis_;
+  RType *bias_;
+};
+#endif
+
+#ifdef FUSION_DEQUANT_BN_RELU_OP
+template <typename Dtype>
+class FusionDequantBNReluParam : public FusionDequantBNParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionDequantBNReluParam(const VariableNameMap &inputs,
+                           const VariableNameMap &outputs,
+                           const AttributeMap &attrs, const Scope &scope)
+      : FusionDequantBNParam<Dtype>(inputs, outputs, attrs, scope) {
+    // output
+    if (outputs.count("Out")) {
+      this->output_ = OpParam::OutFrom<GType>(outputs, scope);
+    }
+  }
+};
+#endif
+
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+template <typename Dtype>
+class FusionDequantAddBNReluParam : public FusionDequantAddBNParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionDequantAddBNReluParam(const VariableNameMap &inputs,
+                              const VariableNameMap &outputs,
+                              const AttributeMap &attrs, const Scope &scope)
+      : FusionDequantAddBNParam<Dtype>(inputs, outputs, attrs, scope) {
+    // output
+    if (outputs.count("Out")) {
+      this->output_ = OpParam::OutFrom<GType>(outputs, scope);
+    }
+  }
 };
 #endif


--- a/test/operators/test_quantize_op.cpp
+++ b/test/operators/test_quantize_op.cpp
@@ -44,25 +44,19 @@ struct Round<round::RoundTowardsZero> {
 template <>
 struct Round<round::RoundToEven> {
  int8_t operator()(float x) {
-    int8_t ret = 0;
    float v = std::round(x);
-    int32_t q = (int32_t)v;
-    if (abs(abs(q - x) - 0.5) > 0) {
-      ret = q;
-    } else {
-      if (abs(q) % 2 == 0) {
-        ret = q;
-      } else {
-        ret = q + ((q > 0) ? -1 : 1);
+    int32_t q = static_cast<int32_t>(v);
+    if (abs(abs(q - v) - 0.5) <= 0) {
+      if (abs(q) % 2 != 0) {
+        q = q + ((q > 0) ? -1 : 1);
      }
    }
-    return ret;
+    return static_cast<int8_t>(q);
  }
 };

 template <round::RoundType T>
-static void quantize(const Tensor *input, const float scale, const int pad,
-                     const int8_t pad_val, Tensor *output) {
+static void quantize(const Tensor *input, const float scale, Tensor *output) {
  int batch_size = input->dims()[0];
  int channels = input->dims()[1];
  int input_h = input->dims()[2];
@@ -77,29 +71,9 @@ static void quantize(const Tensor *input, const float scale, const int pad,
  for (int nc = 0; nc < batch_size * channels; ++nc) {
    const float *xh = x + nc * input_spatial;
    int8_t *yh = y + nc * output_spatial;
-    // pad top
-    for (int h = 0; h < pad; ++h, yh += output_w) {
-      for (int w = 0; w < output_w; ++w) {
-        yh[w] = pad_val;
-      }
-    }
    for (int h = 0; h < input_h; ++h, yh += output_w, xh += input_w) {
-      // pad left
-      for (int w = 0; w < pad; ++w) {
-        yh[w] = pad_val;
-      }
      for (int w = 0; w < input_w; ++w) {
-        yh[w + pad] = Round<T>()(xh[w] * scale);
-      }
-      // pad right
-      for (int w = 0; w < pad; ++w) {
-        yh[pad + input_w + w] = pad_val;
-      }
-    }
-    // pad bottom
-    for (int h = 0; h < pad; ++h, yh += output_w) {
-      for (int w = 0; w < output_w; ++w) {
-        yh[w] = pad_val;
+        yh[w] = Round<T>()(xh[w] * scale);
      }
    }
  }
@@ -120,19 +94,14 @@ static float find_abs_max(const Tensor *input) {

 int TestQuqntizeOp(int argc, char *argv[]) {
  if (argc < 5) {
-    std::cout
-        << "Usage: ./test-quantize-op batch_size channel height width [pad]"
+    std::cout << "Usage: ./test-quantize-op batch_size channel height width"
              << std::endl;
    return 1;
  }
-  int pad = 0;
  int batch_size = atoi(argv[1]);
  int channel = atoi(argv[2]);
  int height = atoi(argv[3]);
  int width = atoi(argv[4]);
-  if (argc == 6) {
-    pad = atoi(argv[5]);
-  }
  std::cout << "batch_size: " << batch_size << ", channel: " << channel
            << ", height: " << height << ", width: " << width << std::endl;
  framework::DDim dim =
@@ -153,7 +122,6 @@ int TestQuqntizeOp(int argc, char *argv[]) {
  auto output_scale_var = scope.get()->Var("output_scale");

  framework::AttributeMap attrs;
-  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad, pad}));
  auto *op = new operators::QuantizeOp<CPU, float>("quantize", inputs, outputs,
                                                   attrs, scope);
  op->InferShape();
@@ -172,9 +140,9 @@ int TestQuqntizeOp(int argc, char *argv[]) {
  framework::Tensor output_cmp;
  output_cmp.Resize(output->dims());
  float scale = 127 / output_scale_cmp;
-  // quantize<round::RoundToEven>(input, scale, pad, 0, &output_cmp);
-  // quantize<round::RoundAwayZero>(input, scale, pad, 0, &output_cmp);
-  quantize<round::RoundTowardsZero>(input, scale, pad, 0, &output_cmp);
+  // quantize<round::RoundToEven>(input, scale, &output_cmp);
+  // quantize<round::RoundAwayZero>(input, scale, &output_cmp);
+  quantize<round::RoundTowardsZero>(input, scale, &output_cmp);
  int8_t *output_cmp_data = output_cmp.data<int8_t>();
  for (int i = 0; i < output->numel(); ++i) {
    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],

--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -249,7 +249,9 @@ if(NOT FOUND_MATCH)
  set(SUM_OP ON)
  set(QUANT_OP ON)
  set(DEQUANT_OP ON)
-  set(FUSION_DEQUANT_ADD_BN_RELU ON)
+  set(FUSION_DEQUANT_ADD_BN_OP ON)
+  set(FUSION_DEQUANT_BN_RELU_OP ON)
+  set(FUSION_DEQUANT_ADD_BN_RELU_OP ON)
 endif()

  # option(BATCHNORM_OP "" ON)
@@ -451,10 +453,17 @@ endif()
 if (DEQUANT_OP)
  add_definitions(-DDEQUANT_OP)
 endif()
-if (FUSION_DEQUANT_ADD_BN_RELU)
+if (FUSION_DEQUANT_ADD_BN_OP)
+  add_definitions(-DFUSION_DEQUANT_ADD_BN_OP)
+endif()
+if (FUSION_DEQUANT_BN_RELU_OP)
+  add_definitions(-DFUSION_DEQUANT_BN_RELU_OP)
+endif()
+if (FUSION_DEQUANT_ADD_BN_RELU_OP)
  add_definitions(-DFUSION_DEQUANT_ADD_BN_RELU_OP)
 endif()

+
 if (TANH_OP)
  add_definitions(-DTANH_OP)
 endif()
@@ -467,3 +476,4 @@ endif()
 if (FUSION_DECONVADDRELU_OP)
  add_definitions(-DFUSION_DECONVADDRELU_OP)
 endif()
+