diff --git a/src/common/types.cpp b/src/common/types.cpp
index 510313d9fee0940d7162ea2c6b09426f6d9ce17a..8f284b3fe1115bd8cec78430a405289aae98e898 100644
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -73,6 +73,8 @@ const char *G_OP_TYPE_QUANTIZE = "quantize";
 const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
 extern const char *G_OP_TYPE_TANH = "tanh";
 extern const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
+extern const char *G_OP_TYPE_FUSION_DECONV_ADD = "fusion_deconv_add";
+extern const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu";
 
 std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
@@ -133,5 +135,7 @@ std::unordered_map<
         {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
         {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
         {G_OP_TYPE_TANH, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}}};
+        {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DECONV_ADD, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DECONV_ADD_RELU, {{"Input"}, {"Out"}}}};
 }  // namespace paddle_mobile
diff --git a/src/common/types.h b/src/common/types.h
index 4cd35ac91084f6518858c97cf4c0e8da5b09555b..e9c0f81232dab7583c57fb036b58601aa26ec3c9 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -142,6 +142,9 @@ extern const char *G_OP_TYPE_DEQUANTIZE;
 extern const char *G_OP_TYPE_TANH;
 extern const char *G_OP_TYPE_FUSION_DECONV_RELU;
 
+extern const char *G_OP_TYPE_FUSION_DECONV_ADD;
+extern const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU;
+
 extern std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
     op_input_output_key;
diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp
index bd463b0bd2aba2ef031a3fc5182ff92910c18f5b..98061dbbb768414fa2689a3b8fb35d559f4614a5 100644
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -280,6 +280,9 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
     clock_gettime(CLOCK_MONOTONIC, &ts);
     profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
+    if (loddable_) {
+      ops[i]->InferShape();
+    }
     // to Run
     ops[i]->Run();
 #ifdef PADDLE_MOBILE_PROFILE
diff --git a/src/framework/loader.cpp b/src/framework/loader.cpp
index 5587d0698fa2b9a04532deae618545d15ecd631f..3a8541e2139945f15605643092d2e9df4ecbefe7 100644
--- a/src/framework/loader.cpp
+++ b/src/framework/loader.cpp
@@ -43,15 +43,21 @@ void Loader<Dtype, P>::InitMemoryFromProgram(
           tensor->Resize(make_ddim(dim));
         } else {
           auto dim = var_desc->Tensor_desc().Dims();
-          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
+          //          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
           //          dim[0] = 1;
-          for (auto &d : dim) {
-            if (d < 0) {
-              d *= -1;
+          if (dim.size() == 0) {
+            auto tensor = var->GetMutable<LoDTensor>();
+            framework::DDim dDim = {0};
+            tensor->Resize(dDim);
+          } else {
+            for (auto &d : dim) {
+              if (d < 0) {
+                d *= -1;
+              }
             }
+            auto tensor = var->GetMutable<LoDTensor>();
+            tensor->Resize(make_ddim(dim));
           }
-          auto tensor = var->GetMutable<LoDTensor>();
-          tensor->Resize(make_ddim(dim));
         }
       } else {
         // TODO(codeWorm): some.
diff --git a/src/io/jni/paddle_mobile_jni.cpp b/src/io/jni/paddle_mobile_jni.cpp
index 56d522b1560d2c4cbeab04eeceaa598543273806..93b14f01e3b1c8f70a573471f53ad80cdfc25746 100644
--- a/src/io/jni/paddle_mobile_jni.cpp
+++ b/src/io/jni/paddle_mobile_jni.cpp
@@ -405,9 +405,9 @@ Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf) {
   ANDROIDLOGE("predict nlp size %d", count);
 
   result = env->NewLongArray(count);
-
   env->SetLongArrayRegion(result, 0, count, vec_result->data<int64_t>());
 
+  env->ReleaseLongArrayElements(buf, ddim_ptr, 0);
   return result;
 }
 
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
index 8e4a72dcadf1bc5105e1fc5f9b8c96bfb6d9aa3d..6a773da00ff6541d55a6a9d04ca470fed5de81a1 100644
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -122,9 +122,12 @@ void PaddleMobile<Dtype, P>::Clear() {
   executor_ = nullptr;
   loader_ = nullptr;
 }
-
 template <typename Dtype, Precision P>
-double PaddleMobile<Dtype, P>::GetPredictTime() {
+double PaddleMobile<Dtype, P>::GetPredictTime() {}
+
+#ifdef PADDLE_MOBILE_CPU
+template <>
+double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
   int m = 32;
   int n = 224 * 224;
   int k = 27;
@@ -147,8 +150,8 @@ double PaddleMobile<Dtype, P>::GetPredictTime() {
   }
   paddle_mobile::operators::math::Gemm gemm;
   auto time1 = paddle_mobile::time();
-  //  gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
-  //             static_cast<float>(0), c, ldc, false, nullptr);
+  gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
+             static_cast<float>(0), c, ldc, false, nullptr);
   auto time2 = paddle_mobile::time();
   double cost = paddle_mobile::time_diff(time1, time2);
   paddle_mobile::memory::Free(a);
@@ -156,6 +159,7 @@ double PaddleMobile<Dtype, P>::GetPredictTime() {
   paddle_mobile::memory::Free(c);
   return cost;
 }
+#endif
 
 template <typename Dtype, Precision P>
 PaddleMobile<Dtype, P>::~PaddleMobile() {
diff --git a/src/operators/feed_op.cpp b/src/operators/feed_op.cpp
index c3211b9fa9cc4b973788af4104c7ebe7bea2f54f..ac707d22696dd0a62902137607fb64c141341d77 100644
--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -21,7 +21,13 @@ template <typename DeviceType, typename T>
 void FeedOp<DeviceType, T>::InferShape() const {
   auto out_dims = this->param_.Out()->dims();
   out_dims[0] = this->param_.BatchSize();
-  this->param_.Out()->Resize(out_dims);
+  auto input_dims = this->param_.InputX()->dims();
+  DLOG << input_dims.size();
+  if (input_dims.size() == 4) {
+    this->param_.Out()->Resize(input_dims);
+  } else {
+    this->param_.Out()->Resize(out_dims);
+  }
 }
 
 }  // namespace operators
diff --git a/src/operators/fusion_conv_add_op.cpp b/src/operators/fusion_conv_add_op.cpp
index 1b32ec39b65f8b16fd8967be3f45f4b31db5ca16..731bb631bb98490d580e0c6fe28c24312f6ccb57 100644
--- a/src/operators/fusion_conv_add_op.cpp
+++ b/src/operators/fusion_conv_add_op.cpp
@@ -61,5 +61,7 @@ REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp);
 #endif
-
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_add, ops::FusionConvAddOp);
+#endif
 #endif
diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h
index 22ba67c617ecdb0f3be2f5757504b6ba530b092c..c4cd61016c6da1100819b3531bd41466726f292a 100644
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -29,8 +29,9 @@ namespace operators {
 class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher {
  public:
   FusionConvAddReluOpMatcher() {
-    //    node_ = framework::Node(G_OP_TYPE_FUSION_CONV_ADD);
-    //    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
   }
 
   void FolderNodes(
diff --git a/src/operators/fusion_deconv_add_op.cpp b/src/operators/fusion_deconv_add_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..99af70c1c05c166481f522282bee11895546afa5
--- /dev/null
+++ b/src/operators/fusion_deconv_add_op.cpp
@@ -0,0 +1,33 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADD_OP
+
+#include "operators/fusion_deconv_add_op.h"
+
+namespace paddle_mobile {
+namespace operators {}
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_deconv_add, ops::FusionDeconvAddMatcher);
+#ifdef PADDLE_MOBILE_CPU
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_deconv_add, ops::FusionDeconvAddOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_deconv_add_op.h b/src/operators/fusion_deconv_add_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..45d6d33aad683b1d45480c304cf44ef10349cdf3
--- /dev/null
+++ b/src/operators/fusion_deconv_add_op.h
@@ -0,0 +1,108 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DECONVADD_OP
+#pragma once
+#include <string>
+#include <vector>
+
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/deconv_add_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionDeconvAddMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDeconvAddMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD; }
+};
+
+template <typename DeviceType, typename T>
+class FusionDeconvAddOp : public framework::OperatorWithKernel<
+                              DeviceType, FusionDeconvAddParam<DeviceType>,
+                              operators::DeconvAddKernel<DeviceType, T>> {
+ public:
+  FusionDeconvAddOp(const string &type, const VariableNameMap &inputs,
+                    const VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs,
+                    std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDeconvAddParam<DeviceType>,
+            operators::DeconvAddKernel<DeviceType, T>>(type, inputs, outputs,
+                                                       attrs, scope) {}
+
+  void InferShape() const {
+    auto input = this->param_.Input();
+    auto in_dims = input->dims();
+
+    auto filter = this->param_.Filter();
+    auto filter_dims = filter->dims();
+
+    std::vector<int> strides = this->param_.Strides();
+    std::vector<int> paddings = this->param_.Paddings();
+    std::vector<int> dilations = this->param_.Dilations();
+
+    int groups = this->param_.Groups();
+
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == 4 || in_dims.size() == 5,
+        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == filter_dims.size(),
+        "ConvTransposeOp input dimension and filter dimension "
+        "should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() - strides.size() == 2U,
+        "ConvTransposeOp input dimension and strides dimension should "
+        "be consistent.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
+                          "ConvTransposeOp paddings dimension and strides "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
+                          "ConvTransposeOp paddings dimension and dilations "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims[1] == filter_dims[0],
+        "In ConvTransposeOp, The number of input channels should "
+        "be equal to the number of filter's channels.");
+
+    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
+    for (size_t i = 0; i < strides.size(); ++i) {
+      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
+      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
+                             2 * paddings[i] + filter_extent);
+    }
+    this->param_.Output()->Resize(framework::make_ddim(output_shape));
+  }
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // FUSION_DECONV_ADD_OP
diff --git a/src/operators/fusion_deconv_add_relu_op.cpp b/src/operators/fusion_deconv_add_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cb76eda2ea942d8852217f02a6ed54a60c3b4cc4
--- /dev/null
+++ b/src/operators/fusion_deconv_add_relu_op.cpp
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADDRELU_OP
+
+#include "operators/fusion_deconv_add_relu_op.h"
+
+namespace paddle_mobile {
+namespace operators {}
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_deconv_add_relu,
+                        ops::FusionDeconvAddReluMatcher);
+#ifdef PADDLE_MOBILE_CPU
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_deconv_add_relu, ops::FusionDeconvAddReluOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_deconv_add_relu_op.h b/src/operators/fusion_deconv_add_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..eeef6d3958746edfdc114192c3b923db1e102ced
--- /dev/null
+++ b/src/operators/fusion_deconv_add_relu_op.h
@@ -0,0 +1,110 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DECONVADDRELU_OP
+#pragma once
+#include <string>
+#include <vector>
+
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/deconv_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionDeconvAddReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDeconvAddReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionDeconvAddReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionDeconvAddReluParam<DeviceType>,
+          operators::DeconvAddReluKernel<DeviceType, T>> {
+ public:
+  FusionDeconvAddReluOp(const string &type, const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs,
+                        std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDeconvAddReluParam<DeviceType>,
+            operators::DeconvAddReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  void InferShape() const {
+    auto input = this->param_.Input();
+    auto in_dims = input->dims();
+
+    auto filter = this->param_.Filter();
+    auto filter_dims = filter->dims();
+
+    std::vector<int> strides = this->param_.Strides();
+    std::vector<int> paddings = this->param_.Paddings();
+    std::vector<int> dilations = this->param_.Dilations();
+
+    int groups = this->param_.Groups();
+
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == 4 || in_dims.size() == 5,
+        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == filter_dims.size(),
+        "ConvTransposeOp input dimension and filter dimension "
+        "should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() - strides.size() == 2U,
+        "ConvTransposeOp input dimension and strides dimension should "
+        "be consistent.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
+                          "ConvTransposeOp paddings dimension and strides "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
+                          "ConvTransposeOp paddings dimension and dilations "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims[1] == filter_dims[0],
+        "In ConvTransposeOp, The number of input channels should "
+        "be equal to the number of filter's channels.");
+
+    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
+    for (size_t i = 0; i < strides.size(); ++i) {
+      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
+      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
+                             2 * paddings[i] + filter_extent);
+    }
+    this->param_.Output()->Resize(framework::make_ddim(output_shape));
+  }
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // FUSION_DECONV_ADD_RELU_OP
diff --git a/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
index d71bc235977236fbd0dd332df556ea4bd41eacf4..bacaa866b12957cfc300049c56bb9648fd360770 100644
--- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
@@ -115,6 +115,7 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
 
 template <typename P>
 void ConvAddCompute(const FusionConvAddParam<CPU> &param) {
+  param.Output()->mutable_data<float>();
   if (param.Groups() == param.Input()->dims()[1] &&
       param.Input()->dims()[1] == param.Output()->dims()[1] &&
       param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
diff --git a/src/operators/kernel/cl/cl_kernel/conv_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_bn_relu_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..3aba8c2606d96812cdad72f570b27af60f632ae5
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/conv_bn_relu_kernel.cl
@@ -0,0 +1,18 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define BATCH_NORM
+#define RELU
+
+#include "conv_kernel.inc.cl"
diff --git a/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
index 64bb1845b0bd2c04c8761845b90dbed9e391a77b..f6014b732398cccd025a39cfb4a824b3154fcd66 100644
--- a/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
@@ -20,7 +20,8 @@ __kernel void fetch(__private const int in_height,
                     __global float* out,
                     __private const int size_ch,
                     __private const int size_block,
-                    __private const int size_batch) {
+                    __private const int size_batch,
+                    __private const int C) {
   const int in_c = get_global_id(0);
   const int in_w = get_global_id(1);
   const int in_nh = get_global_id(2);
@@ -35,9 +36,17 @@ __kernel void fetch(__private const int in_height,
 
   const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
   out[index] = convert_float(in.x);
-  out[index + size_ch] = convert_float(in.y);
+  if(C - 4 * in_c>=2){
+   out[index + size_ch] = convert_float(in.y);
+  }
+  if(C - 4 * in_c>=3){
   out[index + size_ch * 2] = convert_float(in.z);
-  out[index + size_ch * 3] = convert_float(in.w);
+  }
+
+  if(C - 4 * in_c>=4){
+   out[index + size_ch * 3] = convert_float(in.w);
+  }
+
 }
 
 __kernel void fetch_2d(__private const int in_height,
diff --git a/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl b/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..053c7b3f06249f426c4a2203ba9c89362ded6a08
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
@@ -0,0 +1,100 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void prior_box(__private const int global_size_dim0,
+                        __private const int global_size_dim1,
+                        __private const int global_size_dim2,
+                        __global float *box_width,
+                        __global float *box_height,
+                        __write_only image2d_t output_image,
+                        __private const float step_width,
+                        __private const float step_height,
+                        __private const float offset,
+                        __private const int img_width,
+                        __private const int img_height,
+                        __private const int num_priors,
+                        __private const int C){
+
+
+                        const int out_c = get_global_id(0);
+                        const int out_nh = get_global_id(1);
+                        const int out_n = out_nh/num_priors;
+                        const int out_h = out_nh%num_priors;
+
+                        if (out_c >= global_size_dim0 ||out_nh >= global_size_dim2) {
+                             return;
+                         }
+                        const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                  CLK_ADDRESS_CLAMP          |
+                                                  CLK_FILTER_NEAREST;
+                        int2 output_pos;
+                        output_pos.x = out_c * 4;
+                        output_pos.y = out_nh;
+                        float center_x0 = (offset + out_c * 4) * step_width;
+                        float center_x1 = (offset + out_c * 4 + 1) * step_width;
+                        float center_x2 = (offset + out_c * 4 + 2) * step_width;
+                        float center_x3 = (offset + out_c * 4 + 3) * step_width;
+                        float center_y = (out_n + offset) * step_height;
+
+                        half4 output[4];
+                        output[0].x = convert_half((center_x0 - box_width[out_h]) / img_width);
+                        output[1].x = convert_half((center_y - box_height[out_h]) / img_height);
+                        output[2].x = convert_half((center_x0 + box_width[out_h]) / img_width);
+                        output[3].x = convert_half((center_y + box_height[out_h]) / img_height);
+
+                        if(C - 4 * out_c>=2){
+                        output[0].y = convert_half((center_x1 - box_width[out_h]) / img_width);
+                        output[1].y = convert_half((center_y - box_height[out_h]) / img_height);
+                        output[2].y = convert_half((center_x1 + box_width[out_h]) / img_width);
+                        output[3].y = convert_half((center_y + box_height[out_h]) / img_height);
+                        }else{
+                         output[0].y = 0.0f;
+                         output[1].y = 0.0f;
+                         output[2].y = 0.0f;
+                         output[3].y = 0.0f;
+                        }
+                        if(C - 4 * out_c>=3){
+                        output[0].z = convert_half((center_x2 - box_width[out_h]) / img_width);
+                        output[1].z = convert_half((center_y - box_height[out_h]) / img_height);
+                        output[2].z = convert_half((center_x2 + box_width[out_h]) / img_width);
+                        output[3].z = convert_half((center_y + box_height[out_h]) / img_height);
+                        }else{
+                        output[0].z = 0.0f;
+                        output[1].z = 0.0f;
+                        output[2].z = 0.0f;
+                        output[3].z = 0.0f;
+                        }
+                        if(C - 4 * out_c>=4){
+                        output[0].w = convert_half((center_x3 - box_width[out_h]) / img_width);
+                        output[1].w = convert_half((center_y - box_height[out_h]) / img_height);
+                        output[2].w = convert_half((center_x3 + box_width[out_h]) / img_width);
+                        output[3].w = convert_half((center_y + box_height[out_h]) / img_height);
+                        }else{
+                        output[0].z = 0.0f;
+                        output[1].z = 0.0f;
+                        output[2].z = 0.0f;
+                        output[3].z = 0.0f;
+                        }
+                        output[0] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[0]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
+                        output[1] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[1]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
+                        output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
+                        output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
+                        write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output[0]);
+                        write_imageh(output_image, (int2)(output_pos.x + 2, output_pos.y), output[1]);
+                        write_imageh(output_image, (int2)(output_pos.x + 3, output_pos.y), output[2]);
+                        write_imageh(output_image, (int2)(output_pos.x + 4, output_pos.y), output[3]);
+
+}
\ No newline at end of file
diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp
index 7e30c6d31db645fb5d18bf70ef5b6876a5f683da..3292cc7ccd2febc4d1e5b8f5e4991f8348b25196 100644
--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -68,10 +68,10 @@ void ConvAddKernel<GPU_CL, float>::Compute(
   int nh = default_work_size[2];
   auto input = param.Input()->GetCLImage();
   auto filter = param.Filter()->GetCLImage();
-  DLOG << "---yangfei30---";
-  DLOG << *param.Filter();
-  DLOG << param.Paddings();
   auto biase = param.Bias()->GetCLImage();
+  param.Output()->InitEmptyImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue(),
+                                 param.Output()->dims());
   auto output = param.Output()->GetCLImage();
   int stride = param.Strides()[0];
   int offset = param.Offset();
diff --git a/src/operators/kernel/cl/conv_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
index be2da60de0bf429656978d696f8c0067b74559ad..945f84358ff2f5a61ece0c8f96d39b3c10c01c1e 100644
--- a/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
@@ -22,12 +22,185 @@ namespace operators {
 template <>
 bool ConvBNReluKernel<GPU_CL, float>::Init(
     FusionConvBNReluParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  const framework::CLImage *mean = param->InputMean();
+  const framework::CLImage *variance = param->InputVariance();
+  const framework::CLImage *scale = param->InputScale();
+  const framework::CLImage *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+
+  const int C = mean->numel();
+
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  float *new_scale_ptr = new float[C];
+  float *new_bias_ptr = new float[C];
+
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+
+  framework::CLImage *new_scale = new framework::CLImage();
+
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " new scale - " << j << new_scale_ptr[j];
+  //  }
+  //
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " new bias - " << j << new_bias_ptr[j];
+  //  }
+
+  new_scale->SetTensorData(new_scale_ptr, variance->dims());
+  new_scale->InitCLImage(this->cl_helper_.CLContext(),
+                         cl_helper_.CLCommandQueue());
+
+  //  DLOG << " climage - y bias: " << *(param->Bias());
+  //
+  //  DLOG << " climage - new scale: " << *new_scale;
+
+  framework::CLImage *new_bias = new framework::CLImage();
+
+  new_bias->SetTensorData(new_bias_ptr, variance->dims());
+  new_bias->InitCLImage(this->cl_helper_.CLContext(),
+                        cl_helper_.CLCommandQueue());
+
+  //  DLOG << " climage - new bias: " << *new_bias;
+  //
+  //  DLOG << " climage - filter: " << *(param->Filter());
+
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  delete[](new_scale_ptr);
+  delete[](new_bias_ptr);
+
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+
+  param->SetOffset(offset);
+
+  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_1x1", "conv_bn_relu_kernel.cl");
+    DLOG << " conv bn relu conv 1x1";
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_bn_relu_kernel.cl");
+    DLOG << " conv bn relu depth_conv_3x3";
+
+  } else if (param->Filter()->dims()[2] == 3 &&
+             param->Filter()->dims()[3] == 3) {
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    this->cl_helper_.AddKernel("conv_3x3", "conv_bn_relu_kernel.cl");
+    DLOG << " conv bn relu conv_3x3";
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
   return true;
 }
 
 template <>
 void ConvBNReluKernel<GPU_CL, float>::Compute(
-    const FusionConvBNReluParam<GPU_CL> &param) {}
+    const FusionConvBNReluParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto new_scale = param.NewScale()->GetCLImage();
+  auto new_bias = param.NewBias()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+
+  cl_int status;
+
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &new_scale);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_bias);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 8, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 9, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 10, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 11, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 12, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 13, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 14, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 15, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
 template class ConvBNReluKernel<GPU_CL, float>;
 
 }  // namespace operators
diff --git a/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp b/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp
index 48e08d69ec22f128885bb7aa9165e0898ca67b7a..31cb654d75c9c83e132a52d740b0b2db1c3a2ccf 100644
--- a/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp
@@ -22,12 +22,151 @@ namespace operators {
 template <>
 bool DWConvBNReluKernel<GPU_CL, float>::Init(
     FusionDWConvBNReluParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  const framework::CLImage *mean = param->InputMean();
+  const framework::CLImage *variance = param->InputVariance();
+  const framework::CLImage *scale = param->InputScale();
+  const framework::CLImage *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+
+  const int C = mean->numel();
+
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  float *new_scale_ptr = new float[C];
+  float *new_bias_ptr = new float[C];
+
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+
+  framework::CLImage *new_scale = new framework::CLImage();
+
+  new_scale->SetTensorData(new_scale_ptr, variance->dims());
+  new_scale->InitCLImage(this->cl_helper_.CLContext(),
+                         cl_helper_.CLCommandQueue());
+
+  framework::CLImage *new_bias = new framework::CLImage();
+
+  new_bias->SetTensorData(new_bias_ptr, variance->dims());
+  new_bias->InitCLImage(this->cl_helper_.CLContext(),
+                        cl_helper_.CLCommandQueue());
+
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  delete[](new_scale_ptr);
+  delete[](new_bias_ptr);
+
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+
+  param->SetOffset(offset);
+
+  param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                               cl_helper_.CLCommandQueue());
+  this->cl_helper_.AddKernel("depth_conv_3x3", "conv_bn_relu_kernel.cl");
+  DLOG << " conv bn relu depth_conv_3x3";
+
   return true;
 }
 
 template <>
 void DWConvBNReluKernel<GPU_CL, float>::Compute(
-    const FusionDWConvBNReluParam<GPU_CL> &param) {}
+    const FusionDWConvBNReluParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto new_scale = param.NewScale()->GetCLImage();
+  auto new_bias = param.NewBias()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+
+  cl_int status;
+
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &new_scale);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_bias);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 8, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 9, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 10, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 11, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 12, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 13, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 14, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 15, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
 template class DWConvBNReluKernel<GPU_CL, float>;
 
 }  // namespace operators
diff --git a/src/operators/kernel/cl/feed_kernel.cpp b/src/operators/kernel/cl/feed_kernel.cpp
index 78f04357a23c70595595cc24489fd96e994162fb..941a6cb815541d1eca30ccc193161838ce28da80 100644
--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
@@ -28,6 +28,8 @@ template <>
 void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
   auto kernel = this->cl_helper_.KernelAt(0);
   cl_int status;
+  param.Out()->InitEmptyImage(cl_helper_.CLContext(),
+                              cl_helper_.CLCommandQueue(), param.Out()->dims());
   auto output = param.Out();
   const Tensor *input = param.InputX();
   //  DLOG << *input;
diff --git a/src/operators/kernel/cl/fetch_kernel.cpp b/src/operators/kernel/cl/fetch_kernel.cpp
index 31c1d4179cbdfc8145d90bee2353be821e65b40b..8ea0b3ad3d33f0352fba7697fc08ad7a2039e6ab 100644
--- a/src/operators/kernel/cl/fetch_kernel.cpp
+++ b/src/operators/kernel/cl/fetch_kernel.cpp
@@ -27,8 +27,6 @@ bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
   } else {
     this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
   }
-  auto *out = param->Out();
-  out->mutable_data<float>();
   return true;
 }
 
@@ -39,7 +37,7 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
 
   auto input = param.InputX()->GetCLImage();
   auto *out = param.Out();
-
+  out->mutable_data<float>();
   const auto &dim = param.InputX()->dims();
   size_t new_dims[] = {1, 1, 1, 1};
 
@@ -70,9 +68,11 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
     int size_ch = in_height * in_width;
     int size_block = size_ch * 4;
     int size_batch = size_ch * C;
+    int out_c = new_dims[1];
     clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
     clSetKernelArg(kernel, 5, sizeof(int), &size_block);
     clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
+    clSetKernelArg(kernel, 7, sizeof(int), &out_c);
   }
 
   //  cl_event wait_event = param.InpdutX()->GetClEvent();
@@ -93,6 +93,8 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
   //            << "ms" << std::endl;
 
   memcpy(out->data<float>(), out_cl_tensor.Data<float>(), out->memory_size());
+  DLOG << *param.InputX();
+  DLOG << *out;
 }
 
 template class FetchKernel<GPU_CL, float>;
diff --git a/src/operators/kernel/cl/prior_box_kernel.cpp b/src/operators/kernel/cl/prior_box_kernel.cpp
index f8d8c51fcaca214e6248b83d9e135670fa28fe94..1f8843787bc68c8be681e4c2a79053714b76dc4e 100644
--- a/src/operators/kernel/cl/prior_box_kernel.cpp
+++ b/src/operators/kernel/cl/prior_box_kernel.cpp
@@ -15,18 +15,165 @@ limitations under the License. */
 #ifdef PRIORBOX_OP
 
 #include "operators/kernel/prior_box_kernel.h"
-
+#include "framework/cl/cl_tensor.h"
 namespace paddle_mobile {
 namespace operators {
 
 template <>
 bool PriorBoxKernel<GPU_CL, float>::Init(PriorBoxParam<GPU_CL> *param) {
+  this->cl_helper_.AddKernel("prior_box", "prior_box_kernel.cl");
   return true;
 }
 
 template <>
 void PriorBoxKernel<GPU_CL, float>::Compute(
-    const PriorBoxParam<GPU_CL> &param) {}
+    const PriorBoxParam<GPU_CL> &param) {
+  const auto *input_ = param.Input();
+  const auto &input_dims = input_->dims();
+
+  const auto &input_image_dims = param.InputImage()->dims();
+
+  const auto &min_sizes = param.MinSizes();
+  const auto &max_sizes = param.MaxSizes();
+  const auto &variances = param.Variances();
+  const auto &input_aspect_ratio = param.AspectRatios();
+  const bool &flip = param.Flip();
+  const bool &clip = param.Clip();
+  const float &step_w = param.StepW();
+  const float &step_h = param.StepH();
+  const float &offset = param.Offset();
+  const int C = param.OutputBoxes()->dims()[1];
+
+  auto output_boxes = param.OutputBoxes()->GetCLImage();
+  auto output_variances = param.OutputVariances()->GetCLImage();
+
+  std::vector<float> aspect_ratios;
+  ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
+
+  auto img_width = input_image_dims[3];
+  auto img_height = input_image_dims[2];
+
+  auto feature_width = input_dims[3];
+  auto feature_height = input_dims[2];
+
+  float step_width, step_height;
+  /// 300 / 19
+  if (step_w == 0 || step_h == 0) {
+    step_width = static_cast<float>(img_width) / feature_width;
+    step_height = static_cast<float>(img_height) / feature_height;
+  } else {
+    step_width = step_w;
+    step_height = step_h;
+  }
+
+  int num_priors = aspect_ratios.size() * min_sizes.size();
+  if (!max_sizes.empty()) {
+    num_priors += max_sizes.size();
+  }
+
+  float *box_width = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
+  float *box_height = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
+  int idx = 0;
+  for (size_t s = 0; s < min_sizes.size(); ++s) {
+    auto min_size = min_sizes[s];
+    if (param.MinMaxAspectRatiosOrder()) {
+      box_width[idx] = box_height[idx] = min_size / 2.;
+      idx++;
+      if (max_sizes.size() > 0) {
+        auto max_size = max_sizes[s];
+        box_width[idx] = box_height[idx] = sqrt(min_size * max_size) / 2.;
+        idx++;
+      }
+      for (float ar : aspect_ratios) {
+        if (fabs(ar - 1.) < 1e-6) {
+          continue;
+        }
+        box_width[idx] = min_size * sqrt(ar) / 2.;
+        box_height[idx] = min_size / sqrt(ar) / 2.;
+        idx++;
+      }
+
+    } else {
+      for (float ar : aspect_ratios) {
+        box_width[idx] = min_size * sqrt(ar) / 2.;
+        box_height[idx] = min_size / sqrt(ar) / 2.;
+        idx++;
+      }
+      if (!max_sizes.empty()) {
+        auto max_size = max_sizes[s];
+        box_width[idx] = box_height[idx] = sqrt(min_size * max_size) / 2.;
+        idx++;
+      }
+    }
+  }
+  cl_int status;
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size =
+      this->cl_helper_.DefaultWorkSize(*param.OutputBoxes());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  std::vector<int64_t> box_shape({1, 1, 1, num_priors});
+  framework::DDim ddim = framework::make_ddim(box_shape);
+
+  framework::CLTensor box_width_cl_tensor(this->cl_helper_.CLContext(),
+                                          this->cl_helper_.CLCommandQueue());
+  box_width_cl_tensor.Resize(ddim);
+  cl_mem box_width_Buffer =
+      box_width_cl_tensor.mutable_with_data<float>(box_width);
+
+  framework::CLTensor box_height_cl_tensor(this->cl_helper_.CLContext(),
+                                           this->cl_helper_.CLCommandQueue());
+  box_height_cl_tensor.Resize(ddim);
+  cl_mem box_height_Buffer =
+      box_height_cl_tensor.mutable_with_data<float>(box_height);
+
+  DLOG << "c_block:" << c_block;
+  DLOG << "w:" << w;
+  DLOG << "nh:" << nh;
+  DLOG << "step_width:" << step_width;
+  DLOG << "step_height:" << step_height;
+  DLOG << "offset:" << offset;
+  DLOG << "img_width:" << img_width;
+  DLOG << "img_height:" << img_height;
+  DLOG << "num_priors:" << num_priors;
+  DLOG << "C:" << C;
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &box_width_Buffer);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &box_height_Buffer);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output_boxes);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(float), &step_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(float), &step_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 8, sizeof(float), &offset);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &img_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &img_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &num_priors);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &C);
+  CL_CHECK_ERRORS(status);
+  size_t global_work_size[2] = {c_block, nh};
+  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                                  NULL, global_work_size, NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+  paddle_mobile::memory::Free(box_width);
+  paddle_mobile::memory::Free(box_height);
+}
 template class PriorBoxKernel<GPU_CL, float>;
 
 }  // namespace operators
diff --git a/src/operators/kernel/deconv_add_kernel.h b/src/operators/kernel/deconv_add_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..61170f95e2f38319a454eb18461a171347ffed7a
--- /dev/null
+++ b/src/operators/kernel/deconv_add_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADD_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class DeconvAddKernel
+    : public OpKernelBase<DeviceType, FusionDeconvAddParam<DeviceType>> {
+ public:
+  void Compute(const FusionDeconvAddParam<DeviceType> &param);
+
+  bool Init(FusionDeconvAddParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/deconv_add_relu_kernel.h b/src/operators/kernel/deconv_add_relu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc48272157f6e8a5cba4fd09f8acca1b54e90c12
--- /dev/null
+++ b/src/operators/kernel/deconv_add_relu_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADDRELU_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class DeconvAddReluKernel
+    : public OpKernelBase<DeviceType, FusionDeconvAddReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionDeconvAddReluParam<DeviceType> &param);
+
+  bool Init(FusionDeconvAddReluParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/conv_add_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..22841e705c255433bebeab479a2e2b8d3a3b7187
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADD_OP
+
+#include "operators/kernel/conv_add_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input = const_cast<Tensor *>(param->Input());
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = bias_ptr[i];
+  }
+
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void ConvAddKernel<FPGA, float>::Compute(
+    const FusionConvAddParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39d7e818976b56eaea8649392784e7b5dc8b7e1f
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADD_OP
+
+#include "operators/kernel/deconv_add_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void DeconvAddKernel<FPGA, float>::Compute(
+    const FusionDeconvAddParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ef2556208a8650a86522264f40f42cb596ec4190
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADDRELU_OP
+
+#include "operators/kernel/deconv_add_relu_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DeconvAddReluKernel<FPGA, float>::Init(
+    FusionDeconvAddReluParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void DeconvAddReluKernel<FPGA, float>::Compute(
+    const FusionDeconvAddReluParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/split_kernel.cpp b/src/operators/kernel/fpga/V2/split_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..faa1da9186d2a74961450925dea6e3d0f98856bc
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/split_kernel.cpp
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SPLIT_OP
+
+#include "operators/kernel/split_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA>* param) {
+  return true;
+}
+template <>
+void SplitKernel<FPGA, float>::Compute(const SplitParam<FPGA>& param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/fpga/V2/transpose2_kernel.cpp b/src/operators/kernel/fpga/V2/transpose2_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..585cc52947fa5de991fee446ba3c0098ae99d0af
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/transpose2_kernel.cpp
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef TRANSPOSE2_OP
+
+#include "operators/kernel/transpose2_kernel.h"
+#include "operators/kernel/central-arm-func/transpose2_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool Transpose2Kernel<FPGA, float>::Init(Transpose2Param<FPGA> *param) {
+  return true;
+}
+
+template <>
+void Transpose2Kernel<FPGA, float>::Compute(
+    const Transpose2Param<FPGA> &param) {
+  // Transpose2Compute<float>(param);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index 0862ed9b69faa079b2bf841b014f451f9b44e855..121f8be09299d8c6ee8f452b289423b4aad8cf34 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -2234,7 +2234,10 @@ class ConvTransposeParam : public OpParam {
                      const Scope &scope) {
     filter_ = FilterFrom<GType>(inputs, scope);
     input_ = InputFrom<GType>(inputs, scope);
-    output_ = OutputFrom<GType>(outputs, scope);
+    // output_ = OutputFrom<GType>(outputs, scope);
+    if (outputs.count("Output")) {
+      output_ = OpParam::OutputFrom<GType>(outputs, scope);
+    }
     strides_ = GetAttr<vector<int>>("strides", attrs);
     paddings_ = GetAttr<vector<int>>("paddings", attrs);
     dilations_ = GetAttr<vector<int>>("dilations", attrs);
@@ -2275,6 +2278,38 @@ class ConvTransposeParam : public OpParam {
 #endif
 };
 #endif
+#ifdef FUSION_DECONVADD_OP
+template <typename Dtype>
+class FusionDeconvAddParam : public ConvTransposeParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionDeconvAddParam(const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const AttributeMap &attrs, const Scope &scope)
+      : ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
+  }
+  RType *Bias() const { return bias_; }
+
+  const int &Axis() const { return axis_; }
+
+  RType *Output() const { return output_; }
+
+ protected:
+  RType *bias_;
+  int axis_;
+  RType *output_;
+};
+#endif
+
+#ifdef FUSION_DECONVADDRELU_OP
+template <typename Dtype>
+using FusionDeconvAddReluParam = FusionDeconvAddParam<Dtype>;
+#endif
 
 #ifdef FUSION_DECONVRELU_OP
 template <typename Dtype>
diff --git a/src/operators/split_op.cpp b/src/operators/split_op.cpp
index 52732b41288fdc94a7dfc07ef6cfc8d12a969b7b..b440c0be436f333cc46e320c026f67b6020b8aab 100644
--- a/src/operators/split_op.cpp
+++ b/src/operators/split_op.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef SPLIT_OP
-
 #include "operators/split_op.h"
+#include <vector>
 
 namespace paddle_mobile {
 namespace operators {
@@ -83,5 +83,8 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(split, ops::SplitOp);
 #endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(split, ops::SplitOp);
+#endif
 
 #endif  // SPLIT_OP
diff --git a/src/operators/tanh_op.cpp b/src/operators/tanh_op.cpp
index 454cdfa26942eda225a811317e907b1989bcf61b..dd6f9083afd6919cfa3320e5e20275a785adf092 100644
--- a/src/operators/tanh_op.cpp
+++ b/src/operators/tanh_op.cpp
@@ -29,7 +29,7 @@ void TanhOp<DeviceType, T>::InferShape() const {
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(Tanh, ops::TanhOp);
+REGISTER_OPERATOR_FPGA(tanh, ops::TanhOp);
 #endif
 
 #endif
diff --git a/src/operators/transpose2_op.cpp b/src/operators/transpose2_op.cpp
index 64d07991f60b4057e3d2841afa1bfe6483f31a88..03db27a9a2f8fc8974a1b1c97b1d71782388103e 100644
--- a/src/operators/transpose2_op.cpp
+++ b/src/operators/transpose2_op.cpp
@@ -60,5 +60,8 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(transpose2, ops::Transpose2Op);
 #endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(transpose2, ops::Transpose2Op);
+#endif
 
 #endif  // TRANSPOSE_OP
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 38dc540e206ade4adb1427bf2121475217b2d730..3202e9248ba55afb438a4b4fc894095531aa534f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -66,6 +66,10 @@ list(FIND NET "FPGA_NET_V1" CON)
 if (CON GREATER -1)
     ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-resnet50 paddle-mobile)
+
+    ADD_EXECUTABLE(test-densebox net/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-densebox paddle-mobile)
+
     set(FOUND_MATCH ON)
 endif ()
 
@@ -76,6 +80,10 @@ if (CON GREATER -1)
 
     ADD_EXECUTABLE(test-pe fpga/test_pe.cpp)
     target_link_libraries(test-pe paddle-mobile)
+
+    ADD_EXECUTABLE(test-densebox net/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-densebox paddle-mobile)
+
     set(FOUND_MATCH ON)
 endif ()
 
diff --git a/tools/op.cmake b/tools/op.cmake
index ae1ac1a4ffd4a5a563c8a7be0b90c9f26a6b0f70..5901a23a1ff50c357d69cfff63cdfd543dbf8f9d 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -127,6 +127,8 @@ endif()
 list(FIND NET "FPGA_NET_V2" CON)
 if (CON GREATER -1)
   message("FPGA_NET_V2 enabled")
+  set(FEED_OP ON)
+  set(FUSION_CONVADDRELU_OP ON)
   set(FUSION_ELEMENTWISEADDRELU_OP ON)
   set(FUSION_FC_OP ON)
   set(POOL_OP ON)
@@ -135,9 +137,16 @@ if (CON GREATER -1)
   set(FUSION_CONVBN_OP ON)
   set(CONV_TRANSPOSE_OP ON)
   set(FUSION_DECONVRELU_OP ON)
-  set(SLICE_OP ON)
+  #set(SLICE_OP ON)
   set(TANH_OP ON)
   set(ELEMENTWISEADD_OP ON)
+  set(TRANSPOSE2_OP ON)
+  set(FUSION_CONVADD_OP ON)
+  set(SPLIT_OP ON)
+  set(FUSION_DECONVADD_OP ON)
+  set(FUSION_DECONVADDRELU_OP ON)
+
+
   set(FOUND_MATCH ON)
 endif()
 
@@ -452,4 +461,10 @@ if (TANH_OP)
 endif()
 if (FUSION_DECONVRELU_OP)
   add_definitions(-DFUSION_DECONVRELU_OP)
+endif()
+if (FUSION_DECONVADD_OP)
+  add_definitions(-DFUSION_DECONVADD_OP)
+endif()
+if (FUSION_DECONVADDRELU_OP)
+  add_definitions(-DFUSION_DECONVADDRELU_OP)
 endif()
\ No newline at end of file