Merge pull request #1419 from codeWorm2015/metal_oc_pb

Metal oc pb

Merge pull request #1419 from codeWorm2015/metal_oc_pb
Metal oc pb
83c54c9c · Ray Liu · GitHub · efc27022 · 659a6b2b · 83c54c9c
72 changed file
--- a/src/common/log.h
+++ b/src/common/log.h
@@ -85,18 +85,18 @@ struct Print {
 private:
  void print(LogLevel level) {
-    buffer_ << std::endl;
+    // buffer_ << std::endl;
    if (level == kLOG_ERROR) {
-      std::cerr << buffer_.str();
+      std::cerr << buffer_.str() << std::endl;
    } else {
-      std::cout << buffer_.str();
+      std::cout << buffer_.str() << std::endl;
    }
  }
  std::ostringstream buffer_;
 };
 struct ToLog {
-  ToLog(LogLevel level = kLOG_DEBUG, const std::string &info = "")
+  explicit ToLog(LogLevel level = kLOG_DEBUG, const std::string &info = "")
      : level_(level) {
    unsigned blanks =
        (unsigned)(level > kLOG_DEBUG ? (level - kLOG_DEBUG) * 4 : 1);
@@ -175,11 +175,8 @@ struct Print {
  friend struct ToLog;
  template <typename T>
  Print &operator<<(T const &value) {
-    Print p = Print();
+    return *this;
-    return p;
  }
- private:
 };
 struct ToLog {

--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -77,6 +77,12 @@ const char *G_OP_TYPE_CAST = "cast";
 const char *G_OP_TYPE_LOG = "log";
 const char *G_OP_TYPE_LOD_RESET = "lod_reset";
 const char *G_OP_TYPE_LESS_THAN = "less_than";
+const char *G_OP_TYPE_LOGICAL_AND = "logical_and";
+const char *G_OP_TYPE_LOGICAL_OR = "logical_or";
+const char *G_OP_TYPE_LOGICAL_NOT = "logical_not";
+const char *G_OP_TYPE_LOGICAL_XOR = "logical_xor";
+const char *G_OP_TYPE_WRITE_TO_ARRAY = "write_to_array";
+const char *G_OP_TYPE_READ_FROM_ARRAY = "read_from_array";
 const char *G_OP_TYPE_QUANTIZE = "quantize";
 const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
@@ -181,5 +187,11 @@ std::unordered_map<
        {G_OP_TYPE_NORM, {{"X"}, {"Out", "Norm"}}},
        {G_OP_TYPE_LOG, {{"X"}, {"Out"}}},
        {G_OP_TYPE_LOD_RESET, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_LESS_THAN, {{"X", "Y"}, {"Out"}}}};
+        {G_OP_TYPE_LESS_THAN, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_LOGICAL_AND, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_LOGICAL_OR, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_LOGICAL_XOR, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_LOGICAL_NOT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_WRITE_TO_ARRAY, {{"X", "I"}, {"Out"}}},
+        {G_OP_TYPE_READ_FROM_ARRAY, {{"X", "I"}, {"Out"}}}};
 }  // namespace paddle_mobile
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -131,9 +131,12 @@ extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU;
 extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU;
+extern const char *G_OP_TYPE_GRU;
+extern const char *G_OP_TYPE_GRU_UNIT;
 extern const char *G_OP_TYPE_LRN;
 extern const char *G_OP_TYPE_MUL;
 extern const char *G_OP_TYPE_MULTICLASS_NMS;
+extern const char *G_OP_TYPE_NORM;
 extern const char *G_OP_TYPE_POOL2D;
 extern const char *G_OP_TYPE_PRIOR_BOX;
 extern const char *G_OP_TYPE_RELU;
@@ -163,6 +166,12 @@ extern const char *G_OP_TYPE_CAST;
 extern const char *G_OP_TYPE_LOG;
 extern const char *G_OP_TYPE_LOD_RESET;
 extern const char *G_OP_TYPE_LESS_THAN;
+extern const char *G_OP_TYPE_LOGICAL_AND;
+extern const char *G_OP_TYPE_LOGICAL_OR;
+extern const char *G_OP_TYPE_LOGICAL_NOT;
+extern const char *G_OP_TYPE_LOGICAL_XOR;
+extern const char *G_OP_TYPE_WRITE_TO_ARRAY;
+extern const char *G_OP_TYPE_READ_FROM_ARRAY;
 extern const char *G_OP_TYPE_QUANTIZE;
 extern const char *G_OP_TYPE_DEQUANTIZE;

--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -151,6 +151,30 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
  filter_tensor->reset_data_ptr(new_data);
 }
+void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
+                           int stride) {
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  fpga_copy(new_data, data_ptr, memory_size);
+  int hw = height * width;
+  deconv_filter::deconv_NC_convert(&new_data, num, 1, hw);
+  num = dims[1];
+  int channel = dims[0];
+  deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width,
+                                       scale_ptr, stride);
+  //  framework::DDim dims_new =
+  //      framework::make_ddim({num, 1, height, width});
+  //  filter_tensor->Resize(dims_new);
+  filter_tensor->reset_data_ptr(new_data);
+}
 void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
@@ -243,6 +267,17 @@ void format_dwconv_data(framework::Tensor *filter_tensor,
  format_bias_array(bias_ptr, channel);
  format_fp16_ofm(ofm_tensor);
 }
+void format_DWDeconv_data(framework::Tensor *filter_tensor,
+                          framework::Tensor *ofm_tensor, float **bs_ptr,
+                          int group, int sub_conv_n) {
+  int channel = ofm_tensor->dims()[1];
+  // dw-deconv
+  format_DWDconv_filter(
+      filter_tensor,
+      (reinterpret_cast<float *>(*bs_ptr) + sub_conv_n * channel), sub_conv_n);
+  format_bias_array(bs_ptr, channel);
+  format_fp16_ofm(ofm_tensor);
+}
 void expand_conv_arg(ConvArgs *arg) {
  ConvArgs args = *arg;
@@ -770,6 +805,7 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
  auto filter_ptr = filter->data<float>();
  auto input_ptr = input->data<float>();
  auto output_ptr = out->mutable_data<float>();
+  arg->sub_conv_num = 1;
  arg->relu_enabled = relu_enabled;
  arg->bias_address = bias_ptr;
  arg->filter_address = filter_ptr;
@@ -788,5 +824,109 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
  arg->output.scale_address = out->scale;
 }  // end dwconv arg fill
+void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
+                       framework::Tensor *out, framework::Tensor *filter,
+                       bool relu_enabled, int stride_h, int stride_w,
+                       int padding_h, int padding_w, float *bias_ptr) {
+  auto filter_ptr = filter->data<float>();
+  auto input_ptr = input->data<float>();
+  auto output_ptr = out->mutable_data<float>();
+  auto deleter = [](void *p) { fpga_free(p); };
+  arg->group_num = (uint32_t)filter->dims()[0];
+  arg->sub_conv_num = (uint32_t)stride_w;
+  arg->filter_num = (uint32_t)filter->dims()[0];
+  int sub_conv_num = stride_w;
+  int sub_pad =
+      deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],  // NOLINT
+                                         padding_w, stride_w);
+  auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
+      (int)filter->dims()[3], stride_w);  // NOLINT
+  auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
+      (int)input->dims()[3], sub_pad, sub_filter_width);  // NOLINT
+  auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
+      (int)input->dims()[2], sub_pad, sub_filter_width);  // NOLINT
+  arg->sub_output_width = (uint32_t)sub_output_width;
+  arg->sub_output_height = (uint32_t)sub_output_height;
+  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
+      stride_w, (int)filter->dims()[3], padding_w);  // NOLINT
+  auto sub_channels = (int)input->dims()[1];  // NOLINT
+  uint32_t omit_size = arg->omit_size;
+  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
+  int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
+  int sub_filter_num = sub_conv_num * (arg->filter_num);
+  framework::DDim dims_out_new = framework::make_ddim(
+      {1, arg->filter_num, real_out_height, real_out_width});
+  fpga::format_fp16_ofm(out, dims_out_new);
+  auto out_ptr = out->data<float>();
+  /*====For Addition
+  arg->output.address =
+      (half *)out_ptr +  // NOLINT
+      omit_size * sizeof(half) *
+          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
+          */
+  arg->output.address = out_ptr;
+  arg->output.scale_address = out->scale;
+  int filter_offset = sub_filter_width * sub_filter_width *
+                      align_to_x(sub_channels, FILTER_ELEMENT_ALIGNMENT) *
+                      arg->sub_conv_num;
+  for (int i = 0; i < sub_conv_num; ++i) {
+    arg->dw_conv_args.push_back(std::make_shared<DWconvArgs>());
+    arg->dw_conv_args[i]->sub_conv_num = sub_conv_num;
+    arg->dw_conv_args[i]->relu_enabled = relu_enabled;
+    arg->dw_conv_args[i]->bias_address = bias_ptr;
+    arg->dw_conv_args[i]->filter_address =
+        fpga_malloc(filter_offset * sizeof(int16_t));
+    memcpy(arg->dw_conv_args[i]->filter_address,
+           (reinterpret_cast<half *>(filter_ptr) + i * filter_offset),
+           filter_offset * sizeof(int16_t));
+    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
+        reinterpret_cast<char *>(arg->dw_conv_args[i]->filter_address),
+        deleter));
+    arg->dw_conv_args[i]->kernel.height = (uint32_t)sub_filter_width;
+    arg->dw_conv_args[i]->kernel.width = (uint32_t)sub_filter_width;
+    arg->dw_conv_args[i]->kernel.stride_h = (uint32_t)1;
+    arg->dw_conv_args[i]->kernel.stride_w = (uint32_t)1;
+    arg->dw_conv_args[i]->image.address = input_ptr;
+    arg->dw_conv_args[i]->image.channels = (uint32_t)input->dims()[1];
+    arg->dw_conv_args[i]->image.height = (uint32_t)input->dims()[2];
+    arg->dw_conv_args[i]->image.width = (uint32_t)input->dims()[3];
+    arg->dw_conv_args[i]->image.pad_height = sub_pad;
+    arg->dw_conv_args[i]->image.pad_width = sub_pad;
+    arg->dw_conv_args[i]->image.scale_address = input->scale;
+    arg->dw_conv_args[i]->output.address =
+        fpga_malloc(sub_output_height *
+                    align_to_x(sub_output_width * sub_channels * sub_conv_num,
+                               IMAGE_ALIGNMENT) *
+                    sizeof(int16_t));
+    arg->dw_conv_args[i]->output.scale_address =
+        static_cast<float *>(fpga_malloc(2 * sizeof(float)));
+    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
+        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.address),
+        deleter));
+    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
+        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.scale_address),
+        deleter));
+  }
+  // arg->output.scale_address = out->scale;
+}  // end dwconv arg fill
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/api.h
+++ b/src/fpga/V1/api.h
@@ -57,6 +57,10 @@ void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
                     framework::Tensor* out, framework::Tensor* filter,
                     bool relu_enabled, int stride_h, int stride_w,
                     int padding_h, int padding_w, float* bias_ptr);
+void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input,
+                       framework::Tensor* out, framework::Tensor* filter,
+                       bool relu_enabled, int stride_h, int stride_w,
+                       int padding_h, int padding_w, float* bs_ptr);
 void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
                          int group_num, int stride);
@@ -69,6 +73,10 @@ void format_deconv_data(framework::Tensor* filter_tensor,
 void format_dwconv_data(framework::Tensor* filter_tensor,
                        framework::Tensor* ofm_tensor, float* scale_ptr,
                        float** bias_ptr);
+void format_DWDeconv_data(framework::Tensor* filter_tensor,
+                          framework::Tensor* ofm_tensor, float** bs_ptr,
+                          int group, int sub_conv_n);
 template <typename Dtype>
 void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
  float data;

--- a/src/fpga/V1/deconv_filter.cpp
+++ b/src/fpga/V1/deconv_filter.cpp
@@ -21,15 +21,6 @@ limitations under the License. */
 #include "fpga/V1/api.h"
 // #include "fpga_api.h"
-// just for test
-//#include <string>
-//#include "deconv.h"
-//#include "deconv_api.h"
-// using namespace std;
-// using namespace paddle_mobile::fpga;
-// using namespace baidu::fpga::deconv::api;
-// namespace api = baidu::fpga::deconv::api;
 namespace paddle_mobile {
 namespace fpga {
 namespace deconv_filter {
@@ -42,7 +33,8 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
  float* tmp = *data_in;
  int data_size = num * channel * width * height;
  int hw_len = height * width;
-  auto tmp_data = (float*)fpga_malloc(data_size * sizeof(float));
+  auto tmp_data =
+      reinterpret_cast<float*>(fpga_malloc(data_size * sizeof(float)));
  for (int i = 0; i < num; ++i) {
    for (int j = 0; j < channel; ++j) {
      for (int k = 0; k < hw_len; ++k) {
@@ -97,9 +89,10 @@ int deconv_get_omit(int stride, int filter_width, int pad) {
  return (stride - idx);
 }
-void deconv_get_sub_filter(char** data_in, int height, int width,
+template <typename T>
-                           int sub_conv_n, int kernel_num, int channel) {
+void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
-  char* ptr_tmp = *data_in;
+                           int kernel_num, int channel) {
+  T* ptr_tmp = *data_in;
  int sub_num = kernel_num * sub_conv_n;
  int sub_h = height / sub_conv_n;
  int sub_w = width / sub_conv_n;
@@ -107,7 +100,8 @@ void deconv_get_sub_filter(char** data_in, int height, int width,
  int sub_filter_size =
      kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n;
-  char* ptr_sub_filter = (char*)fpga_malloc(sub_filter_size * sizeof(char));
+  T* ptr_sub_filter =
+      reinterpret_cast<T*>(fpga_malloc(sub_filter_size * sizeof(T)));
  for (int idx = 0; idx < sub_conv_n; ++idx) {
    for (int nn = 0; nn < sub_num; ++nn) {
      int ni = nn % kernel_num;
@@ -124,7 +118,7 @@ void deconv_get_sub_filter(char** data_in, int height, int width,
          fpga_copy(
              ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx,
-              (*data_in) + kidx, channel * sizeof(char));
+              (*data_in) + kidx, channel * sizeof(T));
          // for (int cc =0; cc < channel; ++cc) {
          //     ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] =
          //     (*data_in)[kidx + cc];
@@ -140,7 +134,7 @@ void deconv_get_sub_filter(char** data_in, int height, int width,
 void deconv_NC_convert(float** filter_in, int kernel_num, int channels,
                       int hw) {
  float* tmp = *filter_in;
-  float* ptr_filter = (float*)(paddle_mobile::fpga::fpga_malloc(
+  float* ptr_filter = reinterpret_cast<float*>(paddle_mobile::fpga::fpga_malloc(
      hw * kernel_num * channels * sizeof(float)));
  for (int c = 0; c < channels; ++c) {
@@ -188,7 +182,8 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
  result2);
  }*/
-  deconv_get_sub_filter(quantize_data, height, width, stride, num, channel);
+  deconv_get_sub_filter<char>(quantize_data, height, width, stride, num,
+                              channel);
  /*{
     char result2 = (char)0;
     string filename = "sub_filter_filter_data";
@@ -212,10 +207,12 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
                                ((residual == 0) ? div_num : (div_num - 1)) +
                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-  char** ptr_ptr_data = (char**)fpga_malloc(sub_conv_n * sizeof(char*));
+  char** ptr_ptr_data =
+      reinterpret_cast<char**>(fpga_malloc(sub_conv_n * sizeof(char*)));
  int origin_offset = sub_chw * sub_num;
  for (int i = 0; i < sub_conv_n; ++i) {
-    (ptr_ptr_data)[i] = (char*)fpga_malloc(origin_offset * sizeof(char));
+    (ptr_ptr_data)[i] =
+        reinterpret_cast<char*>(fpga_malloc(origin_offset * sizeof(char)));
    fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i,
              origin_offset * sizeof(char));
@@ -233,8 +230,8 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
  int align_offset =
      align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment;
-  char* ptr_space = (char*)fpga_malloc(sub_conv_n * align_offset *
+  char* ptr_space = reinterpret_cast<char*>(fpga_malloc(
-                                       sizeof(char));  // continuous space
+      sub_conv_n * align_offset * sizeof(char)));  // continuous space
  for (int i = 0; i < sub_conv_n; ++i) {
    char* ptr_tmp = (ptr_ptr_data)[i];
@@ -251,7 +248,7 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
    fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset);
    fpga_free(ptr_tmp);
  }
-  *data_in = (float*)ptr_space;
+  *data_in = reinterpret_cast<float*>(ptr_space);
  /*    {
        char result2 = (char)0;
@@ -262,6 +259,22 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
  fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char));
 }
+void DWDconv_format_filter(float** data_in, int num, int channel, int height,
+                           int width, float* scale_ptr, int stride) {
+  deconv_inverse_filter(data_in, num, channel, width, height);
+  filter::quantize_to_fp16(data_in, channel, height, width, scale_ptr);
+  int16_t** quantize_data = (int16_t**)data_in;  // NOLINT
+  filter::convert_to_hwn(quantize_data, channel, height, width);
+  deconv_get_sub_filter<int16_t>(quantize_data, height, width, stride, num,
+                                 channel);
+  filter::align_element_n(quantize_data, channel, height, width);
+  fpga_flush(*quantize_data, align_to_x(channel, FILTER_ELEMENT_ALIGNMENT) *
+                                 height * width * sizeof(int16_t));
+}
 }  // namespace deconv_filter
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/deconv_filter.h
+++ b/src/fpga/V1/deconv_filter.h
@@ -24,11 +24,15 @@ int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
 int deconv_get_sub_filter_axis(int filter_axis, int stride);
 int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis);
 int deconv_get_omit(int stride, int filter_width, int pad);
-void deconv_get_sub_filter(char** data_in, int height, int width,
-                           int sub_conv_n, int kernel_num, int channel);
+template <typename T>
+void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
+                           int kernel_num, int channel);
 void deconv_format_filter(float** data_in, int num, int channel, int height,
                          int width, int group_num, float max, int stride);
 void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw);
+void DWDconv_format_filter(float** data_in, int num, int channel, int height,
+                           int width, float* scale_ptr, int stride);
 }  // namespace deconv_filter
 }  // namespace fpga

--- a/src/fpga/V1/filter.cpp
+++ b/src/fpga/V1/filter.cpp
@@ -346,6 +346,16 @@ void format_dwconv_filter(float **data_in, int num, int height, int width,
  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
                                 height * width * sizeof(int16_t));
 }
+void format_DWDeconv_filter(float **data_in, int num, int height, int width,
+                            float *scale_ptr) {
+  quantize_to_fp16(data_in, num, height, width, scale_ptr);
+  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
+  convert_to_hwn(quantize_data, num, height, width);
+  align_element_n(quantize_data, num, height, width);
+  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
+                                 height * width * sizeof(int16_t));
+}
 }  // namespace filter
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/pe.cpp
+++ b/src/fpga/V1/pe.cpp
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "fpga/V1/image.h"
 #include "fpga/common/config.h"
 #include "fpga/common/driver.h"
 #ifdef COST_TIME_PRINT
 #include <sys/time.h>
 #include <time.h>
@@ -163,6 +162,7 @@ using namespace std;     // NOLINT
 #define REG_DWCONV_FILTER_BASE_ADDR 0xe08
 #define REG_DWCONV_FILTER_SHAPE 0xe10
 #define REG_DWCONV_FILTER_N_ALIGN 0xe18
+#define REG_DWCONV_FILTER_SUBNUMBER 0xe20
 #define REG_DWCONV_CMD 0xe00
 int ComputeFpgaConv(const struct SplitConvArgs &args) {
@@ -591,6 +591,20 @@ int PerformBypass(const struct BypassArgs &args) {
  return 0;
 }  // PerformBypass
+uint64_t FPGAVersion() {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaBypass===========";
+#endif
+#ifdef PADDLE_MOBILE_ZU5
+  uint64_t fpga_ver = 0;
+  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
+  fpga_ver = reg_readq(REG_HARDWARE_STATUS);
+  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+  return fpga_ver;
+#endif
+  return 0;
+}  // FPGAVersion
 int ComputeFPGAConcat(const struct ConcatArgs &args) {
 #ifdef FPGA_PRINT_MODE
  DLOG << "=============ComputeFpgaConcat===========";
@@ -655,6 +669,45 @@ void deconv_post_process(const struct DeconvArgs &args) {
  fpga_flush(args.output.address,
             num * align_deconv_row_len * deconv_h * sizeof(int16_t));
 }
+void DWDeconv_post_process(const struct DWDeconvArgs &args) {
+  int sub_conv_n = args.sub_conv_num;
+  int sub_height = args.sub_output_height;
+  int sub_width = args.sub_output_width;
+  int omit_size = args.omit_size;
+  int channel = args.filter_num;
+  int num = 1;
+  int origin_h = sub_height * sub_conv_n;
+  int origin_w = sub_width * sub_conv_n;
+  int align_origin_w = align_to_x(origin_w * channel, IMAGE_ALIGNMENT);
+  int deconv_h = origin_h - 2 * omit_size;
+  int deconv_w = origin_w - 2 * omit_size;
+  int deconv_row_len = deconv_w * channel;
+  int align_deconv_row_len = align_to_x(deconv_row_len, IMAGE_ALIGNMENT);
+  for (int idx = 0; idx < sub_conv_n; ++idx) {
+    paddle_mobile::fpga::fpga_invalidate(
+        args.dw_conv_args[idx]->output.address,
+        align_origin_w * origin_h * sizeof(int16_t));
+  }
+  int deconv_idx = 0;
+  for (int nn = 0; nn < num; ++nn) {
+    for (int hh = 0; hh < origin_h; ++hh) {
+      int hx = (hh % sub_conv_n);
+      auto sub_t = (int16_t *)(args.dw_conv_args[sub_conv_n - hx - 1]  // NOLINT
+                                   ->output.address);
+      int hi = (hh / sub_conv_n);
+      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
+      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
+                  omit_size * channel);
+      fpga_copy((int16_t *)(args.output.address) + deconv_idx,    // NOLINT
+                sub_t + sidx, sizeof(int16_t) * deconv_row_len);  // NOLINT
+      deconv_idx += align_deconv_row_len;
+    }
+  }
+  fpga_flush(args.output.address,
+             num * align_deconv_row_len * deconv_h * sizeof(int16_t));
+}
 int ComputeFpgaDeconv(const struct DeconvArgs &args) {
 #ifdef FPGA_PRINT_MODE
@@ -792,17 +845,21 @@ int ComputeDWConv(const struct DWconvArgs &args) {
      align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT);
  uint64_t filter_amount_per_row_align =
      filter_N_align * (uint64_t)args.kernel.width;
-  uint64_t filter_amount_align = filter_N_align * (uint64_t)args.kernel.width *
+  uint64_t sub_filter_amount_align = filter_N_align *
-                                 (uint64_t)args.kernel.height;
+                                     (uint64_t)args.kernel.width *
+                                     (uint64_t)args.kernel.height;
+  uint64_t filter_amount_align =
+      sub_filter_amount_align * (uint64_t)args.sub_conv_num;
  uint32_t output_height = (uint32_t)(
      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
          args.kernel.stride_h +
      1);
  uint32_t output_width = (uint32_t)(
-      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
+      ((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-          args.kernel.stride_w +
+           args.kernel.stride_w +
-      1);
+       1) *
+      args.sub_conv_num);
  uint64_t image_amount_per_row =
      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
@@ -845,12 +902,15 @@ int ComputeDWConv(const struct DWconvArgs &args) {
  /*restart scale*/
  reg_writeq(output_scale, REG_SCALE_PARAMETER);
  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
  reg_writeq((bias_physical_address << 32 | filter_physical_address),
             REG_DWCONV_FILTER_BASE_ADDR);
  reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32),
             REG_DWCONV_FILTER_SHAPE);
+  reg_writeq(sub_filter_amount_align | (((uint64_t)args.sub_conv_num) << 32),
+             REG_DWCONV_FILTER_SUBNUMBER);
  reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN);
  reg_writeq(
@@ -904,10 +964,88 @@ int ComputeDWConv(const struct DWconvArgs &args) {
  output_scale = reg_readq(REG_SCALE_PARAMETER);
  output_scale = (output_scale << 32) | (output_scale >> 32);
  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
+  DLOG << "output_scale:" << output_scale;
  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
  return ret;
 #endif
  return 0;
 }
+int ComputeDWDeconv(const struct DWDeconvArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFPGADeConv===========";
+  DLOG << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num << "omit_size:" << args.omit_size
+       << "sub_output_width: " << args.sub_output_width
+       << "sub_output_height: " << args.sub_output_height
+       << "   sub_conv_num:" << args.sub_conv_num;
+  DLOG << "args.output.address: " << args.output.address
+       << "args.output.scale_address: " << args.output.scale_address;
+#endif
+  int sub_conv_num = args.sub_conv_num;
+#ifdef COST_TIME_PRINT
+  timeval start, end;
+  long dif_sec, dif_usec;  // NOLINT
+#endif
+  for (int i = 0; i < sub_conv_num; i++) {
+#ifdef COST_TIME_PRINT
+    gettimeofday(&start, NULL);
+#endif
+    ComputeDWConv(*args.dw_conv_args[i]);
+#ifdef COST_TIME_PRINT
+    gettimeofday(&end, NULL);
+    dif_sec = end.tv_sec - start.tv_sec;
+    dif_usec = end.tv_usec - start.tv_usec;
+    std::cout << "deconv basic_conv: " << i << " times:  "
+              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
+              << std::endl;
+#endif
+  }
+  if (sub_conv_num > 1) {
+    float max_scale = -1.0f;
+#ifdef COST_TIME_PRINT
+    gettimeofday(&start, NULL);
+#endif
+    for (int i = 0; i < sub_conv_num; i++) {
+      paddle_mobile::fpga::fpga_invalidate(
+          args.dw_conv_args[i]->output.scale_address, 2 * sizeof(float));
+      float ptr_scale = (args.dw_conv_args[i]->output.scale_address)[0];
+      if (ptr_scale > max_scale) {
+        args.output.scale_address[0] = ptr_scale;
+        args.output.scale_address[1] =
+            (args.dw_conv_args[i]->output.scale_address)[1];
+      }
+    }
+#ifdef COST_TIME_PRINT
+    gettimeofday(&end, NULL);
+    dif_sec = end.tv_sec - start.tv_sec;
+    dif_usec = end.tv_usec - start.tv_usec;
+    std::cout << "deconv scale  "
+              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
+              << std::endl;
+#endif
+  }
+#ifdef COST_TIME_PRINT
+  gettimeofday(&start, NULL);
+#endif
+  DWDeconv_post_process(args);
+#ifdef COST_TIME_PRINT
+  gettimeofday(&end, NULL);
+  dif_sec = end.tv_sec - start.tv_sec;
+  dif_usec = end.tv_usec - start.tv_usec;
+  std::cout << "deconv_post_process  "
+            << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
+            << std::endl;
+#endif
+  return 0;
+}  // ComputeFpgaDeconv
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/common/fpga_common.cpp
+++ b/src/fpga/common/fpga_common.cpp
@@ -76,7 +76,7 @@ int32_t convertmantissa(int32_t i) {
 }
 float fp16_2_fp32(int16_t fp16_num) {
-  int16_t se_fp16 = fp16_num >> 10;
+  int16_t se_fp16 = (fp16_num >> 10) & 0x3f;
  int16_t m_fp16 = fp16_num & 0x3ff;
  int32_t e_fp32 = 0;
  int16_t offset = 0;
@@ -94,7 +94,7 @@ float fp16_2_fp32(int16_t fp16_num) {
    e_fp32 = 0x80000000;
    offset = 0;
  } else if (se_fp16 < 63) {
-    e_fp32 = 0x80000000 + (se_fp16 - 32) << 23;
+    e_fp32 = 0x80000000 + ((se_fp16 - 32) << 23);
    offset = 1024;
  } else {  // se_fp16 == 63
    e_fp32 = 0xC7800000;

--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -229,6 +229,7 @@ struct DeconvArgs {
  std::vector<std::shared_ptr<SplitConvArgs>> split_conv_args;
 };
 struct DWconvArgs {
+  uint32_t sub_conv_num;
  bool relu_enabled;
  void* bias_address;
  void* filter_address;
@@ -236,6 +237,19 @@ struct DWconvArgs {
  struct ImageInputArgs image;
  struct ImageOutputArgs output;
 };
+struct DWDeconvArgs {
+  uint32_t sub_conv_num;
+  uint32_t group_num;
+  uint32_t filter_num;
+  uint32_t omit_size;
+  uint32_t sub_output_width;
+  uint32_t sub_output_height;
+  struct ImageOutputArgs output;
+  std::vector<std::shared_ptr<DWconvArgs>> dw_conv_args;
+  std::vector<std::shared_ptr<char>> vector_dw_conv_space;
+};
 // static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
 // }
 static inline uint32_t align_to_x(int64_t num, int64_t x) {

--- a/src/fpga/common/pe.h
+++ b/src/fpga/common/pe.h
@@ -18,6 +18,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace fpga {
+uint64_t FPGAVersion();
 int PerformBypass(const struct BypassArgs& args);
 int ComputeBasicConv(const struct ConvArgs& args);
 int ComputeFpgaPool(const struct PoolingArgs& args);
@@ -28,5 +29,7 @@ int ComputeFPGAConcat(const struct ConcatArgs& args);
 int ComputeFPGASplit(const struct SplitArgs& args);
 int ComputeFpgaDeconv(const struct DeconvArgs& args);
 int ComputeDWConv(const struct DWconvArgs& args);
+int ComputeDWDeconv(const struct DWDeconvArgs& args);
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -90,6 +90,10 @@ class Attribute {
        attr.Set<int64_t>(attr_desc->l);
        break;
      }
+      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK: {
+        attr.Set<int>(attr_desc->block_idx);
+        break;
+      }
      default:
        PADDLE_MOBILE_THROW_EXCEPTION("attr type not support");
    }

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -65,6 +65,7 @@ Executor<Device, T>::Executor(const Program<Device> &program,
    for (int j = 0; j < ops.size(); ++j) {
      std::shared_ptr<OpDesc> op_desc = ops[j];
      DLOG << "create op: " << op_desc->Type();
      auto op_handler = OpRegistry<Device>::CreateOp(
          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
          op_desc->GetAttrMap(), program_.scope);

--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -168,6 +168,9 @@ LOAD_FUSION_MATCHER(fusion_conv_bn_relu);
 #ifdef GRU_OP
 LOAD_OP1(gru, CPU);
 #endif
+#ifdef GRU_UNIT_OP
+LOAD_OP1(gru_unit, CPU);
+#endif
 #ifdef FUSION_CONVADDBN_OP
 LOAD_OP2(fusion_conv_add_bn, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_add_bn);
@@ -189,6 +192,9 @@ LOAD_OP1(crf_decoding, CPU);
 #ifdef MUL_OP
 LOAD_OP2(mul, CPU, MALI_GPU);
 #endif
+#ifdef NORM_OP
+LOAD_OP1(norm, CPU);
+#endif
 #ifdef RELU_OP
 LOAD_OP2(relu, CPU, MALI_GPU);
 LOAD_OP1(relu6, CPU);
@@ -279,3 +285,24 @@ LOAD_OP1(lod_reset, CPU);
 #ifdef LESS_THAN_OP
 LOAD_OP1(less_than, CPU);
 #endif
+#ifdef LOGICAL_AND_OP
+LOAD_OP1(logical_and, CPU);
+#endif
+#ifdef LOGICAL_OR_OP
+LOAD_OP1(logical_or, CPU);
+#endif
+#ifdef LOGICAL_NOT_OP
+LOAD_OP1(logical_not, CPU);
+#endif
+#ifdef LOGICAL_XOR_OP
+LOAD_OP1(logical_xor, CPU);
+#endif
+#ifdef WHILE_OP
+LOAD_OP1(while, CPU);
+#endif
+#ifdef WRITE_TO_ARRAY_OP
+LOAD_OP1(write_to_array, CPU);
+#endif
+#ifdef READ_FROM_ARRAY_OP
+LOAD_OP1(read_from_array, CPU);
+#endif
--- a/src/framework/lod_tensor.h
+++ b/src/framework/lod_tensor.h
@@ -176,6 +176,8 @@ LoDTensor LodExpand(const LoDTensor &source, const LoD &lod, size_t level) {
  return tensor;
 }
+using LoDTensorArray = std::vector<LoDTensor>;
 // Get the absolute offset of a lod[start_level][start_idx:end_idx] and
 // relative length of details for every levels(i.e., [start_level: ]).
 //

--- a/src/framework/program/op_desc.cpp
+++ b/src/framework/program/op_desc.cpp
@@ -41,9 +41,7 @@ OpDesc::OpDesc(PaddleMobile__Framework__Proto__OpDesc *desc) {
  for (int k = 0; k < desc->n_attrs; ++k) {
    PaddleMobile__Framework__Proto__OpDesc__Attr *attr = desc->attrs[k];
    std::string attr_name(attr->name);
-    if (attr->type != PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK) {
+    attrs_[attr_name] = Attribute::GetAttrValue(attr);
-      attrs_[attr_name] = Attribute::GetAttrValue(attr);
-    }
  }
 }

--- a/src/io/ios_io/PaddleMobileCPU.h
+++ b/src/io/ios_io/PaddleMobileCPU.h
@@ -44,6 +44,11 @@
 */
 @property  (assign, nonatomic) BOOL optimize;
+/**
+ @b 是否预测时初始化内存，用于处理可变输入
+ */
+@property  (assign, nonatomic) BOOL loadWhenPredict;
 @end
 @interface PaddleMobileCPU : NSObject

--- a/src/io/ios_io/PaddleMobileCPU.mm
+++ b/src/io/ios_io/PaddleMobileCPU.mm
@@ -73,6 +73,8 @@ static std::mutex shared_mutex;
 - (instancetype)initWithConfig:(PaddleMobileCPUConfig *)config {
  if (self = [super init]) {
+    paddle_mobile::PaddleMobileConfigInternal configInternal;
+    configInternal.load_when_predict = config.loadWhenPredict;
    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>();
    _config = config;
  }

--- a/src/io/jni/PML.java
+++ b/src/io/jni/PML.java
@@ -7,15 +7,7 @@ public class PML {
     * @param modelDir model dir
     * @return isloadsuccess
     */
-    public static native boolean load(String modelDir);
+    public static native boolean load(String modelDir, Boolean lodMode);
-    /**
-     * load seperated model
-     *
-     * @param modelDir model dir
-     * @return isloadsuccess
-     */
-    public static native boolean loadnlp(String modelDir);
    /**
     * load combined model
@@ -24,7 +16,7 @@ public class PML {
     * @param paramPath param file path
     * @return isloadsuccess
     */
-    public static native boolean loadCombined(String modelPath, String paramPath);
+    public static native boolean loadCombined(String modelPath, String paramPath, Boolean lodMode);
    /**
     * load model and qualified params
@@ -32,7 +24,7 @@ public class PML {
     * @param modelDir qualified model dir
     * @return isloadsuccess
     */
-    public static native boolean loadQualified(String modelDir);
+    public static native boolean loadQualified(String modelDir, Boolean lodMode);
    /**
     * load model and qualified combined params
@@ -41,7 +33,7 @@ public class PML {
     * @param paramPath qualified param path
     * @return isloadsuccess
     */
-    public static native boolean loadCombinedQualified(String modelPath, String paramPath);
+    public static native boolean loadCombinedQualified(String modelPath, String paramPath, Boolean lodMode);
    /**
     * predict image
@@ -52,9 +44,12 @@ public class PML {
     */
    public static native float[] predictImage(float[] buf, int[] ddims);
    public static native float[] predictYuv(byte[] buf, int imgWidth, int imgHeight, int[] ddims, float[] meanValues);
+    // predict with variable length input
+    // support only one input and one output currently
+    public static native float[] predictLod(float[] buf);
    /**
     * clear model data
     */
@@ -66,6 +61,4 @@ public class PML {
     * @param threadCount threadCount
     */
    public static native void setThread(int threadCount);
 }
--- a/src/io/jni/paddle_mobile_jni.cpp
+++ b/src/io/jni/paddle_mobile_jni.cpp
@@ -39,7 +39,7 @@ using framework::Tensor;
 using paddle_mobile::CPU;
 using std::string;
-extern const char *ANDROID_LOG_TAG =
+const char *ANDROID_LOG_TAG =
    "paddle_mobile LOG built on " __DATE__ " " __TIME__;
 paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
 static std::mutex shared_mutex;
@@ -55,51 +55,31 @@ string jstring2cppstring(JNIEnv *env, jstring jstr) {
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                                          jclass thiz,
-                                                          jstring modelPath) {
+                                                          jstring modelPath,
+                                                          jboolean lodMode) {
  std::lock_guard<std::mutex> lock(shared_mutex);
  ANDROIDLOGI("load invoked");
  bool optimize = true;
  bool isLoadOk = false;
 #ifdef ENABLE_EXCEPTION
  try {
    isLoadOk = getPaddleMobileInstance()->Load(
-        jstring2cppstring(env, modelPath), optimize);
+        jstring2cppstring(env, modelPath), optimize, false, 1,
+        static_cast<bool>(lodMode));
  } catch (paddle_mobile::PaddleMobileException &e) {
    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
    isLoadOk = false;
  }
 #else
  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
-                                             optimize);
+                                             optimize, false, 1,
-#endif
+                                             static_cast<bool>(lodMode));
-  return static_cast<jboolean>(isLoadOk);
-}
-JNIEXPORT jboolean JNICALL
-Java_com_baidu_paddle_PML_loadnlp(JNIEnv *env, jclass thiz, jstring modelPath) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  ANDROIDLOGI("load invoked");
-  bool optimize = true;
-  bool isLoadOk = false;
-#ifdef ENABLE_EXCEPTION
-  try {
-    isLoadOk = getPaddleMobileInstance()->Load(
-        jstring2cppstring(env, modelPath), optimize, false, 1, true);
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-    isLoadOk = false;
-  }
-#else
-  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
-                                             optimize, false, 1, true);
 #endif
  return static_cast<jboolean>(isLoadOk);
 }
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
-    JNIEnv *env, jclass thiz, jstring modelPath) {
+    JNIEnv *env, jclass thiz, jstring modelPath, jboolean lodMode) {
  std::lock_guard<std::mutex> lock(shared_mutex);
  ANDROIDLOGI("loadQualified invoked");
@@ -110,21 +90,24 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
 #ifdef ENABLE_EXCEPTION
  try {
    isLoadOk = getPaddleMobileInstance()->Load(
-        jstring2cppstring(env, modelPath), optimize, qualified);
+        jstring2cppstring(env, modelPath), optimize, qualified, 1,
+        static_cast<bool>(lodMode));
  } catch (paddle_mobile::PaddleMobileException &e) {
    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
    isLoadOk = false;
  }
 #else
  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
-                                             optimize, qualified);
+                                             optimize, qualified, 1,
+                                             static_cast<bool>(lodMode));
 #endif
  return static_cast<jboolean>(isLoadOk);
 }
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
-    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath,
+    jboolean lodMode) {
  std::lock_guard<std::mutex> lock(shared_mutex);
  ANDROIDLOGI("loadCombined invoked");
  bool optimize = true;
@@ -134,21 +117,22 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
  try {
    isLoadOk = getPaddleMobileInstance()->Load(
        jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath),
-        optimize);
+        optimize, false, 1, static_cast<bool>(lodMode));
  } catch (paddle_mobile::PaddleMobileException &e) {
    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
    isLoadOk = false;
  }
 #else
-  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+  isLoadOk = getPaddleMobileInstance()->Load(
-                                             jstring2cppstring(env, paramPath),
+      jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath),
-                                             optimize);
+      optimize, false, 1, static_cast<bool>(lodMode));
 #endif
  return static_cast<jboolean>(isLoadOk);
 }
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
-    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath,
+    jboolean lodMode) {
  std::lock_guard<std::mutex> lock(shared_mutex);
  ANDROIDLOGI("loadCombinedQualified invoked");
  bool optimize = true;
@@ -159,15 +143,15 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
  try {
    isLoadOk = getPaddleMobileInstance()->Load(
        jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath),
-        optimize, qualified);
+        optimize, qualified, 1, static_cast<bool>(lodMode));
  } catch (paddle_mobile::PaddleMobileException &e) {
    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
    isLoadOk = false;
  }
 #else
-  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+  isLoadOk = getPaddleMobileInstance()->Load(
-                                             jstring2cppstring(env, paramPath),
+      jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath),
-                                             optimize, qualified);
+      optimize, qualified, 1, static_cast<bool>(lodMode));
 #endif
  return static_cast<jboolean>(isLoadOk);
 }

--- a/src/io/jni/paddle_mobile_jni.h
+++ b/src/io/jni/paddle_mobile_jni.h
@@ -26,24 +26,27 @@ namespace jni {
 */
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                                          jclass thiz,
-                                                          jstring modelPath);
+                                                          jstring modelPath,
+                                                          jboolean lodMode);
 /**
 * load separated qualified model for android
 */
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
-    JNIEnv *env, jclass thiz, jstring modelPath);
+    JNIEnv *env, jclass thiz, jstring modelPath, jboolean lodMode);
 /**
 * load combined model  for android
 */
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
-    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath,
+    jboolean lodMode);
 /**
 * load combined qualified model for android
 */
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
-    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath,
+    jboolean lodMode);
 /**
 * object detection for anroid
@@ -61,8 +64,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
 /**
 * object detection for anroid
 */
-JNIEXPORT jfloatArray JNICALL
+JNIEXPORT jlongArray JNICALL
-Java_com_baidu_paddle_PML_predict(JNIEnv *env, jclass thiz, jfloatArray buf);
+Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf);
 /**
 * setThreadCount for multithread

--- a/src/operators/controlflow/tensor_array_read_write_op.cpp
+++ b/src/operators/controlflow/tensor_array_read_write_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/controlflow/tensor_array_read_write_op.h"
+namespace paddle_mobile {
+namespace operators {
+#ifdef WRITE_TO_ARRAY_OP
+template <typename Dtype, typename T>
+void WriteToArrayOp<Dtype, T>::InferShape() const {}
+#endif  // WRITE_TO_ARRAY_OP
+#ifdef READ_FROM_ARRAY_OP
+template <typename Dtype, typename T>
+void ReadFromArrayOp<Dtype, T>::InferShape() const {}
+#endif  // READ_FROM_ARRAY_OP
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+#ifdef WRITE_TO_ARRAY_OP
+REGISTER_OPERATOR_CPU(write_to_array, ops::WriteToArrayOp);
+#endif  // WRITE_TO_ARRAY_OP
+#ifdef READ_FROM_ARRAY_OP
+REGISTER_OPERATOR_CPU(read_from_array, ops::ReadFromArrayOp);
+#endif  // READ_FROM_ARRAY_OP
+#endif
--- a/src/operators/controlflow/tensor_array_read_write_op.h
+++ b/src/operators/controlflow/tensor_array_read_write_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/tensor_array_read_write_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+#ifdef WRITE_TO_ARRAY_OP
+DECLARE_OPERATOR(WriteToArray, WriteToArrayParam, WriteToArrayKernel);
+#endif  // WRITE_TO_ARRAY_OP
+#ifdef READ_FROM_ARRAY_OP
+DECLARE_OPERATOR(ReadFromArray, ReadFromArrayParam, ReadFromArrayKernel);
+#endif  // WRITE_TO_ARRAY_OP
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/controlflow/while_op.cpp
+++ b/src/operators/controlflow/while_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/controlflow/while_op.h"
+namespace paddle_mobile {
+namespace operators {
+#ifdef WHILE_OP
+template <typename Dtype, typename T>
+void WhileOp<Dtype, T>::InferShape() const {
+  // TODO(hjchen2)
+}
+#endif  // WHILE_OP
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+#ifdef WHILE_OP
+REGISTER_OPERATOR_CPU(while, ops::WhileOp);
+#endif  // WHILE_OP
+#endif
--- a/src/operators/controlflow/while_op.h
+++ b/src/operators/controlflow/while_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/while_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+#ifdef WHILE_OP
+DECLARE_OPERATOR(While, WhileParam, WhileKernel);
+#endif  // WHILE_OP
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP
 #include "operators/kernel/conv_add_bn_relu_kernel.h"
-#include <math.h>
+#include <cmath>
 #include "operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h"
 namespace paddle_mobile {

--- a/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNADDRELU_OP
 #include "operators/kernel/conv_bn_add_relu_kernel.h"
+#include <cmath>
 #include "operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h"
 namespace paddle_mobile {

--- a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNRELU_OP
 #include "operators/kernel/conv_bn_relu_kernel.h"
+#include <cmath>
 #include "operators/kernel/central-arm-func/conv_bn_relu_arm_func.h"
 namespace paddle_mobile {

--- a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_DWCONVBNRELU_OP
 #include "operators/kernel/dwconv_bn_relu_kernel.h"
+#include <cmath>
 #include "operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h"
 namespace paddle_mobile {

--- a/src/operators/kernel/arm/logical_kernel.cpp
+++ b/src/operators/kernel/arm/logical_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/logical_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename T>
+struct LogicalAndFunctor {
+  bool operator()(const T& a, const T& b) const { return a && b; }
+};
+template <typename T>
+struct LogicalOrFunctor {
+  bool operator()(const T& a, const T& b) const { return a || b; }
+};
+template <typename T>
+struct LogicalNotFunctor {
+  bool operator()(const T& a) const { return !a; }
+};
+template <typename T>
+struct LogicalXorFunctor {
+  bool operator()(const T& a, const T& b) const {
+    return (a || b) && !(a && b);
+  }
+};
+template <typename T, typename Functor>
+void UnaryLogicalCompute(const Tensor* inputX, Tensor* output) {
+  Functor func;
+  std::transform(inputX->data<T>(), inputX->data<T>() + inputX->numel(),
+                 output->data<T>(), func);
+}
+template <typename T, typename Functor>
+void BinaryLogicalCompute(const Tensor* inputX, const Tensor* inputY,
+                          Tensor* output) {
+  Functor func;
+  std::transform(inputX->data<T>(), inputX->data<T>() + inputX->numel(),
+                 inputY->data<T>(), output->data<T>(), func);
+}
+#ifdef LOGICAL_AND_OP
+template <>
+bool LogicalAndKernel<CPU, float>::Init(LogicalBinaryParam<CPU>* param) {
+  return true;
+}
+template <>
+void LogicalAndKernel<CPU, float>::Compute(
+    const LogicalBinaryParam<CPU>& param) {
+  auto* inputX = param.InputX();
+  auto* inputY = param.InputY();
+  auto* out = param.Out();
+  out->mutable_data<bool>();
+  BinaryLogicalCompute<bool, LogicalAndFunctor<bool>>(inputX, inputY, out);
+}
+#endif
+#ifdef LOGICAL_OR_OP
+template <>
+bool LogicalOrKernel<CPU, float>::Init(LogicalBinaryParam<CPU>* param) {
+  return true;
+}
+template <>
+void LogicalOrKernel<CPU, float>::Compute(
+    const LogicalBinaryParam<CPU>& param) {
+  auto* inputX = param.InputX();
+  auto* inputY = param.InputY();
+  auto* out = param.Out();
+  out->mutable_data<bool>();
+  BinaryLogicalCompute<bool, LogicalOrFunctor<bool>>(inputX, inputY, out);
+}
+#endif
+#ifdef LOGICAL_NOT_OP
+template <>
+bool LogicalNotKernel<CPU, float>::Init(LogicalUnaryParam<CPU>* param) {
+  return true;
+}
+template <>
+void LogicalNotKernel<CPU, float>::Compute(
+    const LogicalUnaryParam<CPU>& param) {
+  auto* inputX = param.InputX();
+  auto* out = param.Out();
+  out->mutable_data<bool>();
+  UnaryLogicalCompute<bool, LogicalNotFunctor<bool>>(inputX, out);
+}
+#endif
+#ifdef LOGICAL_XOR_OP
+template <>
+bool LogicalXorKernel<CPU, float>::Init(LogicalBinaryParam<CPU>* param) {
+  return true;
+}
+template <>
+void LogicalXorKernel<CPU, float>::Compute(
+    const LogicalBinaryParam<CPU>& param) {
+  auto* inputX = param.InputX();
+  auto* inputY = param.InputY();
+  auto* out = param.Out();
+  out->mutable_data<bool>();
+  BinaryLogicalCompute<bool, LogicalXorFunctor<bool>>(inputX, inputY, out);
+}
+#endif
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
@@ -167,7 +167,7 @@ float find_abs_max(const Tensor *input) {
  max_abs = vmaxvq_f32(__max);
 #endif
  for (size_t i = 0; i < remain; ++i) {
-    max_abs = std::max(max_abs, std::abs(x[i]));
+    max_abs = std::max(max_abs, fabs(x[i]));
  }
  return max_abs;
 }

--- a/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp
+++ b/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/tensor_array_read_write_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+#ifdef WRITE_TO_ARRAY_OP
+template <>
+bool WriteToArrayKernel<CPU, float>::Init(WriteToArrayParam<CPU> *param) {
+  return true;
+}
+template <>
+void WriteToArrayKernel<CPU, float>::Compute(
+    const WriteToArrayParam<CPU> &param) {
+  int64_t offset = param.index_->data<int64_t>()[0];
+  if (offset >= param.output_->size()) {
+    param.output_->resize(offset);
+  }
+  framework::LoDTensor *out_tensor = &(param.output_->at(offset));
+  out_tensor->set_lod(param.input_->lod());
+  if (param.input_->memory_size() > 0) {
+    TensorCopy(*(param.input_), out_tensor);
+  }
+}
+#endif  // WRITE_TO_ARRAY_OP
+#ifdef READ_FROM_ARRAY_OP
+template <>
+bool ReadFromArrayKernel<CPU, float>::Init(ReadFromArrayParam<CPU> *param) {
+  return true;
+}
+template <>
+void ReadFromArrayKernel<CPU, float>::Compute(
+    const ReadFromArrayParam<CPU> &param) {
+  int64_t offset = param.index_->data<int64_t>()[0];
+  if (offset < param.input_->size()) {
+    TensorCopy(param.input_->at(offset), param.output_);
+  }
+}
+#endif  // READ_FROM_ARRAY_OP
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/arm/while_kernel.cpp
+++ b/src/operators/kernel/arm/while_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/while_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+#ifdef WHILE_OP
+template <>
+bool WhileKernel<CPU, float>::Init(WhileParam<CPU> *param) {
+  return true;
+}
+template <>
+void WhileKernel<CPU, float>::Compute(const WhileParam<CPU> &param) {
+  // TODO(hjchen2)
+}
+#endif  // WHILE_OP
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/central-arm-func/gru_unit_arm_func.h
+++ b/src/operators/kernel/central-arm-func/gru_unit_arm_func.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "operators/math/gemm.h"
 #include "operators/math/math_function.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/cl/batchnorm_kernel.cpp
+++ b/src/operators/kernel/cl/batchnorm_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef BATCHNORM_OP
 #include "operators/kernel/batchnorm_kernel.h"
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP
 #include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include <cmath>
 #include "framework/cl/cl_image.h"
 #include "framework/cl/cl_tool.h"

--- a/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNADDRELU_OP
 #include "operators/kernel/conv_bn_add_relu_kernel.h"
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNRELU_OP
 #include "operators/kernel/conv_bn_relu_kernel.h"
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_DWCONVBNRELU_OP
 #include "operators/kernel/dwconv_bn_relu_kernel.h"
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/cl/relu_kernel.cpp
+++ b/src/operators/kernel/cl/relu_kernel.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef RELU_OP
-#include "operators/kernel/relu_kernel.h"
+#include "operators/kernel/activation_kernel.h"
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
@@ -15,7 +15,8 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBN_OP
 #include "operators/kernel/conv_add_bn_kernel.h"
-#include <math.h>
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP
 #include "operators/kernel/conv_add_bn_relu_kernel.h"
-#include <math.h>
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
--- a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
@@ -49,13 +49,23 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
                        "filter width should be equal to filter height ");
  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                        "filter axis should be the multiple of stride axis ");
-  fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+  if (param->Groups() == channel) {
-  fpga::DeconvArgs deconv_arg = {0};
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-  fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
+                               sub_conv_n);
-                        param->Groups(), param->Strides()[0],
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
-                        param->Strides()[1], param->Paddings()[0],
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, relu_enabled,
-                        param->Paddings()[1], bs_ptr);
+                            param->Strides()[0], param->Strides()[1],
-  param->SetFpgaArgs(deconv_arg);
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
+                          param->Groups(), param->Strides()[0],
+                          param->Strides()[1], param->Paddings()[0],
+                          param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
  return true;
 }
@@ -63,7 +73,11 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
 template <>
 void DeconvAddKernel<FPGA, float>::Compute(
    const FusionDeconvAddParam<FPGA> &param) {
-  fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
 }
 }  // namespace operators

--- a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
@@ -50,20 +50,35 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
                        "filter width should be equal to filter height ");
  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                        "filter axis should be the multiple of stride axis ");
-  fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+  if (param->Groups() == channel) {
-  fpga::DeconvArgs deconv_arg = {0};
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-  fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
+                               sub_conv_n);
-                        param->Groups(), param->Strides()[0],
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
-                        param->Strides()[1], param->Paddings()[0],
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, relu_enabled,
-                        param->Paddings()[1], bs_ptr);
+                            param->Strides()[0], param->Strides()[1],
-  param->SetFpgaArgs(deconv_arg);
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
+                          param->Groups(), param->Strides()[0],
+                          param->Strides()[1], param->Paddings()[0],
+                          param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
  return true;
 }
 template <>
 void DeconvAddReluKernel<FPGA, float>::Compute(
    const FusionDeconvAddReluParam<FPGA> &param) {
-  fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
 }
 }  // namespace operators

--- a/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SIGMOID_OP
+#include "operators/kernel/activation_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+using framework::DDim;
+using framework::Tensor;
+template <>
+bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
+  auto input = const_cast<Tensor *>(param->InputX());
+  auto input_ptr = input->data<float>();
+  auto out = param->Out();
+  fpga::format_fp32_ofm(out);
+  auto float_input = new Tensor;
+  if (input->dims().size() == 2) {
+    float_input->mutable_data<float>({1, input->dims()[1]});
+  } else if (input->dims().size() == 4) {
+    float_input->mutable_data<float>(
+        {1, input->dims()[2], input->dims()[3], input->dims()[1]});
+  } else {
+    DLOG << "wrong dimension of softmax input";
+  }
+  fpga::format_fp32_ofm(float_input);
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_CHW;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input_ptr;
+  args.image.height =
+      (input->dims().size() == 4) ? (uint32_t)input->dims()[2] : 1;
+  args.image.width =
+      (input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = float_input->data<float>();
+  args.output.scale_address = float_input->scale;
+  param->SetFloatInput(float_input);
+  param->SetFpgaArgs(args);
+  return true;
+}
+template <typename T>
+T Sigmoid(const T a) {
+  T tmp = -1.0f * a;
+  return (1.0 / (1.0 + exp(tmp)));
+}
+template <typename T>
+void sigmoidFuntor(Tensor *input, Tensor *output) {
+  auto *input_ptr = input->data<T>();
+  auto *output_ptr = output->mutable_data<T>();
+  for (int i = 0; i < input->numel(); i++) {
+    *(output_ptr + i) = Sigmoid<T>(*(input_ptr + i));
+  }
+}
+template <>
+void SigmoidKernel<FPGA, float>::Compute(const SigmoidParam<FPGA> &param) {
+  Tensor *in_x = param.FloatInput();
+  Tensor *out = param.Out();
+  fpga::PerformBypass(param.FpgaArgs());
+  fpga::fpga_invalidate((void *)in_x->data<float>(),  // NOLINT
+                        in_x->numel() * sizeof(float));
+  // TODO: In general case, 0 should be squeezed before softmax input  // NOLINT
+  sigmoidFuntor<float>(in_x, out);
+  fpga::fpga_flush(out->data<float>(), out->memory_size());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
@@ -22,7 +22,7 @@ namespace operators {
 template <>
 bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
-  auto input = const_cast<Tensor *>(param->InputX());
+  auto input = const_cast<LoDTensor *>(param->InputX());
  auto input_ptr = input->data<float>();
  auto out = param->Out();
  fpga::format_fp32_ofm(out);

--- a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBN_OP
 #include "operators/kernel/conv_add_bn_kernel.h"
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP
 #include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBN_OP
 #include "operators/kernel/conv_bn_kernel.h"
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNRELU_OP
 #include "operators/kernel/conv_bn_relu_kernel.h"
+#include <cmath>
 #include "fpga/V2/filter.h"
 namespace paddle_mobile {

--- a/src/operators/kernel/logical_kernel.h
+++ b/src/operators/kernel/logical_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+#ifdef LOGICAL_AND_OP
+DECLARE_KERNEL(LogicalAnd, LogicalBinaryParam);
+#endif
+#ifdef LOGICAL_OR_OP
+DECLARE_KERNEL(LogicalOr, LogicalBinaryParam);
+#endif
+#ifdef LOGICAL_NOT_OP
+DECLARE_KERNEL(LogicalNot, LogicalUnaryParam);
+#endif
+#ifdef LOGICAL_XOR_OP
+DECLARE_KERNEL(LogicalXor, LogicalBinaryParam);
+#endif
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/tensor_array_read_write_kernel.h
+++ b/src/operators/kernel/tensor_array_read_write_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+#ifdef WRITE_TO_ARRAY_OP
+DECLARE_KERNEL(WriteToArray, WriteToArrayParam);
+#endif  // WRITE_TO_ARRAY_OP
+#ifdef READ_FROM_ARRAY_OP
+DECLARE_KERNEL(ReadFromArray, ReadFromArrayParam);
+#endif  // READ_FROM_ARRAY_OP
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/while_kernel.h
+++ b/src/operators/kernel/while_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+#ifdef WHILE_OP
+template <typename Dtype>
+class WhileParam : public OpParam {
+ public:
+  WhileParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+             const AttributeMap &attrs, const Scope &scope)
+      : inputs_(inputs), outputs_(outputs), scope_(scope) {
+    cond_ =
+        OpParam::GetVarValue<framework::LoDTensor>("Condition", inputs, scope);
+    sub_block_ = OpParam::GetAttr<int>("sub_block", attrs);
+  }
+ public:
+  framework::LoDTensor *cond_;
+  int sub_block_;
+  const VariableNameMap inputs_;
+  const VariableNameMap outputs_;
+  const Scope scope_;
+};
+DECLARE_KERNEL(While, WhileParam);
+#endif  // WHILE_OP
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/logical_op.cpp
+++ b/src/operators/logical_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/logical_op.h"
+namespace paddle_mobile {
+namespace operators {
+#define DEFINE_LOGICAL_INFERSHAPE(OpName)                   \
+  template <typename Dtype, typename T>                     \
+  void OpName##Op<Dtype, T>::InferShape() const {           \
+    const auto &input_dims = this->param_.InputX()->dims(); \
+    this->param_.Out()->Resize(input_dims);                 \
+  }
+#ifdef LOGICAL_AND_OP
+DEFINE_LOGICAL_INFERSHAPE(LogicalAnd);
+#endif  // TLOGICAL_AND_OP
+#ifdef LOGICAL_OR_OP
+DEFINE_LOGICAL_INFERSHAPE(LogicalOr);
+#endif  // TLOGICAL_OR_OP
+#ifdef LOGICAL_NOT_OP
+DEFINE_LOGICAL_INFERSHAPE(LogicalNot);
+#endif  // LOGICAL_NOT_OP
+#ifdef LOGICAL_XOR_OP
+DEFINE_LOGICAL_INFERSHAPE(LogicalXor);
+#endif  // TLOGICAL_XOR_OP
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef LOGICAL_AND_OP
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(logical_and, ops::LogicalAndOp);
+#endif
+#endif  // LOGICAL_AND_OP
+#ifdef LOGICAL_OR_OP
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(logical_or, ops::LogicalOrOp);
+#endif
+#endif  // LOGICAL_OR_OP
+#ifdef LOGICAL_NOT_OP
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(logical_not, ops::LogicalNotOp);
+#endif
+#endif  // LOGICAL_NOT_OP
+#ifdef LOGICAL_XOR_OP
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(logical_xor, ops::LogicalXorOp);
+#endif
+#endif  // LOGICAL_XOR_OP
--- a/src/operators/logical_op.h
+++ b/src/operators/logical_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/logical_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+#ifdef LOGICAL_AND_OP
+DECLARE_OPERATOR(LogicalAnd, LogicalBinaryParam, LogicalAndKernel);
+#endif
+#ifdef LOGICAL_OR_OP
+DECLARE_OPERATOR(LogicalOr, LogicalBinaryParam, LogicalOrKernel);
+#endif
+#ifdef LOGICAL_NOT_OP
+DECLARE_OPERATOR(LogicalNot, LogicalUnaryParam, LogicalNotKernel);
+#endif
+#ifdef LOGICAL_XOR_OP
+DECLARE_OPERATOR(LogicalXor, LogicalBinaryParam, LogicalXorKernel);
+#endif
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <cstring>
 #include <string>
 #include "common/log.h"
 #include "memory/t_malloc.h"

--- a/src/operators/math/quantize.h
+++ b/src/operators/math/quantize.h
@@ -40,8 +40,8 @@ template <>
 inline int8_t Round<ROUND_NEAREST_TO_EVEN>(const float &x) {
  float v = std::round(x);
  int32_t q = static_cast<int32_t>(v);
-  if (std::abs(std::abs(q - v) - 0.5) <= 0) {
+  if (fabs(fabs(q - v) - 0.5) <= 0) {
-    if (std::abs(q) % 2 != 0) {
+    if (abs(q) % 2 != 0) {
      q = q + ((q > 0) ? -1 : 1);
    }
  }

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1078,6 +1078,20 @@ class SigmoidParam : public OpParam {
 private:
  RType *input_x_;
  RType *out_;
+#ifdef PADDLE_MOBILE_FPGA
+ private:
+  std::shared_ptr<RType> float_input_x_;
+  fpga::BypassArgs fpga_bypass_args;
+ public:
+  RType *FloatInput() const {
+    return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
+  }
+  void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
+  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
+  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
+#endif
 };
 #endif
@@ -2357,10 +2371,17 @@ class ConvTransposeParam : public OpParam {
 private:
  fpga::DeconvArgs fpga_conv_args;
+  fpga::DWDeconvArgs fpga_DWDeconv_args;
 public:
  const fpga::DeconvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::DWDeconvArgs &FpgaDWDconvArgs() const {
+    return fpga_DWDeconv_args;
+  }
  void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::DWDeconvArgs &args) {
+    fpga_DWDeconv_args = args;
+  }
 #endif
 };
@@ -2942,5 +2963,112 @@ class CompareParam : public OpParam {
 };
 #endif  // LESS_THAN_OP
+#if defined(LOGICAL_AND_OP) || defined(LOGICAL_OR_OP) || defined(LOGICAL_XOR_OP)
+template <typename Dtype>
+class LogicalBinaryParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  LogicalBinaryParam(const VariableNameMap &inputs,
+                     const VariableNameMap &outputs, const AttributeMap &attrs,
+                     const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    input_y_ = InputYFrom<GType>(inputs, scope);
+    output_ = OutFrom<GType>(outputs, scope);
+  }
+  const GType *InputX() const { return input_x_; }
+  const GType *InputY() const { return input_y_; }
+  GType *Out() const { return output_; }
+ public:
+  GType *input_x_;
+  GType *input_y_;
+  GType *output_;
+};
+#endif  // LOGICAL_AND_OP LOGICAL_OR_OP LOGICAL_XOR_OP
+#ifdef LOGICAL_NOT_OP
+template <typename Dtype>
+class LogicalUnaryParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  LogicalUnaryParam(const VariableNameMap &inputs,
+                    const VariableNameMap &outputs, const AttributeMap &attrs,
+                    const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    output_ = OutFrom<GType>(outputs, scope);
+  }
+  const GType *InputX() const { return input_x_; }
+  GType *Out() const { return output_; }
+ public:
+  GType *input_x_;
+  GType *output_;
+};
+#endif  // LOGICAL_NOT_OP
+// #ifdef WHILE_OP
+// template <typename Dtype>
+// class WhileParam : public OpParam {
+//  public:
+//   WhileParam(const VariableNameMap &inputs,
+//              const VariableNameMap &outputs, const AttributeMap &attrs,
+//              const Scope &scope) {
+//     cond_ = OpParam::GetVarValue<framework::LoDTensor>("Condition", inputs,
+//     scope); block_desc_ = OpParam::GetAttr<framework::BlockDesc
+//     *>("sub_block", attrs);
+//   }
+//
+//  public:
+//   framework::LoDTensor *cond_;
+//   const framework::BlockDesc *block_desc_;
+// };
+// #endif  // WHILE_OP
+#ifdef WRITE_TO_ARRAY_OP
+template <typename Dtype>
+class WriteToArrayParam : public OpParam {
+ public:
+  WriteToArrayParam(const VariableNameMap &inputs,
+                    const VariableNameMap &outputs, const AttributeMap &attrs,
+                    const Scope &scope) {
+    input_ = OpParam::GetVarValue<framework::LoDTensor>("X", inputs, scope);
+    index_ = OpParam::GetVarValue<framework::LoDTensor>("I", inputs, scope);
+    output_ =
+        OpParam::GetVarValue<framework::LoDTensorArray>("Out", outputs, scope);
+  }
+ public:
+  framework::LoDTensor *input_;
+  framework::LoDTensor *index_;
+  framework::LoDTensorArray *output_;
+};
+#endif
+#ifdef READ_FROM_ARRAY_OP
+template <typename Dtype>
+class ReadFromArrayParam : public OpParam {
+ public:
+  ReadFromArrayParam(const VariableNameMap &inputs,
+                     const VariableNameMap &outputs, const AttributeMap &attrs,
+                     const Scope &scope) {
+    input_ =
+        OpParam::GetVarValue<framework::LoDTensorArray>("X", inputs, scope);
+    index_ = OpParam::GetVarValue<framework::LoDTensor>("I", inputs, scope);
+    output_ = OpParam::GetVarValue<framework::LoDTensor>("Out", outputs, scope);
+  }
+ public:
+  framework::LoDTensorArray *input_;
+  framework::LoDTensor *index_;
+  framework::LoDTensor *output_;
+};
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -30,6 +30,10 @@ if (CON GREATER -1)
    target_link_libraries(test-mobilenet-combine paddle-mobile)
    set(FOUND_MATCH ON)
+    # gen test
+    ADD_EXECUTABLE(test-mobilenetgpu  net/test_mobilenet_GPU.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-mobilenetgpu paddle-mobile)
 endif ()
 list(FIND NET "yolo" CON)
@@ -417,4 +421,20 @@ if (NOT FOUND_MATCH)
    ADD_EXECUTABLE(test-vgg16ssd net/test_vgg16ssd.cpp test_helper.h test_include.h)
    target_link_libraries(test-vgg16ssd paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-logical-and-op operators/test_logical_and_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-logical-and-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-logical-or-op operators/test_logical_or_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-logical-or-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-logical-not-op operators/test_logical_not_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-logical-not-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-logical-xor-op operators/test_logical_xor_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-logical-xor-op paddle-mobile)
 endif ()
--- a/test/net/test_mobilenet_GPU.cpp
+++ b/test/net/test_mobilenet_GPU.cpp
@@ -25,11 +25,11 @@ int main() {
  paddle_mobile.SetCLPath("/data/local/tmp/bin");
 #endif
-  auto isok =
+  //  auto isok =
-      paddle_mobile.Load(std::string(g_mobilenet_mul) + "/model",
+  //      paddle_mobile.Load(std::string(g_mobilenet_mul) + "/model",
-                         std::string(g_mobilenet_mul) + "/params", true);
+  //                         std::string(g_mobilenet_mul) + "/params", true);
-  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_mul), true);
+  auto isok = paddle_mobile.Load(std::string(g_mobilenet), true);
  if (isok) {
    auto time2 = paddle_mobile::time();
    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"

--- a/test/operators/test_logical_and_op.cpp
+++ b/test/operators/test_logical_and_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "../test_include.h"
+#include "operators/logical_op.h"
+namespace paddle_mobile {
+void LogicalAnd(const framework::Tensor *inputX,
+                const framework::Tensor *inputY, framework::Tensor *output) {
+  auto x_data = inputX->data<bool>();
+  auto y_data = inputY->data<bool>();
+  auto output_data = output->data<bool>();
+  for (int i = 0; i < inputX->numel(); ++i) {
+    *output_data = *x_data && *y_data;
+    x_data++;
+    y_data++;
+    output_data++;
+  }
+}
+int TestLogicalAndOp(const std::vector<int> input_shape) {
+  framework::DDim input_dims = framework::make_ddim(input_shape);
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["X"] = std::vector<std::string>({"inputX"});
+  inputs["Y"] = std::vector<std::string>({"inputY"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+  auto x_var = scope.get()->Var("inputX");
+  auto x = x_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<bool>(x, input_dims, 0, 1);
+  auto y_var = scope.get()->Var("inputY");
+  auto y = y_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<bool>(y, input_dims, 0, 1);
+  auto output_var = scope.get()->Var("output");
+  framework::AttributeMap attrs;
+  auto *op = new operators::LogicalAndOp<CPU, float>("logical_and", inputs,
+                                                     outputs, attrs, scope);
+  op->InferShape();
+  op->Init();
+  op->Run();
+  auto output = output_var->template Get<framework::LoDTensor>();
+  framework::Tensor output_cmp;
+  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
+  LogicalAnd(x, y, &output_cmp);
+  const bool *output_data = output->data<bool>();
+  for (int i = 0; i < output->numel(); ++i) {
+    if (output_data[i] != output_cmp_data[i]) {
+      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
+                     << ", output_cmp_data[" << i
+                     << "] = " << output_cmp_data[i];
+      delete op;
+      exit(1);
+    }
+  }
+}
+}  // namespace paddle_mobile
+int main() {
+  paddle_mobile::TestLogicalAndOp({1, 1, 2, 3});
+  paddle_mobile::TestLogicalAndOp({1, 3, 11, 12});
+  paddle_mobile::TestLogicalAndOp({1, 16, 32, 32});
+  DLOG << "test logical_and op pass.";
+  return 0;
+}
--- a/test/operators/test_logical_not_op.cpp
+++ b/test/operators/test_logical_not_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "../test_include.h"
+#include "operators/logical_op.h"
+namespace paddle_mobile {
+void LogicalNot(const framework::Tensor *inputX, framework::Tensor *output) {
+  auto x_data = inputX->data<bool>();
+  auto output_data = output->data<bool>();
+  for (int i = 0; i < inputX->numel(); ++i) {
+    *output_data = !*x_data;
+    x_data++;
+    output_data++;
+  }
+}
+int TestLogicalNotOp(const std::vector<int> input_shape) {
+  framework::DDim input_dims = framework::make_ddim(input_shape);
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["X"] = std::vector<std::string>({"inputX"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+  auto x_var = scope.get()->Var("inputX");
+  auto x = x_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<bool>(x, input_dims, 0, 1);
+  auto output_var = scope.get()->Var("output");
+  framework::AttributeMap attrs;
+  auto *op = new operators::LogicalNotOp<CPU, float>("logical_not", inputs,
+                                                     outputs, attrs, scope);
+  op->InferShape();
+  op->Init();
+  op->Run();
+  auto output = output_var->template Get<framework::LoDTensor>();
+  framework::Tensor output_cmp;
+  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
+  LogicalNot(x, &output_cmp);
+  const bool *output_data = output->data<bool>();
+  for (int i = 0; i < output->numel(); ++i) {
+    if (output_data[i] != output_cmp_data[i]) {
+      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
+                     << ", output_cmp_data[" << i
+                     << "] = " << output_cmp_data[i];
+      delete op;
+      exit(1);
+    }
+  }
+}
+}  // namespace paddle_mobile
+int main() {
+  paddle_mobile::TestLogicalNotOp({1, 1, 2, 3});
+  paddle_mobile::TestLogicalNotOp({1, 3, 11, 12});
+  paddle_mobile::TestLogicalNotOp({1, 16, 32, 32});
+  DLOG << "test logical_not op pass.";
+  return 0;
+}
--- a/test/operators/test_logical_or_op.cpp
+++ b/test/operators/test_logical_or_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "../test_include.h"
+#include "operators/logical_op.h"
+namespace paddle_mobile {
+void LogicalOr(const framework::Tensor *inputX, const framework::Tensor *inputY,
+               framework::Tensor *output) {
+  auto x_data = inputX->data<bool>();
+  auto y_data = inputY->data<bool>();
+  auto output_data = output->data<bool>();
+  for (int i = 0; i < inputX->numel(); ++i) {
+    *output_data = *x_data || *y_data;
+    x_data++;
+    y_data++;
+    output_data++;
+  }
+}
+int TestLogicalOrOp(const std::vector<int> input_shape) {
+  framework::DDim input_dims = framework::make_ddim(input_shape);
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["X"] = std::vector<std::string>({"inputX"});
+  inputs["Y"] = std::vector<std::string>({"inputY"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+  auto x_var = scope.get()->Var("inputX");
+  auto x = x_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<bool>(x, input_dims, 0, 1);
+  auto y_var = scope.get()->Var("inputY");
+  auto y = y_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<bool>(y, input_dims, 0, 1);
+  auto output_var = scope.get()->Var("output");
+  framework::AttributeMap attrs;
+  auto *op = new operators::LogicalOrOp<CPU, float>("logical_or", inputs,
+                                                    outputs, attrs, scope);
+  op->InferShape();
+  op->Init();
+  op->Run();
+  auto output = output_var->template Get<framework::LoDTensor>();
+  framework::Tensor output_cmp;
+  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
+  LogicalOr(x, y, &output_cmp);
+  const bool *output_data = output->data<bool>();
+  for (int i = 0; i < output->numel(); ++i) {
+    if (output_data[i] != output_cmp_data[i]) {
+      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
+                     << ", output_cmp_data[" << i
+                     << "] = " << output_cmp_data[i];
+      delete op;
+      exit(1);
+    }
+  }
+}
+}  // namespace paddle_mobile
+int main() {
+  paddle_mobile::TestLogicalOrOp({1, 1, 2, 3});
+  paddle_mobile::TestLogicalOrOp({1, 3, 11, 12});
+  paddle_mobile::TestLogicalOrOp({1, 16, 32, 32});
+  DLOG << "test logical_or op pass.";
+  return 0;
+}
--- a/test/operators/test_logical_xor_op.cpp
+++ b/test/operators/test_logical_xor_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "../test_include.h"
+#include "operators/logical_op.h"
+namespace paddle_mobile {
+void LogicalXor(const framework::Tensor *inputX,
+                const framework::Tensor *inputY, framework::Tensor *output) {
+  auto x_data = inputX->data<bool>();
+  auto y_data = inputY->data<bool>();
+  auto output_data = output->data<bool>();
+  for (int i = 0; i < inputX->numel(); ++i) {
+    bool x = *x_data;
+    bool y = *y_data;
+    *output_data = (x || y) && !(x && y);
+    x_data++;
+    y_data++;
+    output_data++;
+  }
+}
+int TestLogicalXorOp(const std::vector<int> input_shape) {
+  framework::DDim input_dims = framework::make_ddim(input_shape);
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["X"] = std::vector<std::string>({"inputX"});
+  inputs["Y"] = std::vector<std::string>({"inputY"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+  auto x_var = scope.get()->Var("inputX");
+  auto x = x_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<bool>(x, input_dims, 0, 1);
+  auto y_var = scope.get()->Var("inputY");
+  auto y = y_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<bool>(y, input_dims, 0, 1);
+  auto output_var = scope.get()->Var("output");
+  framework::AttributeMap attrs;
+  auto *op = new operators::LogicalXorOp<CPU, float>("logical_xor", inputs,
+                                                     outputs, attrs, scope);
+  op->InferShape();
+  op->Init();
+  op->Run();
+  auto output = output_var->template Get<framework::LoDTensor>();
+  framework::Tensor output_cmp;
+  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
+  LogicalXor(x, y, &output_cmp);
+  const bool *output_data = output->data<bool>();
+  for (int i = 0; i < output->numel(); ++i) {
+    if (output_data[i] != output_cmp_data[i]) {
+      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
+                     << ", output_cmp_data[" << i
+                     << "] = " << output_cmp_data[i];
+      delete op;
+      exit(1);
+    }
+  }
+}
+}  // namespace paddle_mobile
+int main() {
+  paddle_mobile::TestLogicalXorOp({1, 1, 2, 3});
+  paddle_mobile::TestLogicalXorOp({1, 3, 11, 12});
+  paddle_mobile::TestLogicalXorOp({1, 16, 32, 32});
+  DLOG << "test logical_xor op pass.";
+  return 0;
+}
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -83,6 +83,26 @@ void SetupTensor(paddle_mobile::framework::Tensor *input,
  }
 }
+template <>
+void SetupTensor<bool>(paddle_mobile::framework::Tensor *input,
+                       paddle_mobile::framework::DDim dims, bool lower,
+                       bool upper) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+  bool *input_ptr = input->mutable_data<bool>(dims);
+  if (lower == upper) {
+    for (int i = 0; i < input->numel(); ++i) {
+      input_ptr[i] = lower;
+    }
+  } else {
+    for (int i = 0; i < input->numel(); ++i) {
+      input_ptr[i] = uniform_dist(rng) > 0.5;
+    }
+  }
+}
 template <typename T>
 T *CreateInput(Tensor *input, DDim dims, T low, T up) {
  SetupTensor<T>(input, dims, static_cast<float>(low), static_cast<float>(up));

--- a/tools/ci_build.sh
+++ b/tools/ci_build.sh
@@ -26,6 +26,7 @@ function print_usage() {
  ${BLUE}ios${NONE}: run build for apple ios platform
  ${BLUE}linux_armv7${NONE}: run build for linux armv7 platform
  ${BLUE}linux_armv8${NONE}: run build for linux armv8 platform
+  ${BLUE}fpga${NONE}: run build for fpga platform
  "
  echo "\n${RED}Network${NONE}: optional, for deep compressing the framework size
  ${BLUE}googlenet${NONE}: build only googlenet support
@@ -146,6 +147,7 @@ function build_ios_armv8_cpu_only() {
    -DIOS_PLATFORM=OS \
    -DIOS_ARCH="${IOS_ARCH}" \
    -DIS_IOS=true \
+    -DUSE_OPENMP=OFF \
    -DGPU_MALI=OFF \
    -DGPU_CL=OFF \
    -DFPGA=OFF
@@ -163,6 +165,7 @@ function build_ios_armv8_gpu() {
    -DIOS_PLATFORM=OS \
    -DIOS_ARCH="${IOS_ARCH}" \
    -DIS_IOS=true \
+    -DUSE_OPENMP=OFF \
    -DGPU_MALI=OFF \
    -DGPU_CL=ON \
    -DFPGA=OFF
@@ -217,11 +220,19 @@ function build_ios() {
 }
 function build_linux_armv7() {
-  check_ndk
  build_linux_armv7_cpu_only
  # build_linux_armv7_gpu
 }
+function build_linux_fpga() {
+  cd ..
+  image=`docker images paddle-mobile:dev | grep 'paddle-mobile'`
+  if [[ "x"$image == "x" ]]; then
+    docker build -t paddle-mobile:dev - < Dockerfile
+  fi
+  docker run --rm -v `pwd`:/workspace paddle-mobile:dev bash /workspace/tools/docker_build_fpga.sh
+}
 function main() {
  local CMD=$1
  init
@@ -238,6 +249,9 @@ function main() {
    linux_armv7)
      build_linux_armv7
      ;;
+    fpga)
+      build_linux_fpga
+      ;;
    *)
      print_usage
      exit 0

--- a/tools/docker_build_fpga.sh
+++ b/tools/docker_build_fpga.sh
+apt-get update
+apt-get install -y gcc g++ cmake
+cd /workspace && mkdir build
+cd build && cmake .. -DCPU=OFF -DFPGA=ON && make -j4
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -281,6 +281,13 @@ if(NOT FOUND_MATCH)
  set(TANH_OP ON)
  set(LOD_RESET_OP ON)
  set(LESS_THAN_OP ON)
+  set(LOGICAL_AND_OP ON)
+  set(LOGICAL_OR_OP ON)
+  set(LOGICAL_NOT_OP ON)
+  set(LOGICAL_XOR_OP ON)
+  set(WHILE_OP ON)
+  set(WRITE_TO_ARRAY_OP ON)
+  set(READ_FROM_ARRAY_OP ON)
 endif()
  # option(BATCHNORM_OP "" ON)
@@ -530,6 +537,18 @@ endif()
 if (LESS_THAN_OP)
  add_definitions(-DLESS_THAN_OP)
 endif()
+if (LOGICAL_AND_OP)
+  add_definitions(-DLOGICAL_AND_OP)
+endif()
+if (LOGICAL_OR_OP)
+  add_definitions(-DLOGICAL_OR_OP)
+endif()
+if (LOGICAL_NOT_OP)
+  add_definitions(-DLOGICAL_NOT_OP)
+endif()
+if (LOGICAL_XOR_OP)
+  add_definitions(-DLOGICAL_XOR_OP)
+endif()
 if (TANH_OP)
  add_definitions(-DTANH_OP)
@@ -543,3 +562,13 @@ endif()
 if (FUSION_DECONVADDRELU_OP)
  add_definitions(-DFUSION_DECONVADDRELU_OP)
 endif()
+if (WHILE_OP)
+  add_definitions(-DWHILE_OP)
+endif()
+if (WRITE_TO_ARRAY_OP)
+  add_definitions(-DWRITE_TO_ARRAY_OP)
+endif()
+if (READ_FROM_ARRAY_OP)
+  add_definitions(-DREAD_FROM_ARRAY_OP)
+endif()