[LITE][BM] support faceboxes and behavior image,test=develop

3b3abcbb · baolei.an · 1fe164fd · 3b3abcbb · 3b3abcbb · 3b3abcbb
5 changed file
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
@@ -35,7 +35,7 @@ lite_cc_library(subgraph_bridge_assign_value_op_bm SRCS assign_value_op.cc DEPS
 lite_cc_library(subgraph_bridge_shape_op_bm SRCS shape_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_split_op_bm SRCS split_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_matmul_op_bm SRCS matmul_op.cc DEPS ${bm_subgraph_bridge_deps})
-
+lite_cc_library(subgraph_bridge_density_prior_box_op_bm SRCS density_prior_box_op.cc DEPS ${bm_subgraph_bridge_deps})

 set(bm_subgraph_bridges
        subgraph_bridge_registry
@@ -69,4 +69,5 @@ set(bm_subgraph_bridges
        subgraph_bridge_shape_op_bm
        subgraph_bridge_split_op_bm
        subgraph_bridge_matmul_op_bm
+        subgraph_bridge_density_prior_box_op_bm
        CACHE INTERNAL "bm_subgraph_bridges")
--- a/lite/kernels/bm/bridges/density_prior_box_op.cc
+++ b/lite/kernels/bm/bridges/density_prior_box_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+typedef struct __tag_st_priorbox_param {
+  std::vector<float> fixed_sizes;
+  std::vector<float> fixed_ratios;
+  std::vector<int> densities;
+  std::vector<float> variances;
+  float step_w;
+  float step_h;
+  float offset;
+  int prior_num;
+  bool clip;
+  bool flatten_to_2d;
+} st_priorbox_param;
+
+float* compute_density_priorbox_kernel(OpLite* op, st_priorbox_param* param) {
+  auto op_info = op->op_info();
+  auto scope = op->scope();
+  // inputs
+  auto in_var_name = op_info->Input("Input").front();
+  auto in = scope->FindVar(in_var_name)->GetMutable<lite::Tensor>();
+  auto in_dims = in->dims();
+  auto img_var_name = op_info->Input("Image").front();
+  auto img = scope->FindVar(img_var_name)->GetMutable<lite::Tensor>();
+  auto img_dims = img->dims();
+  // outputs
+  auto boxes_var_name = op_info->Output("Boxes").front();
+  auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
+  auto var_var_name = op_info->Output("Variances").front();
+  auto var = scope->FindVar(var_var_name)->GetMutable<lite::Tensor>();
+
+  auto img_width = img_dims[3];
+  auto img_height = img_dims[2];
+  auto feature_width = in_dims[3];
+  auto feature_height = in_dims[2];
+  float step_width, step_height;
+  if (param->step_w == 0.f || param->step_h == 0.f) {
+    step_width = static_cast<float>(img_width) / feature_width;
+    step_height = static_cast<float>(img_height) / feature_height;
+  } else {
+    step_width = param->step_w;
+    step_height = param->step_h;
+  }
+  int num_priors = 0;
+  for (size_t i = 0; i < param->densities.size(); ++i) {
+    num_priors += (param->fixed_ratios.size()) * (pow(param->densities[i], 2));
+  }
+  param->prior_num = num_priors;
+  DDim shape_out({feature_height, feature_width, num_priors, 4});
+  int32_t channel_size = feature_height * feature_width * num_priors * 4;
+  boxes->Resize(shape_out);
+  var->Resize(shape_out);
+  int step_average = static_cast<int>((step_width + step_height) * 0.5);
+  std::vector<float> sqrt_fixed_ratios;
+  for (size_t i = 0; i < param->fixed_ratios.size(); i++) {
+    sqrt_fixed_ratios.push_back(sqrt(param->fixed_ratios[i]));
+  }
+  float* cpu_data =
+      static_cast<float*>(malloc(sizeof(float) * boxes->data_size() * 2));
+  CHECK(cpu_data != nullptr);
+  float* b_t = cpu_data;
+  for (int h = 0; h < feature_height; ++h) {
+    for (int w = 0; w < feature_width; ++w) {
+      float center_x = (w + param->offset) * step_width;
+      float center_y = (h + param->offset) * step_height;
+
+      for (size_t s = 0; s < param->fixed_sizes.size(); ++s) {
+        auto fixed_size = param->fixed_sizes[s];
+        int density = param->densities[s];
+        int shift = step_average / density;
+        // Generate density prior boxes with fixed ratios.
+        for (size_t r = 0; r < param->fixed_ratios.size(); ++r) {
+          float box_width_ratio = fixed_size * sqrt_fixed_ratios[r];
+          float box_height_ratio = fixed_size / sqrt_fixed_ratios[r];
+          float density_center_x = center_x - step_average / 2. + shift / 2.;
+          float density_center_y = center_y - step_average / 2. + shift / 2.;
+          for (int di = 0; di < density; ++di) {
+            for (int dj = 0; dj < density; ++dj) {
+              float center_x_temp = density_center_x + dj * shift;
+              float center_y_temp = density_center_y + di * shift;
+              b_t[0] = std::max(
+                  (center_x_temp - box_width_ratio / 2.) / img_width, 0.);
+              b_t[1] = std::max(
+                  (center_y_temp - box_height_ratio / 2.) / img_height, 0.);
+              b_t[2] = std::min(
+                  (center_x_temp + box_width_ratio / 2.) / img_width, 1.);
+              b_t[3] = std::min(
+                  (center_y_temp + box_height_ratio / 2.) / img_height, 1.);
+              b_t += 4;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (param->clip) {
+    for (int32_t d = 0; d < channel_size; ++d) {
+      cpu_data[d] = std::min(std::max(cpu_data[d], 0.f), 1.f);
+    }
+  }
+  float* ptr = cpu_data + channel_size;
+  int count = 0;
+  for (int32_t h = 0; h < feature_height; ++h) {
+    for (int32_t w = 0; w < feature_width; ++w) {
+      for (int32_t i = 0; i < param->prior_num; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          ptr[count] = param->variances[j];
+          ++count;
+        }
+      }
+    }
+  }
+  return cpu_data;
+}
+
+int DensityPriorBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // inputs
+  auto in_var_name = op_info->Input("Input").front();
+  auto in = scope->FindVar(in_var_name)->GetMutable<lite::Tensor>();
+  auto in_dims = in->dims();
+  auto img_var_name = op_info->Input("Image").front();
+  auto img = scope->FindVar(img_var_name)->GetMutable<lite::Tensor>();
+  auto img_dims = img->dims();
+  std::vector<int32_t> i_input_shape_data(in_dims.size());
+  for (size_t i = 0; i < in_dims.size(); i++) {
+    i_input_shape_data[i] = static_cast<int32_t>(in_dims[i]);
+  }
+  // outputs
+  auto boxes_var_name = op_info->Output("Boxes").front();
+  auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
+  auto var_var_name = op_info->Output("Variances").front();
+  // param
+  st_priorbox_param param;
+  param.clip = op_info->GetAttr<bool>("clip");
+  param.flatten_to_2d = op_info->GetAttr<bool>("flatten_to_2d");
+  param.fixed_sizes = op_info->GetAttr<std::vector<float>>("fixed_sizes");
+  param.fixed_ratios = op_info->GetAttr<std::vector<float>>("fixed_ratios");
+  param.variances = op_info->GetAttr<std::vector<float>>("variances");
+  param.densities = op_info->GetAttr<std::vector<int>>("densities");
+
+  param.offset = op_info->GetAttr<float>("offset");
+  if (op_info->HasAttr("step_w")) {
+    param.step_w = op_info->GetAttr<float>("step_w");
+  }
+  if (op_info->HasAttr("step_h")) {
+    param.step_h = op_info->GetAttr<float>("step_h");
+  }
+  float* cpu_data = compute_density_priorbox_kernel(op, &param);
+  auto boxes_dims = boxes->dims();
+  std::vector<int32_t> i_pri_out_shape_data(3);
+  i_pri_out_shape_data[0] = 1;
+  i_pri_out_shape_data[1] = 2;
+  i_pri_out_shape_data[2] = boxes->data_size();
+  auto bm_priorbox_name = lite::subgraph::bm::UniqueName("bm_priorbox");
+  add_priorbox_layer(graph->GetCompilerHandle(),
+                     const_cast<const int*>(&i_input_shape_data[0]),
+                     in_dims.size(),
+                     static_cast<const char*>(in_var_name.c_str()),
+                     const_cast<const int*>(&i_pri_out_shape_data[0]),
+                     3,
+                     static_cast<const char*>(bm_priorbox_name.c_str()),
+                     static_cast<const float*>(cpu_data),
+                     0,
+                     nullptr,
+                     0,
+                     nullptr,
+                     0,
+                     nullptr,
+                     0,
+                     0,
+                     0,
+                     nullptr,
+                     0,
+                     0,
+                     0.f,
+                     0.f,
+                     0.f);
+  int32_t* shape[2];
+  int32_t dim[2];
+  const char* name[2];
+  int32_t dim_size = 3;
+  dim[0] = dim_size;
+  dim[1] = dim_size;
+  std::vector<int32_t> i_split_shape_data(dim_size);
+  for (size_t i = 0; i < dim_size; i++) {
+    i_split_shape_data[i] = i_pri_out_shape_data[i];
+  }
+  i_split_shape_data[1] /= 2;
+  shape[0] = &i_split_shape_data[0];
+  shape[1] = &i_split_shape_data[0];
+  name[0] = static_cast<const char*>(
+      lite::subgraph::bm::UniqueName("bm_boxes").c_str());
+  name[1] = static_cast<const char*>(
+      lite::subgraph::bm::UniqueName("bm_boxes_var").c_str());
+  int split_size[2];
+  split_size[0] = shape[0][1];
+  split_size[1] = shape[1][1];
+  add_tf_split_layer(graph->GetCompilerHandle(),
+                     const_cast<const int*>(&i_pri_out_shape_data[0]),
+                     3,
+                     static_cast<const char*>(bm_priorbox_name.c_str()),
+                     2,
+                     shape,
+                     dim,
+                     name,
+                     3,
+                     1,
+                     split_size,
+                     2);
+  // final output
+  std::vector<int32_t> i_output_shape_data(boxes_dims.size());
+  for (size_t i = 0; i < boxes_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
+  }
+  add_reshape_layer_v2(graph->GetCompilerHandle(),
+                       name[0],
+                       shape[0],
+                       3,
+                       static_cast<const char*>(boxes_var_name.c_str()),
+                       const_cast<const int*>(&i_output_shape_data[0]),
+                       boxes_dims.size());
+  add_reshape_layer_v2(graph->GetCompilerHandle(),
+                       name[1],
+                       shape[1],
+                       3,
+                       static_cast<const char*>(var_var_name.c_str()),
+                       const_cast<const int*>(&i_output_shape_data[0]),
+                       boxes_dims.size());
+  graph->AddNode(boxes_var_name);
+  graph->AddNode(var_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(density_prior_box,
+                         kBM,
+                         paddle::lite::subgraph::bm::DensityPriorBoxConverter);
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
@@ -24,6 +24,48 @@ namespace lite {
 namespace subgraph {
 namespace bm {

+float* compute_elementwise_both_const(OpLite* op) {
+  auto op_info = op->op_info();
+  auto scope = op->scope();
+  auto op_type = op_info->Type();
+
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto y_var_name = op_info->Input("Y").front();
+  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
+  auto y_dims = y->dims();
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  float* cpu_data =
+      static_cast<float*>(malloc(sizeof(float) * output->data_size()));
+  CHECK(cpu_data != nullptr);
+  CHECK_EQ(x_dims.size(), y_dims.size());
+  const float* y_data = const_cast<const float*>(y->mutable_data<float>());
+  const float* x_data = const_cast<const float*>(x->mutable_data<float>());
+  if (op_type == "elementwise_mul") {
+    for (size_t i = 0; i < output->data_size(); i++) {
+      cpu_data[i] = x_data[i] * y_data[i];
+    }
+  } else if (op_type == "elementwise_add") {
+    for (size_t i = 0; i < output->data_size(); i++) {
+      cpu_data[i] = x_data[i] + y_data[i];
+    }
+  } else if (op_type == "elementwise_sub") {
+    for (size_t i = 0; i < output->data_size(); i++) {
+      cpu_data[i] = x_data[i] - y_data[i];
+    }
+  } else if (op_type == "elementwise_div") {
+    for (size_t i = 0; i < output->data_size(); i++) {
+      cpu_data[i] = x_data[i] / y_data[i];
+    }
+  }
+  return cpu_data;
+}
+
 int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  CHECK(ctx != nullptr);
  CHECK(op != nullptr);
@@ -41,21 +83,20 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto x_dims = x->dims();
  name[0] = static_cast<const char*>(x_var_name.c_str());
  dim[0] = x_dims.size();
-  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
  std::vector<int32_t> i_x_shape_data(x_dims.size());
  for (size_t i = 0; i < x_dims.size(); i++) {
-    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+    i_x_shape_data[i] = static_cast<int>(x_dims[i]);
  }
  shape[0] = &i_x_shape_data[0];
+  bool x_is_const = !graph->HasNode(x_var_name);
  auto y_var_name = op_info->Input("Y").front();
  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
  auto y_dims = y->dims();
  name[1] = static_cast<const char*>(y_var_name.c_str());
  dim[1] = y_dims.size();
-  const int64_t* y_shape_data = const_cast<const int64_t*>(&y_dims.data()[0]);
  std::vector<int32_t> i_y_shape_data(y_dims.size());
  for (size_t i = 0; i < y_dims.size(); i++) {
-    i_y_shape_data[i] = static_cast<int>(y_shape_data[i]);
+    i_y_shape_data[i] = static_cast<int>(y_dims[i]);
  }
  shape[1] = &i_y_shape_data[0];
  bool y_is_const = !graph->HasNode(y_var_name);
@@ -86,46 +127,56 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  const float* x_data = const_cast<const float*>(x->mutable_data<float>());
  auto unique_op_name = lite::subgraph::bm::UniqueName("expand_ndims");
  std::vector<int32_t> i_expand_shape_data(3);
-  if (y_is_const) {
-    if (dim[0] == dim[1] || 2 == dim[0]) {
-      bm_add_const_tensor(graph->GetCompilerHandle(),
-                          name[1],
-                          shape[1],
-                          dim[1],
-                          static_cast<bm_data_type_t>(DTYPE_FP32),
-                          static_cast<const void*>(y_data));
-    } else if (1 == dim[1] && 1 == axis) {
-      add_expand_ndims_layer(graph->GetCompilerHandle(),
-                             name[1],
-                             shape[1],
-                             dim[1],
-                             static_cast<const float*>(y_data),
-                             -1,
-                             2,
-                             static_cast<const char*>(unique_op_name.c_str()));
-      name[1] = static_cast<const char*>(unique_op_name.c_str());
-      dim[1] = 3;
-      i_expand_shape_data[0] = i_y_shape_data[0];
-      i_expand_shape_data[1] = 1;
-      i_expand_shape_data[2] = 1;
-      shape[1] = &i_expand_shape_data[0];
-      y_data = nullptr;
+  if (x_is_const && y_is_const) {
+    float* cpu_data = compute_elementwise_both_const(op);
+    bm_add_const_tensor(graph->GetCompilerHandle(),
+                        static_cast<const char*>(output_var_name.c_str()),
+                        const_cast<const int*>(&i_output_shape_data[0]),
+                        output_dims.size(),
+                        static_cast<bm_data_type_t>(DTYPE_FP32),
+                        static_cast<const void*>(cpu_data));
+  } else {
+    if (y_is_const) {
+      if (dim[0] == dim[1] || 2 == dim[0]) {
+        bm_add_const_tensor(graph->GetCompilerHandle(),
+                            name[1],
+                            shape[1],
+                            dim[1],
+                            static_cast<bm_data_type_t>(DTYPE_FP32),
+                            static_cast<const void*>(y_data));
+      } else if (1 == dim[1] && 1 == axis) {
+        add_expand_ndims_layer(
+            graph->GetCompilerHandle(),
+            name[1],
+            shape[1],
+            dim[1],
+            static_cast<const float*>(y_data),
+            -1,
+            2,
+            static_cast<const char*>(unique_op_name.c_str()));
+        name[1] = static_cast<const char*>(unique_op_name.c_str());
+        dim[1] = 3;
+        i_expand_shape_data[0] = i_y_shape_data[0];
+        i_expand_shape_data[1] = 1;
+        i_expand_shape_data[2] = 1;
+        shape[1] = &i_expand_shape_data[0];
+        y_data = nullptr;
+      }
    }
+    add_binary_layer_v2(graph->GetCompilerHandle(),
+                        name[0],
+                        shape[0],
+                        dim[0],
+                        0,
+                        static_cast<const float*>(x_data),
+                        name[1],
+                        shape[1],
+                        dim[1],
+                        0,
+                        static_cast<const float*>(y_data),
+                        static_cast<const char*>(output_var_name.c_str()),
+                        op_code);
  }
-  add_binary_layer_v2(graph->GetCompilerHandle(),
-                      name[0],
-                      shape[0],
-                      dim[0],
-                      0,
-                      static_cast<const float*>(x_data),
-                      name[1],
-                      shape[1],
-                      dim[1],
-                      0,
-                      static_cast<const float*>(y_data),
-                      static_cast<const char*>(output_var_name.c_str()),
-                      op_code);
-
  delete[] shape;
  delete[] name;
  delete[] dim;

--- a/lite/kernels/bm/bridges/matmul_op.cc
+++ b/lite/kernels/bm/bridges/matmul_op.cc
@@ -36,46 +36,46 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto x_var_name = op_info->Input("X").front();
  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
  auto x_dims = x->dims();
-  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
  std::vector<int32_t> i_x_shape_data(x_dims.size());
  for (size_t i = 0; i < x_dims.size(); i++) {
-    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+    i_x_shape_data[i] = static_cast<int>(x_dims[i]);
  }
  auto y_var_name = op_info->Input("Y").front();
  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
  auto y_dims = y->dims();
-  const int64_t* y_shape_data = const_cast<const int64_t*>(&y_dims.data()[0]);
  std::vector<int32_t> i_y_shape_data(y_dims.size());
  for (size_t i = 0; i < y_dims.size(); i++) {
-    i_y_shape_data[i] = static_cast<int>(y_shape_data[i]);
+    i_y_shape_data[i] = static_cast<int>(y_dims[i]);
  }
  // output
  auto output_var_name = op_info->Output("Out").front();
+  auto out = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto out_dims = out->dims();
+  std::vector<int32_t> i_out_shape_data(out_dims.size());
+  for (size_t i = 0; i < out_dims.size(); i++) {
+    i_out_shape_data[i] = static_cast<int>(out_dims[i]);
+  }
  bool transpose_x = op_info->GetAttr<bool>("transpose_X");
  bool transpose_y = op_info->GetAttr<bool>("transpose_Y");
  float alpha = op_info->GetAttr<float>("alpha");
+  CHECK_EQ(alpha, 1.f);
+  CHECK_EQ(transpose_x, 0);
+  CHECK_EQ(transpose_y, 0);

-  LOG(INFO) << x_dims << " " << y_dims << " " << alpha << " " << transpose_x
-            << " " << transpose_y;
-
-#if 0
-  add_const_binary_layer(graph->GetCompilerHandle(),
+  const float* y_data = const_cast<const float*>(y->mutable_data<float>());
+  const float* x_data = const_cast<const float*>(x->mutable_data<float>());
+  add_batch_matmul_layer(graph->GetCompilerHandle(),
                         static_cast<const char*>(x_var_name.c_str()),
                         const_cast<const int*>(&i_x_shape_data[0]),
                         x_dims.size(),
-                         scale,
-                         static_cast<const char*>(unique_op_scale_name.c_str()),
-                         BINARY_MUL,
-                         0);
-  add_const_binary_layer(graph->GetCompilerHandle(),
-                         static_cast<const char*>(unique_op_scale_name.c_str()),
-                         const_cast<const int*>(&i_x_shape_data[0]),
-                         x_dims.size(),
-                         bias,
-                         static_cast<const char*>(output_var_name.c_str()),
-                         BINARY_ADD,
-                         0);
-#endif
+                         0,
+                         x_data,
+                         static_cast<const char*>(y_var_name.c_str()),
+                         const_cast<const int*>(&i_y_shape_data[0]),
+                         y_dims.size(),
+                         0,
+                         y_data,
+                         static_cast<const char*>(output_var_name.c_str()));
  graph->AddNode(output_var_name);
  return SUCCESS;
 }

--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
@@ -60,3 +60,4 @@ USE_SUBGRAPH_BRIDGE(split, kBM);
 USE_SUBGRAPH_BRIDGE(matmul, kBM);
 USE_SUBGRAPH_BRIDGE(max_pool2d_with_index, kBM);
 USE_SUBGRAPH_BRIDGE(sigmoid, kBM);
+USE_SUBGRAPH_BRIDGE(density_prior_box, kBM);