提交 975cd45d 编写于 作者: S shipengchao 提交者: MaxwellDing

add roi align x86 kernel

上级 d6791276
......@@ -27,7 +27,7 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
#ifdef LITE_WITH_MLU
fusion::FcFuser fuser(false);
fuser(graph.get());
#elif
#else
fusion::FcFuser fuser(true);
fuser(graph.get());
#endif
......
......@@ -854,9 +854,11 @@ void ModifyValidPlaces(SSAGraph* graph, bool use_mlu_cast) {
for (auto& place : v_places) {
prec_set.insert(place.precision);
}
#ifdef LITE_WITH_MLU
if (lite::TargetWrapperMlu::UseFirstConv()) {
prec_set.insert(PRECISION(kInt8));
}
#endif
for (auto& prec : prec_set) {
v_places.emplace_back(TARGET(kX86), prec, DATALAYOUT(kNHWC));
}
......
......@@ -70,6 +70,7 @@ add_kernel(search_fc_compute_x86 X86 basic SRCS search_fc_compute.cc DEPS ${lite
add_kernel(matmul_compute_x86 X86 basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps} blas)
add_kernel(yolo_box_compute_x86 X86 basic SRCS yolo_box_compute.cc DEPS ${lite_kernel_deps})
add_kernel(roi_align_compute_x86 X86 basic SRCS roi_align_compute.cc DEPS ${lite_kernel_deps})
add_kernel(interpolate_compute_x86 X86 basic SRCS interpolate_compute.cc DEPS ${lite_kernel_deps})
lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
......@@ -111,5 +112,6 @@ lite_cc_test(test_sequence_arithmetic_compute_x86 SRCS sequence_arithmetic_compu
lite_cc_test(test_leaky_relu_compute_x86 SRCS leaky_relu_compute_test.cc DEPS activation_compute_x86)
lite_cc_test(test_yolo_box_compute_x86 SRCS yolo_box_compute_test.cc DEPS
yolo_box_compute_x86)
# lite_cc_test(test_roi_align_compute_x86 SRCS roi_align_compute_test.cc DEPS roi_align_compute_x86)
lite_cc_test(test_nearest_interp_comute_x86 SRCS interpolate_compute_test.cc
DEPS interpolate_compute_x86)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/x86/roi_align_compute.h"
#include <string>
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
static constexpr int kROISize = 4;
template <class T>
void PreCalcForBilinearInterpolate(const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int iy_upper,
const int ix_upper,
T roi_ymin,
T roi_xmin,
T bin_size_h,
T bin_size_w,
int roi_bin_grid_h,
int roi_bin_grid_w,
Tensor* pre_pos,
Tensor* pre_w) {
int pre_calc_index = 0;
int* pre_pos_data = pre_pos->mutable_data<int>();
T* pre_w_data = pre_w->mutable_data<T>();
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
// calculate y of sample points
T y = roi_ymin + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h);
// calculate x of samle points
for (int ix = 0; ix < ix_upper; ix++) {
T x = roi_xmin + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// deal with elements out of map
if (y < -1.0 || y > height || x < -1.0 || x > width) {
for (int i = 0; i < kROISize; ++i) {
pre_pos_data[i + pre_calc_index * kROISize] = 0;
pre_w_data[i + pre_calc_index * kROISize] = 0;
}
pre_calc_index += 1;
continue;
}
y = y <= 0 ? 0 : y;
x = x <= 0 ? 0 : x;
int y_low = static_cast<int>(y);
int x_low = static_cast<int>(x);
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = static_cast<T>(y_low);
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = static_cast<T>(x_low);
} else {
x_high = x_low + 1;
}
T ly = y - y_low, lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
pre_pos_data[pre_calc_index * kROISize] = y_low * width + x_low;
pre_pos_data[pre_calc_index * kROISize + 1] = y_low * width + x_high;
pre_pos_data[pre_calc_index * kROISize + 2] = y_high * width + x_low;
pre_pos_data[pre_calc_index * kROISize + 3] = y_high * width + x_high;
pre_w_data[pre_calc_index * kROISize] = hy * hx;
pre_w_data[pre_calc_index * kROISize + 1] = hy * lx;
pre_w_data[pre_calc_index * kROISize + 2] = ly * hx;
pre_w_data[pre_calc_index * kROISize + 3] = ly * lx;
pre_calc_index += 1;
}
}
}
}
}
void RoiAlignCompute::Run() {
auto& param = Param<operators::RoiAlignParam>();
auto* in = param.X;
auto* rois = param.ROIs;
auto* out = param.Out;
float spatial_scale = param.spatial_scale;
int pooled_height = param.pooled_height;
int pooled_width = param.pooled_width;
int sampling_ratio = param.sampling_ratio;
auto in_dims = in->dims();
// int batch_size = in_dims[0];
int channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
auto rois_dims = rois->dims();
int rois_num = rois_dims[0];
auto out_dims = out->dims();
if (rois_num == 0) {
return;
}
DDim in_stride({static_cast<int>(in_dims[1] * in_dims[2] * in_dims[3]),
static_cast<int>(in_dims[2] * in_dims[3]),
static_cast<int>(in_dims[3]),
1});
DDim roi_stride({static_cast<int>(rois_dims[1]), 1});
DDim out_stride({static_cast<int>(out_dims[1] * out_dims[2] * out_dims[3]),
static_cast<int>(out_dims[2] * out_dims[3]),
static_cast<int>(out_dims[3]),
1});
auto* input_data = in->data<float>();
Tensor roi_batch_id_list;
roi_batch_id_list.Resize({rois_num});
int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>();
auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1;
// CHECK_OR_FALSE(rois_batch_size == batch_size);
// int rois_num_with_lod = rois_lod[rois_batch_size];
// CHECK_OR_FALSE(rois_num_with_lod == rois_num);
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n;
}
}
auto* output_data = out->mutable_data<float>();
auto* rois_data = rois->data<float>();
for (int n = 0; n < rois_num; ++n) {
int roi_batch_id = roi_batch_id_data[n];
float roi_xmin = rois_data[0] * spatial_scale;
float roi_ymin = rois_data[1] * spatial_scale;
float roi_xmax = rois_data[2] * spatial_scale;
float roi_ymax = rois_data[3] * spatial_scale;
float roi_width = std::max(roi_xmax - roi_xmin, 1.0f);
float roi_height = std::max(roi_ymax - roi_ymin, 1.0f);
float bin_size_h = roi_height / pooled_height;
float bin_size_w = roi_width / pooled_width;
const float* batch_data = input_data + roi_batch_id * in_stride[0];
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height);
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
const float count = roi_bin_grid_h * roi_bin_grid_w;
Tensor pre_pos;
Tensor pre_w;
int pre_size = count * out_stride[1];
pre_pos.Resize({pre_size, kROISize});
pre_w.Resize({pre_size, kROISize});
PreCalcForBilinearInterpolate<float>(height,
width,
pooled_height,
pooled_width,
roi_bin_grid_h,
roi_bin_grid_w,
roi_ymin,
roi_xmin,
bin_size_h,
bin_size_w,
roi_bin_grid_h,
roi_bin_grid_w,
&pre_pos,
&pre_w);
const int* pre_pos_data = pre_pos.data<int>();
const float* pre_w_data = pre_w.data<float>();
for (int c = 0; c < channels; c++) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
const int pool_index = ph * pooled_width + pw;
float output_val = 0;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
for (int i = 0; i < kROISize; i++) {
int pos = pre_pos_data[pre_calc_index * kROISize + i];
float w = pre_w_data[pre_calc_index * kROISize + i];
output_val += w * batch_data[pos];
}
pre_calc_index += 1;
}
}
output_val /= count;
output_data[pool_index] = output_val;
}
}
batch_data += in_stride[1];
output_data += out_stride[1];
}
rois_data += roi_stride[0];
}
}
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(roi_align,
kX86,
kFloat,
kNCHW,
paddle::lite::kernels::x86::RoiAlignCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindInput("ROIs", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include "lite/core/kernel.h"
#include "lite/operators/roi_align_op.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
class RoiAlignCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::RoiAlignParam;
void Run() override;
virtual ~RoiAlignCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
......@@ -55,7 +55,7 @@ if(LITE_BUILD_EXTRA)
lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
#lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
#lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
......
......@@ -120,6 +120,13 @@ TEST(RoiAlign, precision) {
// The unit test for roi_align needs the params,
// which is obtained by runing model by paddle.
LOG(INFO) << "test roi align op";
#ifdef LITE_WITH_X86
Place place(TARGET(kX86));
std::unique_ptr<arena::TestCase> tester(
new RoiAlignComputeTester(place, "def"));
arena::Arena arena(std::move(tester), place, 2e-4);
arena.TestPrecision();
#endif
#ifdef LITE_WITH_ARM
Place place(TARGET(kARM));
std::unique_ptr<arena::TestCase> tester(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册