未验证 提交 d81fbe8d 编写于 作者: S SPC 提交者: GitHub

Add roialign mlukernel (#103)

1add roi align mlu kernel
2add LITE_BUILD_EXTRA when building test for mlu roialign kernels
3modify lite.cmake to fix compiling error when supporting mlu kernelxx.o exists
上级 328d2da4
...@@ -413,7 +413,9 @@ function(add_kernel TARGET device level) ...@@ -413,7 +413,9 @@ function(add_kernel TARGET device level)
if ("${device}" STREQUAL "MLU") if ("${device}" STREQUAL "MLU")
if (NOT LITE_WITH_MLU) if (NOT LITE_WITH_MLU)
foreach(src ${args_SRCS}) foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") if (NOT (src MATCHES ".*\\.o"))
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endif()
endforeach() endforeach()
return() return()
endif() endif()
...@@ -446,7 +448,13 @@ function(add_kernel TARGET device level) ...@@ -446,7 +448,13 @@ function(add_kernel TARGET device level)
# the source list will collect for paddle_use_kernel.h code generation. # the source list will collect for paddle_use_kernel.h code generation.
foreach(src ${args_SRCS}) foreach(src ${args_SRCS})
file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") if (LITE_WITH_MLU)
if (NOT (src MATCHES ".*\\.o"))
file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endif()
else()
file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endif()
endforeach() endforeach()
lite_cc_library(${TARGET} SRCS ${args_SRCS} lite_cc_library(${TARGET} SRCS ${args_SRCS}
......
...@@ -7,5 +7,11 @@ add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_k ...@@ -7,5 +7,11 @@ add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_k
add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${target_wrapper_mlu}) add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${target_wrapper_mlu})
add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps}) add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps})
# depend on transpose function in backend/x86/math/math_function # depend on transpose function in backend/x86/math/math_function
add_kernel(roi_align_compute_mlu MLU extra SRCS roi_align_compute.cc mlu_kernel/roi_align_kernel.o DEPS ${lite_kernel_deps})
if(LITE_BUILD_EXTRA)
lite_cc_test(test_roi_align_compute_mlu SRCS roi_align_compute_test.cc DEPS roi_align_compute_mlu)
endif()
add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_function} ${target_wrapper_mlu}) add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_function} ${target_wrapper_mlu})
add_kernel(cast_compute_mlu MLU basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} ${target_wrapper_mlu}) add_kernel(cast_compute_mlu MLU basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} ${target_wrapper_mlu})
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/roi_align_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace mlu {
void RoiAlignCompute::Run() {
auto& mlu_context = this->ctx_->template As<MLUContext>();
auto& exec_queue = mlu_context.exec_queue();
this->Run(exec_queue);
}
void RoiAlignCompute::Run(const cnrtQueue_t& exec_queue) {
auto& param = this->Param<param_t>();
auto* rois = param.ROIs;
auto rois_dims = rois->dims();
int rois_num = rois_dims[0];
if (rois_num == 0) {
return;
}
auto* in = param.X;
auto* out = param.Out;
float spatial_scale = param.spatial_scale;
int pooled_height = param.pooled_height;
int pooled_width = param.pooled_width;
int sampling_ratio = param.sampling_ratio;
half spatial_scale_half;
cnrtConvertFloatToHalf(&spatial_scale_half, spatial_scale);
auto in_dims = in->dims();
// int batch_size = in_dims[0];
int channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
auto out_dims = out->dims();
std::vector<int> roi_ind_vec(rois_num);
auto rois_lod = rois->lod().back();
for (int n = 0, rois_batch_size = rois_lod.size() - 1; n < rois_batch_size;
++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_ind_vec[i] = n;
}
}
auto* input_data = in->data<float>();
auto* output_data = out->mutable_data<float>();
auto* rois_data = rois->data<float>();
std::vector<half> input_tmp_vec(in_dims.production());
std::vector<half> rois_tmp_vec(rois_dims.production());
std::vector<half> output_tmp_vec(out_dims.production());
std::vector<int> nchw2nhwc_dimorder{0, 2, 3, 1};
std::vector<int> tmp_in_dims;
for (int i = 0; i < in_dims.size(); i++) {
tmp_in_dims.emplace_back(static_cast<int>(in_dims[i]));
}
cnrtTransOrderAndCast(const_cast<float*>(input_data),
CNRT_FLOAT32,
input_tmp_vec.data(),
CNRT_FLOAT16,
NULL,
tmp_in_dims.size(),
tmp_in_dims.data(),
nchw2nhwc_dimorder.data());
cnrtCastDataType(const_cast<float*>(rois_data),
CNRT_FLOAT32,
const_cast<half*>(rois_tmp_vec.data()),
CNRT_FLOAT16,
rois_dims.production(),
NULL);
void *input_mlu_data = nullptr, *rois_mlu_data = nullptr,
*roi_batch_id_mlu_data = nullptr, *output_mlu_data = nullptr;
cnrtMalloc(&input_mlu_data,
input_tmp_vec.size() * sizeof(input_tmp_vec.front()));
cnrtMemcpy(input_mlu_data,
input_tmp_vec.data(),
input_tmp_vec.size() * sizeof(input_tmp_vec.front()),
CNRT_MEM_TRANS_DIR_HOST2DEV);
cnrtMalloc(&rois_mlu_data,
rois_tmp_vec.size() * sizeof(rois_tmp_vec.front()));
cnrtMemcpy(rois_mlu_data,
rois_tmp_vec.data(),
rois_tmp_vec.size() * sizeof(rois_tmp_vec.front()),
CNRT_MEM_TRANS_DIR_HOST2DEV);
cnrtMalloc(&roi_batch_id_mlu_data,
roi_ind_vec.size() * sizeof(roi_ind_vec.front()));
cnrtMemcpy(roi_batch_id_mlu_data,
roi_ind_vec.data(),
roi_ind_vec.size() * sizeof(roi_ind_vec.front()),
CNRT_MEM_TRANS_DIR_HOST2DEV);
// malloc output memory on device
cnrtMalloc(&output_mlu_data,
output_tmp_vec.size() * sizeof(output_tmp_vec.front()));
// prepare kernel params
cnrtKernelParamsBuffer_t params;
cnrtGetKernelParamsBuffer(&params);
cnrtKernelParamsBufferAddParam(
params, &input_mlu_data, sizeof(input_mlu_data));
cnrtKernelParamsBufferAddParam(params, &rois_mlu_data, sizeof(rois_mlu_data));
cnrtKernelParamsBufferAddParam(
params, &roi_batch_id_mlu_data, sizeof(roi_batch_id_mlu_data));
cnrtKernelParamsBufferAddParam(
params, &output_mlu_data, sizeof(output_mlu_data));
cnrtKernelParamsBufferAddParam(params, &height, sizeof(height));
cnrtKernelParamsBufferAddParam(params, &width, sizeof(width));
cnrtKernelParamsBufferAddParam(params, &channels, sizeof(channels));
cnrtKernelParamsBufferAddParam(params, &pooled_height, sizeof(pooled_height));
cnrtKernelParamsBufferAddParam(params, &pooled_width, sizeof(pooled_width));
cnrtKernelParamsBufferAddParam(params, &rois_num, sizeof(rois_num));
cnrtKernelParamsBufferAddParam(
params, &spatial_scale_half, sizeof(spatial_scale_half));
cnrtKernelParamsBufferAddParam(
params, &sampling_ratio, sizeof(sampling_ratio));
cnrtDim3_t task_dims;
task_dims.x = 1, task_dims.y = 1, task_dims.z = 1;
cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK;
// invoke kernel and sync to compute on MLU
CNRT_CALL(cnrtInvokeKernel_V2(reinterpret_cast<void*>(&roi_align_kernel),
task_dims,
params,
func_type,
exec_queue));
CNRT_CALL(cnrtSyncQueue(exec_queue));
cnrtMemcpy(output_tmp_vec.data(),
output_mlu_data,
output_tmp_vec.size() * sizeof(output_tmp_vec.front()),
CNRT_MEM_TRANS_DIR_DEV2HOST);
std::vector<int> tmp_out_dims;
for (int i = 0; i < out_dims.size(); i++) {
// out_dims = {N, C, H, W}, tmp_out_dims = {N, H, W, C}
tmp_out_dims.emplace_back(out_dims[nchw2nhwc_dimorder[i]]);
}
std::vector<int> nhwc2nchw_dimorder{0, 3, 1, 2};
cnrtTransOrderAndCast(output_tmp_vec.data(),
CNRT_FLOAT16,
output_data,
CNRT_FLOAT32,
NULL,
tmp_out_dims.size(),
tmp_out_dims.data(),
nhwc2nchw_dimorder.data());
// realease resource
cnrtDestroyKernelParamsBuffer(params);
cnrtFree(input_mlu_data);
cnrtFree(rois_mlu_data);
cnrtFree(roi_batch_id_mlu_data);
cnrtFree(output_mlu_data);
}
} // namespace mlu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(roi_align,
kMLU,
kFloat,
kNCHW,
paddle::lite::kernels::mlu::RoiAlignCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindInput("ROIs",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kFloat))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "lite/core/kernel.h"
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/mlu/roi_align_kernel.h"
#include "lite/operators/layout_op.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace mlu {
class RoiAlignCompute
: public KernelLite<TARGET(kMLU), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
public:
using param_t = operators::RoiAlignParam;
void Run() override;
void Run(const cnrtQueue_t& exec_queue);
std::string doc() const override { return "Mlu roi align"; }
virtual ~RoiAlignCompute() = default;
};
} // namespace mlu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/roi_align_compute.h"
#include <gtest/gtest.h>
#include <memory>
#include <utility>
#include <vector>
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace mlu {
TEST(roi_align_mlu, retrive_op) {
auto roi_align =
KernelRegistry::Global().Create<TARGET(kMLU), PRECISION(kFloat)>(
"roi_align");
ASSERT_FALSE(roi_align.empty());
ASSERT_TRUE(roi_align.front());
}
TEST(roi_align_mlu, init) {
RoiAlignCompute roi_align;
ASSERT_EQ(roi_align.precision(), PRECISION(kFloat));
ASSERT_EQ(roi_align.target(), TARGET(kMLU));
}
TEST(roi_align_mlu, run_test) {
constexpr int ROI_SIZE = 4;
// image_height * spatial_scale == featuremap_height, width is also like this
constexpr int batch_size = 2, channels = 3, featuremap_height = 9,
featuremap_width = 16, pooled_height = 2, pooled_width = 1,
num_rois = 3, sampling_rate = 2;
constexpr float spatial_scale = 0.5;
lite::Tensor x, rois, out;
x.Resize(
lite::DDim({batch_size, channels, featuremap_height, featuremap_width}));
rois.Resize(lite::DDim({num_rois, ROI_SIZE}));
// here lod use offset representation: [0, 1), [1, num_rois)
rois.set_lod({{0, 1, num_rois}});
out.Resize(lite::DDim({num_rois, channels, pooled_height, pooled_width}));
auto x_data = x.mutable_data<float>();
auto rois_data = rois.mutable_data<float>();
auto out_data = out.mutable_data<float>();
// {0.0, 1.0, ...}
std::iota(x_data, x_data + x.dims().production(), 0.0f);
std::iota(rois_data, rois_data + rois.dims().production(), 0.25f);
RoiAlignCompute roi_align_op;
operators::RoiAlignParam param;
param.X = &x;
param.ROIs = &rois;
param.Out = &out;
param.pooled_height = pooled_height;
param.pooled_width = pooled_width;
param.spatial_scale = spatial_scale;
param.sampling_ratio = sampling_rate;
// std::unique_ptr<KernelContext> ctx(new KernelContext);
// ctx->As<MLUContext>();
// roi_align_op.SetContext(std::move(ctx));
CNRT_CALL(cnrtInit(0));
// cnrtInvokeFuncParam_t forward_param;
// u32_t affinity = 1;
// int data_param = 1;
// forward_param.data_parallelism = &data_param;
// forward_param.affinity = &affinity;
// forward_param.end = CNRT_PARAM_END;
cnrtDev_t dev_handle;
CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, 0));
CNRT_CALL(cnrtSetCurrentDevice(dev_handle));
cnrtQueue_t queue;
CNRT_CALL(cnrtCreateQueue(&queue));
roi_align_op.SetParam(param);
roi_align_op.Run(queue);
CNRT_CALL(cnrtDestroyQueue(queue));
std::vector<float> ref_results = {14.625,
22.625,
158.625,
166.625,
302.625,
310.625,
480.625,
488.625,
624.625,
632.625,
768.625,
776.625,
514.625,
522.625,
658.625,
666.625,
802.625,
810.625};
for (int i = 0; i < out.dims().production(); i++) {
EXPECT_NEAR(out_data[i], ref_results[i], (4e-3f * ref_results[i]));
}
}
} // namespace mlu
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(roi_align, kMLU, kFloat, kNCHW, def);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef LITE_KERNELS_MLU_ROI_ALIGN_KERNEL_H_
#define LITE_KERNELS_MLU_ROI_ALIGN_KERNEL_H_
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
typedef uint16_t half;
/**
* @brief Region of interests align is used to implement bilinear interpolation.
* It can change the input of uneven size into a fixed size feature map. The
* operation passes pooled_width and pooled_height divides each recommended
* area into equal sized blocks. The position remains the same. In each ROI
* block, take sampling_ratio points (if - 1, all points in the frame are
* taken). Each point is directly calculated by bilinear interpolation. Then
* take the average value of the points taken in the block as the coordinate
* value of the small box.
*
* @param[in] input: 4-D sensor of shape [N, H, W, C], n is the batch size, C is
* the number of input channels, H feature height and W feature width. Datatype
* is float16
* @param[in] rois: 2-D tensor of shape [num_rois, 4]. ROIs to be pooled
* (regions of interest). For example [[x1, Y1, X2, Y2],...], (x1, Y1) is the
* upper left point coordinate, (X2, Y2) is the lower right point coordinate.
* Data type is float16
* @param[in] roi_ind: 1-D tensor of shape [num_boxes] with values in [0,
* batch). The value of box_ind[i] specifies the image that the i-th roi refers
* to. Data type is int
* @param[out] output: 4-D tensor of shape [num_rois, pooled_height,
* pooled_weight, C].
* @param[in] height: The height of input
* @param[in] width: The width of input
* @param[in] channels: The channel of input
* @param[in] pooled_height: Output height after pooling
* @param[in] pooled_width: Output width after pooling
* @param[in] num_rois: The number of roi
* @param[in] spatial_scale: The scale factor of multiplicative space, when
* pooling, transforms the ROI coordinate to the scale used in the
* operation.image_height * spatial_scale == featuremap_height, width is also
* like this
* @param[in] sampling_ratio: The number of sampling points in the interpolation
* lattice. If it < = 0, they will adapt to ROI_Width and pooled_W, the same is
* true for height.
* @retval void
*/
void roi_align_kernel(half *input,
half *rois,
int *roi_ind,
half *output,
const int height,
const int width,
const int channels,
const int pooled_height,
const int pooled_width,
const int rois_num,
const half spatial_scale,
const int sampling_ratio);
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // LITE_KERNELS_MLU_ROI_ALIGN_KERNEL_H_
...@@ -13,9 +13,11 @@ ...@@ -13,9 +13,11 @@
// limitations under the License. // limitations under the License.
#include "lite/kernels/x86/roi_align_compute.h" #include "lite/kernels/x86/roi_align_compute.h"
#include <cmath> #include <cmath>
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/core/type_system.h" #include "lite/core/type_system.h"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册