diff --git a/cmake/lite.cmake b/cmake/lite.cmake index 9a633409cd4d1c5e650a4794fcf30b9154c8638a..66abeab483beacc3d466f626be3b0659516c4162 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -413,7 +413,9 @@ function(add_kernel TARGET device level) if ("${device}" STREQUAL "MLU") if (NOT LITE_WITH_MLU) foreach(src ${args_SRCS}) - file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + if (NOT (src MATCHES ".*\\.o")) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endif() endforeach() return() endif() @@ -446,7 +448,13 @@ function(add_kernel TARGET device level) # the source list will collect for paddle_use_kernel.h code generation. foreach(src ${args_SRCS}) - file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + if (LITE_WITH_MLU) + if (NOT (src MATCHES ".*\\.o")) + file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endif() + else() + file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endif() endforeach() lite_cc_library(${TARGET} SRCS ${args_SRCS} diff --git a/lite/kernels/mlu/CMakeLists.txt b/lite/kernels/mlu/CMakeLists.txt index 5557f86c589951b514b0d44f55c8be8f2a825e0f..eed70d7fa6c39c7de1e03f2fc03a824ab50c1f3a 100644 --- a/lite/kernels/mlu/CMakeLists.txt +++ b/lite/kernels/mlu/CMakeLists.txt @@ -7,5 +7,11 @@ add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_k add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${target_wrapper_mlu}) add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps}) # depend on transpose function in backend/x86/math/math_function +add_kernel(roi_align_compute_mlu MLU extra SRCS roi_align_compute.cc mlu_kernel/roi_align_kernel.o DEPS ${lite_kernel_deps}) + +if(LITE_BUILD_EXTRA) + lite_cc_test(test_roi_align_compute_mlu SRCS roi_align_compute_test.cc DEPS roi_align_compute_mlu) +endif() + add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_function} ${target_wrapper_mlu}) add_kernel(cast_compute_mlu MLU basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} ${target_wrapper_mlu}) diff --git a/lite/kernels/mlu/mlu_kernel/roi_align_kernel.o b/lite/kernels/mlu/mlu_kernel/roi_align_kernel.o new file mode 100644 index 0000000000000000000000000000000000000000..d43056577e894203a0cfdd0f612478e22b7718e1 Binary files /dev/null and b/lite/kernels/mlu/mlu_kernel/roi_align_kernel.o differ diff --git a/lite/kernels/mlu/roi_align_compute.cc b/lite/kernels/mlu/roi_align_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..52bfdfea15b60cd190ca74854fb838a75c03f921 --- /dev/null +++ b/lite/kernels/mlu/roi_align_compute.cc @@ -0,0 +1,198 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/roi_align_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace mlu { + +void RoiAlignCompute::Run() { + auto& mlu_context = this->ctx_->template As(); + auto& exec_queue = mlu_context.exec_queue(); + this->Run(exec_queue); +} + +void RoiAlignCompute::Run(const cnrtQueue_t& exec_queue) { + auto& param = this->Param(); + + auto* rois = param.ROIs; + auto rois_dims = rois->dims(); + int rois_num = rois_dims[0]; + if (rois_num == 0) { + return; + } + + auto* in = param.X; + auto* out = param.Out; + float spatial_scale = param.spatial_scale; + int pooled_height = param.pooled_height; + int pooled_width = param.pooled_width; + int sampling_ratio = param.sampling_ratio; + + half spatial_scale_half; + cnrtConvertFloatToHalf(&spatial_scale_half, spatial_scale); + + auto in_dims = in->dims(); + // int batch_size = in_dims[0]; + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + auto out_dims = out->dims(); + + std::vector roi_ind_vec(rois_num); + auto rois_lod = rois->lod().back(); + for (int n = 0, rois_batch_size = rois_lod.size() - 1; n < rois_batch_size; + ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + roi_ind_vec[i] = n; + } + } + + auto* input_data = in->data(); + auto* output_data = out->mutable_data(); + auto* rois_data = rois->data(); + + std::vector input_tmp_vec(in_dims.production()); + std::vector rois_tmp_vec(rois_dims.production()); + std::vector output_tmp_vec(out_dims.production()); + + std::vector nchw2nhwc_dimorder{0, 2, 3, 1}; + std::vector tmp_in_dims; + for (int i = 0; i < in_dims.size(); i++) { + tmp_in_dims.emplace_back(static_cast(in_dims[i])); + } + cnrtTransOrderAndCast(const_cast(input_data), + CNRT_FLOAT32, + input_tmp_vec.data(), + CNRT_FLOAT16, + NULL, + tmp_in_dims.size(), + tmp_in_dims.data(), + nchw2nhwc_dimorder.data()); + cnrtCastDataType(const_cast(rois_data), + CNRT_FLOAT32, + const_cast(rois_tmp_vec.data()), + CNRT_FLOAT16, + rois_dims.production(), + NULL); + + void *input_mlu_data = nullptr, *rois_mlu_data = nullptr, + *roi_batch_id_mlu_data = nullptr, *output_mlu_data = nullptr; + cnrtMalloc(&input_mlu_data, + input_tmp_vec.size() * sizeof(input_tmp_vec.front())); + cnrtMemcpy(input_mlu_data, + input_tmp_vec.data(), + input_tmp_vec.size() * sizeof(input_tmp_vec.front()), + CNRT_MEM_TRANS_DIR_HOST2DEV); + cnrtMalloc(&rois_mlu_data, + rois_tmp_vec.size() * sizeof(rois_tmp_vec.front())); + cnrtMemcpy(rois_mlu_data, + rois_tmp_vec.data(), + rois_tmp_vec.size() * sizeof(rois_tmp_vec.front()), + CNRT_MEM_TRANS_DIR_HOST2DEV); + cnrtMalloc(&roi_batch_id_mlu_data, + roi_ind_vec.size() * sizeof(roi_ind_vec.front())); + cnrtMemcpy(roi_batch_id_mlu_data, + roi_ind_vec.data(), + roi_ind_vec.size() * sizeof(roi_ind_vec.front()), + CNRT_MEM_TRANS_DIR_HOST2DEV); + + // malloc output memory on device + cnrtMalloc(&output_mlu_data, + output_tmp_vec.size() * sizeof(output_tmp_vec.front())); + + // prepare kernel params + cnrtKernelParamsBuffer_t params; + cnrtGetKernelParamsBuffer(¶ms); + cnrtKernelParamsBufferAddParam( + params, &input_mlu_data, sizeof(input_mlu_data)); + cnrtKernelParamsBufferAddParam(params, &rois_mlu_data, sizeof(rois_mlu_data)); + cnrtKernelParamsBufferAddParam( + params, &roi_batch_id_mlu_data, sizeof(roi_batch_id_mlu_data)); + cnrtKernelParamsBufferAddParam( + params, &output_mlu_data, sizeof(output_mlu_data)); + cnrtKernelParamsBufferAddParam(params, &height, sizeof(height)); + cnrtKernelParamsBufferAddParam(params, &width, sizeof(width)); + cnrtKernelParamsBufferAddParam(params, &channels, sizeof(channels)); + cnrtKernelParamsBufferAddParam(params, &pooled_height, sizeof(pooled_height)); + cnrtKernelParamsBufferAddParam(params, &pooled_width, sizeof(pooled_width)); + cnrtKernelParamsBufferAddParam(params, &rois_num, sizeof(rois_num)); + cnrtKernelParamsBufferAddParam( + params, &spatial_scale_half, sizeof(spatial_scale_half)); + cnrtKernelParamsBufferAddParam( + params, &sampling_ratio, sizeof(sampling_ratio)); + + cnrtDim3_t task_dims; + task_dims.x = 1, task_dims.y = 1, task_dims.z = 1; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; + + // invoke kernel and sync to compute on MLU + CNRT_CALL(cnrtInvokeKernel_V2(reinterpret_cast(&roi_align_kernel), + task_dims, + params, + func_type, + exec_queue)); + CNRT_CALL(cnrtSyncQueue(exec_queue)); + + cnrtMemcpy(output_tmp_vec.data(), + output_mlu_data, + output_tmp_vec.size() * sizeof(output_tmp_vec.front()), + CNRT_MEM_TRANS_DIR_DEV2HOST); + std::vector tmp_out_dims; + for (int i = 0; i < out_dims.size(); i++) { + // out_dims = {N, C, H, W}, tmp_out_dims = {N, H, W, C} + tmp_out_dims.emplace_back(out_dims[nchw2nhwc_dimorder[i]]); + } + std::vector nhwc2nchw_dimorder{0, 3, 1, 2}; + cnrtTransOrderAndCast(output_tmp_vec.data(), + CNRT_FLOAT16, + output_data, + CNRT_FLOAT32, + NULL, + tmp_out_dims.size(), + tmp_out_dims.data(), + nhwc2nchw_dimorder.data()); + + // realease resource + cnrtDestroyKernelParamsBuffer(params); + cnrtFree(input_mlu_data); + cnrtFree(rois_mlu_data); + cnrtFree(roi_batch_id_mlu_data); + cnrtFree(output_mlu_data); +} + +} // namespace mlu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(roi_align, + kMLU, + kFloat, + kNCHW, + paddle::lite::kernels::mlu::RoiAlignCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("ROIs", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kFloat))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/mlu/roi_align_compute.h b/lite/kernels/mlu/roi_align_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..fa571efee012d2bee291138653f079ec0028b175 --- /dev/null +++ b/lite/kernels/mlu/roi_align_compute.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/mlu/roi_align_kernel.h" +#include "lite/operators/layout_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace mlu { + +class RoiAlignCompute + : public KernelLite { + public: + using param_t = operators::RoiAlignParam; + + void Run() override; + void Run(const cnrtQueue_t& exec_queue); + + std::string doc() const override { return "Mlu roi align"; } + + virtual ~RoiAlignCompute() = default; +}; + +} // namespace mlu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/mlu/roi_align_compute_test.cc b/lite/kernels/mlu/roi_align_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9cbcc74136d7d8c815e4b964f68e6b1d83377d98 --- /dev/null +++ b/lite/kernels/mlu/roi_align_compute_test.cc @@ -0,0 +1,132 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/roi_align_compute.h" + +#include + +#include +#include +#include + +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace mlu { + +TEST(roi_align_mlu, retrive_op) { + auto roi_align = + KernelRegistry::Global().Create( + "roi_align"); + ASSERT_FALSE(roi_align.empty()); + ASSERT_TRUE(roi_align.front()); +} + +TEST(roi_align_mlu, init) { + RoiAlignCompute roi_align; + ASSERT_EQ(roi_align.precision(), PRECISION(kFloat)); + ASSERT_EQ(roi_align.target(), TARGET(kMLU)); +} + +TEST(roi_align_mlu, run_test) { + constexpr int ROI_SIZE = 4; + + // image_height * spatial_scale == featuremap_height, width is also like this + constexpr int batch_size = 2, channels = 3, featuremap_height = 9, + featuremap_width = 16, pooled_height = 2, pooled_width = 1, + num_rois = 3, sampling_rate = 2; + constexpr float spatial_scale = 0.5; + + lite::Tensor x, rois, out; + + x.Resize( + lite::DDim({batch_size, channels, featuremap_height, featuremap_width})); + rois.Resize(lite::DDim({num_rois, ROI_SIZE})); + // here lod use offset representation: [0, 1), [1, num_rois) + rois.set_lod({{0, 1, num_rois}}); + out.Resize(lite::DDim({num_rois, channels, pooled_height, pooled_width})); + + auto x_data = x.mutable_data(); + auto rois_data = rois.mutable_data(); + auto out_data = out.mutable_data(); + + // {0.0, 1.0, ...} + std::iota(x_data, x_data + x.dims().production(), 0.0f); + std::iota(rois_data, rois_data + rois.dims().production(), 0.25f); + RoiAlignCompute roi_align_op; + + operators::RoiAlignParam param; + param.X = &x; + param.ROIs = &rois; + param.Out = &out; + param.pooled_height = pooled_height; + param.pooled_width = pooled_width; + param.spatial_scale = spatial_scale; + param.sampling_ratio = sampling_rate; + + // std::unique_ptr ctx(new KernelContext); + // ctx->As(); + // roi_align_op.SetContext(std::move(ctx)); + + CNRT_CALL(cnrtInit(0)); + // cnrtInvokeFuncParam_t forward_param; + // u32_t affinity = 1; + // int data_param = 1; + // forward_param.data_parallelism = &data_param; + // forward_param.affinity = &affinity; + // forward_param.end = CNRT_PARAM_END; + cnrtDev_t dev_handle; + CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, 0)); + CNRT_CALL(cnrtSetCurrentDevice(dev_handle)); + cnrtQueue_t queue; + CNRT_CALL(cnrtCreateQueue(&queue)); + + roi_align_op.SetParam(param); + roi_align_op.Run(queue); + + CNRT_CALL(cnrtDestroyQueue(queue)); + + std::vector ref_results = {14.625, + 22.625, + 158.625, + 166.625, + 302.625, + 310.625, + + 480.625, + 488.625, + 624.625, + 632.625, + 768.625, + 776.625, + + 514.625, + 522.625, + 658.625, + 666.625, + 802.625, + 810.625}; + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_results[i], (4e-3f * ref_results[i])); + } +} + +} // namespace mlu +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(roi_align, kMLU, kFloat, kNCHW, def); diff --git a/lite/kernels/mlu/roi_align_kernel.h b/lite/kernels/mlu/roi_align_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..a2298a1f89c94f51367af1a473aff2734457958a --- /dev/null +++ b/lite/kernels/mlu/roi_align_kernel.h @@ -0,0 +1,78 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef LITE_KERNELS_MLU_ROI_ALIGN_KERNEL_H_ +#define LITE_KERNELS_MLU_ROI_ALIGN_KERNEL_H_ + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +typedef uint16_t half; + +/** + * @brief Region of interests align is used to implement bilinear interpolation. + * It can change the input of uneven size into a fixed size feature map. The + * operation passes pooled_width and pooled_height divides each recommended + * area into equal sized blocks. The position remains the same. In each ROI + * block, take sampling_ratio points (if - 1, all points in the frame are + * taken). Each point is directly calculated by bilinear interpolation. Then + * take the average value of the points taken in the block as the coordinate + * value of the small box. + * + * @param[in] input: 4-D sensor of shape [N, H, W, C], n is the batch size, C is + * the number of input channels, H feature height and W feature width. Datatype + * is float16 + * @param[in] rois: 2-D tensor of shape [num_rois, 4]. ROIs to be pooled + * (regions of interest). For example [[x1, Y1, X2, Y2],...], (x1, Y1) is the + * upper left point coordinate, (X2, Y2) is the lower right point coordinate. + * Data type is float16 + * @param[in] roi_ind: 1-D tensor of shape [num_boxes] with values in [0, + * batch). The value of box_ind[i] specifies the image that the i-th roi refers + * to. Data type is int + * @param[out] output: 4-D tensor of shape [num_rois, pooled_height, + * pooled_weight, C]. + * @param[in] height: The height of input + * @param[in] width: The width of input + * @param[in] channels: The channel of input + * @param[in] pooled_height: Output height after pooling + * @param[in] pooled_width: Output width after pooling + * @param[in] num_rois: The number of roi + * @param[in] spatial_scale: The scale factor of multiplicative space, when + * pooling, transforms the ROI coordinate to the scale used in the + * operation.image_height * spatial_scale == featuremap_height, width is also + * like this + * @param[in] sampling_ratio: The number of sampling points in the interpolation + * lattice. If it < = 0, they will adapt to ROI_Width and pooled_W, the same is + * true for height. + * @retval void + */ +void roi_align_kernel(half *input, + half *rois, + int *roi_ind, + half *output, + const int height, + const int width, + const int channels, + const int pooled_height, + const int pooled_width, + const int rois_num, + const half spatial_scale, + const int sampling_ratio); + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // LITE_KERNELS_MLU_ROI_ALIGN_KERNEL_H_ diff --git a/lite/kernels/x86/roi_align_compute.cc b/lite/kernels/x86/roi_align_compute.cc index 26efd9160c59d0a45e53800d62e050bbfd941799..3c0614ebf4e1d888e836cc20a096a4981b280a94 100644 --- a/lite/kernels/x86/roi_align_compute.cc +++ b/lite/kernels/x86/roi_align_compute.cc @@ -13,9 +13,11 @@ // limitations under the License. #include "lite/kernels/x86/roi_align_compute.h" + #include #include #include + #include "lite/core/op_registry.h" #include "lite/core/tensor.h" #include "lite/core/type_system.h"