diff --git a/cmake/mlu.cmake b/cmake/mlu.cmake index b73ab16462b83e952807289d511fdb95ad74c6cd..580fbc95bd3237691bc6864af36eb4508b43dfe0 100644 --- a/cmake/mlu.cmake +++ b/cmake/mlu.cmake @@ -36,6 +36,12 @@ if(NOT CNRT_INC) message(FATAL_ERROR "Can not find cnrt.h in ${NEUWARE_HOME}/include") endif() +find_path(CNPLUGIN_INC NAMES cnplugin.h + PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH) +if(NOT CNPLUGIN_INC) + message(FATAL_ERROR "Can not find cnplugin.h in ${NEUWARE_HOME}/include") +endif() + include_directories("${NEUWARE_HOME}/include") find_library(CNML_LIB_FILE NAMES cnml @@ -59,3 +65,15 @@ else() add_library(cnrt_lib SHARED IMPORTED GLOBAL) set_property(TARGET cnrt_lib PROPERTY IMPORTED_LOCATION ${CNRT_LIB_FILE}) endif() + + +find_library(CNPLUGIN_LIB_FILE NAMES cnplugin + PATHS ${NEUWARE_HOME}/lib64) + +if(NOT CNPLUGIN_LIB_FILE) + message(FATAL_ERROR "Can not find CNPLUGIN Library in ${NEUWARE_HOME}/lib64") +else() + message(STATUS "Found CNPLUGIN Library: ${CNPLUGIN_LIB_FILE}") + add_library(cnplugin_lib SHARED IMPORTED GLOBAL) + set_property(TARGET cnplugin_lib PROPERTY IMPORTED_LOCATION ${CNPLUGIN_LIB_FILE}) +endif() \ No newline at end of file diff --git a/lite/backends/mlu/CMakeLists.txt b/lite/backends/mlu/CMakeLists.txt index 29c90b422044be4e6a7aa9f4a8da45018a41f11a..e4d997348e30c6ee50eaae1979ebc6209a2fd64b 100644 --- a/lite/backends/mlu/CMakeLists.txt +++ b/lite/backends/mlu/CMakeLists.txt @@ -4,4 +4,4 @@ endif() message (STATUS "Lite with mlu backend") -lite_cc_library(target_wrapper_mlu SRCS target_wrapper.cc DEPS cnml_lib cnrt_lib) +lite_cc_library(target_wrapper_mlu SRCS target_wrapper.cc DEPS cnml_lib cnrt_lib cnplugin_lib) diff --git a/lite/kernels/mlu/bridges/CMakeLists.txt b/lite/kernels/mlu/bridges/CMakeLists.txt index a580426b2df726724775e4829666eb34472a03fa..0d91b3f35d175b3f10c11f63bf2237f77144a5ad 100644 --- a/lite/kernels/mlu/bridges/CMakeLists.txt +++ b/lite/kernels/mlu/bridges/CMakeLists.txt @@ -28,6 +28,9 @@ lite_cc_library(subgraph_bridge_argmax_op_mlu SRCS argmax_op.cc DEPS ${subgraph_ lite_cc_library(subgraph_bridge_squeeze_op_mlu SRCS squeeze_op.cc DEPS ${subgraph_bridge_deps_mlu}) lite_cc_library(subgraph_bridge_reshape_op_mlu SRCS reshape_op.cc DEPS ${subgraph_bridge_deps_mlu}) lite_cc_library(subgraph_bridge_flatten_op_mlu SRCS flatten_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_box_coder_op_mlu SRCS box_coder_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_density_prior_box_op_mlu SRCS density_prior_box_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_multiclass_nms_mlu SRCS multiclass_nms.cc multiclass_nms_api.cc multiclass_nms_impl.o DEPS ${subgraph_bridge_deps_mlu}) set(mlu_subgraph_bridges subgraph_bridge_registry subgraph_bridge_utility_mlu @@ -52,6 +55,9 @@ set(mlu_subgraph_bridges subgraph_bridge_squeeze_op_mlu subgraph_bridge_reshape_op_mlu subgraph_bridge_flatten_op_mlu + subgraph_bridge_box_coder_op_mlu + subgraph_bridge_density_prior_box_op_mlu + subgraph_bridge_multiclass_nms_mlu CACHE INTERNAL "mlu_subgraph_bridges") @@ -88,6 +94,9 @@ lite_cc_test(test_argmax_converter_mlu SRCS argmax_op_test.cc DEPS scope optimiz lite_cc_test(test_squeeze_converter_mlu SRCS squeeze_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_reshape_converter_mlu SRCS reshape_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_flatten_converter_mlu SRCS flatten_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_box_coder_mlu SRCS box_coder_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_density_prior_box_mlu SRCS density_prior_box_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_multiclass_nms_op_converter_mlu SRCS multiclass_nms_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) if (LITE_BUILD_EXTRA) lite_cc_test(test_norm_converter_mlu SRCS norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_lrn_converter_mlu SRCS lrn_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) diff --git a/lite/kernels/mlu/bridges/box_coder_op.cc b/lite/kernels/mlu/bridges/box_coder_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ec1617cff22b4547f9fde81e62bfc10f2c650918 --- /dev/null +++ b/lite/kernels/mlu/bridges/box_coder_op.cc @@ -0,0 +1,166 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +inline cnmlBoxCodeType_t GetBoxCodeType(const std::string& type) { + if (type == "encode_center_size") { + return cnmlBoxCodeType_t::Encode; + } + return cnmlBoxCodeType_t::Decode; +} + +int BoxCoderConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto Prior_box_name = op_info->Input("PriorBox").front(); + auto Target_box_name = op_info->Input("TargetBox").front(); + auto Output_box_name = op_info->Output("OutputBox").front(); + std::vector input_arg_names = op_info->InputArgumentNames(); + if (std::find(input_arg_names.begin(), + input_arg_names.end(), + "PriorBoxVar") == input_arg_names.end()) { + LOG(FATAL) << "box coder mlu kernel expect PriorBoxVar input" << std::endl; + } + auto box_var_name = op_info->Input("PriorBoxVar").front(); + + auto* prior_box = scope->FindVar(Prior_box_name)->GetMutable(); + auto* target_box = scope->FindVar(Target_box_name)->GetMutable(); + auto* proposals = scope->FindVar(Output_box_name)->GetMutable(); + auto* box_var = scope->FindVar(box_var_name)->GetMutable(); + + auto code_type_str = op_info->GetAttr("code_type"); + auto box_normalized = op_info->GetAttr("box_normalized"); + int axis = -1; + if (op_info->HasAttr("axis")) { + axis = op_info->GetAttr("axis"); + } else { + LOG(FATAL) << "box coder mlu kernel expect axis" << std::endl; + } + + if (op_info->HasAttr("variance")) { + LOG(WARNING) << "box coder mlu kernel expect not have variance attr" + << std::endl; + VLOG(6) << "variance: "; + auto variance_vec = op_info->GetAttr>("variance"); + for (size_t i = 0; i < variance_vec.size(); i++) { + VLOG(6) << variance_vec[i]; + } + } + cnmlBoxCodeType_t code_type = GetBoxCodeType(code_type_str); + + int row = -1; + int len = -1; + int col = -1; + if (code_type == cnmlBoxCodeType_t::Encode) { + // target_box_shape = {row, len}; + // prior_box_shape = {col, len}; + // output_shape = {row, col, len}; + row = target_box->dims()[0]; + len = target_box->dims()[1]; + col = prior_box->dims()[0]; + } else if (code_type == cnmlBoxCodeType_t::Decode) { + // target_box_shape = {row,col,len}; + // prior_box_shape = {col, len} if axis == 0, or {row, len}; + // output_shape = {row, col, len}; + row = target_box->dims()[0]; + col = target_box->dims()[1]; + len = target_box->dims()[2]; + if (axis == 0) { + CHECK(prior_box->dims()[0] == col); + } else { + CHECK(prior_box->dims()[0] == row); + } + } + + bool float32_precision = false; + if (graph->FPType() == CNML_DATA_FLOAT32) { + float32_precision = true; + } + + // =================== DEBUG ====================== + VLOG(6) << "prior_box->dims(): " << prior_box->dims(); + VLOG(6) << "target_box->dims(): " << target_box->dims(); + VLOG(6) << "box_var->dims(): " << box_var->dims(); + VLOG(6) << "proposals->dims(): " << proposals->dims(); + VLOG(6) << "code_type_str: " << code_type_str; + VLOG(6) << "col: " << col; + VLOG(6) << "row: " << row; + VLOG(6) << "len: " << len; + VLOG(6) << "axis: " << axis; + VLOG(6) << "box_normalized :" << box_normalized; + VLOG(6) << "float32_precision: " << float32_precision; + VLOG(6) << "Prior_box_name: " << Prior_box_name; + VLOG(6) << "Target_box_name: " << Target_box_name; + VLOG(6) << "Output_box_name: " << Output_box_name; + VLOG(6) << "box_var_name: " << box_var_name; + + // =================== DEBUG END ====================== + auto target_box_tensor = graph->GetNode(Target_box_name); + auto prior_box_tensor = graph->GetNode(Prior_box_name); + auto box_var_tensor = graph->GetNode(box_var_name); + auto proposals_tensor = graph->AddNode(Output_box_name, + proposals->dims().Vectorize(), + CNML_TENSOR, + CNML_NCHW, + graph->FPType()); + cnmlPluginBoxCoderOpParam_t param; + CNML_CALL( + cnmlCreatePluginBoxCoderOpParam(¶m, + row, + col, + len, + axis, + box_normalized, + float32_precision, + code_type, + TargetWrapperMlu::MLUCoreVersion())); + cnmlBaseOp_t box_coder_op; + cnmlTensor_t input_tensors[3]; + input_tensors[0] = target_box_tensor->mlu_tensor(); + input_tensors[1] = prior_box_tensor->mlu_tensor(); + input_tensors[2] = box_var_tensor->mlu_tensor(); + cnmlTensor_t output_tensors[1]; + output_tensors[0] = proposals_tensor->mlu_tensor(); + CNML_CALL(cnmlCreatePluginBoxCoderOp( + &box_coder_op, param, input_tensors, output_tensors)); + + // CNML_CALL(cnmlSetOperationComputingLayout(box_coder_op, CNML_NCHW)); // + // important + graph->FuseOp(box_coder_op); + cnmlDestroyPluginBoxCoderOpParam(¶m); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(box_coder, + kMLU, + paddle::lite::subgraph::mlu::BoxCoderConverter); diff --git a/lite/kernels/mlu/bridges/box_coder_op_test.cc b/lite/kernels/mlu/bridges/box_coder_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..625e3aaf6a984320ca76bd06b019d1888052770b --- /dev/null +++ b/lite/kernels/mlu/bridges/box_coder_op_test.cc @@ -0,0 +1,505 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/box_coder_op.h" +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void ToFile(Tensor *tensor, std::string file_name) { + int count = tensor->dims().production(); + auto data = tensor->mutable_data(); + std::ostringstream outs; + for (size_t i = 0; i < count; i++) { + outs << data[i] << std::endl; + } + std::ofstream of; + of.open(file_name, std::ios::out); + of << outs.str(); + of.close(); +} + +inline std::string BoxCodeTypeToStr(cnmlBoxCodeType_t code_type) { + if (code_type == cnmlBoxCodeType_t::Encode) { + return "encode_center_size"; + } else if (code_type == cnmlBoxCodeType_t::Decode) { + return "decode_center_size"; + } else { + CHECK(false); + } +} + +inline cnmlBoxCodeType_t GetBoxCodeType(const std::string &type) { + if (type == "encode_center_size") { + return cnmlBoxCodeType_t::Encode; + } else if (type == "decode_center_size") { + return cnmlBoxCodeType_t::Decode; + } else { + CHECK(false); + } +} + +void EncodeCenterSize(float *target_box_data, + float *prior_box_data, + float *prior_box_var_data, + std::vector target_box_shape, + std::vector prior_box_shape, + std::vector prior_box_var_shape, + const bool normalized, + const std::vector variance, + float *output) { + int64_t row = target_box_shape[0]; + int64_t col = prior_box_shape[0]; + int64_t len = prior_box_shape[1]; + + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + size_t offset = i * col * len + j * len; + float prior_box_width = prior_box_data[j * len + 2] - + prior_box_data[j * len] + (normalized == false); + float prior_box_height = prior_box_data[j * len + 3] - + prior_box_data[j * len + 1] + + (normalized == false); + float prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2; + float prior_box_center_y = + prior_box_data[j * len + 1] + prior_box_height / 2; + + float target_box_center_x = + (target_box_data[i * len + 2] + target_box_data[i * len]) / 2; + float target_box_center_y = + (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2; + float target_box_width = target_box_data[i * len + 2] - + target_box_data[i * len] + (normalized == false); + float target_box_height = target_box_data[i * len + 3] - + target_box_data[i * len + 1] + + (normalized == false); + + output[offset] = + (target_box_center_x - prior_box_center_x) / prior_box_width; + output[offset + 1] = + (target_box_center_y - prior_box_center_y) / prior_box_height; + output[offset + 2] = + std::log(std::fabs(target_box_width / prior_box_width)); + output[offset + 3] = + std::log(std::fabs(target_box_height / prior_box_height)); + } + } + + if (prior_box_var_data) { + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + for (int k = 0; k < 4; ++k) { + size_t offset = i * col * len + j * len; + int prior_var_offset = j * len; + output[offset + k] /= prior_box_var_data[prior_var_offset + k]; + } + } + } + } else if (!(variance.empty())) { + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + for (int k = 0; k < 4; ++k) { + size_t offset = i * col * len + j * len; + output[offset + k] /= static_cast(variance[k]); + } + } + } + } +} + +template +void DecodeCenterSize(float *target_box_data, + float *prior_box_data, + float *prior_box_var_data, + std::vector target_box_shape, + std::vector prior_box_shape, + std::vector prior_box_var_shape, + const bool normalized, + std::vector variance, + float *output) { + int64_t row = target_box_shape[0]; + int64_t col = target_box_shape[1]; + int64_t len = target_box_shape[2]; + + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + float var_data[4] = {1., 1., 1., 1.}; + float *var_ptr = var_data; + size_t offset = i * col * len + j * len; + int prior_box_offset = axis == 0 ? j * len : i * len; + + float prior_box_width = prior_box_data[prior_box_offset + 2] - + prior_box_data[prior_box_offset] + + (normalized == false); + float prior_box_height = prior_box_data[prior_box_offset + 3] - + prior_box_data[prior_box_offset + 1] + + (normalized == false); + float prior_box_center_x = + prior_box_data[prior_box_offset] + prior_box_width / 2; + float prior_box_center_y = + prior_box_data[prior_box_offset + 1] + prior_box_height / 2; + + float target_box_center_x = 0, target_box_center_y = 0; + float target_box_width = 0, target_box_height = 0; + int prior_var_offset = axis == 0 ? j * len : i * len; + if (var_size == 2) { + std::memcpy( + var_ptr, prior_box_var_data + prior_var_offset, 4 * sizeof(float)); + } else if (var_size == 1) { + var_ptr = reinterpret_cast(variance.data()); + } + float box_var_x = *var_ptr; + float box_var_y = *(var_ptr + 1); + float box_var_w = *(var_ptr + 2); + float box_var_h = *(var_ptr + 3); + + target_box_center_x = + box_var_x * target_box_data[offset] * prior_box_width + + prior_box_center_x; + target_box_center_y = + box_var_y * target_box_data[offset + 1] * prior_box_height + + prior_box_center_y; + target_box_width = + std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width; + target_box_height = + std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height; + + output[offset] = target_box_center_x - target_box_width / 2; + output[offset + 1] = target_box_center_y - target_box_height / 2; + output[offset + 2] = + target_box_center_x + target_box_width / 2 - (normalized == false); + output[offset + 3] = + target_box_center_y + target_box_height / 2 - (normalized == false); + } + } +} + +void Compute(cnmlBoxCodeType_t code_type, + lite::Tensor *prior_box, + lite::Tensor *target_box, + lite::Tensor *box_var, + lite::Tensor *output_box, + std::vector variance, + bool normalized, + int axis) { + // BoxCodeType code_type = BoxCodeType::kDecodeCenterSize; + // std::vector prior_box_shape = {512, 4}; + // std::vector prior_box_var_shape = prior_box_shape; + + // std::vector target_box_shape; + // std::vector output_shape; + // if (code_type == BoxCodeType::kEncodeCenterSize) { + // target_box_shape = {81, 4}; + // output_shape = {81, 512, 4}; + // } else { + // target_box_shape = {81, 512, 4}; + // output_shape = {81, 512, 4}; + // } + + auto *prior_box_data = prior_box->mutable_data(); + auto *prior_box_var_data = box_var->mutable_data(); + auto *target_box_data = target_box->mutable_data(); + auto *output_data = output_box->mutable_data(); + + auto target_box_shape = target_box->dims().Vectorize(); + auto prior_box_shape = prior_box->dims().Vectorize(); + auto prior_box_var_shape = box_var->dims().Vectorize(); + if (code_type == cnmlBoxCodeType_t::Encode) { + EncodeCenterSize(target_box_data, + prior_box_data, + prior_box_var_data, + target_box_shape, + prior_box_shape, + prior_box_var_shape, + normalized, + variance, + output_data); + } else if (code_type == cnmlBoxCodeType_t::Decode) { + if (prior_box_var_data) { + LOG(INFO) << "prior_box_var_data not null" << std::endl; + if (axis == 0) { + LOG(INFO) << "use DecodeCenterSize<1, 2> axis == 0" << std::endl; + DecodeCenterSize<0, 2>(target_box_data, + prior_box_data, + prior_box_var_data, + target_box_shape, + prior_box_shape, + prior_box_var_shape, + normalized, + variance, + output_data); + } else { + LOG(INFO) << "use DecodeCenterSize<1, 2> axis == 1" << std::endl; + DecodeCenterSize<1, 2>(target_box_data, + prior_box_data, + prior_box_var_data, + target_box_shape, + prior_box_shape, + prior_box_var_shape, + normalized, + variance, + output_data); + } + } else if (!(variance.empty())) { + LOG(INFO) << "prior_box_var_data null" << std::endl; + if (axis == 0) { + DecodeCenterSize<0, 1>(target_box_data, + prior_box_data, + prior_box_var_data, + target_box_shape, + prior_box_shape, + prior_box_var_shape, + normalized, + variance, + output_data); + } else { + DecodeCenterSize<1, 1>(target_box_data, + prior_box_data, + prior_box_var_data, + target_box_shape, + prior_box_shape, + prior_box_var_shape, + normalized, + variance, + output_data); + } + } else { + if (axis == 0) { + DecodeCenterSize<0, 0>(target_box_data, + prior_box_data, + prior_box_var_data, + target_box_shape, + prior_box_shape, + prior_box_var_shape, + normalized, + variance, + output_data); + } else { + DecodeCenterSize<1, 0>(target_box_data, + prior_box_data, + prior_box_var_data, + target_box_shape, + prior_box_shape, + prior_box_var_shape, + normalized, + variance, + output_data); + } + } + } +} + +void box_coder_ref(const std::shared_ptr op) { + Scope *scope = op->scope(); + const OpInfo *op_info = op->op_info(); + auto prior_box = + scope->FindVar(op_info->Input("PriorBox").front())->GetMutable(); + auto target_box = + scope->FindVar(op_info->Input("TargetBox").front())->GetMutable(); + auto box_var = scope->FindVar(op_info->Input("PriorBoxVar").front()) + ->GetMutable(); + auto output_box = scope->FindVar(op_info->Output("OutputBox").front()) + ->GetMutable(); + + auto code_type_str = op_info->GetAttr("code_type"); + auto box_normalized = op_info->GetAttr("box_normalized"); + auto axis = op_info->GetAttr("axis"); + auto code_type = GetBoxCodeType(code_type_str); + std::vector variance; + if (op_info->HasAttr("variance")) { + variance = op_info->GetAttr>("variance"); + } + Compute(code_type, + prior_box, + target_box, + box_var, + output_box, + variance, + box_normalized, + axis); +} + +void test_box_coder(int row, + int col, + int len, + int axis, + cnmlBoxCodeType_t code_type, + bool box_normalized) { + // prepare input&output variables + Scope scope; + std::string prior_box_var_name("PriorBox"); + std::string taget_box_var_name("TargetBox"); + std::string output_box_var_name("OutputBox"); + std::string box_var_var_name("PriorBoxVar"); + std::string output_ref_var_name("OutputBox_ref"); + auto *prior_box = scope.Var(prior_box_var_name)->GetMutable(); + auto *target_box = scope.Var(taget_box_var_name)->GetMutable(); + auto *box_var = scope.Var(box_var_var_name)->GetMutable(); + auto *output_box = scope.Var(output_box_var_name)->GetMutable(); + auto *output_box_ref = scope.Var(output_ref_var_name)->GetMutable(); + + if (code_type == cnmlBoxCodeType_t::Encode) { + // target_box_shape = {row, len}; + // prior_box_shape = {col, len}; + // output_shape = {row, col, len}; + target_box->Resize({row, len}); + prior_box->Resize({col, len}); + box_var->Resize({col, len}); + } else if (code_type == cnmlBoxCodeType_t::Decode) { + // target_box_shape = {row,col,len}; + // prior_box_shape = {col, len} if axis == 0, or {row, len}; + // output_shape = {row, col, len}; + target_box->Resize({row, col, len}); + if (axis == 0) { + prior_box->Resize({col, len}); + box_var->Resize({col, len}); + } else if (axis == 1) { + prior_box->Resize({row, len}); + box_var->Resize({row, len}); + } else { + LOG(FATAL) << "axis should in {0,1} ,but got " << axis << std::endl; + } + } + + // initialize input&output data + // FillTensor(prior_box); + // FillTensor(target_box); + // FillTensor(box_var); // ?????? + for (int i = 0; i < prior_box->dims().production(); i++) { + prior_box->mutable_data()[i] = static_cast((i % 8) + 1); + } + for (int i = 0; i < target_box->dims().production(); i++) { + target_box->mutable_data()[i] = static_cast((i % 8) + 1); + } + for (int i = 0; i < box_var->dims().production() / 4; i++) { + box_var->mutable_data()[i * 4 + 0] = 0.1; + box_var->mutable_data()[i * 4 + 1] = 0.1; + box_var->mutable_data()[i * 4 + 2] = 0.2; + box_var->mutable_data()[i * 4 + 3] = 0.2; + } + + LOG(INFO) << "prior_box count : " << prior_box->dims().production(); + LOG(INFO) << "target_box count : " << target_box->dims().production(); + LOG(INFO) << "box_var count : " << box_var->dims().production(); + + // ToFile(*prior_box, "prior_box.txt"); + // ToFile(*box_var, "box_var.txt"); + // ToFile(*target_box, "target_box.txt"); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("box_coder"); + opdesc.SetInput("PriorBox", {prior_box_var_name}); + opdesc.SetInput("TargetBox", {taget_box_var_name}); + opdesc.SetInput("PriorBoxVar", {box_var_var_name}); + opdesc.SetOutput("OutputBox", {output_box_var_name}); + + opdesc.SetAttr("axis", axis); + opdesc.SetAttr("box_normalized", box_normalized); + opdesc.SetAttr("code_type", BoxCodeTypeToStr(code_type)); + + // trans inputs + Tensor prior_box_trans; + Tensor box_var_trans; + Tensor target_box_trans; + prior_box_trans.Resize(prior_box->dims()); + box_var_trans.Resize(box_var->dims()); + target_box_trans.Resize(target_box->dims()); + + auto op = CreateOp(opdesc, &scope); + box_coder_ref(op); + output_box_ref->CopyDataFrom(*output_box); + + // transpose(prior_box->mutable_data(), + // prior_box_trans.mutable_data(), + // {static_cast(prior_box->dims()[0]), + // static_cast(prior_box->dims()[1]), + // 1, + // 1}, + // {0, 2, 3, 1}); + + // row col len 1 --> row len 1 col + transpose(target_box->mutable_data(), + target_box_trans.mutable_data(), + { + static_cast(target_box->dims()[0]), + static_cast(target_box->dims()[1]), + static_cast(target_box->dims()[2]), + 1, + }, + {0, 2, 3, 1}); + + // transpose(box_var->mutable_data(), + // box_var_trans.mutable_data(), + // {static_cast(box_var->dims()[0]), + // static_cast(box_var->dims()[0]), + // 1, + // 1}, + // {0, 2, 3, 1}); + + target_box->CopyDataFrom(target_box_trans); + + LaunchOp(op, + {prior_box_var_name, taget_box_var_name, box_var_var_name}, + {output_box_var_name}); + + // execute reference implementation and save to output tensor('out') + + // compare results + auto *output_data = output_box->mutable_data(); + auto *output_ref_data = output_box_ref->mutable_data(); + Tensor output_trans; + output_trans.Resize(output_box->dims()); + // row * len * 1 * col -> row * col * len * 1 + transpose(output_data, + output_trans.mutable_data(), + {static_cast(output_box->dims()[0]), + static_cast(output_box->dims()[2]), + 1, + static_cast(output_box->dims()[1])}, + {0, 3, 1, 2}); + + output_data = output_trans.mutable_data(); + // ToFile(*output_box, "output_mlu_before_trans.txt"); + // ToFile(&output_trans, "output_mlu.txt"); + // ToFile(output_box_ref, "output_cpu.txt"); + for (int i = 0; i < output_box->dims().production(); i++) { + VLOG(6) << i; + EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-2); + } +} + +TEST(MLUBridges, prior_density_box) { + int row = 1; + int col = 20560; + int len = 4; + int axis = 0; + cnmlBoxCodeType_t code_type = cnmlBoxCodeType_t::Decode; + bool box_normalized = true; + test_box_coder(row, col, len, axis, code_type, box_normalized); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(box_coder, kMLU); diff --git a/lite/kernels/mlu/bridges/density_prior_box_op.cc b/lite/kernels/mlu/bridges/density_prior_box_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0634114a04aadcd93eadf1fe604a76c51099c709 --- /dev/null +++ b/lite/kernels/mlu/bridges/density_prior_box_op.cc @@ -0,0 +1,231 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void inferShape(Tensor* input, + Tensor* boxes, + Tensor* variances, + std::vector fixed_ratios, + std::vector densities) { + auto feat_height = input->dims()[2]; + auto feat_width = input->dims()[3]; + + int num_priors = 0; + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + + std::vector boxes_shape = {feat_width, feat_height, num_priors, 4}; + std::vector vars_shape = boxes_shape; + boxes->Resize(boxes_shape); + variances->Resize(vars_shape); +} + +int DensityPriorBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto input_name = op_info->Input("Input").front(); + auto image_name = op_info->Input("Image").front(); + auto boxes_name = op_info->Output("Boxes").front(); + auto variances_name = op_info->Output("Variances").front(); + + auto input_var = scope->FindVar(input_name)->GetMutable(); + auto image_var = scope->FindVar(image_name)->GetMutable(); + auto boxes_var = scope->FindVar(boxes_name)->GetMutable(); + auto variances_var = scope->FindVar(variances_name)->GetMutable(); + + auto clip = op_info->GetAttr("clip"); + auto fixed_sizes = op_info->GetAttr>("fixed_sizes"); + auto fixed_ratios = op_info->GetAttr>("fixed_ratios"); + auto variances_ = op_info->GetAttr>("variances"); + auto densities = op_info->GetAttr>("densities"); + auto offset = op_info->GetAttr("offset"); + auto step_w = op_info->GetAttr("step_w"); + auto step_h = op_info->GetAttr("step_h"); + + inferShape(input_var, boxes_var, variances_var, fixed_ratios, densities); + + auto input_dims = input_var->dims(); + auto image_dims = image_var->dims(); + auto boxes_dims = boxes_var->dims(); + auto variances_dims = variances_var->dims(); + + auto feat_tensor = graph->GetNode(input_name); + auto image_tensor = graph->GetNode(image_name); + + auto boxes_tensor_trans = graph->AddNode(boxes_name + ".trans.boxes", + boxes_dims.Vectorize(), + CNML_TENSOR, + CNML_NHWC, + graph->FPType()); + auto variances_tensor_trans = graph->AddNode(variances_name + ".trans.vars", + variances_dims.Vectorize(), + CNML_TENSOR, + CNML_NHWC, + graph->FPType()); + + bool float32_precision = false; + if (graph->FPType() == CNML_DATA_FLOAT32) { + float32_precision = true; + } + + // ==================== DEBUG ================== + + VLOG(6) << "input_name: " << input_name; + VLOG(6) << "image_name: " << image_name; + VLOG(6) << "boxes_name: " << boxes_name; + VLOG(6) << "variances_name: " << variances_name; + VLOG(6) << "input_dims : " << input_dims; + VLOG(6) << "image_dims : " << image_dims; + VLOG(6) << "boxes_dims : " << boxes_dims; + VLOG(6) << "variances_dims : " << variances_dims; + VLOG(6) << "clip : " << clip; + VLOG(6) << "fixed_sizes : "; + for (auto tmp : fixed_sizes) { + VLOG(6) << tmp; + } + + VLOG(6) << "fixed_ratios : "; + for (auto tmp : fixed_ratios) { + VLOG(6) << tmp; + } + VLOG(6) << "variances_ : "; + for (auto tmp : variances_) { + VLOG(6) << tmp; + } + VLOG(6) << "densities : "; + for (auto tmp : densities) { + VLOG(6) << tmp; + } + VLOG(6) << "offset : " << offset; + VLOG(6) << "clip : " << clip; + + int cnml_boxes_shape[4]; + CNML_CALL( + cnmlGetTensorShape(boxes_tensor_trans->mlu_tensor(), cnml_boxes_shape)); + VLOG(6) << "cnml_boxes_shape"; + for (size_t i = 0; i < 4; i++) { + VLOG(6) << cnml_boxes_shape[i]; + } + int cnml_vars_shape[4]; + VLOG(6) << "cnml_vars_shape"; + CNML_CALL(cnmlGetTensorShape(variances_tensor_trans->mlu_tensor(), + cnml_vars_shape)); + for (size_t i = 0; i < 4; i++) { + VLOG(6) << cnml_vars_shape[i]; + } + + int feat_width = input_dims[3]; + int feat_height = input_dims[2]; + int image_width = image_dims[3]; + int image_height = image_dims[2]; + // ==================== DEBUG END ================== + cnmlPluginDensityPriorBoxOpParam_t op_param; + cnmlCreatePluginDensityPriorBoxOpParam(&op_param, + feat_width, + feat_height, + image_width, + image_height, + variances_.data(), + variances_.size(), + densities.data(), + densities.size(), + fixed_sizes.data(), + fixed_sizes.size(), + fixed_ratios.data(), + fixed_ratios.size(), + clip, + step_w, + step_h, + offset, + float32_precision, + TargetWrapperMlu::MLUCoreVersion()); + + cnmlTensor_t input_tensors[2]; + input_tensors[0] = feat_tensor->mlu_tensor(); + input_tensors[1] = image_tensor->mlu_tensor(); + cnmlTensor_t output_tensors[2]; + output_tensors[0] = boxes_tensor_trans->mlu_tensor(); + output_tensors[1] = variances_tensor_trans->mlu_tensor(); + cnmlBaseOp_t density_prior_box_op; + CNML_CALL(cnmlCreatePluginDensityPriorBoxOp( + &density_prior_box_op, op_param, input_tensors, output_tensors)); + + std::vector nchw_to_nhwc_axis = {0, 2, 3, 1}; + // ============== Boxes Trans ======================= + auto boxes_tensor = graph->AddNode(boxes_name, + boxes_dims.Vectorize(), + CNML_TENSOR, + CNML_NCHW, + graph->FPType()); + cnmlBaseOp_t trans_boxes_op{nullptr}; + cnmlNdTransposeOpParam_t trans_boxes_param{nullptr}; + CNML_CALL(cnmlCreateNdTransposeOpParam( + &trans_boxes_param, nchw_to_nhwc_axis.data(), nchw_to_nhwc_axis.size())); + CNML_CALL(cnmlCreateNdTransposeProOp(&trans_boxes_op, + boxes_tensor_trans->mlu_tensor(), + boxes_tensor->mlu_tensor(), + trans_boxes_param)); + // ============== Boxes Trans End =================== + + // ============== Vars Trans ======================= + auto variances_tensor = graph->AddNode(variances_name, + variances_dims.Vectorize(), + CNML_TENSOR, + CNML_NCHW, + graph->FPType()); + cnmlBaseOp_t trans_vars_op{nullptr}; + cnmlNdTransposeOpParam_t trans_vars_param{nullptr}; + CNML_CALL(cnmlCreateNdTransposeOpParam( + &trans_vars_param, nchw_to_nhwc_axis.data(), nchw_to_nhwc_axis.size())); + CNML_CALL(cnmlCreateNdTransposeProOp(&trans_vars_op, + variances_tensor_trans->mlu_tensor(), + variances_tensor->mlu_tensor(), + trans_vars_param)); + // ============== Vars Trans End =================== + + // cnmlSetOperationComputingLayout(density_prior_box_op,CNML_NCHW); + // cnmlSetTensorComputingLayoutInOperation( + // density_prior_box_op, boxes_tensor->mlu_tensor(), CNML_NCHW); + // cnmlSetTensorComputingLayoutInOperation( + // density_prior_box_op, variances_tensor->mlu_tensor(), CNML_NCHW); + graph->FuseOp(trans_boxes_op); + graph->FuseOp(density_prior_box_op); + graph->FuseOp(trans_vars_op); + // cnmlDestroyPluginDensityPriorBoxOpParam(&op_param); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(density_prior_box, + kMLU, + paddle::lite::subgraph::mlu::DensityPriorBoxConverter); diff --git a/lite/kernels/mlu/bridges/density_prior_box_op_test.cc b/lite/kernels/mlu/bridges/density_prior_box_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..88b9a6c53d606d022ee173c6dcee65114a181ef1 --- /dev/null +++ b/lite/kernels/mlu/bridges/density_prior_box_op_test.cc @@ -0,0 +1,302 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/density_prior_box_op.h" +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void inferShape_(Tensor* input, + Tensor* boxes, + Tensor* variances, + std::vector fixed_ratios, + std::vector densities) { + auto feat_height = input->dims()[2]; + auto feat_width = input->dims()[3]; + + int num_priors = 0; + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + + std::vector boxes_shape = {feat_width, feat_height, num_priors, 4}; + std::vector vars_shape = boxes_shape; + boxes->Resize(boxes_shape); + variances->Resize(vars_shape); +} + +void prior_density_box_ref( + const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto input = + scope->FindVar(op_info->Input("Input").front())->GetMutable(); + auto image = + scope->FindVar(op_info->Input("Image").front())->GetMutable(); + auto boxes_tensor = + scope->FindVar(op_info->Output("Boxes").front())->GetMutable(); + auto variances = scope->FindVar(op_info->Output("Variances").front()) + ->GetMutable(); + auto clip = op_info->GetAttr("clip"); + auto fixed_sizes = op_info->GetAttr>("fixed_sizes"); + auto fixed_ratios = op_info->GetAttr>("fixed_ratios"); + auto variances_ = op_info->GetAttr>("variances"); + auto densities = op_info->GetAttr>("densities"); + auto offset = op_info->GetAttr("offset"); + auto step_w = op_info->GetAttr("step_w"); + auto step_h = op_info->GetAttr("step_h"); + + std::vector input_shape = {128, 128}; + std::vector image_shape = {256, 256}; + int num_priors = 0; + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + + int boxes_count = boxes_tensor->dims().production(); + + float* boxes = boxes_tensor->mutable_data(); + float* vars = variances->mutable_data(); + + auto img_width = image->dims()[3]; + auto img_height = image->dims()[2]; + + auto feature_width = input->dims()[3]; + auto feature_height = input->dims()[2]; + + float step_width, step_height; + if (step_w == 0 || step_h == 0) { + step_width = static_cast(img_width) / feature_width; + step_height = static_cast(img_height) / feature_height; + } else { + step_width = step_w; + step_height = step_h; + } + + int step_average = static_cast((step_width + step_height) * 0.5); + + std::vector sqrt_fixed_ratios; + for (size_t i = 0; i < fixed_ratios.size(); i++) { + sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i])); + } + + for (int h = 0; h < feature_height; ++h) { + for (int w = 0; w < feature_width; ++w) { + float center_x = (w + offset) * step_width; + float center_y = (h + offset) * step_height; + int idx = 0; + // Generate density prior boxes with fixed sizes. + for (size_t s = 0; s < fixed_sizes.size(); ++s) { + auto fixed_size = fixed_sizes[s]; + int density = densities[s]; + int shift = step_average / density; + // Generate density prior boxes with fixed ratios. + for (size_t r = 0; r < fixed_ratios.size(); ++r) { + float box_width_ratio = fixed_size * sqrt_fixed_ratios[r]; + float box_height_ratio = fixed_size / sqrt_fixed_ratios[r]; + float density_center_x = center_x - step_average / 2. + shift / 2.; + float density_center_y = center_y - step_average / 2. + shift / 2.; + for (int di = 0; di < density; ++di) { + for (int dj = 0; dj < density; ++dj) { + float center_x_temp = density_center_x + dj * shift; + float center_y_temp = density_center_y + di * shift; + boxes[h * feature_width * num_priors * 4 + w * num_priors * 4 + + idx * 4 + 0] = + std::max((center_x_temp - box_width_ratio / 2.) / img_width, + 0.); + boxes[h * feature_width * num_priors * 4 + w * num_priors * 4 + + idx * 4 + 1] = + std::max((center_y_temp - box_height_ratio / 2.) / img_height, + 0.); + boxes[h * feature_width * num_priors * 4 + w * num_priors * 4 + + idx * 4 + 2] = + std::min((center_x_temp + box_width_ratio / 2.) / img_width, + 1.); + boxes[h * feature_width * num_priors * 4 + w * num_priors * 4 + + idx * 4 + 3] = + std::min((center_y_temp + box_height_ratio / 2.) / img_height, + 1.); + idx++; + } + } + } + } + } + } + if (clip) { + std::transform(boxes, boxes + boxes_count, boxes, [](float v) -> float { + return std::min(std::max(v, 0.), 1.); + }); + } + int box_num = feature_height * feature_width * num_priors; + + for (int i = 0; i < box_num; ++i) { + for (size_t j = 0; j < variances_.size(); ++j) { + vars[i * variances_.size() + j] = variances_[j]; + } + } +} + +void test_prior_density_box(int feat_h, + int feat_w, + int img_h, + int img_w, + bool clip, + std::vector fixed_sizes, + std::vector fixed_ratios, + std::vector variances_, + std::vector densities, + float step_w, + float step_h, + float offset) { + // prepare input&output variables + Scope scope; + std::string input_var_name("Input"); + std::string image_var_name("Image"); + std::string boxes_var_name("Boxes"); + std::string variances_var_name("Variances"); + std::string boxes_ref_var_name("Boxes_ref"); + std::string variances_ref_var_name("Variances_ref"); + auto* input = scope.Var(input_var_name)->GetMutable(); + auto* image = scope.Var(image_var_name)->GetMutable(); + auto* boxes = scope.Var(boxes_var_name)->GetMutable(); + auto* variances = scope.Var(variances_var_name)->GetMutable(); + auto* boxes_ref = scope.Var(boxes_ref_var_name)->GetMutable(); + auto* variances_ref = scope.Var(variances_ref_var_name)->GetMutable(); + input->Resize({1, 1, feat_h, feat_w}); + image->Resize({1, 1, img_h, img_w}); + + // initialize input&output data + FillTensor(input); + FillTensor(image); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("density_prior_box"); + opdesc.SetInput("Input", {input_var_name}); + opdesc.SetInput("Image", {image_var_name}); + opdesc.SetOutput("Boxes", {boxes_var_name}); + opdesc.SetOutput("Variances", {variances_var_name}); + + opdesc.SetAttr("fixed_sizes", fixed_sizes); + opdesc.SetAttr("fixed_ratios", fixed_ratios); + opdesc.SetAttr("variances", variances_); + opdesc.SetAttr("densities", densities); + opdesc.SetAttr("offset", offset); + opdesc.SetAttr("clip", clip); + opdesc.SetAttr("step_w", step_w); + opdesc.SetAttr("step_h", step_h); + + inferShape_(input, boxes, variances, fixed_ratios, densities); + inferShape_(input, boxes_ref, variances_ref, fixed_ratios, densities); + + auto op = CreateOp(opdesc, &scope); + prior_density_box_ref(op); + boxes_ref->CopyDataFrom(*boxes); + variances_ref->CopyDataFrom(*variances); + LaunchOp(op, + {input_var_name, image_var_name}, + {boxes_var_name, variances_var_name}); + + // execute reference implementation and save to output tensor('out') + + // ===================== Trans From NHWC to NCHW ==================== + Tensor boxes_trans; + boxes_trans.Resize(boxes->dims().Vectorize()); + transpose(boxes->mutable_data(), + boxes_trans.mutable_data(), + {static_cast(boxes->dims()[0]), + static_cast(boxes->dims()[2]), + static_cast(boxes->dims()[3]), + static_cast(boxes->dims()[1])}, + {0, 3, 1, 2}); + boxes->CopyDataFrom(boxes_trans); + Tensor vars_trans; + vars_trans.Resize(variances->dims().Vectorize()); + transpose(variances->mutable_data(), + vars_trans.mutable_data(), + {static_cast(variances->dims()[0]), + static_cast(variances->dims()[2]), + static_cast(variances->dims()[3]), + static_cast(variances->dims()[1])}, + {0, 3, 1, 2}); + variances->CopyDataFrom(vars_trans); + + // compare results + auto* boxes_data = boxes->mutable_data(); + auto* boxes_ref_data = boxes_ref->mutable_data(); + auto* variances_data = variances->mutable_data(); + auto* variances_ref_data = variances_ref->mutable_data(); + + // ToFile(*variances, "var_mlu.txt"); + // ToFile(*variances_ref, "var_cpu.txt"); + // ToFile(*boxes, "box_mlu.txt"); + // ToFile(*boxes_ref, "box_cpu.txt"); + for (int i = 0; i < variances->dims().production(); i++) { + VLOG(6) << i; + EXPECT_NEAR(variances_data[i], variances_ref_data[i], 1e-5); + } + + for (int i = 0; i < boxes->dims().production(); i++) { + VLOG(6) << i; + EXPECT_NEAR(boxes_data[i], boxes_ref_data[i], 1e-5); + } +} + +TEST(MLUBridges, prior_density_box) { + // std::vector input_shape = {128, 128}; + // std::vector image_shape = {256, 256}; + // std::vector fixed_sizes = {8 * 16, 16 * 16, 32 * 16}; + // std::vector fixed_sizes = {8, 16, 32}; + // std::vector fixed_ratios = {0.5, 1, 2}; + // std::vector densities = {1, 1, 1}; + + std::vector input_shape = {16, 16}; + std::vector image_shape = {32, 32}; + std::vector fixed_sizes = {8, 16, 32}; + std::vector fixed_ratios = {0.5, 1, 2}; + std::vector densities = {1, 1, 1}; + std::vector variances = {0.1, 0.1, 0.2, 0.2}; + bool clip = true; + float offset = 0.5; + float step_h = 0; + float step_w = 0; + + test_prior_density_box(input_shape[1], + input_shape[0], + image_shape[1], + image_shape[0], + clip, + fixed_sizes, + fixed_ratios, + variances, + densities, + offset, + step_h, + step_w); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(density_prior_box, kMLU); diff --git a/lite/kernels/mlu/bridges/multiclass_nms.cc b/lite/kernels/mlu/bridges/multiclass_nms.cc new file mode 100644 index 0000000000000000000000000000000000000000..a749d9b392e97d06ec798d3aa502ff7706eb76de --- /dev/null +++ b/lite/kernels/mlu/bridges/multiclass_nms.cc @@ -0,0 +1,250 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/multiclass_nms_api.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/operators/multiclass_nms_op.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int MulticlassNmsConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto bboxes_name = op_info->Input("BBoxes").front(); + auto scores_name = op_info->Input("Scores").front(); + auto out_name = op_info->Output("Out").front(); + + auto* bboxes = scope->FindTensor(bboxes_name); + auto* scores = scope->FindTensor(scores_name); + auto* out = scope->FindTensor(out_name); + auto background_label = op_info->GetAttr("background_label"); + auto keep_top_k = op_info->GetAttr("keep_top_k"); + auto nms_top_k = op_info->GetAttr("nms_top_k"); + auto score_threshold = op_info->GetAttr("score_threshold"); + auto nms_threshold = op_info->GetAttr("nms_threshold"); + auto nms_eta = op_info->GetAttr("nms_eta"); + bool normalized = false; + if (op_info->HasAttr("normalized")) { + normalized = op_info->GetAttr("normalized"); + } + + auto bboxes_dims = bboxes->dims(); + auto scores_dims = scores->dims(); + + auto batch_size = bboxes->dims()[0]; + auto num_boxes = bboxes->dims()[1]; + auto class_num = scores->dims()[1]; + keep_top_k = keep_top_k == -1 ? num_boxes : keep_top_k; + + // ????????????? + int box_size = 4; + std::vector outs_shape = {batch_size, keep_top_k, box_size + 2}; + const_cast(out)->Resize(outs_shape); + auto out_dims = out->dims(); + + // LOG(WARNING) << "CORE NUM SHOULD BE 4!!!!" << std::endl; + + int core_num = TargetWrapperMlu::MLUCoreNumber(); + + // expect {batch_size, num_boxes, box_size} in compute + // while {batch_size, box_size,num_boxes} on mlu + // while {batch_size, num_boxes, box_size} on cpu + // so mlu data_flow and mlu compute layout mismatch, should set bboxes_tensor + // as NCHW + auto bboxes_tensor = graph->GetNode(bboxes_name); + // expect {batch_size, class_num, num_boxes} in compute + // while {batch_size, num_boxes,class_num } on mlu + // while {batch_size, class_num, num_boxes} on cpu + // so mlu data_flow and mlu compute layout mismatch, should set scores_tensor + // as NCHW + auto scores_tensor = graph->GetNode(scores_name); + // expect batch_size, keep_top_k, box_size + 2 in compute + // while batch_size, box_size + 2, keep_top_k on mlu + // while batch_size, keep_top_k, box_size + 2 on cpu + // so mlu data_flow and mlu compute layout mismatch, should set out_tensor as + auto out_tensor = graph->AddNode( + out_name, out_dims.Vectorize(), CNML_TENSOR, CNML_NCHW, graph->FPType()); + + // trans bboxes {batch_size, num_boxes, box_size} + auto bboxes_trans_tensor = graph->AddNode(bboxes_name + ".trans.bboxes", + bboxes_dims.Vectorize(), + CNML_TENSOR, + CNML_NCHW, + graph->FPType(), + CNML_NCHW); + // trans scores {batch_size, class_num, num_boxes} + auto scores_trans_tensor = graph->AddNode(bboxes_name + ".trans.scores", + scores_dims.Vectorize(), + CNML_TENSOR, + CNML_NCHW, + graph->FPType(), + CNML_NCHW); + // trans out {batch_size, keep_top_k, box_size + 2} + auto out_trans_tensor = graph->AddNode(out_name + ".trans.out", + out_dims.Vectorize(), + CNML_TENSOR, + CNML_NCHW, + graph->FPType(), + CNML_NCHW); + + std::string out_num_name = "nms_out_num"; + auto* out_num = scope->NewTensor(out_num_name); + std::vector out_num_shape = {batch_size, 1}; + out_num->Resize(out_num_shape); + auto num_outs_tensor = graph->AddNode( + out_num_name, out_num_shape, CNML_TENSOR, CNML_NCHW, graph->FPType()); + bool float_precision = false; + if (graph->FPType() == CNML_DATA_FLOAT32) { + float_precision = true; + } + int64_t workspace_mem_size = + 4 * std::min(static_cast(batch_size), core_num) * + (14 * num_boxes + 8 * class_num * num_boxes); + int64_t workspace_fp_size = workspace_mem_size / 4; + if (!float_precision) { + // when run as fp16, mlu size will be half of cpu size, so workspace_fp_size + // should be double + workspace_fp_size = workspace_mem_size / 2; + } + std::vector workspace_shape = {workspace_fp_size}; + std::string nms_workspace_name = + "nms_workspace"; // expect only one nms in same model + auto workspace_tensor = graph->AddNode(nms_workspace_name, + workspace_shape, + CNML_CONST, + CNML_NCHW, + graph->FPType()); + std::vector workspace_cpu(workspace_shape[0]); + // void* work_space_ = nullptr; + // cnrtMalloc(&work_space_, workspace_shape[0]); + VLOG(6) << "workspace_shape :" << workspace_shape[0]; + // VLOG(6) << "workspace_shape mlu ptr :" + // << reinterpret_cast(work_space_); + + // =================== Bboxes Trans ============================ + std::vector bboxes_axis = {0, 2, 1}; + cnmlBaseOp_t bboxes_trans_op{nullptr}; + cnmlNdTransposeOpParam_t bboxes_trans_param{nullptr}; + CNML_CALL(cnmlCreateNdTransposeOpParam( + &bboxes_trans_param, bboxes_axis.data(), bboxes_axis.size())); + CNML_CALL(cnmlCreateNdTransposeProOp(&bboxes_trans_op, + bboxes_tensor->mlu_tensor(), + bboxes_trans_tensor->mlu_tensor(), + bboxes_trans_param)); + // =================== Bboxes Trans END ======================== + + // =================== Scores Trans ============================ + std::vector scores_axis = {0, 2, 1}; + cnmlBaseOp_t scores_trans_op{nullptr}; + cnmlNdTransposeOpParam_t scores_trans_param{nullptr}; + CNML_CALL(cnmlCreateNdTransposeOpParam( + &scores_trans_param, scores_axis.data(), scores_axis.size())); + CNML_CALL(cnmlCreateNdTransposeProOp(&scores_trans_op, + scores_tensor->mlu_tensor(), + scores_trans_tensor->mlu_tensor(), + scores_trans_param)); + // =================== Scores Trans END ======================== + multiclass_nms_param_t params_; + create_multiclass_nms_param(¶ms_, + score_threshold, + nms_top_k, + keep_top_k, + nms_threshold, + normalized, + nms_eta, + background_label, + batch_size, + class_num, + num_boxes, + box_size); + + cnmlBaseOp_t multiclass_nms_op; + create_multiclass_nms_op(&multiclass_nms_op, + params_, + bboxes_trans_tensor->mlu_tensor(), + scores_trans_tensor->mlu_tensor(), + out_trans_tensor->mlu_tensor(), + num_outs_tensor->mlu_tensor(), + workspace_tensor->mlu_tensor(), + float_precision); + + graph->BindConstRawData( + nms_workspace_name, workspace_cpu.data(), workspace_cpu.size(), true); + + // =================== Out Trans ============================ + std::vector out_axis = {0, 2, 1}; + cnmlBaseOp_t out_trans_op{nullptr}; + cnmlNdTransposeOpParam_t out_trans_param{nullptr}; + CNML_CALL(cnmlCreateNdTransposeOpParam( + &out_trans_param, out_axis.data(), out_axis.size())); + CNML_CALL(cnmlCreateNdTransposeProOp(&out_trans_op, + out_trans_tensor->mlu_tensor(), + out_tensor->mlu_tensor(), + out_trans_param)); + // =================== Out Trans END ======================== + + // =================== DEBUG ==================== + VLOG(6) << "bboxes_name: " << bboxes_name; + VLOG(6) << "scores_name: " << scores_name; + VLOG(6) << "out_name: " << out_name; + VLOG(6) << "background_label: " << background_label; + VLOG(6) << "keep_top_k: " << keep_top_k; + VLOG(6) << "nms_top_k: " << nms_top_k; + VLOG(6) << "score_threshold: " << score_threshold; + VLOG(6) << "nms_threshold: " << nms_threshold; + VLOG(6) << "nms_eta: " << nms_eta; + VLOG(6) << "normalized: " << normalized; + VLOG(6) << "bboxes_dims: " << bboxes_dims; + VLOG(6) << "scores_dims: " << scores_dims; + VLOG(6) << "out_dims: " << out_dims; + VLOG(6) << "out_dims: " << out->dims(); + VLOG(6) << "batch_size: " << batch_size; + VLOG(6) << "num_boxes : " << num_boxes; + VLOG(6) << "class_num: " << class_num; + // cnmlPrintTensor(bboxes_tensor->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(bboxes_trans_tensor->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(scores_tensor->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(scores_trans_tensor->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(out_tensor->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(out_trans_tensor->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(num_outs_tensor->mlu_tensor(), CNML_TENSOR); + // =================== DEBUG END ================ + graph->FuseOp(bboxes_trans_op); + graph->FuseOp(scores_trans_op); + graph->FuseOp(multiclass_nms_op); + graph->FuseOp(out_trans_op); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(multiclass_nms, + kMLU, + paddle::lite::subgraph::mlu::MulticlassNmsConverter); diff --git a/lite/kernels/mlu/bridges/multiclass_nms_api.cc b/lite/kernels/mlu/bridges/multiclass_nms_api.cc new file mode 100644 index 0000000000000000000000000000000000000000..d5de834d2781e4b64750141c0311907280b3d0f8 --- /dev/null +++ b/lite/kernels/mlu/bridges/multiclass_nms_api.cc @@ -0,0 +1,132 @@ +// Copyright (c) 2020 smarsu. All Rights Reserved. + +#include "lite/kernels/mlu/bridges/multiclass_nms_api.h" +#include +#include +#include +#include +#include +#include + +extern "C" { +void multiclass_nms_paddle_entry(void *bboxes, + void *scores, + void *outs, + void *num_outs, + float score_threshold, + int nms_top_k, + int keep_top_k, + float nms_threshold, + bool normalized, + float nms_eta, + int background_label, + int batch_size, + int class_num, + int num_boxes, + int box_size, + void *work_space, + DataType data_type); +} // extern "C" + +void create_multiclass_nms_param(multiclass_nms_param_t *params_ptr, + float score_threshold, + int nms_top_k, + int keep_top_k, + float nms_threshold, + bool normalized, + float nms_eta, + int background_label, + int batch_size, + int class_num, + int num_boxes, + int box_size) { + multiclass_nms_param_t params = + (multiclass_nms_param_t)malloc(sizeof(struct multiclass_nms_param)); + params->score_threshold = score_threshold; + params->nms_top_k = nms_top_k; + params->keep_top_k = keep_top_k; + params->nms_threshold = nms_threshold; + params->normalized = normalized; + params->nms_eta = nms_eta; + params->background_label = background_label; + params->batch_size = batch_size; + params->class_num = class_num; + params->num_boxes = num_boxes; + params->box_size = box_size; + *params_ptr = params; + + return; +} + +void destory_multiclass_nms_param(multiclass_nms_param_t *params) { + if (*params != NULL) { + free(*params); + } +} + +int create_multiclass_nms_op(cnmlBaseOp_t *op_ptr, + multiclass_nms_param_t nms_param, + cnmlTensor_t bboxes, + cnmlTensor_t scores, + cnmlTensor_t outs, + cnmlTensor_t num_outs, + cnmlTensor_t workspace_tensor, + bool float_precision) { + DataType data_type = kFloat16; + if (float_precision) { + data_type = kFloat32; + } + + if (nms_param->keep_top_k == -1) { + nms_param->keep_top_k = nms_param->num_boxes; + } + + cnrtKernelParamsBuffer_t params; + cnrtGetKernelParamsBuffer(¶ms); + cnrtKernelParamsBufferMarkInput(params); + cnrtKernelParamsBufferMarkInput(params); + cnrtKernelParamsBufferMarkOutput(params); + cnrtKernelParamsBufferMarkOutput(params); + cnrtKernelParamsBufferAddParam( + params, &nms_param->score_threshold, sizeof(float)); + cnrtKernelParamsBufferAddParam(params, &nms_param->nms_top_k, sizeof(int)); + cnrtKernelParamsBufferAddParam(params, &nms_param->keep_top_k, sizeof(int)); + cnrtKernelParamsBufferAddParam( + params, &nms_param->nms_threshold, sizeof(float)); + cnrtKernelParamsBufferAddParam(params, &nms_param->normalized, sizeof(bool)); + cnrtKernelParamsBufferAddParam(params, &nms_param->nms_eta, sizeof(float)); + cnrtKernelParamsBufferAddParam( + params, &nms_param->background_label, sizeof(int)); + cnrtKernelParamsBufferAddParam(params, &nms_param->batch_size, sizeof(int)); + cnrtKernelParamsBufferAddParam(params, &nms_param->class_num, sizeof(int)); + cnrtKernelParamsBufferAddParam(params, &nms_param->num_boxes, sizeof(int)); + cnrtKernelParamsBufferAddParam(params, &nms_param->box_size, sizeof(int)); + // cnrtKernelParamsBufferAddParam( + // params, &nms_param->work_space, sizeof(void *)); + cnrtKernelParamsBufferMarkStatic(params); + cnrtKernelParamsBufferAddParam(params, &data_type, sizeof(DataType)); + + cnmlTensor_t input_tensors[2]; + input_tensors[0] = bboxes; + input_tensors[1] = scores; + cnmlTensor_t output_tensors[2]; + output_tensors[0] = outs; + output_tensors[1] = num_outs; + cnmlTensor_t static_tensors[1]; + static_tensors[0] = workspace_tensor; + + cnmlCreatePluginOp(op_ptr, + "multiclass_nms_paddle", + reinterpret_cast(multiclass_nms_paddle_entry), + params, + input_tensors, + 2, + output_tensors, + 2, + static_tensors, + 1); + + cnrtDestroyKernelParamsBuffer(params); + + return 0; +} diff --git a/lite/kernels/mlu/bridges/multiclass_nms_api.h b/lite/kernels/mlu/bridges/multiclass_nms_api.h new file mode 100644 index 0000000000000000000000000000000000000000..25447e99f55f8c0ecbbfc73d5c8bd51727be7f99 --- /dev/null +++ b/lite/kernels/mlu/bridges/multiclass_nms_api.h @@ -0,0 +1,82 @@ +// Copyright (c) 2020 smarsu. All Rights Reserved. + +#ifndef LITE_KERNELS_MLU_BRIDGES_MULTICLASS_NMS_API_H_ +#define LITE_KERNELS_MLU_BRIDGES_MULTICLASS_NMS_API_H_ + +// #define ALIGN_UP(a, b) (((a) + (b) - 1) / (b) * (b)) +// #define ALIGN_DN(a, b) ((a) / (b) * (b)) +// #define DIV_UP(a, b) (((a) + (b) - 1) / (b)) +// #define DIV_DN(a, b) ((a) / (b)) + +// #define MAX(a, b) ((a) >= (b) ? (a) : (b)) +// #define MIN(a, b) ((a) <= (b) ? (a) : (b)) +// #define ABS(a) (((a) > 0) ? (a) : (-(a))) + +// #define INIFITE 0x7F800000 +#include +#include + +enum DataType { + kInvalid, + kFloat32, + kFloat16, + kUint8, + kInt8, + kInt16, + kInt32, +}; + +enum TopkSplitStrategy { + kAuto, + kSplitN, + kSplitC, +}; + +enum ColorType { + kGray, + kRGB, + kBGR, + kRGBA, +}; + +struct multiclass_nms_param { + float score_threshold; + int nms_top_k; + int keep_top_k; + float nms_threshold; + bool normalized; + float nms_eta; + int background_label; + int batch_size; + int class_num; + int num_boxes; + int box_size; +}; + +typedef struct multiclass_nms_param *multiclass_nms_param_t; + +void create_multiclass_nms_param(multiclass_nms_param_t *params_ptr, + float score_threshold, + int nms_top_k, + int keep_top_k, + float nms_threshold, + bool normalized, + float nms_eta, + int background_label, + int batch_size, + int class_num, + int num_boxes, + int box_size); + +void destory_multiclass_nms_param(multiclass_nms_param_t *params); + +int create_multiclass_nms_op(cnmlBaseOp_t *op_ptr, + multiclass_nms_param_t nms_param, + cnmlTensor_t bboxes, + cnmlTensor_t scores, + cnmlTensor_t outs, + cnmlTensor_t num_outs, + cnmlTensor_t workspace_tensor, + bool float_precision); + +#endif // LITE_KERNELS_MLU_BRIDGES_MULTICLASS_NMS_API_H_ diff --git a/lite/kernels/mlu/bridges/multiclass_nms_impl.o b/lite/kernels/mlu/bridges/multiclass_nms_impl.o new file mode 100644 index 0000000000000000000000000000000000000000..5da75ef56a00d936d76abfd5f3caf7ae35f3594a Binary files /dev/null and b/lite/kernels/mlu/bridges/multiclass_nms_impl.o differ diff --git a/lite/kernels/mlu/bridges/multiclass_nms_op_test.cc b/lite/kernels/mlu/bridges/multiclass_nms_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6618252ca70b58754dd64e5f746092773c9713dc --- /dev/null +++ b/lite/kernels/mlu/bridges/multiclass_nms_op_test.cc @@ -0,0 +1,604 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/multiclass_nms_op.h" + +#include + +#include +#include +#include +#include + +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +std::vector gen_random_boxes(int box_num, int img_w, int img_h) { + std::vector boxes; + unsigned int SEED = 1; + + for (size_t i = 0; i < box_num; i++) { + float x = rand_r(&SEED) / static_cast(RAND_MAX) * img_w; + float w = rand_r(&SEED) / static_cast(RAND_MAX) * img_w; + float y = rand_r(&SEED) / static_cast(RAND_MAX) * img_h; + float h = rand_r(&SEED) / static_cast(RAND_MAX) * img_h; + float xmin = std::max(0.0f, (x - w) / 2); + float ymin = std::max(0.0f, (y - h) / 2); + float xmax = std::min(static_cast(img_w), (x + w) / 2); + float ymax = std::min(static_cast(img_h), (y + h) / 2); + boxes.push_back(xmin); + boxes.push_back(ymin); + boxes.push_back(xmax); + boxes.push_back(ymax); + } + return boxes; +} + +std::vector gen_random_scores(int box_num, int class_num) { + std::vector scores; + unsigned int SEED = 1; + for (size_t i = 0; i < box_num; i++) { + for (size_t i = 0; i < class_num; i++) { + scores.push_back(rand_r(&SEED) / static_cast(RAND_MAX)); + } + } + return scores; +} + +float Area(float box[4]) { + float xmin = box[0]; + float ymin = box[1]; + float xmax = box[2]; + float ymax = box[3]; + CHECK(xmax > xmin) << "xmax: " << xmax << " xmin: " << xmin; + CHECK(ymax > ymin) << "ymax: " << ymax << " ymin: " << ymin; + float w = xmax - xmin; + float h = ymax - ymin; + return w * h; +} + +// overlap may < 0 +float overlap(float min1, float max1, float min2, float max2) { + return ((max1 - min1) + (max2 - min2)) - + (std::max(max2, max1) - std::min(min1, min2)); +} + +float IntersectionArea(float box1[4], float box2[4]) { + float box1_xmin = box1[0]; + float box1_ymin = box1[1]; + float box1_xmax = box1[2]; + float box1_ymax = box1[3]; + + float box2_xmin = box2[0]; + float box2_ymin = box2[1]; + float box2_xmax = box2[2]; + float box2_ymax = box2[3]; + + float x_overlap = overlap(box1_xmin, box1_xmax, box2_xmin, box2_xmax); + float y_overlap = overlap(box1_ymin, box1_ymax, box2_ymin, box2_ymax); + float intersection_area = x_overlap * y_overlap; + return std::max(intersection_area, 0.0f); +} + +float IOU(float box1[4], float box2[4]) { + float area1 = Area(box1); + float area2 = Area(box2); + float intersection_area = IntersectionArea(box1, box2); + float union_area = area1 + area2 - intersection_area; + return intersection_area / union_area; +} + +template +void VecToFile(const std::vector& vec, std::string filename) { + std::ofstream f(filename, std::ios::out); + if (!f) { + LOG(FATAL) << filename << "not exist!" << std::endl; + } + for (size_t i = 0; i < vec.size(); i++) { + f << vec[i] << std::endl; + } + f.close(); +} + +template +void ArrayToFile(const T* data, int size, std::string filename) { + std::ofstream f(filename, std::ios::out); + if (!f) { + LOG(FATAL) << filename << "not exist!" << std::endl; + } + for (size_t i = 0; i < size; i++) { + f << data[i] << std::endl; + } + f.close(); +} + +void ToFile(Tensor* tensor, std::string file_name) { + int count = tensor->dims().production(); + auto data = tensor->mutable_data(); + std::ostringstream outs; + for (size_t i = 0; i < count; i++) { + outs << data[i] << std::endl; + } + std::ofstream of; + of.open(file_name, std::ios::out); + of << outs.str(); + of.close(); +} + +void FromFile(Tensor* tensor, std::string file_name) { + LOG(INFO) << " from file:" << file_name << std::endl; + std::ifstream f; + f.open(file_name, std::ios::in); + if (f.good()) { + for (size_t i = 0; i < tensor->dims().production(); i++) { + f >> tensor->mutable_data()[i]; + } + } else { + LOG(FATAL) << "can not open " << file_name << "to read" << std::endl; + } + f.close(); +} + +template +static bool sort_score_pair_descend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +template +void get_max_score_index(const dtype* scores, + int num, + float threshold, + int top_k, + std::vector>* score_index_vec) { + // ArrayToFile(scores, 100, "cpu_score.txt"); + //! Generate index score pairs. + for (int i = 0; i < num; ++i) { + if (scores[i] > threshold) { + score_index_vec->push_back(std::make_pair(scores[i], i)); + } + } + + //! Sort the score pair according to the scores in descending order + std::stable_sort(score_index_vec->begin(), + score_index_vec->end(), + sort_score_pair_descend); + + //! Keep top_k scores if needed. + if (top_k > -1 && top_k < score_index_vec->size()) { + score_index_vec->resize(top_k); + } +} + +template +dtype bbox_size(const dtype* bbox, bool normalized = true) { + if (bbox[2] < bbox[0] || bbox[3] < bbox[1]) { + // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0. + return dtype(0.); + } else { + const dtype width = bbox[2] - bbox[0]; + const dtype height = bbox[3] - bbox[1]; + + if (normalized) { + return width * height; + } else { + // If bbox is not within range [0, 1]. + return (width + 1) * (height + 1); + } + } +} + +template +dtype jaccard_overlap(const dtype* bbox1, const dtype* bbox2) { + if (bbox2[0] > bbox1[2] || bbox2[2] < bbox1[0] || bbox2[1] > bbox1[3] || + bbox2[3] < bbox1[1]) { + return dtype(0.); + } else { + const dtype inter_xmin = std::max(bbox1[0], bbox2[0]); + const dtype inter_ymin = std::max(bbox1[1], bbox2[1]); + const dtype inter_xmax = std::min(bbox1[2], bbox2[2]); + const dtype inter_ymax = std::min(bbox1[3], bbox2[3]); + + const dtype inter_width = inter_xmax - inter_xmin; + const dtype inter_height = inter_ymax - inter_ymin; + const dtype inter_size = inter_width * inter_height; + + const dtype bbox1_size = bbox_size(bbox1); + const dtype bbox2_size = bbox_size(bbox2); + + return inter_size / (bbox1_size + bbox2_size - inter_size); + } +} + +template +void apply_nms_fast(const dtype* bboxes, + const dtype* scores, + int num, + float score_threshold, + float nms_threshold, + float eta, + int top_k, + std::vector* indices) { + // Get top_k scores (with corresponding indices). + std::vector> score_index_vec; + get_max_score_index(scores, num, score_threshold, top_k, &score_index_vec); + + // Do nms. + float adaptive_threshold = nms_threshold; + indices->clear(); + + while (score_index_vec.size() != 0) { + const int idx = score_index_vec.front().second; + bool keep = true; + + for (int k = 0; k < indices->size(); ++k) { + if (keep) { + const int kept_idx = (*indices)[k]; + float overlap = + jaccard_overlap(bboxes + idx * 4, bboxes + kept_idx * 4); + keep = overlap <= adaptive_threshold; + } else { + break; + } + } + + if (keep) { + indices->push_back(idx); + } + + score_index_vec.erase(score_index_vec.begin()); + + if (keep && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } +} + +template +void multiclass_nms_compute_ref(const operators::MulticlassNmsParam& param, + int class_num, + const std::vector& priors, + bool share_location, + std::vector* result) { + int background_id = param.background_label; + int keep_topk = param.keep_top_k; + int nms_topk = param.nms_top_k; + float conf_thresh = param.score_threshold; + float nms_thresh = param.nms_threshold; + float nms_eta = param.nms_eta; + const dtype* bbox_data = param.bboxes->data(); + const dtype* conf_data = param.scores->data(); + (*result).clear(); + + int num_kept = 0; + std::vector>> all_indices; + int64_t conf_offset = 0; + int64_t bbox_offset = 0; + for (int i = 0; i < priors.size(); ++i) { + std::map> indices; + int num_det = 0; + int num_priors = priors[i]; + + int conf_idx = class_num * conf_offset; + int bbox_idx = + share_location ? bbox_offset * 4 : bbox_offset * 4 * class_num; + + for (int c = 0; c < class_num; ++c) { + if (c == background_id) { + // Ignore background class + continue; + } + + const dtype* cur_conf_data = conf_data + conf_idx + c * num_priors; + const dtype* cur_bbox_data = bbox_data + bbox_idx; + + if (!share_location) { + cur_bbox_data += c * num_priors * 4; + } + + apply_nms_fast(cur_bbox_data, + cur_conf_data, + num_priors, + conf_thresh, + nms_thresh, + nms_eta, + nms_topk, + &(indices[c])); + num_det += indices[c].size(); + } + + if (keep_topk > -1 && num_det > keep_topk) { + std::vector>> score_index_pairs; + + for (auto it = indices.begin(); it != indices.end(); ++it) { + int label = it->first; + const std::vector& label_indices = it->second; + + for (int j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + float score = conf_data[conf_idx + label * num_priors + idx]; + score_index_pairs.push_back( + std::make_pair(score, std::make_pair(label, idx))); + } + } + + // Keep top k results per image. + std::stable_sort(score_index_pairs.begin(), + score_index_pairs.end(), + sort_score_pair_descend>); + score_index_pairs.resize(keep_topk); + // Store the new indices. + std::map> new_indices; + + for (int j = 0; j < score_index_pairs.size(); ++j) { + int label = score_index_pairs[j].second.first; + int idx = score_index_pairs[j].second.second; + new_indices[label].push_back(idx); + } + + all_indices.push_back(new_indices); + num_kept += keep_topk; + } else { + all_indices.push_back(indices); + num_kept += num_det; + } + conf_offset += num_priors; + bbox_offset += num_priors; + } + + if (num_kept == 0) { + (*result).clear(); + (*result).resize(1); + (*result)[0] = -1; + return; + } else { + (*result).resize(num_kept * 6); + } + + int count = 0; + + conf_offset = 0; + bbox_offset = 0; + for (int i = 0; i < priors.size(); ++i) { + int num_priors = priors[i]; + int conf_idx = class_num * conf_offset; + int bbox_idx = + share_location ? bbox_offset * 4 : bbox_offset * 4 * class_num; + + for (auto it = all_indices[i].begin(); it != all_indices[i].end(); ++it) { + int label = it->first; + std::vector& indices = it->second; + const dtype* cur_conf_data = conf_data + conf_idx + label * num_priors; + const dtype* cur_bbox_data = bbox_data + bbox_idx; + + if (!share_location) { + cur_bbox_data += label * num_priors * 4; + } + + for (int j = 0; j < indices.size(); ++j) { + int idx = indices[j]; + (*result)[count * 6] = label; + (*result)[count * 6 + 1] = cur_conf_data[idx]; + + for (int k = 0; k < 4; ++k) { + (*result)[count * 6 + 2 + k] = cur_bbox_data[idx * 4 + k]; + } + + ++count; + } + } + conf_offset += num_priors; + bbox_offset += num_priors; + } +} + +void test_multiclass_nms(float score_threshold, + int nms_top_k, + int keep_top_k, + float nms_threshold, + bool normalized, + float nms_eta, + int background_label, + int batch_size, + int class_num, + int num_boxes, + int box_size, + int core_num) { + // prepare input&output variables + Scope scope; + std::string bboxes_var_name = "BBoxes"; + std::string scores_var_name = "Scores"; + std::string out_var_name = "Out"; + std::string out_num_var_name = + "nms_out_num"; // must be this name,corespond with + // lite/operators/multiclass_nms_op.cc + auto* bboxes = scope.Var(bboxes_var_name)->GetMutable(); + auto* scores = scope.Var(scores_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_num = scope.Var(out_num_var_name)->GetMutable(); + + std::vector bboxes_shape = {batch_size, num_boxes, box_size}; + std::vector scores_shape = {batch_size, class_num, num_boxes}; + std::vector out_num_shape = {batch_size}; + + bboxes->Resize(bboxes_shape); + scores->Resize(scores_shape); + out_num->Resize(out_num_shape); + + std::vector bboxes_vec = gen_random_boxes(num_boxes, 1024, 1024); + std::vector scores_vec = gen_random_scores(num_boxes, class_num); + + for (size_t i = 1; i < bboxes_vec.size(); i++) { + bboxes->mutable_data()[i] = bboxes_vec[i]; + } + for (size_t i = 1; i < scores_vec.size(); i++) { + scores->mutable_data()[i] = scores_vec[i]; + } + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("multiclass_nms"); + opdesc.SetInput("BBoxes", {bboxes_var_name}); + opdesc.SetInput("Scores", {scores_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("background_label", background_label); + opdesc.SetAttr("keep_top_k", keep_top_k); + opdesc.SetAttr("nms_top_k", nms_top_k); + opdesc.SetAttr("score_threshold", score_threshold); + opdesc.SetAttr("nms_threshold", nms_threshold); + opdesc.SetAttr("nms_eta", nms_eta); + opdesc.SetAttr("normalized", normalized); + + auto op = CreateOp(opdesc, &scope); + // out_ref->CopyDataFrom(*out); + + operators::MulticlassNmsParam param; + auto bboxes_name = opdesc.Input("BBoxes").front(); + auto scores_name = opdesc.Input("Scores").front(); + auto out_name = opdesc.Output("Out").front(); + std::vector output_arg_names = opdesc.OutputArgumentNames(); + + param.bboxes = bboxes; + param.scores = scores; + param.out = out; + param.background_label = opdesc.GetAttr("background_label"); + param.keep_top_k = opdesc.GetAttr("keep_top_k"); + param.nms_top_k = opdesc.GetAttr("nms_top_k"); + param.score_threshold = opdesc.GetAttr("score_threshold"); + param.nms_threshold = opdesc.GetAttr("nms_threshold"); + param.nms_eta = opdesc.GetAttr("nms_eta"); + if (opdesc.HasAttr("normalized")) { + param.normalized = opdesc.GetAttr("normalized"); + } + const std::vector& priors = {num_boxes}; // batch_size + std::vector result; + multiclass_nms_compute_ref(param, class_num, priors, true, &result); + + // trans + Tensor bboxes_trans; + bboxes_trans.Resize({bboxes->dims()}); + transpose(bboxes->mutable_data(), + bboxes_trans.mutable_data(), + {static_cast(bboxes->dims()[0]), + static_cast(bboxes->dims()[1]), + static_cast(bboxes->dims()[2])}, + {0, 2, 1}); + bboxes->CopyDataFrom(bboxes_trans); + + Tensor scores_trans; + scores_trans.Resize({scores->dims()}); + transpose(scores->mutable_data(), + scores_trans.mutable_data(), + {static_cast(scores->dims()[0]), + static_cast(scores->dims()[1]), + static_cast(scores->dims()[2])}, + {0, 2, 1}); + scores->CopyDataFrom(scores_trans); + + LaunchOp( + op, {bboxes_var_name, scores_var_name}, {out_var_name, out_num_var_name}); + + // ToFile(out, "nms_out_mlu_before_trans.txt"); + // out trans + Tensor out_trans; + out_trans.Resize({out->dims()}); + transpose(out->mutable_data(), + out_trans.mutable_data(), + {static_cast(out->dims()[0]), + static_cast(out->dims()[2]), + static_cast(out->dims()[1])}, // 0 2 1 on mlu + {0, 2, 1}); + out->CopyDataFrom(out_trans); + + // ToFile(out, "nms_out_mlu.txt"); + // ToFile(out_num, "nms_out_num_mlu.txt"); + // VecToFile(result, "nms_out_cpu.txt"); + + // auto out_data = out->mutable_data(); + int num_box = out->dims()[1]; + int match_count = 0; + std::vector matched_cpu_index; + for (int i = 0; i < num_box; i++) { + float mlu_box[4]; + mlu_box[0] = out->mutable_data()[i * 6 + 2]; + mlu_box[1] = out->mutable_data()[i * 6 + 3]; + mlu_box[2] = out->mutable_data()[i * 6 + 4]; + mlu_box[3] = out->mutable_data()[i * 6 + 5]; + bool match = false; + for (size_t j = 0; j < num_box; j++) { + // if j th cpu box has matched some mlu box, do not use if to match other + // mlu box + if (std::find(std::begin(matched_cpu_index), + std::end(matched_cpu_index), + j) != std::end(matched_cpu_index)) { + continue; + } + float cpu_box[4]; + cpu_box[0] = result[j * 6 + 2]; + cpu_box[1] = result[j * 6 + 3]; + cpu_box[2] = result[j * 6 + 4]; + cpu_box[3] = result[j * 6 + 5]; + if (IOU(mlu_box, cpu_box) >= 0.9) { + match = true; + matched_cpu_index.push_back(j); + break; + } + } + if (match) { + match_count += 1; + } + } + EXPECT_NEAR(match_count, num_box, 0); +} + +TEST(MLUBridges, multiclass_nms) { + int background_label = -1; + int keep_top_k = 100; + int nms_top_k = 1000; + float score_threshold = 0.01; + float nms_threshold = 0.45; + int nms_eta = 1; + bool normalized = 0; + int batch_size = 1; + int num_boxes = 22743; + int class_num = 80; + int core_num = 4; + int box_size = 4; + + test_multiclass_nms(score_threshold, + nms_top_k, + keep_top_k, + nms_threshold, + normalized, + nms_eta, + background_label, + batch_size, + class_num, + num_boxes, + box_size, + core_num); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(multiclass_nms, kMLU) diff --git a/lite/kernels/mlu/bridges/paddle_use_bridges.h b/lite/kernels/mlu/bridges/paddle_use_bridges.h index be5c64b3b7056d0b8de1589d198db541b5a3777b..cddd4489219021f3df6606d1fdc4cc0967c60915 100644 --- a/lite/kernels/mlu/bridges/paddle_use_bridges.h +++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h @@ -43,6 +43,9 @@ USE_SUBGRAPH_BRIDGE(flatten, kMLU); USE_SUBGRAPH_BRIDGE(flatten2, kMLU); USE_SUBGRAPH_BRIDGE(reshape, kMLU); USE_SUBGRAPH_BRIDGE(reshape2, kMLU); +USE_SUBGRAPH_BRIDGE(multiclass_nms, kMLU); +USE_SUBGRAPH_BRIDGE(density_prior_box, kMLU); +USE_SUBGRAPH_BRIDGE(box_coder, kMLU); #ifdef LITE_BUILD_EXTRA USE_SUBGRAPH_BRIDGE(gather, kMLU); USE_SUBGRAPH_BRIDGE(lrn, kMLU) diff --git a/lite/kernels/mlu/bridges/utility.h b/lite/kernels/mlu/bridges/utility.h index fe886c5e4467b844e41701636545776071b67997..ba187cfa2420b713d765c7853ceeb3c43cb7e88b 100644 --- a/lite/kernels/mlu/bridges/utility.h +++ b/lite/kernels/mlu/bridges/utility.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include