diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ac227f0154feb64178d9a99b6784bfd6db40d50..50c375be4342bb88a98f313fa9dfebdef19af25d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,7 @@ lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kerne lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF) # cv build options lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF) +lite_option(LITE_WITH_COMPUTE_API "Enable build conmpute api in lite" OFF) lite_option(LITE_WITH_STATIC_CUDA "Statically link cuda libraries." ON) lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 1b0890e0dbf5e741176c293a059d809752c72a43..fd3b478c27238bbf3d5d4efa64763f0e4e61ce7f 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -129,6 +129,9 @@ if (LITE_WITH_ARM) if (LITE_WITH_CV) add_definitions("-DLITE_WITH_CV") endif() + if (LITE_WITH_COMPUTE_API) + add_definitions("-DLITE_WITH_COMPUTE_API") + endif() endif() if (LITE_WITH_TRAIN) diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index e534fdd9f9a909d7620cee40a71c6571910c4baf..b0294e9a6450dfb2c6cf98cbb7ae46cb4d3fc8bb 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -280,6 +280,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_BINARY_DIR}/lite/gen_code/paddle_code_generator" "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/compute_api.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/compute_param.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" ) if(NOT IOS) add_dependencies(publish_inference_cxx_lib paddle_code_generator) @@ -323,6 +325,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/libpaddle_light_api_shared.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/compute_api.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/compute_param.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/compute_utils.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" ) add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared) add_dependencies(tiny_publish_cxx_lib bundle_light_api) @@ -380,6 +385,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_libs" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_libs/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_libs/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_compute_api" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_compute_api/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_compute_api/Makefile" ) add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) @@ -401,6 +408,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_compute_api" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_compute_api/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_compute_api/Makefile" ) add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos) endif() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 85744f5cac4b5b6dc6cb149a0375a69c98d55dd7..d56b1b2ff719da0edede6e8218110d4c2b0de861 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -64,7 +64,11 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH else() if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux")) add_library(paddle_light_api_shared SHARED "") - target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc) + if (LITE_WITH_COMPUTE_API) + target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc compute_param.cc compute_api.cc compute_utils.cc) + else() + target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc) + endif() set(TARGET_COMIPILE_FLAGS "-fdata-sections") if (NOT (ARM_TARGET_LANG STREQUAL "clang")) #gcc set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto") @@ -308,7 +312,11 @@ lite_cc_library(paddle_api SRCS paddle_api.cc DEPS op_params tensor device_info) #----------------------------------------------------------------------------------------------------- # The final inference library for both CxxConfig and MobileConfig. if (LITE_ON_TINY_PUBLISH) - lite_cc_library(paddle_api_light SRCS light_api_impl.cc DEPS light_api paddle_api stream) + if (LITE_WITH_COMPUTE_API) + lite_cc_library(paddle_api_light SRCS light_api_impl.cc compute_param.cc compute_api.cc compute_utils.cc DEPS light_api paddle_api stream) + else() + lite_cc_library(paddle_api_light SRCS light_api_impl.cc DEPS light_api paddle_api stream) + endif() else() lite_cc_library(paddle_api_light SRCS light_api_impl.cc DEPS light_api paddle_api) endif() diff --git a/lite/api/compute_api.cc b/lite/api/compute_api.cc new file mode 100644 index 0000000000000000000000000000000000000000..0af327fed4273b4a9b3631b76826a2001230fc66 --- /dev/null +++ b/lite/api/compute_api.cc @@ -0,0 +1,118 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "compute_api.h" // NOLINT +#include +#include +#include +#include +#include "lite/core/context.h" +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/core/program.h" +#include "log_lite.h" // NOLINT + +namespace paddle { +namespace lite_api { + +class InstructionWrapper { + public: + InstructionWrapper( + std::shared_ptr& op, // NOLINT + std::vector>& kernels) { // NOLINT + op_ = op; + for (auto& kernel : kernels) { + kernels_.emplace_back(std::move(kernel)); + } + } + + lite::OpLite* get_op() { return op_.get(); } + + lite::KernelBase* get_kernel() { + if (kernel_idx > kernels_.size()) { + LOGF("Error! kernel index > kernel size\n"); + } + return kernels_[kernel_idx].get(); + } + + void set_kernel_idx(int idx) { kernel_idx = idx; } + + ~InstructionWrapper() = default; + + private: + std::shared_ptr op_; + std::vector> kernels_; + int kernel_idx{0}; +}; + +void ComputeEngine::env_init(PowerMode power_mode, int threads) { + lite::DeviceInfo::Init(); + lite::DeviceInfo::Global().SetRunMode(power_mode, threads); +} + +bool ComputeEngine::CreateOperator(const char* op_type, + PrecisionType precision, + DataLayoutType layout) { + auto op = lite::LiteOpRegistry::Global().Create(op_type); + LCHECK(op, "no Op found for %s\n", op_type); + LOGI("Create %s Operator Success\n", op_type); + lite_api::Place place(TARGET(kARM), precision, layout); + auto kernels = op->CreateKernels({place}); + LCHECK_GT(kernels.size(), 0, "no kernel found for: %s\n", op_type); + LOGI("Create %s kernel Success\n", op_type); + instruction_ = new InstructionWrapper(op, kernels); + return true; +} + +// param must set input and output +void ComputeEngine::SetParam(ParamBase* param) { + delete static_cast(param_); + // generate raw param + param_ = param->AttachRawParam(); + auto* ins = static_cast(instruction_); + // pick kernel + ins->set_kernel_idx(param->GetKernelIndex()); + // get raw kernel and op + auto* kernel = ins->get_kernel(); + LCHECK(kernel, "SetParam, pick kernel error\n"); + auto* op = ins->get_op(); + // set context + std::unique_ptr ctx(new lite::KernelContext); + kernel->SetContext(std::move(ctx)); + op->SetParam(static_cast(param_)); + op->CheckShape(); + op->AttachKernel(kernel); + LOGI("SetParam Success\n"); +} + +void ComputeEngine::Launch() { + auto* ins = static_cast(instruction_); + auto* kernel = ins->get_kernel(); + LCHECK(kernel, "Launch, pick kernel error\n"); + auto* op = ins->get_op(); + op->InferShapeImpl(); + kernel->Launch(); + LOGI("Run Success\n"); +} + +ComputeEngine::~ComputeEngine() { + delete static_cast(instruction_); + delete static_cast(param_); + instruction_ = nullptr; + param_ = nullptr; +} + +} // namespace lite_api +} // namespace paddle diff --git a/lite/api/compute_api.h b/lite/api/compute_api.h new file mode 100644 index 0000000000000000000000000000000000000000..92d100d10f821a659c126b207966b7c52b13a969 --- /dev/null +++ b/lite/api/compute_api.h @@ -0,0 +1,58 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "compute_param.h" // NOLINT +#include "paddle_place.h" // NOLINT + +namespace paddle { +namespace lite_api { + +// now ComputeEngine only support Target = Arm +template +class LITE_API ComputeEngine { + public: + ComputeEngine() = default; + bool CreateOperator(const char* op_type, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)) {} + void SetParam(ParamBase* param) {} + void Launch() {} + ~ComputeEngine() = default; + + private: + void* instruction_; + void* param_; +}; + +template <> +class LITE_API ComputeEngine { + public: + ComputeEngine() = default; + static void env_init(PowerMode power_mode, int threads); + bool CreateOperator(const char* op_type, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)); + void SetParam(ParamBase* param); + void Launch(); + ~ComputeEngine(); + + private: + void* instruction_{nullptr}; + void* param_{nullptr}; +}; + +} // namespace lite_api +} // namespace paddle diff --git a/lite/api/compute_param.cc b/lite/api/compute_param.cc new file mode 100644 index 0000000000000000000000000000000000000000..cb7825ccc7097e58a98fe044c0cf8e943ef18e59 --- /dev/null +++ b/lite/api/compute_param.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "compute_param.h" // NOLINT +#include "lite/operators/op_params.h" +#include "log_lite.h" // NOLINT + +namespace paddle { +namespace lite_api { +void *ActivationParam::AttachRawParam() { + //! necessary check + LCHECK(X, "ActivationParam must set input tensor: X\n"); + LCHECK(Out, "ActivationParam must set output tensor: Out\n"); + + auto *raw_act_param = new lite::operators::ActivationParam(); + // Tensor + raw_act_param->X = static_cast(X->GetRawTensor()); + raw_act_param->Out = static_cast(Out->GetRawTensor()); + raw_act_param->Prelu_alpha = + Prelu_alpha ? static_cast(Prelu_alpha->GetRawTensor()) + : nullptr; + + raw_act_param->active_type = active_type; + raw_act_param->has_active = has_active; + raw_act_param->Leaky_relu_alpha = Leaky_relu_alpha; + raw_act_param->Relu_clipped_coef = Relu_clipped_coef; + raw_act_param->Prelu_mode = Prelu_mode; + raw_act_param->Swish_beta = Swish_beta; + raw_act_param->hard_sigmoid_slope = hard_sigmoid_slope; + raw_act_param->hard_sigmoid_offset = hard_sigmoid_offset; + raw_act_param->hard_swish_scale = hard_swish_scale; + raw_act_param->hard_swish_offset = hard_swish_offset; + raw_act_param->hard_swish_threshold = hard_swish_threshold; + + return raw_act_param; +} + +void *ConvParam::AttachRawParam() { + //! necessary check + LCHECK(x, "ConvParam must set input tensor: x\n"); + LCHECK(filter, "ConvParam must set filter tensor: filter\n"); + LCHECK(output, "ConvParam must set output tensor: output\n"); + if (enable_int8 && out_ptype == PRECISION(kFloat)) { + LCHECK_NE(input_scale, 0.f, "int8 conv out float, must has input scale\n"); + LCHECK(!weight_scale.empty(), + "int8 conv out float, must has weights scale\n"); + } else if (enable_int8 && out_ptype == PRECISION(kInt8)) { + LCHECK_NE(input_scale, 0.f, "int8 conv out int8, must has input scale\n"); + LCHECK_NE(output_scale, 0.f, "int8 conv out int8, must has output scale\n"); + LCHECK(!weight_scale.empty(), + "int8 conv out int8, must has weights scale\n"); + } + + auto *raw_conv_param = new lite::operators::ConvParam(); + // Tensor + raw_conv_param->x = static_cast(x->GetRawTensor()); + raw_conv_param->filter = static_cast(filter->GetRawTensor()); + raw_conv_param->output = static_cast(output->GetRawTensor()); + raw_conv_param->bias = + bias ? static_cast(bias->GetRawTensor()) : nullptr; + raw_conv_param->residualData = + residualData ? static_cast(residualData->GetRawTensor()) + : nullptr; + + // activation param + raw_conv_param->activation_param.active_type = activation_param.active_type; + raw_conv_param->activation_param.has_active = activation_param.has_active; + raw_conv_param->activation_param.Relu_clipped_coef = + activation_param.Relu_clipped_coef; + raw_conv_param->activation_param.Leaky_relu_alpha = + activation_param.Leaky_relu_alpha; + raw_conv_param->activation_param.Swish_beta = activation_param.Swish_beta; + raw_conv_param->activation_param.hard_sigmoid_slope = + activation_param.hard_sigmoid_slope; + raw_conv_param->activation_param.hard_sigmoid_offset = + activation_param.hard_sigmoid_offset; + raw_conv_param->activation_param.hard_swish_scale = + activation_param.hard_swish_scale; + raw_conv_param->activation_param.hard_swish_offset = + activation_param.hard_swish_offset; + raw_conv_param->activation_param.hard_swish_threshold = + activation_param.hard_swish_threshold; + + // for int8 + raw_conv_param->enable_int8 = enable_int8; + raw_conv_param->input_scale = input_scale; + raw_conv_param->weight_scale = weight_scale; + raw_conv_param->output_scale = output_scale; + raw_conv_param->bit_length = bit_length; + + raw_conv_param->strides = strides; + raw_conv_param->paddings = paddings; + raw_conv_param->groups = groups; + raw_conv_param->dilations = dilations; + raw_conv_param->fuse_residual_connection = fuse_residual_connection; + raw_conv_param->data_format = data_format; + raw_conv_param->output_size = output_size; + + return raw_conv_param; +} + +int ConvParam::GetKernelIndex() { + if (enable_int8) { + if (out_ptype == PRECISION(kFloat)) { + return 1; + } else if (out_ptype == PRECISION(kInt8)) { + return 0; + } else { + LOGF("conv only support float and int8 precision\n"); + } + } else { + return 0; + } +} +} // namespace lite_api +} // namespace paddle diff --git a/lite/api/compute_param.h b/lite/api/compute_param.h new file mode 100644 index 0000000000000000000000000000000000000000..03d26848b554f7b7d81bb6b45c27536cd89d5f63 --- /dev/null +++ b/lite/api/compute_param.h @@ -0,0 +1,89 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle_api.h" // NOLINT +#include "paddle_place.h" // NOLINT + +namespace paddle { +namespace lite_api { + +class LITE_API ParamBase { + public: + PrecisionType out_ptype{PRECISION(kFloat)}; + virtual int GetKernelIndex() { return 0; } + virtual void* AttachRawParam() {} + virtual ~ParamBase() = default; +}; + +class LITE_API ActivationParam : public ParamBase { + public: + Tensor* X{}; + Tensor* Out{}; + ActivationType active_type{ActivationType::kIndentity}; + bool has_active{false}; + float Leaky_relu_alpha{0}; // leaky_relu param + float Relu_clipped_coef{6}; // relu_clipped param + const char* Prelu_mode{ + "channel"}; // prelu param, can be "all", "channel" or "element" + Tensor* Prelu_alpha{}; // prelu param + float Swish_beta; // swish param + // hard_sigmoid param + float hard_sigmoid_slope{0.2f}; + float hard_sigmoid_offset{0.5f}; + // hard_swish param + float hard_swish_threshold{6.0}; + float hard_swish_scale{6.0}; + float hard_swish_offset{3.0}; + + ActivationParam() = default; + virtual ~ActivationParam() = default; + void* AttachRawParam() override; +}; + +class LITE_API ConvParam : public ParamBase { + public: + Tensor* x{}; + Tensor* filter{}; + Tensor* bias{nullptr}; + Tensor* residualData{nullptr}; + Tensor* output{}; + std::vector strides{1, 1}; + std::shared_ptr> paddings; + int groups{1}; + std::shared_ptr> dilations; + bool fuse_residual_connection{false}; + const char* data_format{"Anylayout"}; + // for activation + ActivationParam activation_param; + // only used in conv_transpose. + std::vector output_size; + // for int8 + bool enable_int8{false}; + float input_scale{1.0f}; + std::vector weight_scale{}; + float output_scale{1.0f}; + int bit_length{8}; + + ConvParam() = default; + virtual ~ConvParam() = default; + void* AttachRawParam() override; + int GetKernelIndex() override; +}; + +} // namespace lite_api +} // namespace paddle diff --git a/lite/api/compute_utils.cc b/lite/api/compute_utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..02c8e77d684fb3315125d6e2abb7fdc519d0cdcc --- /dev/null +++ b/lite/api/compute_utils.cc @@ -0,0 +1,176 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "compute_utils.h" // NOLINT +#include "lite/backends/arm/math/type_trans.h" +#include "lite/core/tensor.h" +#include "log_lite.h" // NOLINT +#include "paddle_place.h" // NOLINT +namespace paddle { +namespace lite_api { + +// clang-format off +void ComputeUtils::TensorFloatToInt8(Tensor& tin, Tensor& tout, float scale) { + lite::Tensor* raw_tin = static_cast(tin.GetRawTensor()); + lite::Tensor* raw_tout = static_cast(tout.GetRawTensor()); + LCHECK(raw_tin, "tensor in must have raw tensor\n"); + tout.Resize(tin.shape()); + int outer_size = 1; + int axis_size = 1; + int inner_size = raw_tin->numel(); + const float* din = raw_tin->data(); + int8_t* dout = raw_tout->mutable_data(); + paddle::lite::arm::math::fp32_to_int8( + din, dout, &scale, axis_size, outer_size, inner_size); +} + +void ComputeUtils::TensorFloatToInt8Inplace(Tensor& tin, float scale) { + lite::Tensor* raw_tin = static_cast(tin.GetRawTensor()); + LCHECK(raw_tin, "tensor in must have raw tensor\n"); + LCHECK_GT(raw_tin->numel(), 0, "tensor in shape must greater than zero\n"); + LCHECK_EQ(raw_tin->precision(), + PRECISION(kFloat), + "tensor in precision must be float\n"); + int outer_size = 1; + int axis_size = 1; + int inner_size = raw_tin->numel(); + const float* din = raw_tin->data(); + int8_t* dout = raw_tin->mutable_data(); + paddle::lite::arm::math::fp32_to_int8( + din, dout, &scale, axis_size, outer_size, inner_size); +} + +void ComputeUtils::TensorInt8ToFloat(Tensor& tin, Tensor& tout, float scale) { + lite::Tensor* raw_tin = static_cast(tin.GetRawTensor()); + lite::Tensor* raw_tout = static_cast(tout.GetRawTensor()); + LCHECK(raw_tin, "tensor in must have raw tensor\n"); + LCHECK_GT(raw_tin->numel(), 0, "tensor in shape must greater than zero\n"); + LCHECK_EQ(raw_tin->precision(), + PRECISION(kInt8), + "tensor in precision must be int8"); + tout.Resize(tin.shape()); + int outer_size = 1; + int axis_size = 1; + int inner_size = raw_tin->numel(); + const int8_t* din = raw_tin->data(); + float* dout = raw_tout->mutable_data(); + paddle::lite::arm::math::int8_to_fp32( + din, dout, &scale, axis_size, outer_size, inner_size); +} + +void ComputeUtils::TensorInt8ToFloatInplace(Tensor& tin, float scale) { + lite::Tensor* raw_tin = static_cast(tin.GetRawTensor()); + lite::Tensor tmp_out; + LCHECK(raw_tin, "tensor in must have raw tensor\n"); + LCHECK_GT(raw_tin->numel(), 0, "tensor in shape must greater than zero\n"); + LCHECK_EQ(raw_tin->precision(), + PRECISION(kInt8), + "tensor in precision must be int8"); + tmp_out.Resize(tin.shape()); + int outer_size = 1; + int axis_size = 1; + int inner_size = raw_tin->numel(); + const int8_t* din = raw_tin->data(); + float* tmp_dout = tmp_out.mutable_data(); + paddle::lite::arm::math::int8_to_fp32( + din, tmp_dout, &scale, axis_size, outer_size, inner_size); + float* dout = raw_tin->mutable_data(); + memcpy(dout, tmp_dout, raw_tin->numel() * sizeof(float)); +} + +void ComputeUtils::ConvWeightsFloatToInt8(Tensor& weightin, + Tensor& weightout, + std::vector scale) { + lite::Tensor* raw_win = static_cast(weightin.GetRawTensor()); + lite::Tensor* raw_wout = static_cast(weightout.GetRawTensor()); + LCHECK(raw_win, "weights in must have raw tensor\n"); + LCHECK_GT(raw_win->numel(), 0, "weights in shape must greater than zero\n"); + LCHECK_EQ(raw_win->precision(), + PRECISION(kFloat), + "weights in precision must be float"); + weightout.Resize(weightin.shape()); + int outer_size = 1; + int axis_size = raw_win->dims()[0]; // chout + int inner_size = + raw_win->numel() / axis_size; // chin / group * ksize_w * ksize_h + const float* din = raw_win->data(); + int8_t* dout = raw_wout->mutable_data(); + paddle::lite::arm::math::fp32_to_int8( + din, dout, scale.data(), axis_size, outer_size, inner_size); +} + +void ComputeUtils::ConvWeightsFloatToInt8Inplace(Tensor& weightin, + std::vector scale) { + lite::Tensor* raw_win = static_cast(weightin.GetRawTensor()); + LCHECK(raw_win, "weights in must have raw tensor\n"); + LCHECK_GT(raw_win->numel(), 0, "weights in shape must greater than zero\n"); + LCHECK_EQ(raw_win->precision(), + PRECISION(kFloat), + "weights in precision must be float"); + int outer_size = 1; + int axis_size = raw_win->dims()[0]; // chout + int inner_size = + raw_win->numel() / axis_size; // chin / group * ksize_w * ksize_h + const float* din = raw_win->data(); + int8_t* dout = raw_win->mutable_data(); + paddle::lite::arm::math::fp32_to_int8( + din, dout, scale.data(), axis_size, outer_size, inner_size); +} + +void ComputeUtils::ConvWeightsInt8ToFloat(Tensor& weightin, + Tensor& weightout, + std::vector scale) { + lite::Tensor* raw_win = static_cast(weightin.GetRawTensor()); + lite::Tensor* raw_wout = static_cast(weightout.GetRawTensor()); + LCHECK(raw_win, "weights in must have raw tensor\n"); + LCHECK_GT(raw_win->numel(), 0, "weights in shape must greater than zero\n"); + LCHECK_EQ(raw_win->precision(), + PRECISION(kInt8), + "weights in precision must be int8"); + weightout.Resize(weightin.shape()); + int outer_size = 1; + int axis_size = raw_win->dims()[0]; // chout + int inner_size = + raw_win->numel() / axis_size; // chin / group * ksize_w * ksize_h + const int8_t* din = raw_win->data(); + float* dout = raw_wout->mutable_data(); + paddle::lite::arm::math::int8_to_fp32( + din, dout, scale.data(), axis_size, outer_size, inner_size); +} + +void ComputeUtils::ConvWeightsInt8ToFloatInplace(Tensor& weightin, + std::vector scale) { + lite::Tensor* raw_win = static_cast(weightin.GetRawTensor()); + lite::Tensor tmp_out; + LCHECK(raw_win, "weights in must have raw tensor\n"); + LCHECK_GT(raw_win->numel(), 0, "weights in shape must greater than zero\n"); + LCHECK_EQ(raw_win->precision(), + PRECISION(kInt8), + "weights in precision must be int8"); + tmp_out.Resize(weightin.shape()); + int outer_size = 1; + int axis_size = raw_win->dims()[0]; // chout + int inner_size = + raw_win->numel() / axis_size; // chin / group * ksize_w * ksize_h + const int8_t* din = raw_win->data(); + float* dout_tmp = tmp_out.mutable_data(); + paddle::lite::arm::math::int8_to_fp32( + din, dout_tmp, scale.data(), axis_size, outer_size, inner_size); + float* dout = raw_win->mutable_data(); + memcpy(dout, dout_tmp, raw_win->numel() * sizeof(float)); +} +// clang-format on + +} // namespace lite_api +} // namespace paddle diff --git a/lite/api/compute_utils.h b/lite/api/compute_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..418a29929e8e441086f1eaa9ef338eb4c8623896 --- /dev/null +++ b/lite/api/compute_utils.h @@ -0,0 +1,44 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle_api.h" // NOLINT + +namespace paddle { +namespace lite_api { + +struct LITE_API ComputeUtils { + static void TensorFloatToInt8(Tensor& tin, // NOLINT + Tensor& tout, // NOLINT + float scale); + static void TensorFloatToInt8Inplace(Tensor& tin, float scale); // NOLINT + static void TensorInt8ToFloat(Tensor& tin, // NOLINT + Tensor& tout, // NOLINT + float scale); + static void TensorInt8ToFloatInplace(Tensor& tin, float scale); // NOLINT + static void ConvWeightsFloatToInt8(Tensor& weightin, // NOLINT + Tensor& weightout, // NOLINT + std::vector scale); + static void ConvWeightsFloatToInt8Inplace(Tensor& weightin, // NOLINT + std::vector scale); + static void ConvWeightsInt8ToFloat(Tensor& weightin, // NOLINT + Tensor& weightout, // NOLINT + std::vector scale); + static void ConvWeightsInt8ToFloatInplace(Tensor& weightin, // NOLINT + std::vector scale); +}; + +} // namespace lite_api +} // namespace paddle diff --git a/lite/api/log_lite.h b/lite/api/log_lite.h new file mode 100644 index 0000000000000000000000000000000000000000..8cfdd5669b0450f695bd6b8d2a8a64b981d9aba8 --- /dev/null +++ b/lite/api/log_lite.h @@ -0,0 +1,70 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#define LOGI(fmt, ...) printf(fmt, ##__VA_ARGS__) +#define LOGE(fmt, ...) printf(fmt, ##__VA_ARGS__) +#define LOGF(fmt, ...) \ + printf(fmt, ##__VA_ARGS__); \ + exit(1) + +#define LCHECK(a, fmt, ...) \ + do { \ + if (!a) { \ + LOGF(fmt, ##__VA_ARGS__); \ + } \ + } while (0) + +#define LCHECK_EQ(a, b, fmt, ...) \ + do { \ + if (a != b) { \ + LOGF(fmt, ##__VA_ARGS__); \ + } \ + } while (0) + +#define LCHECK_NE(a, b, fmt, ...) \ + do { \ + if (a == b) { \ + LOGF(fmt, ##__VA_ARGS__); \ + } \ + } while (0) + +#define LCHECK_GE(a, b, fmt, ...) \ + do { \ + if (a < b) { \ + LOGF(fmt, ##__VA_ARGS__); \ + } \ + } while (0) + +#define LCHECK_GT(a, b, fmt, ...) \ + do { \ + if (a <= b) { \ + LOGF(fmt, ##__VA_ARGS__); \ + } \ + } while (0) + +#define LCHECK_LE(a, b, fmt, ...) \ + do { \ + if (a > b) { \ + LOGF(fmt, ##__VA_ARGS__); \ + } \ + } while (0) + +#define LCHECK_LT(a, b, fmt, ...) \ + do { \ + if (a >= b) { \ + LOGF(fmt, ##__VA_ARGS__); \ + } \ + } while (0) diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc index bfeff4879820f132a331e9bff56a5f9c494fe775..c2dde68eee5ae232a49000bebc73ffa13c9e2b7d 100644 --- a/lite/api/paddle_api.cc +++ b/lite/api/paddle_api.cc @@ -35,6 +35,22 @@ const lite::Tensor *ctensor(void *x) { return static_cast(x); } +#ifdef LITE_WITH_COMPUTE_API +lite::Tensor *mtensor(void *x) { return static_cast(x); } +Tensor::Tensor() : raw_tensor_(new lite::Tensor()) {} + +void Tensor::ReleaseRawTensor() { + delete static_cast(raw_tensor_); + raw_tensor_ = nullptr; +} + +void Tensor::set_precision(PrecisionType ptype) { + mtensor(raw_tensor_)->set_precision(ptype); +} + +void *Tensor::GetRawTensor() { return raw_tensor_; } +#endif + void Tensor::Resize(const shape_t &shape) { tensor(raw_tensor_)->Resize(shape); } diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index d28ea8fdbf3f77a15f9ef561e03555090fddac97..b94ca90884e412dfe6f20c1fcaa9c21bda085c4b 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -36,6 +36,12 @@ struct LITE_API Tensor { explicit Tensor(void* raw); explicit Tensor(const void* raw); +#ifdef LITE_WITH_COMPUTE_API + Tensor(); + void ReleaseRawTensor(); + void set_precision(PrecisionType ptype); + void* GetRawTensor(); +#endif void Resize(const shape_t& shape); /// Readonly data. diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc index 537636065d6aeea67fd7c8c71fb00b183720fecc..a82f8b560fba1704e5322606adb4800fb8869861 100644 --- a/lite/core/op_lite.cc +++ b/lite/core/op_lite.cc @@ -86,8 +86,8 @@ std::vector> OpLite::CreateKernels( auto pick_kernel = [&](const Place &place) { auto ks = KernelRegistry::Global().Create( op_type_, place.target, place.precision, place.layout); - VLOG(5) << "pick kernel for " << op_info()->Type() << " " - << place.DebugString() << " get " << ks.size() << " kernels"; + VLOG(5) << "pick kernel for " << op_type_ << " " << place.DebugString() + << " get " << ks.size() << " kernels"; for (auto &&it : ks) { AttachKernel(it.get()); kernels.emplace_back(std::move(it)); diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h index 301065d5b6bb5c4f41b19d9a9034985ca2f74d89..3fabc6e10ef5009067b901542fc38b006fcda15f 100644 --- a/lite/core/op_lite.h +++ b/lite/core/op_lite.h @@ -68,6 +68,7 @@ class OpLite : public Registry { // Inference the outputs' shape. virtual bool InferShapeImpl() const { return true; } virtual bool InferShape(); + virtual bool SetParam(operators::ParamBase *param) { return false; } // Run this operator. virtual bool Run(); // Indicate whether the Op runs only once or not diff --git a/lite/demo/cxx/makefiles/test_compute_api/Makefile.android.armv7 b/lite/demo/cxx/makefiles/test_compute_api/Makefile.android.armv7 new file mode 100644 index 0000000000000000000000000000000000000000..2c36e6575e54cda4f3b19b0ec5b4cae2881e4f2a --- /dev/null +++ b/lite/demo/cxx/makefiles/test_compute_api/Makefile.android.armv7 @@ -0,0 +1,54 @@ +ARM_ABI = arm7 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +CXX_DEFINES += -DLITE_WITH_COMPUTE_API + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +#activation +test_activation: test_activation.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_activation.o -o test_activation $(CXX_LIBS) $(LDFLAGS) + +test_activation.o: activation_test.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_activation.o -c activation_test.cc + +# conv +test_conv: test_conv.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_conv.o -o test_conv $(CXX_LIBS) $(LDFLAGS) + +test_conv.o: conv_test.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_conv.o -c conv_test.cc + +# int8 conv +test_conv_int8: test_conv_int8.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_conv_int8.o -o test_conv_int8 $(CXX_LIBS) $(LDFLAGS) + +test_conv_int8.o: conv_int8_test.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_conv_int8.o -c conv_int8_test.cc + +.PHONY: clean +clean: + rm -f test_activation.o + rm -f test_activation + rm -f test_conv.o + rm -f test_conv + rm -f test_conv_int8.o + rm -f test_conv_int8 diff --git a/lite/demo/cxx/makefiles/test_compute_api/Makefile.android.armv8 b/lite/demo/cxx/makefiles/test_compute_api/Makefile.android.armv8 new file mode 100644 index 0000000000000000000000000000000000000000..9b23c20ce5effc4e1e395bd992ea63fab2de1057 --- /dev/null +++ b/lite/demo/cxx/makefiles/test_compute_api/Makefile.android.armv8 @@ -0,0 +1,54 @@ +ARM_ABI = arm8 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +CXX_DEFINES += -DLITE_WITH_COMPUTE_API + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +#activation +test_activation: test_activation.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_activation.o -o test_activation $(CXX_LIBS) $(LDFLAGS) + +test_activation.o: activation_test.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_activation.o -c activation_test.cc + +# conv +test_conv: test_conv.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_conv.o -o test_conv $(CXX_LIBS) $(LDFLAGS) + +test_conv.o: conv_test.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_conv.o -c conv_test.cc + +# int8 conv +test_conv_int8: test_conv_int8.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_conv_int8.o -o test_conv_int8 $(CXX_LIBS) $(LDFLAGS) + +test_conv_int8.o: conv_int8_test.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_conv_int8.o -c conv_int8_test.cc + +.PHONY: clean +clean: + rm -f test_activation.o + rm -f test_activation + rm -f test_conv.o + rm -f test_conv + rm -f test_conv_int8.o + rm -f test_conv_int8 diff --git a/lite/demo/cxx/test_compute_api/activation_test.cc b/lite/demo/cxx/test_compute_api/activation_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..bb537b21700ad415aba168fec355f55268b992d2 --- /dev/null +++ b/lite/demo/cxx/test_compute_api/activation_test.cc @@ -0,0 +1,268 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "compute_api.h" // NOLINT +#include "compute_param.h" // NOLINT +#include "compute_utils.h" // NOLINT +#include "paddle_api.h" // NOLINT +#include "utils.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +void activation_naive_impl(const float* din, + float* dout, + int64_t len, + ActivationType act_type, + float leaky_relu_alpha) { + switch (act_type) { + case ActivationType::kRelu: { + for (int i = 0; i < len; i++) { + dout[i] = std::max(0.f, din[i]); + } + break; + } + case ActivationType::kRelu6: { + for (int i = 0; i < len; i++) { + dout[i] = std::max(0.f, din[i]); + dout[i] = std::min(6.f, dout[i]); + } + break; + } + case ActivationType::kLeakyRelu: { + for (int i = 0; i < len; i++) { + dout[i] = din[i] > 0.f ? din[i] : din[i] * leaky_relu_alpha; + } + break; + } + case ActivationType::kSigmoid: { + for (int i = 0; i < len; i++) { + dout[i] = 1.f / (1.f + std::exp(-din[i])); + } + break; + } + case ActivationType::kTanh: { + for (int i = 0; i < len; i++) { + dout[i] = (std::exp(din[i]) - std::exp(-din[i])) / + (std::exp(din[i]) + std::exp(-din[i])); + } + break; + } + default: + std::cerr << "the type of activation is unknow." << std::endl; + assert(0); + } +} + +void activation_func(int n, + int c, + int h, + int w, + ActivationType act_type, + float leaky_relu_alpha, + int warmup, + int repeats, + bool check_result, + int threads, + PowerMode power_mode) { + Tensor input, output, output_ref; + input.Resize({n, c, h, w}); + input.set_precision(PRECISION(kFloat)); + output_ref.Resize({n, c, h, w}); + output_ref.set_precision(PRECISION(kFloat)); + fill_tensor_rand(input, -1.f, 1.f); + ComputeEngine::env_init(power_mode, threads); + ComputeEngine act; + + ActivationParam act_param; + act_param.active_type = act_type; + act_param.X = &input; + act_param.Out = &output; + act_param.Leaky_relu_alpha = leaky_relu_alpha; + std::string act_str; + if (act_type == ActivationType::kRelu) { + act_str = "relu"; + } else if (act_type == ActivationType::kRelu6) { + act_str = "relu6"; + } else if (act_type == ActivationType::kLeakyRelu) { + act_str = "leaky_relu"; + } else if (act_type == ActivationType::kSigmoid) { + act_str = "sigmoid"; + } else if (act_type == ActivationType::kTanh) { + act_str = "tanh"; + } else { + std::cerr << "act type: " << static_cast(act_type) + << "is not support now." << std::endl; + assert(0); + } + act.CreateOperator(act_str.c_str()); + act.SetParam(&act_param); + act.Launch(); + if (output.shape() != output_ref.shape()) { + std::cerr << "act op infer shape error." << std::endl; + assert(0); + } + Timer t; + for (int i = 0; i < warmup; ++i) { + act.Launch(); + } + + for (int i = 0; i < repeats; ++i) { + t.Start(); + act.Launch(); + t.Stop(); + } + auto shape = input.shape(); + std::cout << "act input shape: " << shape[0] << ", " << shape[1] << ", " + << shape[2] << ", " << shape[3] + << ", act_type: " << static_cast(act_type) + << ", warmup: " << warmup << ", repeats: " << repeats + << ", power mode: " << static_cast(power_mode) + << ", threads: " << threads << ", avg time: " << t.LapTimes().Avg() + << " ms" << std::endl; + + if (check_result) { + const float* din = input.data(); + float* dout_ref = output_ref.mutable_data(); + int64_t len = dim_production(input); + activation_naive_impl(din, dout_ref, len, act_type, leaky_relu_alpha); + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(output, output_ref, max_ratio, max_diff); + if (std::abs(max_ratio) > 1e-3f) { + if (max_diff > 5e-4f) { + std::cout << "basic result" << std::endl; + print_tensor(output_ref); + std::cout << "lite result" << std::endl; + print_tensor(output); + Tensor tdiff; + tdiff.set_precision(PRECISION(kFloat)); + tensor_diff(output_ref, output, tdiff); + std::cout << "diff result" << std::endl; + print_tensor(tdiff); + tdiff.ReleaseRawTensor(); + } + } + } + + input.ReleaseRawTensor(); + output.ReleaseRawTensor(); + output_ref.ReleaseRawTensor(); +} + +static int basic_test = 1; +static int n = 1; +static int c = 3; +static int h = 224; +static int w = 224; +static int act_type = 1; +static float leaky_relu_alpha = 2.f; +static int warmup = 0; +static int repeats = 1; +static int check_result = 1; +static int power_mode = 3; +static int threads = 1; + +int main(int argc, const char** argv) { + if (argc < 2) { + std::cout << "usage: ./" << argv[0] + << "basic_test n c h w act_type leaky_relu_alpha" + " warmup repeats check_result power_mode threads" + << std::endl; + return 0; + } + if (argc >= 2) { + basic_test = atoi(argv[1]) > 0; + } + if (argc >= 3) { + n = atoi(argv[2]); + } + if (argc >= 4) { + c = atoi(argv[3]); + } + if (argc >= 5) { + h = atoi(argv[4]); + } + if (argc >= 6) { + w = atoi(argv[5]); + } + if (argc >= 7) { + act_type = atoi(argv[6]); + } + if (argc >= 8) { + leaky_relu_alpha = atof(argv[7]); + } + if (argc >= 9) { + warmup = atoi(argv[8]); + } + if (argc >= 10) { + repeats = atoi(argv[9]); + } + if (argc >= 11) { + check_result = atoi(argv[10]); + } + if (argc >= 12) { + power_mode = atoi(argv[11]); + } + if (argc >= 13) { + threads = atoi(argv[12]); + } + // basic test + if (basic_test) { + std::cout << "RUN BASIC TEST BEGIN: " << std::endl; + for (auto& n : {1, 3, 4}) { + for (auto& c : {1, 3, 32}) { + for (auto& h : {5, 64, 112, 224}) { + for (auto& w : {5, 64, 112, 224}) { + for (auto& act_type : {1, 2, 4, 5, 6}) { + for (auto& threads : {1, 2, 4}) { + activation_func(n, + c, + h, + w, + static_cast(act_type), + leaky_relu_alpha, + 0, + 1, + 1, + threads, + static_cast(3)); + } + } + } + } + } + } + std::cout << "RUN BASIC TEST END: " << std::endl; + } + + // costum test + std::cout << "RUN CUSTOM TEST BEGIN: " << std::endl; + activation_func(n, + c, + h, + w, + static_cast(act_type), + leaky_relu_alpha, + warmup, + repeats, + check_result, + threads, + static_cast(power_mode)); + std::cout << "RUN CUSTOM TEST END: " << std::endl; + return 0; +} diff --git a/lite/demo/cxx/test_compute_api/conv_int8_test.cc b/lite/demo/cxx/test_compute_api/conv_int8_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8f7243be27253b9d2e65133f1c92e5282ea9a29f --- /dev/null +++ b/lite/demo/cxx/test_compute_api/conv_int8_test.cc @@ -0,0 +1,695 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "compute_api.h" // NOLINT +#include "compute_param.h" // NOLINT +#include "compute_utils.h" // NOLINT +#include "paddle_api.h" // NOLINT +#include "utils.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +static int basic_test = 1; +static int batch = 1; +static int in_channel = 32; +static int in_height = 112; +static int in_width = 112; +static int out_channel = 32; +static int group = 1; +static int kernel_h = 3; +static int kernel_w = 3; +static int pad_h0 = 1; +static int pad_h1 = 1; +static int pad_w0 = 1; +static int pad_w1 = 1; +static int stride_h = 1; +static int stride_w = 1; +static int dila_h = 1; +static int dila_w = 1; +static int flag_relu = 0; +static int flag_bias = 1; +static float leaky_relu_alpha = 2.f; +static int warmup = 0; +static int repeats = 1; +static int check_result = 1; +static int power_mode = 3; +static int threads = 1; + +template +static void conv_basic(const Dtype1* din, + Dtype2* dout, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + const Dtype1* weights, + const Dtype2* bias, + int group, + int kernel_w, + int kernel_h, + int stride_w, + int stride_h, + int dila_w, + int dila_h, + int pad_w, + int pad_h, + bool flag_bias, + int act_type, + float six = 6.f, + float scale = 1.f) { + Dtype2 beta = 0; + auto src_data = din; + auto dst_data_ref = dout; + auto weights_data = weights; + auto with_bias = flag_bias; + auto bias_data = bias; + + int in_num = num; + int out_channels = chout; + int out_h = hout; + int out_w = wout; + + int in_channel = chin; + int in_h = hin; + int in_w = win; + int out_c_group = out_channels / group; + int in_c_group = in_channel / group; + + for (int n = 0; n < in_num; ++n) { +#pragma omp parallel for collapse(4) + for (int g = 0; g < group; ++g) { + for (int oc = 0; oc < out_c_group; ++oc) { + for (int oh = 0; oh < out_h; ++oh) { + for (int ow = 0; ow < out_w; ++ow) { + int out_idx = n * group * out_c_group * out_h * out_w + + g * out_c_group * out_h * out_w + oc * out_h * out_w + + oh * out_w + ow; + Dtype2 bias_d = with_bias ? (bias_data[g * out_c_group + oc]) : 0; + dst_data_ref[out_idx] = bias_d; // + dst_data_ref[out_idx] * beta; + for (int ic = 0; ic < in_c_group; ++ic) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int iw = ow * stride_w - pad_w + kw * (dila_w); + int ih = oh * stride_h - pad_h + kh * (dila_h); + if (iw < 0 || iw >= in_w) continue; + if (ih < 0 || ih >= in_h) continue; + + int iidx = n * in_channel * in_h * in_w + + g * in_c_group * in_h * in_w + ic * in_h * in_w + + ih * in_w + iw; + int widx = + g * out_c_group * in_c_group * kernel_h * kernel_w + + oc * in_c_group * kernel_h * kernel_w + + ic * kernel_h * kernel_w + kh * kernel_w + kw; + + dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx]; + } + } + } + if (act_type > 0) { + // 1-relu 2-relu6 4-leakyrelu + if (act_type == 1) { + dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0 + ? dst_data_ref[out_idx] + : (Dtype2)0; + } else if (act_type == 2) { + dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0 + ? dst_data_ref[out_idx] + : (Dtype2)0; + dst_data_ref[out_idx] = dst_data_ref[out_idx] < (Dtype2)six + ? dst_data_ref[out_idx] + : (Dtype2)six; + } else if (act_type == 4) { + dst_data_ref[out_idx] = + dst_data_ref[out_idx] > (Dtype2)0 + ? dst_data_ref[out_idx] + : (Dtype2)(dst_data_ref[out_idx] * scale); + } else { + printf("this act type: %d does not support \n", act_type); + } + } + } + } + } + } + } +} + +shape_t compute_out_dim(const shape_t& dim_in, const ConvParam& param) { + shape_t dim_out = dim_in; + auto paddings = *param.paddings; + auto dilations = *param.dilations; + auto filter_shape = param.filter->shape(); + dim_out[1] = filter_shape[0]; + auto kernel_h = filter_shape[2]; + auto kernel_w = filter_shape[3]; + auto h = dim_in[2]; + auto w = dim_in[3]; + int dila_h = dilations[0]; + int dila_w = dilations[1]; + int pad_top = paddings[0]; + int pad_bottom = paddings[1]; + int pad_left = paddings[2]; + int pad_right = paddings[3]; + int stride_h = param.strides[0]; + int stride_w = param.strides[1]; + auto kernel_exten = dila_h * (kernel_h - 1) + 1; + auto hout = (h + pad_top + pad_bottom - kernel_exten) / stride_h + 1; + kernel_exten = dila_w * (kernel_w - 1) + 1; + auto wout = (w + pad_left + pad_right - kernel_exten) / stride_w + 1; + dim_out[2] = hout; + dim_out[3] = wout; + return dim_out; +} + +template +void get_conv_param(const shape_t& dim_w, + int g, + const std::vector& strides, + const std::vector& pads, + const std::vector& dila, + bool flag_bias, + bool flag_relu, + ConvParam* param) { + param->x = new Tensor; + param->x->set_precision(PRECISION(kInt8)); + param->filter = new Tensor; + param->filter->Resize(dim_w); + param->filter->set_precision(PRECISION(kInt8)); + if (flag_bias) { + param->bias = new Tensor; + param->bias->Resize({dim_w[0]}); + param->bias->set_precision(PRECISION(kFloat)); + } + param->enable_int8 = true; + param->strides = strides; + param->paddings = std::make_shared>(pads); + param->dilations = std::make_shared>(dila); + if (flag_relu) { + param->activation_param.has_active = true; + param->activation_param.active_type = ActivationType::kRelu; + } + param->groups = g; + + param->output = new Tensor; + param->output->set_precision(ptype); + param->out_ptype = ptype; +} + +void release_param(ConvParam* param) { + param->x->ReleaseRawTensor(); + param->filter->ReleaseRawTensor(); + param->output->ReleaseRawTensor(); + if (param->bias) { + param->bias->ReleaseRawTensor(); + } + delete param->x; + delete param->filter; + delete param->output; + delete param->bias; +} + +void test_conv_int8(const std::vector& input_dims, + const shape_t& weight_dim, + int group, + const std::vector& strides, + const std::vector& pads, + const std::vector& dilas, + bool flag_bias, + bool flag_relu, + const int thread_num, + const int power_mode) { + ComputeEngine::env_init(static_cast(power_mode), + thread_num); + ConvParam param_int8_out; + ConvParam param_fp32_out; + + get_conv_param(weight_dim, + group, + strides, + pads, + dilas, + flag_bias, + flag_relu, + ¶m_int8_out); + + get_conv_param(weight_dim, + group, + strides, + pads, + dilas, + flag_bias, + flag_relu, + ¶m_fp32_out); + Tensor weight_fp32; + Tensor bias_fp32; + weight_fp32.Resize(weight_dim); + fill_tensor_rand(*param_int8_out.filter, -127, 127); + param_fp32_out.filter->CopyFromCpu( + param_int8_out.filter->data()); + if (flag_bias) { + auto dim_b = param_int8_out.bias->shape(); + bias_fp32.Resize(dim_b); + fill_tensor_rand(*param_int8_out.bias, -1.f, 1.f); + param_fp32_out.bias->CopyFromCpu(param_int8_out.bias->data()); + bias_fp32.CopyFromCpu(param_int8_out.bias->data()); + } + + std::vector scale_in{1.f / 127}; + std::vector scale_out{weight_dim[1] * weight_dim[2] * weight_dim[3] / + 127.f}; + std::vector scale_w(weight_dim[0], 1.f / 127); + + param_int8_out.input_scale = scale_in[0]; + param_int8_out.output_scale = scale_out[0]; + param_int8_out.weight_scale = scale_w; + + param_fp32_out.input_scale = scale_in[0]; + param_fp32_out.output_scale = scale_out[0]; + param_fp32_out.weight_scale = scale_w; + + auto wptr_fp32 = weight_fp32.mutable_data(); + auto bptr_fp32 = flag_bias ? bias_fp32.data() : nullptr; + ComputeUtils::ConvWeightsInt8ToFloat( + *param_int8_out.filter, weight_fp32, scale_w); + + ComputeEngine conv_int8_int8; + ComputeEngine conv_int8_fp32; + conv_int8_int8.CreateOperator("conv2d", PRECISION(kInt8)); + conv_int8_fp32.CreateOperator("conv2d", PRECISION(kInt8)); + + /// set param and context + for (auto& dim_in : input_dims) { + param_int8_out.x->Resize(dim_in); + auto out_tmp_dims = compute_out_dim(dim_in, param_int8_out); + if (out_tmp_dims[2] < 1 || out_tmp_dims[3] < 1) { + continue; + } + param_fp32_out.x->Resize(dim_in); + param_int8_out.output->Resize(out_tmp_dims); + param_fp32_out.output->Resize(out_tmp_dims); + break; + } + conv_int8_int8.SetParam(¶m_int8_out); + conv_int8_fp32.SetParam(¶m_fp32_out); + + for (auto& dim_in : input_dims) { + if (weight_dim[1] * group != dim_in[1]) { + "input channel must equal to weights channel\n"; + assert(0); + } + auto dim_out = compute_out_dim(dim_in, param_int8_out); + if (dim_out[2] < 1 || dim_out[3] < 1) { + continue; + } + param_fp32_out.output->ReleaseRawTensor(); + delete param_fp32_out.output; + param_fp32_out.output = new Tensor; + param_fp32_out.output->set_precision(PRECISION(kFloat)); + param_int8_out.output->ReleaseRawTensor(); + delete param_int8_out.output; + param_int8_out.output = new Tensor; + param_int8_out.output->set_precision(PRECISION(kInt8)); + + param_int8_out.x->Resize(dim_in); + param_int8_out.output->Resize(dim_out); + param_fp32_out.x->Resize(dim_in); + param_fp32_out.output->Resize(dim_out); + + Tensor tin_fp32; + tin_fp32.Resize(dim_in); + tin_fp32.set_precision(PRECISION(kFloat)); + Tensor tout_basic_fp32; + Tensor tout_basic_int8; + + fill_tensor_rand(*param_int8_out.x, -127, 127); + param_fp32_out.x->CopyFromCpu(param_int8_out.x->data()); + + ComputeUtils::TensorInt8ToFloat(*param_int8_out.x, tin_fp32, scale_in[0]); + + if (check_result) { + tout_basic_fp32.set_precision(PRECISION(kFloat)); + tout_basic_fp32.Resize(dim_out); + tout_basic_int8.set_precision(PRECISION(kInt8)); + tout_basic_int8.Resize(dim_out); + fill_tensor_const(tout_basic_fp32, 0.f); + auto dout_basic_fp32 = tout_basic_fp32.mutable_data(); + auto dout_basic_int8 = tout_basic_int8.mutable_data(); + const float* din_fp32 = tin_fp32.data(); + conv_basic(din_fp32, + dout_basic_fp32, + dim_in[0], + dim_out[1], + dim_out[2], + dim_out[3], + dim_in[1], + dim_in[2], + dim_in[3], + wptr_fp32, + bptr_fp32, + group, + weight_dim[3], + weight_dim[2], + strides[1], + strides[0], + dilas[1], + dilas[0], + pads[2], + pads[0], + flag_bias, + static_cast(flag_relu)); + ComputeUtils::TensorFloatToInt8( + tout_basic_fp32, tout_basic_int8, scale_out[0]); + } + + double gops = 2.0 * dim_production(tout_basic_int8) * dim_in[1] * + weight_dim[2] * weight_dim[3] / group; + /// warm up + for (int i = 0; i < warmup; ++i) { + conv_int8_int8.Launch(); + } + /// compute fp32 output + Timer t0; + for (int i = 0; i < repeats; ++i) { + t0.Start(); + conv_int8_fp32.Launch(); + t0.Stop(); + } + std::cout << "int8 conv, fp32 output: output shape: (" << dim_out[0] << ", " + << dim_out[1] << ", " << dim_out[2] << ", " << dim_out[3] + << "), running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() + << ", total GOPS: " << 1e-9 * gops + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min() + << std::endl; + + /// compute int8 output + t0.Reset(); + for (int i = 0; i < repeats; ++i) { + t0.Start(); + conv_int8_int8.Launch(); + t0.Stop(); + } + std::cout << "int8 conv, int8 output: output shape: (" << dim_out[0] << ", " + << dim_out[1] << ", " << dim_out[2] << ", " << dim_out[3] + << "), running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() + << ", total GOPS: " << 1e-9 * gops + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min() + << std::endl; + + /// compare result fp32 output + if (check_result) { + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host( + tout_basic_fp32, *param_fp32_out.output, max_ratio, max_diff); + std::cout << "FP32 compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio << std::endl; + if (std::abs(max_ratio) > 1e-5f) { + if (max_diff > 5e-5f) { + std::cout << "basic result\n"; + print_tensor(tout_basic_fp32); + std::cout << "lite result\n"; + print_tensor(*param_fp32_out.output); + Tensor tdiff; + tdiff.Resize(tout_basic_fp32.shape()); + tdiff.set_precision(PRECISION(kFloat)); + tensor_diff(tout_basic_fp32, *param_fp32_out.output, tdiff); + print_tensor(tdiff); + release_param(¶m_int8_out); + release_param(¶m_fp32_out); + std::cerr << "test int8 conv, fp32 out: input: (" << dim_in[0] << ", " + << dim_in[1] << ", " << dim_in[2] << ", " << dim_in[3] + << "), output: (" << dim_out[0] << ", " << dim_out[1] + << ", " << dim_out[2] << ", " << dim_out[3] + << "), weight dim: (" << weight_dim[0] << ", " + << weight_dim[1] << ", " << weight_dim[2] << ", " + << weight_dim[3] << "), pad: " << pads[0] << ", " << pads[1] + << ", " << pads[2] << ", " << pads[3] + << ", stride: " << strides[0] << ", " << strides[1] + << ", dila_: " << dilas[0] << ", " << dilas[1] + << ", group: " << group + << ", bias: " << (flag_bias ? "true" : "false") + << ", relu: " << (flag_relu ? "true" : "false") + << ", threads: " << thread_num + << ", power_mode: " << power_mode << " failed!!\n"; + exit(1); + } + } + } + /// compare result int8 output + if (check_result) { + double max_ratio = 0; + double max_diff = 0; + // ! int8 + tensor_cmp_host( + tout_basic_int8, *param_int8_out.output, max_ratio, max_diff); + std::cout << "int8 compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio << std::endl; + if (fabs(max_diff) > 0) { + Tensor tdiff; + tdiff.Resize(tout_basic_int8.shape()); + tdiff.set_precision(PRECISION(kInt8)); + tensor_diff(tout_basic_int8, *param_int8_out.output, tdiff); + auto ptr = tdiff.data(); + auto ptr_basic_fp32 = tout_basic_fp32.data(); + float count = 0; + bool check = true; + for (int i = 0; i < dim_production(tdiff); ++i) { + if (abs(ptr[i]) > 1) { + check = false; + std::cerr << "basic float data: " << ptr_basic_fp32[i] + << ", after scale: " << ptr_basic_fp32[i] / scale_out[0] + << std::endl; + break; + } + if (ptr[i] != 0) { + std::cerr << "basic float data: " << ptr_basic_fp32[i] + << ", after scale: " << ptr_basic_fp32[i] / scale_out[0] + << std::endl; + count += 1; + } + } + check = check && + count < std::max( + 10, static_cast(0.01 * dim_production(tdiff))); + if (!check) { + std::cout << "int8 basic result\n"; + print_tensor(tout_basic_int8); + std::cout << "int8 lite result\n"; + print_tensor(*param_int8_out.output); + std::cout << "int8 diff tensor\n"; + print_tensor(tdiff); + release_param(¶m_int8_out); + release_param(¶m_fp32_out); + std::cerr << "test int8 conv, fp32 out: input: (" << dim_in[0] << ", " + << dim_in[1] << ", " << dim_in[2] << ", " << dim_in[3] + << "), output: (" << dim_out[0] << ", " << dim_out[1] + << ", " << dim_out[2] << ", " << dim_out[3] + << "), weight dim: (" << weight_dim[0] << ", " + << weight_dim[1] << ", " << weight_dim[2] << ", " + << weight_dim[3] << "), pad: " << pads[0] << ", " << pads[1] + << ", " << pads[2] << ", " << pads[3] + << ", stride: " << strides[0] << ", " << strides[1] + << ", dila_: " << dilas[0] << ", " << dilas[1] + << ", group: " << group + << ", bias: " << (flag_bias ? "true" : "false") + << ", relu: " << (flag_relu ? "true" : "false") + << ", threads: " << thread_num + << ", power_mode: " << power_mode << " failed!!\n"; + exit(1); + } + } + } + std::cout << "test int8 conv: input: (" << dim_in[0] << ", " << dim_in[1] + << ", " << dim_in[2] << ", " << dim_in[3] << "), output: (" + << dim_out[0] << ", " << dim_out[1] << ", " << dim_out[2] << ", " + << dim_out[3] << "), weight dim: (" << weight_dim[0] << ", " + << weight_dim[1] << ", " << weight_dim[2] << ", " << weight_dim[3] + << "), pad: " << pads[0] << ", " << pads[1] << ", " << pads[2] + << ", " << pads[3] << ", stride: " << strides[0] << ", " + << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] + << ", bias: " << (flag_bias ? "true" : "false") + << ", relu: " << (flag_relu ? "true" : "false") + << ", threads: " << thread_num << ", power_mode: " << power_mode + << " successed!!\n"; + } + release_param(¶m_int8_out); + release_param(¶m_fp32_out); +} + +int main(int argc, const char** argv) { + if (argc < 2) { + std::cout << "usage: ./" << argv[0] + << "basic_test check_result batch in_channel in_height in_width " + "out_channel group kernel_h pad_h0 stride_h dila_h flag_act " + "flag_bias warmup repeats threads power_mode." + << std::endl; + return 0; + } + if (argc >= 2) { + basic_test = atoi(argv[1]); + } + if (argc >= 3) { + check_result = atoi(argv[2]); + } + if (argc >= 4) { + batch = atoi(argv[3]); + } + if (argc >= 5) { + in_channel = atoi(argv[4]); + } + if (argc >= 6) { + in_height = atoi(argv[5]); + } + if (argc >= 7) { + in_width = atoi(argv[6]); + } + if (argc >= 8) { + out_channel = atoi(argv[7]); + } + if (argc >= 9) { + group = atof(argv[8]); + } + if (argc >= 10) { + if (argc >= 13) { + kernel_h = atoi(argv[9]); + kernel_w = kernel_h; + pad_h0 = atoi(argv[10]); + pad_h1 = pad_h0; + pad_w0 = pad_h0; + pad_w1 = pad_h0; + stride_h = atoi(argv[11]); + stride_w = stride_h; + dila_h = atoi(argv[12]); + dila_w = dila_h; + } else { + std::cout + << "kernel_h padh0 stride_h dila_h must be set at the same time." + << std::endl; + } + } + if (argc >= 14) { + flag_relu = atoi(argv[13]); + } + if (argc >= 15) { + flag_bias = atoi(argv[14]); + } + if (argc >= 16) { + warmup = atoi(argv[15]); + } + if (argc >= 17) { + repeats = atoi(argv[16]); + } + if (argc >= 18) { + threads = atoi(argv[17]); + } + if (argc >= 19) { + power_mode = atoi(argv[18]); + } + if (argc >= 20) { + leaky_relu_alpha = atof(argv[19]); + } + // basic test + if (basic_test) { + std::cout << "RUN BASIC TEST BEGIN: " << std::endl; + for (auto& cin : {1, 3, 8}) { + for (auto& cout : {1, 5, 16}) { + for (auto& g : {1, 2}) { + for (auto& kw : {1, 2, 3}) { + for (auto& kh : {1, 2, 3}) { + for (auto& stride : {1, 2}) { + for (auto& pad_left : {0, 2}) { + for (auto& pad_right : {0, 2}) { + for (auto& pad_top : {0, 2}) { + for (auto& pad_bottom : {0, 2}) { + for (auto& dila : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_act : {0, 1, 2, 4}) { + for (auto& threads : {1, 2, 4}) { + if (cin % g != 0 || cout % g != 0) { + continue; + } + std::vector dims; + shape_t weights_dim({cout, cin / g, kh, kw}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 3, 19, 32}) { + dims.push_back(shape_t({batch, cin, h, h})); + } + } + // skip 3x3 depthwise conv + if (g == cin && cin == cout && kw == 3 && + kh == 3) { + break; + } + // skip 3x3s1 direct conv + if (g == 1 && (cin != 1 || cout != 1) && + kw == 3 && kh == 3 && stride == 1) { + break; + } + test_conv_int8( + dims, + weights_dim, + g, + {stride, stride}, + {pad_top, pad_bottom, pad_left, pad_right}, + {dila, dila}, + flag_bias, + flag_relu, + threads, + 3); + } + } + } + } + } + } + } + } + } + } + } + } + } + } + std::cout << "RUN BASIC TEST END: " << std::endl; + } + + // costum test + std::cout << "RUN CUSTOM TEST BEGIN: " << std::endl; + std::vector dims; + dims.emplace_back(shape_t({batch, in_channel, in_height, in_width})); + shape_t weights_dim({out_channel, in_channel / group, kernel_h, kernel_w}); + test_conv_int8(dims, + weights_dim, + group, + {stride_h, stride_w}, + {pad_h0, pad_h1, pad_w0, pad_w1}, + {dila_h, dila_w}, + flag_bias, + flag_relu, + threads, + 3); + std::cout << "RUN CUSTOM TEST END: " << std::endl; + return 0; +} diff --git a/lite/demo/cxx/test_compute_api/conv_test.cc b/lite/demo/cxx/test_compute_api/conv_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..44d2c9790d00b0f9a7a2f0261e414e37ad0235f6 --- /dev/null +++ b/lite/demo/cxx/test_compute_api/conv_test.cc @@ -0,0 +1,539 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "compute_api.h" // NOLINT +#include "compute_param.h" // NOLINT +#include "compute_utils.h" // NOLINT +#include "paddle_api.h" // NOLINT +#include "utils.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +static int basic_test = 1; +static int batch = 1; +static int in_channel = 32; +static int in_height = 112; +static int in_width = 112; +static int out_channel = 32; +static int group = 1; +static int kernel_h = 3; +static int kernel_w = 3; +static int pad_h0 = 1; +static int pad_h1 = 1; +static int pad_w0 = 1; +static int pad_w1 = 1; +static int stride_h = 1; +static int stride_w = 1; +static int dila_h = 1; +static int dila_w = 1; +static int flag_act = 0; +static int flag_bias = 1; +static float leaky_relu_alpha = 2.f; +static int warmup = 0; +static int repeats = 1; +static int check_result = 1; +static int power_mode = 3; +static int threads = 1; + +template +static void conv_basic(const Dtype1* din, + Dtype2* dout, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + const Dtype1* weights, + const Dtype2* bias, + int group, + int kernel_w, + int kernel_h, + int stride_w, + int stride_h, + int dila_w, + int dila_h, + int pad_w, + int pad_h, + bool flag_bias, + int act_type, + float six = 6.f, + float scale = 1.f) { + Dtype2 beta = 0; + auto src_data = din; + auto dst_data_ref = dout; + auto weights_data = weights; + auto with_bias = flag_bias; + auto bias_data = bias; + + int in_num = num; + int out_channels = chout; + int out_h = hout; + int out_w = wout; + + int in_channel = chin; + int in_h = hin; + int in_w = win; + int out_c_group = out_channels / group; + int in_c_group = in_channel / group; + + for (int n = 0; n < in_num; ++n) { +#pragma omp parallel for collapse(4) + for (int g = 0; g < group; ++g) { + for (int oc = 0; oc < out_c_group; ++oc) { + for (int oh = 0; oh < out_h; ++oh) { + for (int ow = 0; ow < out_w; ++ow) { + int out_idx = n * group * out_c_group * out_h * out_w + + g * out_c_group * out_h * out_w + oc * out_h * out_w + + oh * out_w + ow; + Dtype2 bias_d = with_bias ? (bias_data[g * out_c_group + oc]) : 0; + dst_data_ref[out_idx] = bias_d; // + dst_data_ref[out_idx] * beta; + for (int ic = 0; ic < in_c_group; ++ic) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int iw = ow * stride_w - pad_w + kw * (dila_w); + int ih = oh * stride_h - pad_h + kh * (dila_h); + if (iw < 0 || iw >= in_w) continue; + if (ih < 0 || ih >= in_h) continue; + + int iidx = n * in_channel * in_h * in_w + + g * in_c_group * in_h * in_w + ic * in_h * in_w + + ih * in_w + iw; + int widx = + g * out_c_group * in_c_group * kernel_h * kernel_w + + oc * in_c_group * kernel_h * kernel_w + + ic * kernel_h * kernel_w + kh * kernel_w + kw; + + dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx]; + } + } + } + if (act_type > 0) { + // 1-relu 2-relu6 4-leakyrelu + if (act_type == 1) { + dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0 + ? dst_data_ref[out_idx] + : (Dtype2)0; + } else if (act_type == 2) { + dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0 + ? dst_data_ref[out_idx] + : (Dtype2)0; + dst_data_ref[out_idx] = dst_data_ref[out_idx] < (Dtype2)six + ? dst_data_ref[out_idx] + : (Dtype2)six; + } else if (act_type == 4) { + dst_data_ref[out_idx] = + dst_data_ref[out_idx] > (Dtype2)0 + ? dst_data_ref[out_idx] + : (Dtype2)(dst_data_ref[out_idx] * scale); + } else { + printf("this act type: %d does not support \n", act_type); + } + } + } + } + } + } + } +} + +shape_t compute_out_dim(const shape_t& dim_in, const ConvParam& param) { + shape_t dim_out = dim_in; + auto paddings = *param.paddings; + auto dilations = *param.dilations; + auto filter_shape = param.filter->shape(); + dim_out[1] = filter_shape[0]; + auto kernel_h = filter_shape[2]; + auto kernel_w = filter_shape[3]; + auto h = dim_in[2]; + auto w = dim_in[3]; + int dila_h = dilations[0]; + int dila_w = dilations[1]; + int pad_top = paddings[0]; + int pad_bottom = paddings[1]; + int pad_left = paddings[2]; + int pad_right = paddings[3]; + int stride_h = param.strides[0]; + int stride_w = param.strides[1]; + auto kernel_exten = dila_h * (kernel_h - 1) + 1; + auto hout = (h + pad_top + pad_bottom - kernel_exten) / stride_h + 1; + kernel_exten = dila_w * (kernel_w - 1) + 1; + auto wout = (w + pad_left + pad_right - kernel_exten) / stride_w + 1; + dim_out[2] = hout; + dim_out[3] = wout; + return dim_out; +} + +void test_conv_fp32(const std::vector& input_dims, + const shape_t& weight_dim, + int group, + const std::vector& strides, + const std::vector& pads, + const std::vector& dilas, + bool flag_bias, + int flag_act, + const int thread_num, + const int power_mode, + const float leakey_relu_scale) { + ComputeEngine::env_init(static_cast(power_mode), + thread_num); + ConvParam param; + param.x = new Tensor; + param.x->set_precision(PRECISION(kFloat)); + param.filter = new Tensor; + param.filter->Resize(weight_dim); + param.filter->set_precision(PRECISION(kFloat)); + if (flag_bias) { + param.bias = new Tensor; + param.bias->Resize({weight_dim[0]}); + param.bias->set_precision(PRECISION(kFloat)); + } + param.strides = strides; + param.paddings = std::make_shared>(pads); + param.dilations = std::make_shared>(dilas); + param.groups = group; + const float six = 6.f; + if (flag_act > 0) { + ActivationParam act_param; + act_param.has_active = true; + act_param.active_type = + static_cast(flag_act); // 1-relu, 2-relu6, 4-leakyrelu + if (flag_act == 1) { + // param.fuse_relu = true; + } else if (flag_act == 2) { + act_param.Relu_clipped_coef = six; + } else if (flag_act == 4) { + act_param.Leaky_relu_alpha = leakey_relu_scale; + } + param.activation_param = act_param; + } + + param.output = new Tensor; + param.output->set_precision(PRECISION(kFloat)); + + fill_tensor_rand(*param.filter, -1.f, 1.f); + // fill_tensor_const(*param.filter, 1.f); + if (flag_bias) { + fill_tensor_rand(*param.bias, -1.f, 1.f); + // fill_tensor_const(*param.bias, 1.f); + } + auto wptr = param.filter->data(); + auto bias_ptr = flag_bias ? param.bias->data() : nullptr; + + ComputeEngine conv; + conv.CreateOperator("conv2d"); + for (auto& dim_in : input_dims) { + param.x->Resize(dim_in); + shape_t out_tmp_dims = compute_out_dim(dim_in, param); + if (out_tmp_dims[2] < 1 || out_tmp_dims[3] < 1) { + continue; + } + param.output->Resize(out_tmp_dims); + break; + } + conv.SetParam(¶m); + + for (auto& dim_in : input_dims) { + if (weight_dim[1] * group != dim_in[1]) { + "input channel must equal to weights channel\n"; + exit(1); + } + shape_t dim_out = compute_out_dim(dim_in, param); + if (dim_out[2] < 1 || dim_out[3] < 1) { + continue; + } + param.x->Resize(dim_in); + param.output->Resize(dim_out); + + fill_tensor_rand(*param.x, -1.f, 1.f); + // fill_tensor_const(*param.x, 1.f); + auto din = param.x->data(); + + Tensor tout_basic; + if (check_result) { + tout_basic.set_precision(PRECISION(kFloat)); + tout_basic.Resize(dim_out); + fill_tensor_const(tout_basic, 0.f); + auto dout_basic = tout_basic.mutable_data(); + conv_basic(din, + dout_basic, + dim_in[0], + dim_out[1], + dim_out[2], + dim_out[3], + dim_in[1], + dim_in[2], + dim_in[3], + wptr, + bias_ptr, + group, + weight_dim[3], + weight_dim[2], + strides[1], + strides[0], + dilas[1], + dilas[0], + pads[2], + pads[0], + flag_bias, + flag_act, + six, + leakey_relu_scale); + } + /// warm up + for (int i = 0; i < warmup; ++i) { + conv.Launch(); + } + /// compute + Timer t0; + for (int i = 0; i < repeats; ++i) { + t0.Start(); + conv.Launch(); + t0.Stop(); + } + + double gops = 2.0 * dim_production(*param.output) * dim_in[1] * + weight_dim[2] * weight_dim[3] / param.groups; + std::cout << "conv fp32: input shape: (" << dim_in[0] << ", " << dim_in[1] + << ", " << dim_in[2] << ", " << dim_in[3] << "), output shape: (" + << dim_out[0] << ", " << dim_out[1] << ", " << dim_out[2] << ", " + << dim_out[3] << "),running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() + << ", total GOPS: " << 1e-9 * gops + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min() + << std::endl; + + if (check_result) { + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(tout_basic, *param.output, max_ratio, max_diff); + std::cout << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio << std::endl; + if (std::abs(max_ratio) > 1e-3f) { + if (max_diff > 5e-4f) { + std::cout << "basic result\n"; + print_tensor(tout_basic); + std::cout << "lite result\n"; + print_tensor(*param.output); + Tensor tdiff; + tdiff.Resize(tout_basic.shape()); + tdiff.set_precision(PRECISION(kFloat)); + tensor_diff(tout_basic, *param.output, tdiff); + print_tensor(tdiff); + std::cerr << "test fp32 conv: input: (" << dim_in[0] << ", " + << dim_in[1] << ", " << dim_in[2] << ", " << dim_in[3] + << "), output: (" << dim_out[0] << ", " << dim_out[1] + << ", " << dim_out[2] << ", " << dim_out[3] + << "), weight dim: (" << weight_dim[0] << ", " + << weight_dim[1] << ", " << weight_dim[2] << ", " + << weight_dim[3] << "), pad: " << pads[0] << ", " << pads[1] + << ", " << pads[2] << ", " << pads[3] + << ", stride: " << strides[0] << ", " << strides[1] + << ", dila_: " << dilas[0] << ", " << dilas[1] + << ", group: " << group + << ", bias: " << (flag_bias ? "true" : "false") + << ", act: " << flag_act << ", threads: " << thread_num + << ", power_mode: " << power_mode << " failed!!\n"; + exit(1); + } + } + } + std::cout << "test fp32 conv: input: (" << dim_in[0] << ", " << dim_in[1] + << ", " << dim_in[2] << ", " << dim_in[3] << "), output: (" + << dim_out[0] << ", " << dim_out[1] << ", " << dim_out[2] << ", " + << dim_out[3] << "), weight dim: (" << weight_dim[0] << ", " + << weight_dim[1] << ", " << weight_dim[2] << ", " << weight_dim[3] + << "), pad: " << pads[0] << ", " << pads[1] << ", " << pads[2] + << ", " << pads[3] << ", stride: " << strides[0] << ", " + << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] + << ", group: " << group + << ", bias: " << (flag_bias ? "true" : "false") + << ", act: " << flag_act << ", threads: " << thread_num + << ", power_mode: " << power_mode << " success!!\n"; + } + param.x->ReleaseRawTensor(); + param.filter->ReleaseRawTensor(); + param.output->ReleaseRawTensor(); + if (flag_bias) { + param.bias->ReleaseRawTensor(); + } + delete param.x; + delete param.filter; + delete param.output; + delete param.bias; +} + +int main(int argc, const char** argv) { + if (argc < 2) { + std::cout << "usage: ./" << argv[0] + << "basic_test check_result batch in_channel in_height in_width " + "out_channel group kernel_h pad_h0 stride_h dila_h flag_act " + "flag_bias warmup repeats threads power_mode." + << std::endl; + return 0; + } + if (argc >= 2) { + basic_test = atoi(argv[1]); + } + if (argc >= 3) { + check_result = atoi(argv[2]); + } + if (argc >= 4) { + batch = atoi(argv[3]); + } + if (argc >= 5) { + in_channel = atoi(argv[4]); + } + if (argc >= 6) { + in_height = atoi(argv[5]); + } + if (argc >= 7) { + in_width = atoi(argv[6]); + } + if (argc >= 8) { + out_channel = atoi(argv[7]); + } + if (argc >= 9) { + group = atof(argv[8]); + } + if (argc >= 10) { + if (argc >= 13) { + kernel_h = atoi(argv[9]); + kernel_w = kernel_h; + pad_h0 = atoi(argv[10]); + pad_h1 = pad_h0; + pad_w0 = pad_h0; + pad_w1 = pad_h0; + stride_h = atoi(argv[11]); + stride_w = stride_h; + dila_h = atoi(argv[12]); + dila_w = dila_h; + } else { + std::cout + << "kernel_h padh0 stride_h dila_h must be set at the same time." + << std::endl; + } + } + if (argc >= 14) { + flag_act = atoi(argv[13]); + } + if (argc >= 15) { + flag_bias = atoi(argv[14]); + } + if (argc >= 16) { + warmup = atoi(argv[15]); + } + if (argc >= 17) { + repeats = atoi(argv[16]); + } + if (argc >= 18) { + threads = atoi(argv[17]); + } + if (argc >= 19) { + power_mode = atoi(argv[18]); + } + if (argc >= 20) { + leaky_relu_alpha = atof(argv[19]); + } + // basic test + if (basic_test) { + std::cout << "RUN BASIC TEST BEGIN: " << std::endl; + for (auto& cin : {1, 3, 8}) { + for (auto& cout : {1, 5, 16}) { + for (auto& g : {1, 2}) { + for (auto& kw : {1, 2, 3}) { + for (auto& kh : {1, 2, 3}) { + for (auto& stride : {1, 2}) { + for (auto& pad_left : {0, 2}) { + for (auto& pad_right : {0, 2}) { + for (auto& pad_top : {0, 2}) { + for (auto& pad_bottom : {0, 2}) { + for (auto& dila : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_act : {0, 1, 2, 4}) { + for (auto& threads : {1, 2, 4}) { + if (cin % g != 0 || cout % g != 0) { + continue; + } + std::vector dims; + shape_t weights_dim({cout, cin / g, kh, kw}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 3, 19, 32}) { + dims.push_back(shape_t({batch, cin, h, h})); + } + } + // skip 3x3 depthwise conv + if (g == cin && cin == cout && kw == 3 && + kh == 3) { + break; + } + // skip 3x3s1 direct conv + if (g == 1 && (cin != 1 || cout != 1) && + kw == 3 && kh == 3 && stride == 1) { + break; + } + const float leakey_relu_scale = 2.22; + test_conv_fp32( + dims, + weights_dim, + g, + {stride, stride}, + {pad_top, pad_bottom, pad_left, pad_right}, + {dila, dila}, + flag_bias, + flag_act, + threads, + 3, + leakey_relu_scale); + } + } + } + } + } + } + } + } + } + } + } + } + } + } + std::cout << "RUN BASIC TEST END: " << std::endl; + } + + // costum test + std::cout << "RUN CUSTOM TEST BEGIN: " << std::endl; + std::vector dims; + dims.emplace_back(shape_t({batch, in_channel, in_height, in_width})); + shape_t weights_dim({out_channel, in_channel / group, kernel_h, kernel_w}); + test_conv_fp32(dims, + weights_dim, + group, + {stride_h, stride_w}, + {pad_h0, pad_h1, pad_w0, pad_w1}, + {dila_h, dila_w}, + flag_bias, + flag_act, + threads, + 3, + leaky_relu_alpha); + std::cout << "RUN CUSTOM TEST END: " << std::endl; + return 0; +} diff --git a/lite/demo/cxx/test_compute_api/utils.h b/lite/demo/cxx/test_compute_api/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..7ae51fa8656f1783960bf77a024a21744114695b --- /dev/null +++ b/lite/demo/cxx/test_compute_api/utils.h @@ -0,0 +1,420 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "compute_api.h" // NOLINT +#include "paddle_api.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +template +void fill_tensor_host_const_impl(Dtype* dio, Dtype value, int64_t size) { + for (int64_t i = 0; i < size; ++i) { + dio[i] = value; + } +} + +int64_t dim_production(const Tensor& t) { + shape_t s = t.shape(); + int64_t n = 1; + for (int i = 0; i < s.size(); ++i) { + n *= s[i]; + } + return n; +} +/** + * \brief Fill the host tensor buffer with rand value. + * \param tensor The reference of input tensor. + */ +void fill_tensor_const(Tensor& tensor, float value) { // NOLINT + int64_t size = dim_production(tensor); + PrecisionType type = tensor.precision(); + switch (type) { + case PRECISION(kInt8): + fill_tensor_host_const_impl( + tensor.mutable_data(), static_cast(value), size); + break; + case PRECISION(kInt32): + fill_tensor_host_const_impl( + tensor.mutable_data(), static_cast(value), size); + break; + case PRECISION(kFloat): + fill_tensor_host_const_impl( + tensor.mutable_data(), static_cast(value), size); + break; + default: + std::cerr << "data type is unsupported now." << std::endl; + assert(0); + } +} + +template +void fill_tensor_host_rand_impl(Dtype* dio, int64_t size) { + for (int64_t i = 0; i < size; ++i) { + Dtype rand_x = static_cast(rand() % 256); // NOLINT + dio[i] = (rand_x - 128) / 128; + } +} + +template <> +void fill_tensor_host_rand_impl(signed char* dio, int64_t size) { + for (int64_t i = 0; i < size; ++i) { + dio[i] = rand() % 256 - 128; // NOLINT + } +} +template <> +void fill_tensor_host_rand_impl(unsigned char* dio, + int64_t size) { + for (int64_t i = 0; i < size; ++i) { + dio[i] = rand() % 256; // NOLINT + } +} +/** + * \brief Fill the host tensor buffer with rand value. + * \param The reference of input tensor. + */ +void fill_tensor_rand(Tensor& tensor) { // NOLINT + int64_t size = dim_production(tensor); + PrecisionType type = tensor.precision(); + switch (type) { + case PRECISION(kInt8): + fill_tensor_host_rand_impl(tensor.mutable_data(), size); + break; + case PRECISION(kInt32): + fill_tensor_host_rand_impl(tensor.mutable_data(), size); + break; + case PRECISION(kFloat): + fill_tensor_host_rand_impl(tensor.mutable_data(), size); + break; + default: + std::cerr << "data type: is unsupported now" << std::endl; + assert(0); + } +} + +template +void fill_tensor_host_rand_impl2(Dtype* dio, + Dtype vstart, + Dtype vend, + int64_t size) { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(0, 1.f); + for (int64_t i = 0; i < size; ++i) { + Dtype random_num = static_cast(vstart + (vend - vstart) * dis(gen)); + dio[i] = random_num; + } +} + +/** + * \brief Fill the host tensor buffer with rand value from vstart to vend. + * \param tensor The reference of input tensor. + */ +void fill_tensor_rand(Tensor& tensor, float vstart, float vend) { // NOLINT + int64_t size = dim_production(tensor); + PrecisionType type = tensor.precision(); + switch (type) { + case PRECISION(kInt8): + fill_tensor_host_rand_impl2(tensor.mutable_data(), + static_cast(vstart), + static_cast(vend), + size); + break; + case PRECISION(kInt32): + fill_tensor_host_rand_impl2(tensor.mutable_data(), + static_cast(vstart), + static_cast(vend), + size); + break; + case PRECISION(kFloat): + fill_tensor_host_rand_impl2( + tensor.mutable_data(), vstart, vend, size); + break; + default: + std::cerr << "data type: is unsupported now" << std::endl; + assert(0); + } +} + +template +void print_tensor_host_impl(const Dtype* din, int64_t size, int64_t width); + +template <> +void print_tensor_host_impl(const float* din, int64_t size, int64_t width) { + for (int i = 0; i < size; ++i) { + printf("%.6f ", din[i]); + if ((i + 1) % width == 0) { + printf("\n"); + } + } + printf("\n"); +} + +template <> +void print_tensor_host_impl(const int* din, int64_t size, int64_t width) { + for (int i = 0; i < size; ++i) { + printf("%d ", din[i]); + if ((i + 1) % width == 0) { + printf("\n"); + } + } + printf("\n"); +} + +template <> +void print_tensor_host_impl(const signed char* din, + int64_t size, + int64_t width) { + for (int i = 0; i < size; ++i) { + printf("%d ", din[i]); + if ((i + 1) % width == 0) { + printf("\n"); + } + } + printf("\n"); +} +/** + * \brief Print the data in host tensor. + * \param tensor The reference of input tensor. + */ +void print_tensor(const Tensor& tensor) { + printf("host tensor data size: %ld\n", dim_production(tensor)); + int64_t size = dim_production(tensor); + int64_t width = tensor.shape()[tensor.shape().size() - 1]; + PrecisionType type = tensor.precision(); + switch (type) { + case PRECISION(kInt8): + print_tensor_host_impl(tensor.data(), size, width); + break; + case PRECISION(kInt32): + print_tensor_host_impl(tensor.data(), size, width); + break; + case PRECISION(kFloat): + print_tensor_host_impl(tensor.data(), size, width); + break; + default: + std::cerr << "data type: is unsupported now" << std::endl; + assert(0); + } +} + +template +double tensor_mean_value_host_impl(const Dtype* din, int64_t size) { + double sum = 0.0; + for (int64_t i = 0; i < size; ++i) { + sum += din[i]; + } + return sum / size; +} + +double tensor_mean(const Tensor& tensor) { + int64_t size = dim_production(tensor); + PrecisionType type = tensor.precision(); + switch (type) { + case PRECISION(kInt8): + return tensor_mean_value_host_impl(tensor.data(), size); + case PRECISION(kInt32): + return tensor_mean_value_host_impl(tensor.data(), size); + case PRECISION(kFloat): + return tensor_mean_value_host_impl(tensor.data(), size); + default: + std::cerr << "data type: is unsupported now" << std::endl; + assert(0); + } + return 0.0; +} + +template +void data_diff_kernel(const dtype* src1_truth, + const dtype* src2, + int size, + double& max_ratio, // NOLINT + double& max_diff) { // NOLINT + const double eps = 1e-6f; + max_diff = fabs(src1_truth[0] - src2[0]); + max_ratio = fabs(max_diff) / (std::abs(src1_truth[0]) + eps); + for (int i = 1; i < size; ++i) { + double diff = fabs(src1_truth[i] - src2[i]); + double ratio = fabs(diff) / (std::abs(src1_truth[i]) + eps); + if (max_ratio < ratio) { + max_diff = diff; + max_ratio = ratio; + } + } +} + +void tensor_cmp_host(const Tensor& src1_basic, + const Tensor& src2, + double& max_ratio, // NOLINT + double& max_diff) { // NOLINT + max_ratio = 0.; + max_diff = 0.; + int64_t size = dim_production(src1_basic); + int64_t size2 = dim_production(src2); + if (size != size2) { + std::cerr << "ERROR: tensor_cmp_host: wrong shape" << std::endl; + assert(0); + } + auto ptype1 = src1_basic.precision(); + auto ptype2 = src2.precision(); + if (ptype1 != ptype2) { + std::cerr << "ERROR: tensor_cmp_host: wrong data type" << std::endl; + assert(0); + } + if (size == 0) return; + switch (src1_basic.precision()) { + case PRECISION(kFloat): + data_diff_kernel(src1_basic.data(), + src2.data(), + size, + max_ratio, + max_diff); + return; + case PRECISION(kInt32): + data_diff_kernel( + src1_basic.data(), src2.data(), size, max_ratio, max_diff); + return; + case PRECISION(kInt8): + data_diff_kernel(src1_basic.data(), + src2.data(), + size, + max_ratio, + max_diff); + return; + default: + std::cerr << "data type: is unsupported now" << std::endl; + assert(0); + } +} + +template +void tensor_diff_kernel(const dtype* src1, + const dtype* src2, + dtype* dst, + int64_t size) { + for (int i = 0; i < size; ++i) { + dst[i] = src1[i] - src2[i]; + } +} +void tensor_diff(const Tensor& t1, const Tensor& t2, Tensor& tdiff) { // NOLINT + int64_t size1 = dim_production(t1); + int64_t size2 = dim_production(t2); + if (size1 != size2) { + std::cerr << "ERROR: tensor_diff: wrong shape" << std::endl; + assert(0); + } + auto ptype1 = t1.precision(); + auto ptype2 = t2.precision(); + if (ptype1 != ptype2) { + std::cerr << "ERROR: tensor_diff: wrong data type" << std::endl; + assert(0); + } + tdiff.Resize(t1.shape()); + switch (t1.precision()) { + case PRECISION(kFloat): + tensor_diff_kernel(t1.data(), + t2.data(), + tdiff.mutable_data(), + size1); + return; + case PRECISION(kInt32): + tensor_diff_kernel( + t1.data(), t2.data(), tdiff.mutable_data(), size1); + case PRECISION(kInt8): + tensor_diff_kernel(t1.data(), + t2.data(), + tdiff.mutable_data(), + size1); + return; + default: + std::cerr << "data type: is unsupported now" << std::endl; + assert(0); + } +} + +template +class TimeList { + public: + void Clear() { laps_t_.clear(); } + void Add(T t) { laps_t_.push_back(t); } + T Last(size_t offset = 0) const { + if (!Size(offset)) { + return 0; + } + return laps_t_.back(); + } + T Max(size_t offset = 0) const { + if (!Size(offset)) { + return 0; + } + return *std::max_element((laps_t_.begin() + offset), laps_t_.end()); + } + T Min(size_t offset = 0) const { + if (!Size(offset)) { + return 0; + } + return *std::min_element((laps_t_.begin() + offset), laps_t_.end()); + } + T Sum(size_t offset = 0) const { + if (!Size(offset)) { + return 0; + } + return std::accumulate((laps_t_.begin() + offset), laps_t_.end(), 0.0); + } + size_t Size(size_t offset = 0) const { + size_t size = (laps_t_.size() <= offset) ? 0 : (laps_t_.size() - offset); + return size; + } + T Avg(size_t offset = 0) const { + if (!Size(offset)) { + return 0; + } + return Sum(offset) / Size(offset); + } + const std::vector& Raw() const { return laps_t_; } + + private: + std::vector laps_t_; +}; + +class Timer { + public: + Timer() = default; + virtual ~Timer() = default; + + void Reset() { laps_t_.Clear(); } + void Start() { t_start_ = std::chrono::system_clock::now(); } + float Stop() { + t_stop_ = std::chrono::system_clock::now(); + auto ts = std::chrono::duration_cast(t_stop_ - + t_start_); + float elapse_ms = 1000.f * static_cast(ts.count()) * + std::chrono::microseconds::period::num / + std::chrono::microseconds::period::den; + this->laps_t_.Add(elapse_ms); + return elapse_ms; + } + float AvgLapTimeMs() const { return laps_t_.Avg(); } + const TimeList& LapTimes() const { return laps_t_; } + + protected: + TimeList laps_t_; + + private: + std::chrono::time_point t_start_, t_stop_; +}; diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc index 01e4116c94c75df3bd5360494c57419fe57c18ef..6523e6864cc83a4dae13180bf26a5920b53c22e9 100644 --- a/lite/operators/activation_ops.cc +++ b/lite/operators/activation_ops.cc @@ -93,6 +93,11 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) { return true; } +bool ActivationOp::SetParam(ParamBase* param) { + param_ = *static_cast(param); + return true; +} + } // namespace operators } // namespace lite } // namespace paddle diff --git a/lite/operators/activation_ops.h b/lite/operators/activation_ops.h index 250a88de42b4004932f78b0490a844d4a8dbc6fe..2d5eff5cfa6a57ac0fcab763c08b0fb8ebb3d93a 100644 --- a/lite/operators/activation_ops.h +++ b/lite/operators/activation_ops.h @@ -31,6 +31,8 @@ class ActivationOp : public OpLite { bool InferShapeImpl() const override; + bool SetParam(ParamBase* param) override; + bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override; void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); } diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc index 38c59a0290b03031e9cbe013a4a10c14c7ad1743..09b06ebc909475391f1c4dee508f7b4dfdf572b7 100644 --- a/lite/operators/conv_op.cc +++ b/lite/operators/conv_op.cc @@ -39,6 +39,11 @@ bool ConvOpLite::CheckShape() const { return true; } +bool ConvOpLite::SetParam(ParamBase* param) { + param_ = *static_cast(param); + return true; +} + inline int ConvOutputSize(int input_size, int filter_size, int dilation, diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h index c3e375e2e44b8184e6e7e635ab2c6c1f8889f844..7ffc644f167880083615629509923540f47354ff 100644 --- a/lite/operators/conv_op.h +++ b/lite/operators/conv_op.h @@ -61,6 +61,7 @@ class ConvOpLite : public OpLite { output_dims.production() * input_dims[1] / param_.groups; } #endif + bool SetParam(ParamBase* param) override; // TODO(Superjomn) replace framework::OpDesc with a lite one. bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override { diff --git a/lite/tools/build.sh b/lite/tools/build.sh index 9365120772d96d31ff0af98c2cab4dea609be5ab..790811d90dbd1a787cc013b00f608e4746858825 100755 --- a/lite/tools/build.sh +++ b/lite/tools/build.sh @@ -21,6 +21,7 @@ BUILD_DIR=$(pwd) OPTMODEL_DIR="" BUILD_TAILOR=OFF BUILD_CV=OFF +BUILD_COMPUTE_API=OFF WITH_LOG=ON WITH_PROFILE=OFF BUILD_NPU=OFF @@ -130,6 +131,7 @@ function make_tiny_publish_so { -DANDROID_STL_TYPE=$android_stl \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ -DLITE_WITH_CV=$BUILD_CV \ + -DLITE_WITH_COMPUTE_API=$BUILD_COMPUTE_API \ -DLITE_BUILD_TAILOR=$BUILD_TAILOR \ -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \ -DLITE_WITH_NPU=$BUILD_NPU \