diff --git a/.gitignore b/.gitignore index 9823f8c945c1be8e717b622a993d402c49517b7c..dc0a38edcb563589ce3845803174598ca68ec396 100644 --- a/.gitignore +++ b/.gitignore @@ -63,6 +63,16 @@ test/models/ test/images/ +*.pyc + +# model +*.nb +*.svg +*.dot + +# vim intermediate files +*.swp + # Emacs intermediate files *~ diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 5a757659bb036ca99326bc40cc075f761ba6e641..f0cbedcba39258327519f45310f24792b4962b91 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -45,7 +45,7 @@ else() # we changed the source code to adapt for windows compiling # git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h ###################################################################################################### - URL https://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip + URL http://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR} DOWNLOAD_NO_PROGRESS 1 PREFIX ${EIGEN_SOURCE_DIR} diff --git a/docs/demo_guides/cuda.md b/docs/demo_guides/cuda.md index 8b3e76acef590bda19a59388017added6a0b8d52..f863fd86864194c6d022e4cf1fc75eb46725cc2c 100644 --- a/docs/demo_guides/cuda.md +++ b/docs/demo_guides/cuda.md @@ -48,7 +48,7 @@ cuda的编译结果位于 `build_cuda/inference_lite_lib` 4、 `demo` 文件夹:c++ demo. -如果编译打开了python选项,则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。 +如果编译打开了python选项,则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite.so`。 ## 运行 @@ -66,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg 二: 运行 -**NOTE:**此处示例使用的是python接口。 +**NOTE:** 此处示例使用的是python接口。 ``` python #-*- coding: utf-8 -*- @@ -75,7 +75,7 @@ import sys import numpy as np import cv2 sys.path.append('build_cuda/inference_lite_lib/python/lib') -from lite_core import * +from lite import * def read_img(im_path, resize_h, resize_w): im = cv2.imread(im_path).astype('float32') diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index e2b15b187bf6dd3b77fe353f23b5d65bf56e44c7..b89a4de37aafdc17c10fe6cb58b7bda272cc69fb 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -369,6 +369,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_libs" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_libs/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_libs/Makefile" ) add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 0f60b13f35d51d3917425df75d3f157f8b5a87c3..506f2eab721807abcff64e16470edbc6bcd40842 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -1,4 +1,4 @@ -if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) +if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_SHUTDOWN_LOG) lite_cc_library(place SRCS paddle_place.cc DEPS logging) else() lite_cc_library(place SRCS paddle_place.cc DEPS glog) diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index f4dcac519a0699cbcf1bdd3845d8ae90d7a289ed..5c89c24325e2aeff0f8b0ed7a5cd621f26318b8f 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -151,6 +151,11 @@ std::vector Predictor::GetInputNames() { return input_names_; } // get outputnames std::vector Predictor::GetOutputNames() { return output_names_; } +// get param names +std::vector Predictor::GetParamNames() { + return exec_scope_->AttributeVarNames(); +} + // append the names of inputs and outputs into input_names_ and output_names_ void Predictor::PrepareFeedFetch() { if (!program_) { @@ -293,6 +298,7 @@ void Predictor::Build(const cpp::ProgramDesc &desc, // `inner_places` is used to optimize passes std::vector inner_places = valid_places; for (auto &valid_place : valid_places) { + if (valid_place.target == TARGET(kOpenCL)) continue; inner_places.emplace_back( Place(TARGET(kHost), valid_place.precision, valid_place.layout)); } @@ -345,9 +351,16 @@ void Predictor::GenRuntimeProgram() { const lite::Tensor *Predictor::GetTensor(const std::string &name) const { auto *var = exec_scope_->FindVar(name); + CHECK(var) << "no variable named with " << name << " in exec_scope"; return &var->Get(); } +lite::Tensor *Predictor::GetMutableTensor(const std::string &name) { + auto *var = exec_scope_->FindVar(name); + CHECK(var) << "no variable named with " << name << " in exec_scope"; + return var->GetMutable(); +} + // get input by name lite::Tensor *Predictor::GetInputByName(const std::string &name) { auto element = std::find(input_names_.begin(), input_names_.end(), name); diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h index 146556756af7e0b56ae38b5303e622c97dfe58af..cd542e87ed3bf4632bce141f019e974af6ef4308 100644 --- a/lite/api/cxx_api.h +++ b/lite/api/cxx_api.h @@ -85,6 +85,9 @@ class LITE_API Predictor { // get inputnames and get outputnames. std::vector GetInputNames(); std::vector GetOutputNames(); + // get param names + std::vector GetParamNames(); + void PrepareFeedFetch(); // Get offset-th col of fetch results. @@ -92,6 +95,9 @@ class LITE_API Predictor { std::vector GetOutputs() const; const cpp::ProgramDesc& program_desc() const; + // get a mutable tensor according to its name + lite::Tensor* GetMutableTensor(const std::string& name); + // get a const tensor according to its name const lite::Tensor* GetTensor(const std::string& name) const; const RuntimeProgram& runtime_program() const; @@ -142,9 +148,15 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor { // get inputs names and get outputs names std::vector GetInputNames() override; std::vector GetOutputNames() override; + // get param names + std::vector GetParamNames() override; + // get tensor according to tensor's name std::unique_ptr GetTensor( const std::string& name) const override; + // get a mutable tensor according to tensor's name + std::unique_ptr GetMutableTensor( + const std::string& name) override; // Get InputTebsor by name std::unique_ptr GetInputByName( diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc index 28e87dca394ba06844269746c19a892c26e0c653..18eb0b3545eeb27c6661c48b9a91dbf413757606 100644 --- a/lite/api/cxx_api_impl.cc +++ b/lite/api/cxx_api_impl.cc @@ -97,6 +97,10 @@ std::vector CxxPaddleApiImpl::GetInputNames() { return raw_predictor_.GetInputNames(); } +std::vector CxxPaddleApiImpl::GetParamNames() { + return raw_predictor_.GetParamNames(); +} + std::vector CxxPaddleApiImpl::GetOutputNames() { return raw_predictor_.GetOutputNames(); } @@ -123,6 +127,12 @@ std::unique_ptr CxxPaddleApiImpl::GetTensor( return std::unique_ptr(new lite_api::Tensor(x)); } +std::unique_ptr CxxPaddleApiImpl::GetMutableTensor( + const std::string &name) { + return std::unique_ptr( + new lite_api::Tensor(raw_predictor_.GetMutableTensor(name))); +} + std::unique_ptr CxxPaddleApiImpl::GetInputByName( const std::string &name) { return std::unique_ptr( diff --git a/lite/api/lite_multithread_test.cc b/lite/api/lite_multithread_test.cc index 33c0a94cf1a254e42c47aa462c5cfe12e386a87e..8da192701c9d232196c0dbbc9fd374e214821345 100644 --- a/lite/api/lite_multithread_test.cc +++ b/lite/api/lite_multithread_test.cc @@ -36,7 +36,7 @@ DEFINE_string(model_dir_0, "", "model_dir_0"); DEFINE_string(input_shape_0, "1,3,224,224", "input shapes another, separated by colon and comma"); - +DEFINE_string(target, "arm", "main target for Predictor: arm, opencl"); DEFINE_bool(use_optimize_nb, false, "optimized & naive buffer model for mobile devices"); @@ -51,9 +51,19 @@ void OutputOptModel(const std::string& load_model_dir, const std::vector>& input_shapes) { lite_api::CxxConfig config; config.set_model_dir(load_model_dir); - config.set_valid_places({ - Place{TARGET(kARM), PRECISION(kFloat)}, - }); + if (FLAGS_target == "arm") { + config.set_valid_places({ + Place{TARGET(kARM), PRECISION(kFloat)}, + }); + } else if (FLAGS_target == "opencl") { + config.set_valid_places({ + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}, + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}, + Place{TARGET(kARM)}, // enable kARM CPU kernel when no opencl kernel + }); + } auto predictor = lite_api::CreatePaddlePredictor(config); // delete old optimized model @@ -78,7 +88,7 @@ void Run(const std::vector>& input_shapes, int tid, const int warmup_times = 5) { lite_api::MobileConfig config; - config.set_model_dir(model_dir); + config.set_model_from_file(model_dir + ".nb"); config.set_power_mode(power_mode); config.set_threads(thread_num); @@ -197,7 +207,7 @@ void RunTestType_10(const std::vector>& input_shapes, const int repeat, int warmup = 5) { lite_api::MobileConfig config; - config.set_model_dir(model_dir); + config.set_model_from_file(model_dir + ".nb"); config.set_power_mode(power_mode); config.set_threads(thread_num); @@ -218,13 +228,13 @@ void RunTestType_11(const std::vector>& input_shapes, const int repeat, int warmup = 5) { lite_api::MobileConfig config; - config.set_model_dir(model_dir); + config.set_model_from_file(model_dir + ".nb"); config.set_power_mode(power_mode); config.set_threads(thread_num); auto predictor = lite_api::CreatePaddlePredictor(config); - config.set_model_dir(model_dir_0); + config.set_model_from_file(model_dir_0 + ".nb"); auto predictor_0 = lite_api::CreatePaddlePredictor(config); for (int i = 0; i < 2 * repeat; i += 2) { @@ -246,7 +256,8 @@ int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); if (FLAGS_model_dir == "") { LOG(INFO) << "usage: " - << "--model_dir /path/to/your/model"; + << "--model_dir /path/to/your/model --model_dir_0 " + "/path/to/your/model0 --target `arm` or `opencl`"; exit(0); } std::string save_optimized_model_dir = ""; diff --git a/lite/api/opt.cc b/lite/api/opt.cc index a6ad7cff6f234187770eccf1501378c04201b729..a1b963ac4ebf836e29045c8810658e0b30bad2f2 100644 --- a/lite/api/opt.cc +++ b/lite/api/opt.cc @@ -55,7 +55,7 @@ DEFINE_string(model_file, "", "model file path of the combined-param model"); DEFINE_string(param_file, "", "param file path of the combined-param model"); DEFINE_string( optimize_out_type, - "protobuf", + "naive_buffer", "store type of the output optimized model. protobuf/naive_buffer"); DEFINE_bool(display_kernels, false, "Display kernel information"); DEFINE_bool(record_tailoring_info, @@ -207,7 +207,7 @@ void PrintOpsInfo(std::set valid_ops = {}) { } std::cout << std::setiosflags(std::ios::internal); std::cout << std::setw(maximum_optype_length) << "OP_name"; - for (int i = 0; i < targets.size(); i++) { + for (size_t i = 0; i < targets.size(); i++) { std::cout << std::setw(10) << targets[i].substr(1); } std::cout << std::endl; @@ -215,7 +215,7 @@ void PrintOpsInfo(std::set valid_ops = {}) { for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) { std::cout << std::setw(maximum_optype_length) << it->first; auto ops_valid_places = it->second; - for (int i = 0; i < targets.size(); i++) { + for (size_t i = 0; i < targets.size(); i++) { if (std::find(ops_valid_places.begin(), ops_valid_places.end(), targets[i]) != ops_valid_places.end()) { @@ -235,7 +235,7 @@ void PrintOpsInfo(std::set valid_ops = {}) { } // Print OP info. auto ops_valid_places = supported_ops.at(*op); - for (int i = 0; i < targets.size(); i++) { + for (size_t i = 0; i < targets.size(); i++) { if (std::find(ops_valid_places.begin(), ops_valid_places.end(), targets[i]) != ops_valid_places.end()) { @@ -288,11 +288,11 @@ void ParseInputCommand() { auto valid_places = paddle::lite_api::ParserValidPlaces(); // get valid_targets string std::vector target_types = {}; - for (int i = 0; i < valid_places.size(); i++) { + for (size_t i = 0; i < valid_places.size(); i++) { target_types.push_back(valid_places[i].target); } std::string targets_str = TargetToStr(target_types[0]); - for (int i = 1; i < target_types.size(); i++) { + for (size_t i = 1; i < target_types.size(); i++) { targets_str = targets_str + TargetToStr(target_types[i]); } @@ -301,7 +301,7 @@ void ParseInputCommand() { target_types.push_back(TARGET(kUnk)); std::set valid_ops; - for (int i = 0; i < target_types.size(); i++) { + for (size_t i = 0; i < target_types.size(); i++) { auto ops = supported_ops_target[static_cast(target_types[i])]; valid_ops.insert(ops.begin(), ops.end()); } @@ -318,7 +318,7 @@ void CheckIfModelSupported() { auto valid_unktype_ops = supported_ops_target[static_cast(TARGET(kUnk))]; valid_ops.insert( valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end()); - for (int i = 0; i < valid_places.size(); i++) { + for (size_t i = 0; i < valid_places.size(); i++) { auto target = valid_places[i].target; auto ops = supported_ops_target[static_cast(target)]; valid_ops.insert(valid_ops.end(), ops.begin(), ops.end()); @@ -340,7 +340,7 @@ void CheckIfModelSupported() { std::set unsupported_ops; std::set input_model_ops; - for (int index = 0; index < cpp_prog.BlocksSize(); index++) { + for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) { auto current_block = cpp_prog.GetBlock(index); for (size_t i = 0; i < current_block->OpsSize(); ++i) { auto& op_desc = *current_block->GetOp(i); @@ -364,13 +364,13 @@ void CheckIfModelSupported() { unsupported_ops_str = unsupported_ops_str + ", " + *op_str; } std::vector targets = {}; - for (int i = 0; i < valid_places.size(); i++) { + for (size_t i = 0; i < valid_places.size(); i++) { targets.push_back(valid_places[i].target); } std::sort(targets.begin(), targets.end()); targets.erase(unique(targets.begin(), targets.end()), targets.end()); std::string targets_str = TargetToStr(targets[0]); - for (int i = 1; i < targets.size(); i++) { + for (size_t i = 1; i < targets.size(); i++) { targets_str = targets_str + "," + TargetToStr(targets[i]); } diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc index 14c1ca4a4e9c19d2d3c27b783267682457eeddb2..5af001961af6e4064e45174f1537d0c6f05e6c07 100644 --- a/lite/api/opt_base.cc +++ b/lite/api/opt_base.cc @@ -82,27 +82,56 @@ void OptBase::SetValidPlaces(const std::string& valid_places) { "command argument 'valid_targets'"; } -void OptBase::SetOptimizeOut(const std::string& optimized_out_path) { - optimize_out_path_ = optimized_out_path; +void OptBase::SetLiteOut(const std::string& lite_out_name) { + lite_out_name_ = lite_out_name; } -void OptBase::RunOptimize(bool record_strip_info) { +void OptBase::RecordModelInfo(bool record_strip_info) { + record_strip_info_ = record_strip_info; +} + +void OptBase::Run() { CheckIfModelSupported(false); OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map); opt_config_.set_valid_places(valid_places_); if (model_set_dir_ != "") { - RunOptimizeFromModelSet(record_strip_info); + RunOptimizeFromModelSet(record_strip_info_); } else { auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_); opt_predictor->SaveOptimizedModel( - optimize_out_path_, model_type_, record_strip_info); + lite_out_name_, model_type_, record_strip_info_); auto resulted_model_name = - record_strip_info ? "information of striped model" : "optimized model"; + record_strip_info_ ? "information of striped model" : "optimized model"; std::cout << "Save the " << resulted_model_name - << " into :" << optimize_out_path_ << "successfully"; + << " into :" << lite_out_name_ << "successfully"; } } +void OptBase::RunOptimize(const std::string& model_dir_path, + const std::string& model_path, + const std::string& param_path, + const std::string& valid_places, + const std::string& optimized_out_path) { + SetModelDir(model_dir_path); + SetModelFile(model_path); + SetParamFile(param_path); + SetValidPlaces(valid_places); + SetLiteOut(optimized_out_path); + CheckIfModelSupported(false); + OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map); + opt_config_.set_valid_places(valid_places_); + if (model_set_dir_ != "") { + RunOptimizeFromModelSet(record_strip_info_); + } else { + auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_); + opt_predictor->SaveOptimizedModel( + lite_out_name_, model_type_, record_strip_info_); + auto resulted_model_name = + record_strip_info_ ? "information of striped model" : "optimized model"; + std::cout << "Save the " << resulted_model_name + << " into :" << lite_out_name_ << "successfully"; + } +} // collect ops info of modelset void CollectModelMetaInfo(const std::string& output_dir, const std::vector& models, @@ -125,7 +154,7 @@ void OptBase::SetModelSetDir(const std::string& model_set_path) { } void OptBase::RunOptimizeFromModelSet(bool record_strip_info) { // 1. mkdir of outputed optimized model set. - lite::MkDirRecur(optimize_out_path_); + lite::MkDirRecur(lite_out_name_); auto model_dirs = lite::ListDir(model_set_dir_, true); if (model_dirs.size() == 0) { LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model"; @@ -138,7 +167,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) { std::string input_model_dir = lite::Join({model_set_dir_, name}, "/"); std::string output_model_dir = - lite::Join({optimize_out_path_, name}, "/"); + lite::Join({lite_out_name_, name}, "/"); if (opt_config_.model_file() != "" && opt_config_.param_file() != "") { auto model_file_path = @@ -155,7 +184,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) { auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_); opt_predictor->SaveOptimizedModel( - optimize_out_path_, model_type_, record_strip_info); + lite_out_name_, model_type_, record_strip_info); std::cout << "Optimize done. "; } @@ -164,46 +193,60 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) { if (record_strip_info) { // Collect all models information CollectModelMetaInfo( - optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME); + lite_out_name_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME); + CollectModelMetaInfo( + lite_out_name_, model_dirs, lite::TAILORD_OPS_LIST_NAME); CollectModelMetaInfo( - optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME); - CollectModelMetaInfo(optimize_out_path_, - model_dirs, - lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME); + lite_out_name_, model_dirs, lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME); CollectModelMetaInfo( - optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME); + lite_out_name_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME); std::cout << "Record the information of stripped models into :" - << optimize_out_path_ << "successfully"; + << lite_out_name_ << "successfully"; } } void OptBase::PrintHelpInfo() { const std::string opt_version = lite::version(); const char help_info[] = - "At least one argument should be inputed. Valid arguments are listed " - "below:\n" + "------------------------------------------------------------------------" + "-----------------------------------------------------------\n" + " Valid arguments of Paddle-Lite opt are listed below:\n" + "------------------------------------------------------------------------" + "-----------------------------------------------------------\n" " Arguments of help information:\n" " `help()` Print help infomation\n" - " Arguments of model optimization:\n" + "\n" + " Arguments of model transformation:\n" " `set_model_dir(model_dir)`\n" " `set_model_file(model_file_path)`\n" " `set_param_file(param_file_path)`\n" - " `set_model_type(protobuf|naive_buffer)`\n" - " `set_optimize_out(output_optimize_model_dir)`\n" + " `set_model_type(protobuf|naive_buffer)`: naive_buffer by " + "default\n" + " `set_lite_out(output_optimize_model_dir)`\n" " `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n" - " `run_optimize(false|true)`\n" - " ` ----fasle&true refer to whether to record ops info for " - "tailoring lib, false by default`\n" - " Arguments of model checking and ops information:\n" + " `record_model_info(false|true)`: refer to whether to record ops " + "info for striping lib, false by default`\n" + " `run() : start model transformation`\n" + " eg. `opt.set_model_dir(\"./mobilenetv1\"); " + "opt.set_lite_out(\"mobilenetv1_opt\"); opt.set_valid_places(\"arm\"); " + "opt.run();`\n" + "\n" + " You can also transform model through a single input argument:\n" + " `run_optimize(model_dir, model_file_path, param_file_path, " + "model_type, valid_places, lite_out_name) `\n" + " eg. `opt.run_optimize(\"./mobilenetv1\", \"\", \"\", " + "\"naive_buffer\", \"arm\", \"mobilenetv1_opt\");`" + "\n" + " Arguments of checking model and printing ops information:\n" " `print_all_ops()` Display all the valid operators of " "Paddle-Lite\n" " `print_supported_ops` Display supported operators of valid " "places\n" " `check_if_model_supported()` Check if the input model is " - "supported\n"; - - std::cout << "opt version:" << opt_version << std::endl - << help_info << std::endl; + "supported\n" + "------------------------------------------------------------------------" + "-----------------------------------------------------------\n"; + std::cout << "opt version:" << opt_version << std::endl << help_info; } // 2. Print supported info of inputed ops void OptBase::PrintOpsInfo(const std::set& valid_ops) { diff --git a/lite/api/opt_base.h b/lite/api/opt_base.h index a8d6d0390ccd3f1c9b0291b1bcf6eb1ecc47a248..3c0051375d0c09d09e0e070df273c94e7a668750 100644 --- a/lite/api/opt_base.h +++ b/lite/api/opt_base.h @@ -44,16 +44,21 @@ class LITE_API OptBase { public: OptBase() = default; void SetModelSetDir(const std::string &model_set_path); - void SetModelDir(const std::string &model_path); + void SetModelDir(const std::string &model_dir_path); void SetModelFile(const std::string &model_path); void SetParamFile(const std::string ¶m_path); void SetValidPlaces(const std::string &valid_places); - void SetOptimizeOut(const std::string &optimized_out_path); + void SetLiteOut(const std::string &lite_out_name); + void RecordModelInfo(bool record_strip_info = true); // set optimized_model type void SetModelType(std::string model_type); // transform and save the optimized model - void RunOptimize(bool record_strip_info = false); - + void Run(); + void RunOptimize(const std::string &model_dir_path = "", + const std::string &model_path = "", + const std::string ¶m_path = "", + const std::string &valid_places = "", + const std::string &optimized_out_path = ""); // fuctions of printing info // 1. help info void PrintHelpInfo(); @@ -71,12 +76,12 @@ class LITE_API OptBase { // valid places for the optimized_model std::vector valid_places_; // filename of the optimized_model - std::string optimize_out_path_; + std::string lite_out_name_; // type of the optimized_model, kNaiveBuffer default. LiteModelType model_type_{LiteModelType::kNaiveBuffer}; // Dir path of a set of models, this should be combined with model std::string model_set_dir_; - + bool record_strip_info_{false}; void RunOptimizeFromModelSet(bool record_strip_info = false); }; diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc index daef2c66dda5188a1eec25c3d5f045f1fa705e1e..4b13ae4ed241eb1a3164a1213feec12306df89f6 100644 --- a/lite/api/paddle_api.cc +++ b/lite/api/paddle_api.cc @@ -167,6 +167,20 @@ lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); } void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); } +std::unique_ptr PaddlePredictor::GetMutableTensor( + const std::string &name) { + LOG(FATAL) + << "The GetMutableTensor API is only supported by CxxConfig predictor."; + return nullptr; +} + +std::vector PaddlePredictor::GetParamNames() { + std::vector null_result = {}; + LOG(FATAL) + << "The GetParamNames API is only supported by CxxConfig predictor."; + return null_result; +} + void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir, LiteModelType model_type, bool record_info) { diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index 79ab98da799a99540217d55e3d40b46800f17626..b08f2f5c745f87cda2be181bdea2444b2c11313c 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -86,6 +86,8 @@ class LITE_API PaddlePredictor { virtual std::vector GetInputNames() = 0; // Get output names virtual std::vector GetOutputNames() = 0; + // Get output names + virtual std::vector GetParamNames(); // Get Input by name virtual std::unique_ptr GetInputByName(const std::string& name) = 0; @@ -93,6 +95,9 @@ class LITE_API PaddlePredictor { /// Get a readonly tensor, return null if no one called `name` exists. virtual std::unique_ptr GetTensor( const std::string& name) const = 0; + /// Get a mutable tensor, return null if on one called `name` exists + /// internal infereces API, not recommanded. + virtual std::unique_ptr GetMutableTensor(const std::string& name); /// Persist the optimized model to disk. This API is only supported by /// CxxConfig, and the persisted model can be reused for MobileConfig. @@ -176,7 +181,7 @@ class LITE_API CxxConfig : public ConfigBase { #endif #ifdef LITE_WITH_CUDA void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; } - int multi_stream() const { return multi_stream_; } + bool multi_stream() const { return multi_stream_; } #endif #ifdef LITE_WITH_MLU @@ -208,6 +213,8 @@ class LITE_API CxxConfig : public ConfigBase { // current thread. void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00); // XPU only, specify the target device ID for the current thread. + // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker + // thread void set_xpu_dev_per_thread(int dev_no = 0); }; diff --git a/lite/api/paddle_lite_factory_helper.h b/lite/api/paddle_lite_factory_helper.h index 9dc5c9e857243ecb57f785737b00929e36c5d83c..5ce6a9ac9433d720c005d84712ed181d075c61b4 100644 --- a/lite/api/paddle_lite_factory_helper.h +++ b/lite/api/paddle_lite_factory_helper.h @@ -19,7 +19,13 @@ #pragma once // some platform-independent defintion -#include "lite/utils/macros.h" + +#if defined(_WIN32) +#define UNUSED +#define __builtin_expect(EXP, C) (EXP) +#else +#define UNUSED __attribute__((unused)) +#endif #define USE_LITE_OP(op_type__) \ extern int touch_op_##op_type__(); \ diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index 82cd7f3d8da5eb4f00c9069731960a81ef9fe87d..8cb4dbf192993219347d70bb8ccb704199b45f3d 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -33,6 +33,7 @@ USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass); USE_MIR_PASS(lite_interpolate_fuse_pass); USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass); USE_MIR_PASS(identity_scale_eliminate_pass); +USE_MIR_PASS(identity_dropout_eliminate_pass); USE_MIR_PASS(lite_conv_elementwise_fuse_pass); USE_MIR_PASS(lite_conv_activation_fuse_pass); USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass); @@ -51,5 +52,8 @@ USE_MIR_PASS(mlu_postprocess_pass); USE_MIR_PASS(weight_quantization_preprocess_pass); USE_MIR_PASS(apu_subgraph_pass); USE_MIR_PASS(quantized_op_attributes_inference_pass); +USE_MIR_PASS(lite_scale_activation_fuse_pass); USE_MIR_PASS(__xpu__resnet_fuse_pass); USE_MIR_PASS(__xpu__multi_encoder_fuse_pass); +USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass); +USE_MIR_PASS(__xpu__fc_fuse_pass); diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc index 06d1c607fd761f9f6e58a4c5779e2c3cb9f4e6b3..104275e2e9cf157d7d2f7ca963a1abed2983b92e 100644 --- a/lite/api/python/pybind/pybind.cc +++ b/lite/api/python/pybind/pybind.cc @@ -62,8 +62,10 @@ void BindLiteOpt(py::module *m) { .def("set_model_file", &OptBase::SetModelFile) .def("set_param_file", &OptBase::SetParamFile) .def("set_valid_places", &OptBase::SetValidPlaces) - .def("set_optimize_out", &OptBase::SetOptimizeOut) + .def("set_lite_out", &OptBase::SetLiteOut) .def("set_model_type", &OptBase::SetModelType) + .def("record_model_info", &OptBase::RecordModelInfo) + .def("run", &OptBase::Run) .def("run_optimize", &OptBase::RunOptimize) .def("help", &OptBase::PrintHelpInfo) .def("print_supported_ops", &OptBase::PrintSupportedOps) diff --git a/lite/api/python/setup.py.in b/lite/api/python/setup.py.in index b04a6077f5aafecf76fed0b0dee5c56919b9302e..884266a12dc911f6e642518b169370d7aeb83cca 100644 --- a/lite/api/python/setup.py.in +++ b/lite/api/python/setup.py.in @@ -50,7 +50,7 @@ if '${WITH_MKL}' == 'ON': # link lite.so to paddlelite.libs if os.name != 'nt': COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\ - /inference_lite_lib/python/install/lite/lite.so" +/inference_lite_lib/python/install/lite/lite.so" if os.system(COMMAND) != 0: raise Exception("patch third_party libs failed, command: %s" % COMMAND) diff --git a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc index d1992f62bbfa9e15ab4d39565f7fe3555e17b215..35d9eeaee1b69bed423cd3b489217c71575b3079 100644 --- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc +++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc @@ -80,8 +80,10 @@ void conv_compute_6x6_3x3(const float* input, const operators::ConvParam& param, ARMContext* ctx) { auto act_param = param.activation_param; - const int pad_h = (*param.paddings)[0]; - const int pad_w = (*param.paddings)[2]; + const int pad_h0 = (*param.paddings)[0]; + const int pad_h1 = (*param.paddings)[1]; + const int pad_w0 = (*param.paddings)[2]; + const int pad_w1 = (*param.paddings)[3]; float* tmp_work_space = ctx->workspace_data() + ctx->llc_size() / sizeof(float); @@ -96,8 +98,8 @@ void conv_compute_6x6_3x3(const float* input, int tile_h = (hout + 5) / 6; int size_tile = tile_h * tile_w; - int w_pad = win + pad_w * 2; - int h_pad = hin + pad_h * 2; + int w_pad = win + pad_w0 + pad_w1; + int h_pad = hin + pad_h0 + pad_h1; const int zero_len = w_pad; float zero_ptr[zero_len]; // NOLINT @@ -127,10 +129,10 @@ void conv_compute_6x6_3x3(const float* input, prepack_input_nxwc4_dw(input + ni * in_n_stride, input_c4 + i * new_c_stride, i * 4, - -pad_h, - hin + pad_h, - -pad_w, - win + pad_w, + -pad_h0, + hin + pad_h1, + -pad_w0, + win + pad_w1, chin, win, hin, @@ -367,8 +369,10 @@ void conv_compute_2x2_3x3(const float* input, const operators::ConvParam& param, ARMContext* ctx) { auto act_param = param.activation_param; - const int pad_h = (*param.paddings)[0]; - const int pad_w = (*param.paddings)[2]; + const int pad_h0 = (*param.paddings)[0]; + const int pad_h1 = (*param.paddings)[1]; + const int pad_w0 = (*param.paddings)[2]; + const int pad_w1 = (*param.paddings)[3]; float* tmp_work_space = ctx->workspace_data() + ctx->llc_size() / sizeof(float); @@ -383,8 +387,8 @@ void conv_compute_2x2_3x3(const float* input, int tile_h = (hout + 1) / 2; int size_tile = tile_h * tile_w; - int w_pad = win + pad_w * 2; - int h_pad = hin + pad_h * 2; + int w_pad = win + pad_w0 + pad_w1; + int h_pad = hin + pad_h0 + pad_h1; const int zero_len = w_pad; float zero_ptr[zero_len]; // NOLINT @@ -414,10 +418,10 @@ void conv_compute_2x2_3x3(const float* input, prepack_input_nxwc4_dw(input + ni * in_n_stride, input_c4 + i * new_c_stride, i * 4, - -pad_h, - hin + pad_h, - -pad_w, - win + pad_w, + -pad_h0, + hin + pad_h1, + -pad_w0, + win + pad_w1, chin, win, hin, @@ -628,8 +632,10 @@ void conv_compute_2x2_3x3_small(const float* input, const operators::ConvParam& param, ARMContext* ctx) { auto act_param = param.activation_param; - const int pad_h = (*param.paddings)[0]; - const int pad_w = (*param.paddings)[2]; + const int pad_h0 = (*param.paddings)[0]; + const int pad_h1 = (*param.paddings)[1]; + const int pad_w0 = (*param.paddings)[2]; + const int pad_w1 = (*param.paddings)[3]; float* tmp_work_space = ctx->workspace_data() + ctx->llc_size() / sizeof(float); @@ -644,8 +650,8 @@ void conv_compute_2x2_3x3_small(const float* input, int tile_h = (hout + 1) / 2; int size_tile = tile_h * tile_w; - int w_pad = win + pad_w * 2; - int h_pad = hin + pad_h * 2; + int w_pad = win + pad_w0 + pad_w1; + int h_pad = hin + pad_h0 + pad_h1; const int zero_len = w_pad; float zero_ptr[zero_len]; // NOLINT @@ -676,10 +682,10 @@ void conv_compute_2x2_3x3_small(const float* input, prepack_input_nxwc4_dw(input + ni * in_n_stride, input_c4 + i * new_c_stride, i * 4, - -pad_h, - hin + pad_h, - -pad_w, - win + pad_w, + -pad_h0, + hin + pad_h1, + -pad_w0, + win + pad_w1, chin, win, hin, diff --git a/lite/backends/arm/math/lstm.cc b/lite/backends/arm/math/lstm.cc index 5a2a263bb4fa2dc7b4ec54d84c698651a058f933..cd8e012a287437ac9527ca510f927be30d825f0c 100644 --- a/lite/backends/arm/math/lstm.cc +++ b/lite/backends/arm/math/lstm.cc @@ -33,6 +33,7 @@ void add_bias_rowwise(Tensor* input, for (int w = start_w; w < w_adds; ++w) { i_data[w] += b_data[w]; } + i_data += width; } } void vector_dot( @@ -67,15 +68,8 @@ void vector_dot( for (int i = 0; i < remain; ++i) { if (!v2) { out_ptr[i] = in_ptr[i] * v1_ptr[i]; - ++out_ptr; - ++in_ptr; - ++v1_ptr; } else { out_ptr[i] = in_ptr[i] + v1_ptr[i] * v2_ptr[i]; - ++out_ptr; - ++in_ptr; - ++v1_ptr; - ++v2_ptr; } } } diff --git a/lite/backends/arm/math/packed_sgemm.cc b/lite/backends/arm/math/packed_sgemm.cc index b41afc1c29e121f905b0abc48bae98705bc0ee16..2e869f2df3a292b264dae948f13c64e05854d052 100644 --- a/lite/backends/arm/math/packed_sgemm.cc +++ b/lite/backends/arm/math/packed_sgemm.cc @@ -72,6 +72,7 @@ void pack_trans_m4(float *out, int mmax, int k0, int kmax); + void sgemm_prepacked_4x4(bool is_transB, int M, int N, @@ -154,6 +155,20 @@ void sgemm_prepacked_4x8(bool is_transB, bool has_bias, const operators::ActivationParam act_param, ARMContext *ctx); +// for kA53 +void sgemm_prepacked_6x8_a53(bool is_transB, + int M, + int N, + int K, + const float *A_packed, + const float *B, + int ldb, + float *C, + int ldc, + const float *bias, + bool has_bias, + int is_relu, + ARMContext *ctx); #endif // __aarch64__ /** @@ -300,6 +315,44 @@ void sgemm_prepack(bool is_transB, has_bias, act_param, ctx); + } else if (ctx->arch() == kA53) { + auto act_type = act_param.active_type; + bool has_act = act_param.has_active; + bool act_flag = + (has_act == false) || + (has_act == true && act_type == lite_api::ActivationType::kRelu); + bool has_beta = fabsf(beta) > 1e-8f ? true : false; + bool a53_sgemm = act_flag && !has_beta; + if (a53_sgemm) { + sgemm_prepacked_6x8_a53(is_transB, + M, + N, + K, + A_packed, + B, + ldb, + C, + ldc, + bias, + has_bias, + static_cast(has_act), + ctx); + } else { + sgemm_prepacked_6x8(is_transB, + M, + N, + K, + A_packed, + B, + ldb, + beta, + C, + ldc, + bias, + has_bias, + act_param, + ctx); + } } else { sgemm_prepacked_6x8(is_transB, M, @@ -3983,6 +4036,472 @@ void sgemm_prepacked_6x8(bool is_transB, } } +/** + * \brief gemm with ablock = 6, bblock = 8, output 6x8, optimize for a53 arch + * @param A + * @param B + * @param C + * @param M + * @param N + * @param K + * @param threads + * @param workspace + */ +void sgemm_prepacked_6x8_a53(bool is_transB, + int M, + int N, + int K, + const float* A_packed, + const float* B, + int ldb, + float* C, + int ldc, + const float* bias, + bool has_bias, + int is_relu, + ARMContext* ctx) { + size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; + auto* workspace = ctx->workspace_data(); + int threads = ctx->threads(); + //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 + int x_block = + (l2_cache - (MBLOCK_OTH * K)) / (sizeof(float) * (K + MBLOCK_OTH)); + x_block /= NBLOCK; + x_block *= NBLOCK; + int x_num = (N + (x_block - 1)) / x_block; + x_block = (N + x_num - 1) / x_num; + x_block = (x_block + NBLOCK - 1) / NBLOCK; + x_block *= NBLOCK; + x_block = x_block < NBLOCK ? NBLOCK : x_block; + + int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1; + int tail_pre = (K & (KBLOCK - 1)); + if (tail_pre == 0) { + tail_pre = KBLOCK; + } + + //! merge tail_pre and flag_act + tail_pre = (tail_pre << 2 | is_relu); + bool flag_p_remain = false; + int remain = 0; + + //! apanel is pre_compute outside gemm + for (unsigned int x0 = 0; x0 < N; x0 += x_block) { + unsigned int xmax = x0 + x_block; + if (xmax > N) { + xmax = N; + } + int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK; + remain = xmax - x0 - (bblocks - 1) * NBLOCK; + if (remain > 0) { + flag_p_remain = true; + } + //! load bpanel + auto b_pannel = static_cast(workspace); + if (is_transB) { + loadb_trans(b_pannel, B, ldb, 0, K, x0, xmax); + } else { + loadb(b_pannel, B, ldb, 0, K, x0, xmax); + } +#pragma omp parallel for num_threads(threads) + for (unsigned int y = 0; y < M; y += MBLOCK_OTH) { + unsigned int ymax = y + MBLOCK_OTH; + if (ymax > M) { + ymax = M; + } + float* c_ptr0 = C + y * ldc + x0; + float* c_ptr1 = c_ptr0 + ldc; + float* c_ptr2 = c_ptr1 + ldc; + float* c_ptr3 = c_ptr2 + ldc; + float* c_ptr4 = c_ptr3 + ldc; + float* c_ptr5 = c_ptr4 + ldc; + + float* pout0 = c_ptr0; + float* pout1 = c_ptr1; + float* pout2 = c_ptr2; + float* pout3 = c_ptr3; + float* pout4 = c_ptr4; + float* pout5 = c_ptr5; + + float bias_local[6] = {0}; + if (has_bias) { + bias_local[0] = bias[y]; + bias_local[1] = bias[y + 1]; + bias_local[2] = bias[y + 2]; + bias_local[3] = bias[y + 3]; + bias_local[4] = bias[y + 4]; + bias_local[5] = bias[y + 5]; + } + + float cout0[NBLOCK]; + float cout1[NBLOCK]; + float cout2[NBLOCK]; + float cout3[NBLOCK]; + float cout4[NBLOCK]; + float cout5[NBLOCK]; + + const float* a_ptr_l = A_packed + y * K; + const float* b_ptr = b_pannel; + for (int xb = 0; xb < bblocks; xb++) { + if ((y + 5) >= ymax) { + switch ((y + 5) - ymax) { + case 4: + c_ptr1 = cout1; + case 3: + c_ptr2 = cout2; + case 2: + c_ptr3 = cout3; + case 1: + c_ptr4 = cout4; + case 0: + c_ptr5 = cout5; + default: + break; + } + } + if (flag_p_remain && (xb == bblocks - 1)) { + pout0 = c_ptr0; + pout1 = c_ptr1; + pout2 = c_ptr2; + pout3 = c_ptr3; + pout4 = c_ptr4; + pout5 = c_ptr5; + + c_ptr0 = cout0; + c_ptr1 = cout1; + c_ptr2 = cout2; + c_ptr3 = cout3; + c_ptr4 = cout4; + c_ptr5 = cout5; + } + const float* a_ptr = a_ptr_l; + int tails = tail_pre; + int k = k_pre; + + // clang-format off + asm volatile( + // sgemm 6x8 for a53 + "vld1.32 {d2-d3}, [%[bias_ptr]] \n" /* load bias0-3 to d2,d3 */ + "vdup.i32 q4, d2[0] \n" /* set out00 to bias0 */ + "vld1.32 {d0-d1}, [%[a_ptr] :64] \n" /* load a00-a30 to d0,d1 */ + "vdup.i32 q5, d2[0] \n" /* set out01 to bias0 */ + "vld1.32 {d4-d5}, [%[b_ptr] :128] \n" /* load b00-b03 to d4,d5 */ + "vdup.i32 q6, d2[1] \n" /* set out10 to bias1 */ + "ldr r0, [%[a_ptr], #0x10] \n" /* load a40 to r0 */ + "vdup.i32 q7, d2[1] \n" /* set out11 to bias1 */ + "ldr r1, [%[a_ptr], #0x14] \n" /* load a50 to r1 */ + "vdup.i32 q8, d3[0] \n" /* set out20 to bias2 */ + "vldr d6, [%[bias_ptr], #0x10] \n" /* load bias 4,5 to d6 */ + "pld [%[a_ptr], #0x40] \n" /* pre load apanel */ + "vdup.i32 q9, d3[0] \n" /* set out21 to bias2 */ + "pld [%[b_ptr], #0x40] \n" /* pre load bpanel */ + "vdup.i32 q10, d3[1] \n" /* set out30 to bias3 */ + "pld [%[a_ptr], #0x80] \n" /* pre load apanel */ + "vdup.i32 q11, d3[1] \n" /* set out31 to bias3 */ + "pld [%[b_ptr], #0x80] \n" /* pre load bpanel */ + "vdup.i32 q12, d6[0] \n" /* set out40 to bias4 */ + "vdup.i32 q13, d6[0] \n" /* set out41 to bias4 */ + "pld [%[a_ptr], #0xC0] \n" /* pre load apanel */ + "vdup.i32 q14, d6[1] \n" /* set out50 to bias5 */ + "pld [%[b_ptr], #0XC0] \n" /* pre load bpanel */ + "vdup.i32 q15, d6[1] \n" /* set out51 to bias5 */ + "cmp %[k], #0 \n" /* check k loop */ + "beq 6f \n" /* k==0, branch to 6 */ + "1:\n" + /* Unroll 0 */ + "vldr d6, [%[b_ptr], #0x10] \n" /* load b04, b05 to d6 */ + "vmov d2, r0, r1 \n" /* mov a40, a50 to d2 */ + "vmla.f32 q4, q2, d0[0] \n" /* out00 += a00 * b0l */ + "ldr r0, [%[b_ptr], #0x18] \n" /* load b06 to r0 */ + "vmla.f32 q6, q2, d0[1] \n" /* out10 += a10 * b0l */ + "ldr r1, [%[b_ptr], #0x1C] \n" /* load b07 to r1 */ + "vmla.f32 q8, q2, d1[0] \n" /* out20 += a20 * b0l */ + "vldr d3, [%[a_ptr], #0x18] \n" /* load a01, a11 to d3 */ + "vmov d7, r0, r1 \n" /* mov b06, b07 to d7 */ + "vmla.f32 q10, q2, d1[1] \n" /* out30 += a30 * b0l */ + "pld [%[a_ptr], #0x100] \n" /* pre load apanel */ + "vmla.f32 q12, q2, d2[0] \n" /* out40 += a40 * b0l */ + "vmla.f32 q14, q2, d2[1] \n" /* out50 += a50 * b0l */ + "vldr d4, [%[b_ptr], #0x20] \n" /* load b10, b11 to d4 */ + "vmla.f32 q5, q3, d0[0] \n" /* out01 += a00 * b0h */ + "ldr r0, [%[b_ptr], #0x28] \n" /* load b12 to r0 */ + "vmla.f32 q7, q3, d0[1] \n" /* out11 += a10 * b0h */ + "ldr r1, [%[b_ptr], #0x2C] \n" /* load b13 to r1 */ + "vmla.f32 q9, q3, d1[0] \n" /* out21 += a20 * b0h */ + "vldr d0, [%[a_ptr], #0x20] \n" /* load a21, a31 to d0 */ + "vmov d5, r0, r1 \n" /* mov b12, b13 to d5 */ + "vmla.f32 q11, q3, d1[1] \n" /* out31 += a30 * b0h */ + "ldr r0, [%[a_ptr], #0x28] \n" /* load a41 to r0 */ + "vmla.f32 q13, q3, d2[0] \n" /* out41 += a40 * b0h */ + "ldr r1, [%[a_ptr], #0x2C] \n" /* load a51 to r1 */ + "vmla.f32 q15, q3, d2[1] \n" /* out51 += a50 * b0h */ + /* Unroll 1 */ + "vldr d6, [%[b_ptr], #0x30] \n" /* load b14, b15 to d6 */ + "vmov d1, r0, r1 \n" /* mov a41, a51 to d1 */ + "vmla.f32 q4, q2, d3[0] \n" /* out00 += a01 * b1l */ + "ldr r0, [%[b_ptr], #0x38] \n" /* load b16 to r0 */ + "vmla.f32 q6, q2, d3[1] \n" /* out10 += a11 * b1l */ + "ldr r1, [%[b_ptr], #0x3C] \n" /* load b17 to r1 */ + "vmla.f32 q8, q2, d0[0] \n" /* out20 += a21 * b1l */ + "vldr d2, [%[a_ptr], #0x30] \n" /* load a02, a12 to d0 */ + "vmov d7, r0, r1 \n" /* mov b16, b17 to d7 */ + "vmla.f32 q10, q2, d0[1] \n" /* out30 += a31 * b1l */ + "pld [%[b_ptr], #0x100] \n" /* pre load apanel */ + "vmla.f32 q12, q2, d1[0] \n" /* out40 += a41 * b1l */ + "vmla.f32 q14, q2, d1[1] \n" /* out50 += a51 * b1l */ + "vldr d4, [%[b_ptr], #0x40] \n" /* load b20, b21 to d4 */ + "vmla.f32 q5, q3, d3[0] \n" /* out01 += a01 * b1h */ + "ldr r0, [%[b_ptr], #0x48] \n" /* load b22 to r0 */ + "vmla.f32 q7, q3, d3[1] \n" /* out11 += a11 * b1h */ + "ldr r1, [%[b_ptr], #0x4C] \n" /* load b23 to r1 */ + "vmla.f32 q9, q3, d0[0] \n" /* out21 += a21 * b1h */ + "vldr d3, [%[a_ptr], #0x38] \n" /* load a22, a32 to d3 */ + "vmov d5, r0, r1 \n" /* mov b22, b23 to d5 */ + "vmla.f32 q11, q3, d0[1] \n" /* out31 += a31 * b1h */ + "ldr r0, [%[a_ptr], #0x40] \n" /* load a42 to r0 */ + "vmla.f32 q13, q3, d1[0] \n" /* out41 += a41 * b1h */ + "ldr r1, [%[a_ptr], #0x44] \n" /* load a52 to r1 */ + "vmla.f32 q15, q3, d1[1] \n" /* out51 += a51 * b1h */ + /* Unroll 2 */ + "vldr d6, [%[b_ptr], #0x50] \n" /* load b24, b25 to d6 */ + "vmov d0, r0, r1 \n" /* mov a42, a52 to d0 */ + "vmla.f32 q4, q2, d2[0] \n" /* out00 += a02 * b2l */ + "ldr r0, [%[b_ptr], #0x58] \n" /* load b26 to r0 */ + "vmla.f32 q6, q2, d2[1] \n" /* out10 += a12 * b2l */ + "ldr r1, [%[b_ptr], #0x5C] \n" /* load b27 to r1 */ + "vmla.f32 q8, q2, d3[0] \n" /* out20 += a22 * b2l */ + "vldr d1, [%[a_ptr], #0x48] \n" /* load a03, a13 to d1 */ + "vmov d7, r0, r1 \n" /* mov b26, b27 to d7 */ + "vmla.f32 q10, q2, d3[1] \n" /* out30 += a32 * b2l */ + "pld [%[a_ptr], #0x140] \n" /* pre load apanel */ + "vmla.f32 q12, q2, d0[0] \n" /* out40 += a42 * b2l */ + "vmla.f32 q14, q2, d0[1] \n" /* out50 += a52 * b2l */ + "vldr d4, [%[b_ptr], #0x60] \n" /* load b30, b31 to d4 */ + "vmla.f32 q5, q3, d2[0] \n" /* out01 += a02 * b2h */ + "ldr r0, [%[b_ptr], #0x68] \n" /* load b32 to r0 */ + "vmla.f32 q7, q3, d2[1] \n" /* out11 += a12 * b2h */ + "ldr r1, [%[b_ptr], #0x6C] \n" /* load b33 to r1 */ + "vmla.f32 q9, q3, d3[0] \n" /* out21 += a22 * b2h */ + "vldr d2, [%[a_ptr], #0x50] \n" /* load a23, a33 to d2 */ + "vmov d5, r0, r1 \n" /* mov b32, b33 to d5 */ + "vmla.f32 q11, q3, d3[1] \n" /* out31 += a32 * b2h */ + "ldr r0, [%[a_ptr], #0x58] \n" /* load a43 to r0 */ + "vmla.f32 q13, q3, d0[0] \n" /* out41 += a42 * b2h */ + "ldr r1, [%[a_ptr], #0x5C] \n" /* load a53 to r1 */ + "vmla.f32 q15, q3, d0[1] \n" /* out51 += a52 * b2h */ + "add %[a_ptr], %[a_ptr], #0x60 \n" /* aptr += 96 */ + /* Unroll 3 */ + "vldr d6, [%[b_ptr], #0x70] \n" /* load b34, b35 to d6 */ + "vmov d3, r0, r1 \n" /* mov a43, a53 to d3 */ + "vmla.f32 q4, q2, d1[0] \n" /* out00 += a03 * b3l */ + "ldr r0, [%[b_ptr], #0x78] \n" /* load b36 to r0 */ + "vmla.f32 q6, q2, d1[1] \n" /* out10 += a13 * b3l */ + "ldr r1, [%[b_ptr], #0x7C] \n" /* load b37 to r1 */ + "vmla.f32 q8, q2, d2[0] \n" /* out20 += a23 * b3l */ + "add %[b_ptr], %[b_ptr], #0x80 \n" /* bptr += 108 */ + "vldr d0, [%[a_ptr], #0x00] \n" /* load a00, a10 to d0 */ + "vmov d7, r0, r1 \n" /* mov b36, b37 to d7 */ + "vmla.f32 q10, q2, d2[1] \n" /* out30 += a33 * b3l */ + "pld [%[b_ptr], #0xC0] \n" /* pre load bpanel */ + "vmla.f32 q12, q2, d3[0] \n" /* out40 += a43 * b3l */ + "vmla.f32 q14, q2, d3[1] \n" /* out50 += a53 * b3l */ + "vldr d4, [%[b_ptr], #0x00] \n" /* load b00, b01 to d4 */ + "vmla.f32 q5, q3, d1[0] \n" /* out01 += a03 * b3h */ + "ldr r0, [%[b_ptr], #0x08] \n" /* load b02 to r0 */ + "vmla.f32 q7, q3, d1[1] \n" /* out11 += a13 * b3h */ + "ldr r1, [%[b_ptr], #0x0C] \n" /* load b03 to r1 */ + "vmla.f32 q9, q3, d2[0] \n" /* out21 += a23 * b3h */ + "subs %[k], %[k], #1 \n" /* loop k -= 1 */ + "vldr d1, [%[a_ptr], #0x08] \n" /* load a20, a30 to d1 */ + "vmov d5, r0, r1 \n" /* mov b02, b03 to d5 */ + "vmla.f32 q11, q3, d2[1] \n" /* out31 += a33 * b3h */ + "ldr r0, [%[a_ptr], #0x10] \n" /* load a40 to r0 */ + "vmla.f32 q13, q3, d3[0] \n" /* out41 += a43 * b3h */ + "ldr r1, [%[a_ptr], #0x14] \n" /* load a50 to r1 */ + "vmla.f32 q15, q3, d3[1] \n" /* out51 += a53 * b3h */ + "bne 1b \n" /* branch to k loop */ + "6:\n" + "sub %[tails], %[tails], #4 \n" /* tail -= 4 */ + "cmp %[tails], #4 \n" /* cmp tail with 4 */ + "blt 3f \n" /* branch to tail == 1 */ + /* Tail Unroll 0 */ + "vmov d2, r0, r1 \n" /* mov b02, b03 to d2 */ + "add %[a_ptr], %[a_ptr], #0x18 \n" /* aptr += 24 */ + "vmla.f32 q4, q2, d0[0] \n" /* out00 += a00 * b0l */ + "vld1.32 {d3}, [%[a_ptr] :64]! \n" /* load a01, a11 to d3 */ + "vmla.f32 q6, q2, d0[1] \n" /* out10 += a10 * b0l */ + "add %[b_ptr], %[b_ptr], #0x10 \n" /* bptr += 16 */ + "vmla.f32 q8, q2, d1[0] \n" /* out20 += a20 * b0l */ + "vld1.32 {d6-d7}, [%[b_ptr] :128]! \n" /* load b04-b07 to d6,d7 */ + "vmla.f32 q10, q2, d1[1] \n" /* out30 += a30 * b0l */ + "vmla.f32 q12, q2, d2[0] \n" /* out40 += a40 * b0l */ + "sub %[tails], %[tails], #4 \n" /* tail -= 4 */ + "vmla.f32 q14, q2, d2[1] \n" /* out50 += a50 * b0l */ + "vld1.32 {d4-d5}, [%[b_ptr] :128]! \n" /* load b10-b13 to d4,d5 */ + "vmla.f32 q5, q3, d0[0] \n" /* out01 += a00 * b0h */ + "vmla.f32 q7, q3, d0[1] \n" /* out11 += a10 * b0h */ + "vmla.f32 q9, q3, d1[0] \n" /* out21 += a20 * b0h */ + "vmla.f32 q11, q3, d1[1] \n" /* out31 += a30 * b0h */ + "vld1.32 {d0-d1}, [%[a_ptr] :64]! \n" /* load a21-a51 to d0,d1 */ + "cmp %[tails], #4 \n" /* cmp tail with 4 */ + "vmla.f32 q13, q3, d2[0] \n" /* out41 += a40 * b0h */ + "vmla.f32 q15, q3, d2[1] \n" /* out51 += a50 * b0h */ + "vld1.32 {d6-d7}, [%[b_ptr] :128]! \n" /* load b14-b17 to d6,d7 */ + "blt 4f \n" /* branch to tail == 2 */ + /* Tail Unroll 1 */ + "vmla.f32 q4, q2, d3[0] \n" /* out00 += a01 * b1l */ + "vmla.f32 q6, q2, d3[1] \n" /* out10 += a11 * b1l */ + "sub %[tails], %[tails], #4 \n" /* tail -= 4 */ + "vmla.f32 q8, q2, d0[0] \n" /* out20 += a21 * b1l */ + "vmla.f32 q10, q2, d0[1] \n" /* out30 += a31 * b1l */ + "vmla.f32 q12, q2, d1[0] \n" /* out40 += a41 * b1l */ + "vmla.f32 q14, q2, d1[1] \n" /* out50 += a51 * b1l */ + "vld1.32 {d4-d5}, [%[b_ptr] :128]! \n" /* load b20-b23 to d4,d5 */ + "vmla.f32 q5, q3, d3[0] \n" /* out01 += a01 * b1h */ + "vmla.f32 q7, q3, d3[1] \n" /* out11 += a11 * b1h */ + "cmp %[tails], #4 \n" /* cmp tail with 4 */ + "vld1.32 {d2-d3}, [%[a_ptr] :64]! \n" /* load a02-a32 to d2,d3 */ + "vmla.f32 q9, q3, d0[0] \n" /* out21 += a21 * b1h */ + "vmla.f32 q11, q3, d0[1] \n" /* out31 += a31 * b1h */ + "vmla.f32 q13, q3, d1[0] \n" /* out41 += a41 * b1h */ + "vmla.f32 q15, q3, d1[1] \n" /* out51 += a51 * b1h */ + "vld1.32 {d6-d7}, [%[b_ptr] :128]! \n" /* load b24-b27 to d6,d7 */ + "blt 5f \n" /* branch to tail == 3 */ + /* Tail Unroll 2 */ + "sub %[tails], %[tails], #4 \n" /* tail -= 4 */ + "vld1.32 {d0-d1}, [%[a_ptr] :64]! \n" /* a42a52a03a13 to d0,d1 */ + "vmla.f32 q4, q2, d2[0] \n" /* out00 += a02 * b2l */ + "vmla.f32 q6, q2, d2[1] \n" /* out10 += a12 * b2l */ + "vmla.f32 q8, q2, d3[0] \n" /* out20 += a22 * b2l */ + "vmla.f32 q10, q2, d3[1] \n" /* out30 += a32 * b2l */ + "vmla.f32 q12, q2, d0[0] \n" /* out40 += a42 * b2l */ + "vmla.f32 q14, q2, d0[1] \n" /* out50 += a52 * b2l */ + "vld1.32 {d4-d5}, [%[b_ptr] :128]! \n" /* load b30-b33 to d4,d5 */ + "vmla.f32 q5, q3, d2[0] \n" /* out01 += a02 * b2h */ + "vmla.f32 q7, q3, d2[1] \n" /* out11 += a12 * b2h */ + "vmla.f32 q9, q3, d3[0] \n" /* out21 += a22 * b2h */ + "vmla.f32 q11, q3, d3[1] \n" /* out31 += a32 * b2h */ + "vld1.32 {d2-d3}, [%[a_ptr] :64]! \n" /* load a23-a53 to d2,d3 */ + "vmla.f32 q13, q3, d0[0] \n" /* out41 += a42 * b2h */ + "vmla.f32 q15, q3, d0[1] \n" /* out51 += a52 * b2h */ + "vld1.32 {d6-d7}, [%[b_ptr] :128]! \n" /* load b34-b37 to d6,d7 */ + /* Tail Unroll 3 */ + "vmla.f32 q4, q2, d1[0] \n" /* out00 += a03 * b3l */ + "vmla.f32 q5, q3, d1[0] \n" /* out01 += a03 * b3h */ + "vmla.f32 q6, q2, d1[1] \n" /* out10 += a13 * b3l */ + "vmla.f32 q7, q3, d1[1] \n" /* out11 += a13 * b3h */ + "vmla.f32 q8, q2, d2[0] \n" /* out20 += a23 * b3l */ + "vmla.f32 q9, q3, d2[0] \n" /* out21 += a23 * b3h */ + "vmla.f32 q10, q2, d2[1] \n" /* out30 += a33 * b3l */ + "vmla.f32 q11, q3, d2[1] \n" /* out31 += a33 * b3h */ + "vmla.f32 q12, q2, d3[0] \n" /* out40 += a43 * b3l */ + "vmla.f32 q13, q3, d3[0] \n" /* out41 += a43 * b3h */ + "vmla.f32 q14, q2, d3[1] \n" /* out50 += a53 * b3l */ + "vmla.f32 q15, q3, d3[1] \n" /* out51 += a53 * b3h */ + "b 2f \n" /* branch to check relu */ + /* tails==1 final tail */ + "3:\n" + "vmov d2, r0, r1 \n" /* mov b02, b03 to d2 */ + "add %[b_ptr], %[b_ptr], #0x10 \n" /* bptr += 16 */ + "vmla.f32 q4, q2, d0[0] \n" /* out00 += a00 * b0l */ + "add %[a_ptr], %[a_ptr], #0x18 \n" /* aptr += 24 */ + "vmla.f32 q6, q2, d0[1] \n" /* out10 += a10 * b0l */ + "vld1.32 {d6-d7}, [%[b_ptr] :128]! \n" /* load b04-b07 to d6,d7 */ + "vmla.f32 q8, q2, d1[0] \n" /* out20 += a20 * b0l */ + "vmla.f32 q10, q2, d1[1] \n" /* out30 += a30 * b0l */ + "vmla.f32 q12, q2, d2[0] \n" /* out40 += a40 * b0l */ + "vmla.f32 q14, q2, d2[1] \n" /* out50 += a50 * b0l */ + "vmla.f32 q5, q3, d0[0] \n" /* out01 += a00 * b0h */ + "vmla.f32 q7, q3, d0[1] \n" /* out11 += a10 * b0h */ + "vmla.f32 q9, q3, d1[0] \n" /* out21 += a20 * b0h */ + "vmla.f32 q11, q3, d1[1] \n" /* out31 += a30 * b0h */ + "vmla.f32 q13, q3, d2[0] \n" /* out41 += a40 * b0h */ + "vmla.f32 q15, q3, d2[1] \n" /* out51 += a50 * b0h */ + "b 2f \n" /* branch to check relu */ + /* tails==2 final tail */ + "4:\n" + "vmla.f32 q4, q2, d3[0] \n" /* out00 += a01 * b1l */ + "vmla.f32 q5, q3, d3[0] \n" /* out01 += a01 * b1h */ + "vmla.f32 q6, q2, d3[1] \n" /* out10 += a11 * b1l */ + "vmla.f32 q7, q3, d3[1] \n" /* out11 += a11 * b1h */ + "vmla.f32 q8, q2, d0[0] \n" /* out20 += a21 * b1l */ + "vmla.f32 q9, q3, d0[0] \n" /* out21 += a21 * b1h */ + "vmla.f32 q10, q2, d0[1] \n" /* out30 += a31 * b1l */ + "vmla.f32 q11, q3, d0[1] \n" /* out31 += a31 * b1h */ + "vmla.f32 q12, q2, d1[0] \n" /* out40 += a41 * b1l */ + "vmla.f32 q13, q3, d1[0] \n" /* out41 += a41 * b1h */ + "vmla.f32 q14, q2, d1[1] \n" /* out50 += a51 * b1l */ + "vmla.f32 q15, q3, d1[1] \n" /* out51 += a51 * b1h */ + "b 2f \n" /* branch to check relu */ + /* tails==3 final tail */ + "5:\n" + "vmla.f32 q4, q2, d2[0] \n" /* out00 += a02 * b2l */ + "vld1.32 {d0}, [%[a_ptr] :64]! \n" /* load a42, a52 to d0 */ + "vmla.f32 q6, q2, d2[1] \n" /* out10 += a12 * b2l */ + "vmla.f32 q8, q2, d3[0] \n" /* out20 += a22 * b2l */ + "vmla.f32 q5, q3, d2[0] \n" /* out01 += a02 * b2h */ + "vmla.f32 q7, q3, d2[1] \n" /* out11 += a12 * b2h */ + "vmla.f32 q9, q3, d3[0] \n" /* out21 += a22 * b2h */ + "vmla.f32 q10, q2, d3[1] \n" /* out30 += a32 * b2l */ + "vmla.f32 q11, q3, d3[1] \n" /* out31 += a32 * b2h */ + "vmla.f32 q12, q2, d0[0] \n" /* out40 += a42 * b2l */ + "vmla.f32 q13, q3, d0[0] \n" /* out41 += a42 * b2h */ + "vmla.f32 q14, q2, d0[1] \n" /* out50 += a52 * b2l */ + "vmla.f32 q15, q3, d0[1] \n" /* out51 += a52 * b2h */ + /* relu */ + "2:\n" + "cmp %[tails], #1 \n" /* cmp tail is relu */ + "bne 0f \n" /* no relu branch to end */ + "vmov.i32 q0, #0 \n" /* mov 0.f to q0 */ + "vmax.f32 q4, q4, q0 \n" /* out00 relu */ + "vmax.f32 q5, q5, q0 \n" /* out01 relu */ + "vmax.f32 q6, q6, q0 \n" /* out10 relu */ + "vmax.f32 q7, q7, q0 \n" /* out11 relu */ + "vmax.f32 q8, q8, q0 \n" /* out20 relu */ + "vmax.f32 q9, q9, q0 \n" /* out21 relu */ + "vmax.f32 q10, q10, q0 \n" /* out30 relu */ + "vmax.f32 q11, q11, q0 \n" /* out31 relu */ + "vmax.f32 q12, q12, q0 \n" /* out40 relu */ + "vmax.f32 q13, q13, q0 \n" /* out41 relu */ + "vmax.f32 q14, q14, q0 \n" /* out50 relu */ + "vmax.f32 q15, q15, q0 \n" /* out51 relu */ + "0:\n" + "vst1.32 {d8-d11}, [%[c_ptr0]]! \n" /* store out0 to cptr0 */ + "vst1.32 {d12-d15}, [%[c_ptr1]]! \n" /* store out1 to cptr1 */ + "vst1.32 {d16-d19}, [%[c_ptr2]]! \n" /* store out2 to cptr2 */ + "vst1.32 {d20-d23}, [%[c_ptr3]]! \n" /* store out3 to cptr3 */ + "vst1.32 {d24-d27}, [%[c_ptr4]]! \n" /* store out4 to cptr4 */ + "vst1.32 {d28-d31}, [%[c_ptr5]]! \n" /* store out5 to cptr5 */ + : [a_ptr] "+r"(a_ptr), + [b_ptr] "+r"(b_ptr), + [c_ptr0] "+r"(c_ptr0), + [c_ptr1] "+r"(c_ptr1), + [c_ptr2] "+r"(c_ptr2), + [c_ptr3] "+r"(c_ptr3), + [c_ptr4] "+r"(c_ptr4), + [c_ptr5] "+r"(c_ptr5), + [k] "+r"(k), + [tails] "+r"(tails) + : [bias_ptr] "r"(bias_local) + : "r0", "r1", "q0","q1","q2","q3","q4", + "q5","q6","q7","q8","q9","q10","q11", + "q12","q13","q14","q15","cc","memory"); + // clang-format on + if (flag_p_remain && (xb == bblocks - 1)) { + for (int i = 0; i < remain; ++i) { + *pout0++ = cout0[i]; + *pout1++ = cout1[i]; + *pout2++ = cout2[i]; + *pout3++ = cout3[i]; + *pout4++ = cout4[i]; + *pout5++ = cout5[i]; + } + } + } + } + } +} + void sgemm_prepacked_4x8(bool is_transB, int M, int N, diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc index 0955b09d92f64066000b03c4487f359880f1c2a5..fdcbc7394b1be9e438686f91dfa407065d24f91a 100644 --- a/lite/backends/arm/math/pooling.cc +++ b/lite/backends/arm/math/pooling.cc @@ -21,6 +21,17 @@ namespace paddle { namespace lite { namespace arm { namespace math { + +int AdaptStartIndex(int ph, int input_size, int output_size) { + return static_cast( + floor(static_cast(ph * input_size) / output_size)); +} + +int AdaptEndIndex(int ph, int input_size, int output_size) { + return static_cast( + ceil(static_cast((ph + 1) * input_size) / output_size)); +} + void pooling_basic(const float* din, float* dout, int num, @@ -88,15 +99,27 @@ void pooling_basic(const float* din, #pragma omp parallel for for (int ind_c = 0; ind_c < chin; ++ind_c) { for (int ind_h = 0; ind_h < hout; ++ind_h) { - int sh = ind_h * stride_h; - int eh = sh + kernel_h; - sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; - eh = (eh - pad_h) > hin ? hin : eh - pad_h; + int sh, eh; + if (adaptive) { + sh = AdaptStartIndex(ind_h, hin, hout); + eh = AdaptEndIndex(ind_h, hin, hout); + } else { + sh = ind_h * stride_h; + eh = sh + kernel_h; + sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; + eh = (eh - pad_h) > hin ? hin : eh - pad_h; + } for (int ind_w = 0; ind_w < wout; ++ind_w) { - int sw = ind_w * stride_w; - int ew = sw + kernel_w; - sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; - ew = (ew - pad_w) > win ? win : ew - pad_w; + int sw, ew; + if (adaptive) { + sw = AdaptStartIndex(ind_w, win, wout); + ew = AdaptEndIndex(ind_w, win, wout); + } else { + sw = ind_w * stride_w; + ew = sw + kernel_w; + sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; + ew = (ew - pad_w) > win ? win : ew - pad_w; + } float result = static_cast(0); int dst_ind = (ind_n * chout + ind_c) * size_channel_out + ind_h * wout + ind_w; diff --git a/lite/backends/arm/math/scale.cc b/lite/backends/arm/math/scale.cc index 5aad98c05c56f85931b7a0276d0a85b426573c4c..aab1058b9dd66522a0793fc151c54707505d1fbb 100644 --- a/lite/backends/arm/math/scale.cc +++ b/lite/backends/arm/math/scale.cc @@ -27,31 +27,467 @@ void scale( int remain = num % 16; float32x4_t vscale = vdupq_n_f32(scale); float32x4_t vbias = vdupq_n_f32(bias); + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "1: \n" + "ld1 {v4.4s}, [%[din]], #16 \n" + "and v8.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v5.4s}, [%[din]], #16 \n" + "and v9.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v6.4s}, [%[din]], #16 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v7.4s}, [%[din]], #16 \n" + "and v11.16b, %[vbias].16b, %[vbias].16b \n" + + "fmla v8.4s, v4.4s, %[vscale].4s \n" + "fmla v9.4s, v5.4s, %[vscale].4s \n" + "fmla v10.4s, v6.4s, %[vscale].4s \n" + "fmla v11.4s, v7.4s, %[vscale].4s \n" + + "stp q8, q9, [%[dout]], #32 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "stp q10, q11, [%[dout]], #32 \n" + + "bne 1b \n" + "0: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), [vbias] "w"(vbias) + : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); +#else + asm volatile( + "1: @ loop header \n" + "vld1.32 {d8-d11}, [%[din]]! @ load din 0 \n" + "vand.32 q8, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q9, %q[vbias], %q[vbias] @ out bias \n" + "vld1.32 {d12-d15}, [%[din]]! @ load din 0 \n" + + "vand.32 q10, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q11, %q[vbias], %q[vbias] @ out bias \n" + + "vmla.f32 q8, q4, %q[vscale] @ mla \n" + "vmla.f32 q9, q5, %q[vscale] @ mla \n" + "vmla.f32 q10, q6, %q[vscale] @ mla \n" + "vmla.f32 q11, q7, %q[vscale] @ mla \n" + + "vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer\n" + + "bne 1b @ jump to main loop start " + "2: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), [vbias] "w"(vbias) + : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"); +#endif + } + if (remain > 0) { + for (int i = 0; i < remain; i++) { + *dout = *din * scale + bias; + dout++; + din++; + } + } +} + +template <> +void scale_relu( + const float* din, float* dout, int num, float scale, float bias) { + int cnt = num >> 4; + int remain = num % 16; + float32x4_t vscale = vdupq_n_f32(scale); + float32x4_t vbias = vdupq_n_f32(bias); + float32x4_t vzero = vdupq_n_f32(0.f); + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "1: \n" + "ld1 {v4.4s}, [%[din]], #16 \n" + "and v8.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v5.4s}, [%[din]], #16 \n" + "and v9.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v6.4s}, [%[din]], #16 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b\n" + "ld1 {v7.4s}, [%[din]], #16 \n" + "and v11.16b, %[vbias].16b, %[vbias].16b\n" + + "fmla v8.4s, v4.4s, %[vscale].4s \n" + "fmla v9.4s, v5.4s, %[vscale].4s \n" + "fmla v10.4s, v6.4s, %[vscale].4s \n" + "fmla v11.4s, v7.4s, %[vscale].4s \n" + + "fmax v8.4s, v8.4s, %[vzero].4s \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + + "stp q8, q9, [%[dout]], #32 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "stp q10, q11, [%[dout]], #32 \n" + "bne 1b \n" + "0: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), [vbias] "w"(vbias), [vzero] "w"(vzero) + : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); +#else + asm volatile( + "1: @ loop header \n" + "vld1.32 {d8-d11}, [%[din]]! @ load din 0 \n" + "vand.32 q8, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q9, %q[vbias], %q[vbias] @ out bias \n" + "vld1.32 {d12-d15}, [%[din]]! @ load din 0 \n" + + "vand.32 q10, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q11, %q[vbias], %q[vbias] @ out bias \n" + + "vmla.f32 q8, q4, %q[vscale] @ mla \n" + "vmla.f32 q9, q5, %q[vscale] @ mla \n" + "vmla.f32 q10, q6, %q[vscale] @ mla \n" + "vmla.f32 q11, q7, %q[vscale] @ mla \n" + + "vmax.f32 q8, q8, %q[vzero] @ relu \n" + "vmax.f32 q9, q9, %q[vzero] @ relu \n" + "vmax.f32 q10, q10, %q[vzero] @ relu \n" + "vmax.f32 q11, q11, %q[vzero] @ relu \n" + + "vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer\n" + + "bne 1b @ jump to main loop start " + "2: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), [vbias] "w"(vbias), [vzero] "w"(vzero) + : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"); +#endif + } + if (remain > 0) { + for (int i = 0; i < remain; i++) { + *dout = *din * scale + bias; + *dout = *dout > 0.f ? *dout : 0.f; + dout++; + din++; + } + } +} + +template <> +void scale_relu6(const float* din, + float* dout, + int num, + float scale, + float bias, + float alpha) { + int cnt = num >> 4; + int remain = num % 16; + float32x4_t vscale = vdupq_n_f32(scale); + float32x4_t vbias = vdupq_n_f32(bias); + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t valpha = vdupq_n_f32(alpha); + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "1: \n" + "ld1 {v4.4s}, [%[din]], #16 \n" + "and v8.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v5.4s}, [%[din]], #16 \n" + "and v9.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v6.4s}, [%[din]], #16 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v7.4s}, [%[din]], #16 \n" + "and v11.16b, %[vbias].16b, %[vbias].16b \n" + + "fmla v8.4s, v4.4s, %[vscale].4s \n" + "fmla v9.4s, v5.4s, %[vscale].4s \n" + "fmla v10.4s, v6.4s, %[vscale].4s \n" + "fmla v11.4s, v7.4s, %[vscale].4s \n" + + "fmax v8.4s, v8.4s, %[vzero].4s \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + + "fmin v8.4s, v8.4s, %[valpha].4s \n" + "fmin v9.4s, v9.4s, %[valpha].4s \n" + "fmin v10.4s, v10.4s, %[valpha].4s \n" + "fmin v11.4s, v11.4s, %[valpha].4s \n" + + "stp q8, q9, [%[dout]], #32 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "stp q10, q11, [%[dout]], #32 \n" + "bne 1b \n" + "0: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), + [vbias] "w"(vbias), + [vzero] "w"(vzero), + [valpha] "w"(valpha) + : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); +#else + asm volatile( + "1: @ loop header \n" + "vld1.32 {d8-d11}, [%[din]]! @ load din 0 \n" + "vand.32 q8, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q9, %q[vbias], %q[vbias] @ out bias \n" + "vld1.32 {d12-d15}, [%[din]]! @ load din 0 \n" + + "vand.32 q10, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q11, %q[vbias], %q[vbias] @ out bias \n" + + "vmla.f32 q8, q4, %q[vscale] @ mla \n" + "vmla.f32 q9, q5, %q[vscale] @ mla \n" + "vmla.f32 q10, q6, %q[vscale] @ mla \n" + "vmla.f32 q11, q7, %q[vscale] @ mla \n" + + "vmax.f32 q8, q8, %q[vzero] @ relu \n" + "vmax.f32 q9, q9, %q[vzero] @ relu \n" + "vmax.f32 q10, q10, %q[vzero] @ relu \n" + "vmax.f32 q11, q11, %q[vzero] @ relu \n" + + "vmin.f32 q8, q8, %q[valpha] @ relu \n" + "vmin.f32 q9, q9, %q[valpha] @ relu \n" + "vmin.f32 q10, q10, %q[valpha] @ relu \n" + "vmin.f32 q11, q11, %q[valpha] @ relu \n" + + "vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer\n" + + "bne 1b @ jump to main loop start " + "2: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), + [vbias] "w"(vbias), + [vzero] "w"(vzero), + [valpha] "w"(valpha) + : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"); +#endif + } + if (remain > 0) { + for (int i = 0; i < remain; i++) { + *dout = *din * scale + bias; + *dout = *dout > 0.f ? (*dout < alpha ? *dout : alpha) : 0.f; + dout++; + din++; + } + } +} + +template <> +void scale_leaky_relu(const float* din, + float* dout, + int num, + float scale, + float bias, + float alpha) { + int cnt = num >> 4; + int remain = num % 16; + float32x4_t vscale = vdupq_n_f32(scale); + float32x4_t vbias = vdupq_n_f32(bias); + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t valpha = vdupq_n_f32(alpha); + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "1: \n" + "ld1 {v4.4s}, [%[din]], #16 \n" + "and v8.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v5.4s}, [%[din]], #16 \n" + "and v9.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v6.4s}, [%[din]], #16 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v7.4s}, [%[din]], #16 \n" + "and v11.16b, %[vbias].16b, %[vbias].16b \n" + + "fmla v8.4s, v4.4s, %[vscale].4s \n" + "fmla v9.4s, v5.4s, %[vscale].4s \n" + "fmla v10.4s, v6.4s, %[vscale].4s \n" + "fmla v11.4s, v7.4s, %[vscale].4s \n" + + "fcmge v12.4s, v8.4s, %[vzero].4s \n" + "fmul v16.4s, v8.4s, %[valpha].4s \n" + + "fcmge v13.4s, v9.4s, %[vzero].4s \n" + "fmul v17.4s, v9.4s, %[valpha].4s \n" + + "fcmge v14.4s, v10.4s, %[vzero].4s \n" + "fmul v18.4s, v10.4s, %[valpha].4s \n" + + "fcmge v15.4s, v11.4s, %[vzero].4s \n" + "fmul v19.4s, v11.4s, %[valpha].4s \n" + + "bif v8.16b, v16.16b, v12.16b \n" /* choose*/ + "bif v9.16b, v17.16b, v13.16b \n" /* choose*/ + "bif v10.16b, v18.16b, v14.16b \n" /* choose*/ + "bif v11.16b, v19.16b, v15.16b \n" /* choose*/ + + "stp q8, q9, [%[dout]], #32 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "stp q10, q11, [%[dout]], #32 \n" + "bne 1b \n" + "0: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), + [vbias] "w"(vbias), + [vzero] "w"(vzero), + [valpha] "w"(valpha) + : "cc", + "memory", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); +#else + asm volatile( + "1: @ loop header \n" + "vld1.32 {d8-d11}, [%[din]]! @ load din 0 \n" + "vand.32 q8, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q9, %q[vbias], %q[vbias] @ out bias \n" + "vld1.32 {d12-d15}, [%[din]]! @ load din 0 \n" + + "vand.32 q10, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q11, %q[vbias], %q[vbias] @ out bias \n" + + "vmla.f32 q8, q4, %q[vscale] @ mla \n" + "vmla.f32 q9, q5, %q[vscale] @ mla \n" + "vmla.f32 q10, q6, %q[vscale] @ mla \n" + "vmla.f32 q11, q7, %q[vscale] @ mla \n" + + "vcge.f32 q12, q8, %q[vzero] @ relu \n" + "vmul.f32 q14, q8, %q[valpha] @ mul \n" + "vcge.f32 q13, q9, %q[vzero] @ relu \n" + "vmul.f32 q15, q9, %q[valpha] @ mul \n" + "vbif q8, q14, q12 @ choose \n" + "vbif q9, q15, q13 @ choose \n" + + "vcge.f32 q12, q10, %q[vzero] @ relu \n" + "vmul.f32 q14, q10, %q[valpha] @ mul \n" + "vcge.f32 q13, q11, %q[vzero] @ relu \n" + "vmul.f32 q15, q11, %q[valpha] @ mul \n" + + "vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer\n" + + "vbif q10, q14, q12 @ choose \n" + "vbif q11, q15, q13 @ choose \n" + "subs %[cnt], #1 @ loop count minus 1\n" + "vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer\n" + + "bne 1b @ jump to main loop start " + "2: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), + [vbias] "w"(vbias), + [vzero] "w"(vzero), + [valpha] "w"(valpha) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); +#endif + } + if (remain > 0) { + for (int i = 0; i < remain; i++) { + *dout = *din * scale + bias; + *dout = *dout > 0.f ? *dout : (*dout * alpha); + dout++; + din++; + } + } +} + +template <> +void scale(const int* din, int* dout, int num, int scale, int bias) { + int cnt = num >> 4; + int remain = num % 16; + int32x4_t vscale = vdupq_n_s32(scale); + int32x4_t vbias = vdupq_n_s32(bias); +#pragma omp parallel for + for (int i = 0; i < cnt; i++) { + const int* din_ptr = din + (i << 4); + int* dout_ptr = dout + (i << 4); + + int32x4_t din0 = vld1q_s32(din_ptr); + int32x4_t din1 = vld1q_s32(din_ptr + 4); + int32x4_t din2 = vld1q_s32(din_ptr + 8); + int32x4_t din3 = vld1q_s32(din_ptr + 12); + + int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale); + int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale); + int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale); + int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale); + + vst1q_s32(dout_ptr, vsum1); + vst1q_s32(dout_ptr + 4, vsum2); + vst1q_s32(dout_ptr + 8, vsum3); + vst1q_s32(dout_ptr + 12, vsum4); + } + if (remain > 0) { + const int* din_ptr = din + (cnt << 4); + int* dout_ptr = dout + (cnt << 4); + for (int i = 0; i < remain; i++) { + *dout_ptr = *din_ptr * scale + bias; + dout_ptr++; + din_ptr++; + } + } +} + +template <> +void scale_relu(const int* din, int* dout, int num, int scale, int bias) { + int cnt = num >> 4; + int remain = num % 16; + int32x4_t vscale = vdupq_n_s32(scale); + int32x4_t vbias = vdupq_n_s32(bias); + int32x4_t vzero = vdupq_n_s32(0); #pragma omp parallel for for (int i = 0; i < cnt; i++) { - const float* din_ptr = din + (i << 4); - float* dout_ptr = dout + (i << 4); + const int* din_ptr = din + (i << 4); + int* dout_ptr = dout + (i << 4); - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); + int32x4_t din0 = vld1q_s32(din_ptr); + int32x4_t din1 = vld1q_s32(din_ptr + 4); + int32x4_t din2 = vld1q_s32(din_ptr + 8); + int32x4_t din3 = vld1q_s32(din_ptr + 12); - float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale); - float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale); - float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale); - float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale); + int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale); + int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale); + int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale); + int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale); - vst1q_f32(dout_ptr, vsum1); - vst1q_f32(dout_ptr + 4, vsum2); - vst1q_f32(dout_ptr + 8, vsum3); - vst1q_f32(dout_ptr + 12, vsum4); + vsum1 = vmaxq_s32(vsum1, vzero); + vsum2 = vmaxq_s32(vsum2, vzero); + vsum3 = vmaxq_s32(vsum3, vzero); + vsum4 = vmaxq_s32(vsum4, vzero); + + vst1q_s32(dout_ptr, vsum1); + vst1q_s32(dout_ptr + 4, vsum2); + vst1q_s32(dout_ptr + 8, vsum3); + vst1q_s32(dout_ptr + 12, vsum4); } if (remain > 0) { - const float* din_ptr = din + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); + const int* din_ptr = din + (cnt << 4); + int* dout_ptr = dout + (cnt << 4); for (int i = 0; i < remain; i++) { *dout_ptr = *din_ptr * scale + bias; + *dout_ptr = *dout_ptr > 0 ? *dout_ptr : 0; dout_ptr++; din_ptr++; } @@ -59,11 +495,66 @@ void scale( } template <> -void scale(const int* din, int* dout, int num, int scale, int bias) { +void scale_relu6( + const int* din, int* dout, int num, int scale, int bias, int alpha) { + int cnt = num >> 4; + int remain = num % 16; + int32x4_t vscale = vdupq_n_s32(scale); + int32x4_t vbias = vdupq_n_s32(bias); + int32x4_t vzero = vdupq_n_s32(0); + int32x4_t valpha = vdupq_n_s32(alpha); +#pragma omp parallel for + for (int i = 0; i < cnt; i++) { + const int* din_ptr = din + (i << 4); + int* dout_ptr = dout + (i << 4); + + int32x4_t din0 = vld1q_s32(din_ptr); + int32x4_t din1 = vld1q_s32(din_ptr + 4); + int32x4_t din2 = vld1q_s32(din_ptr + 8); + int32x4_t din3 = vld1q_s32(din_ptr + 12); + + int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale); + int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale); + int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale); + int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale); + + vsum1 = vmaxq_s32(vsum1, vzero); + vsum2 = vmaxq_s32(vsum2, vzero); + vsum3 = vmaxq_s32(vsum3, vzero); + vsum4 = vmaxq_s32(vsum4, vzero); + + vsum1 = vminq_s32(vsum1, valpha); + vsum2 = vminq_s32(vsum2, valpha); + vsum3 = vminq_s32(vsum3, valpha); + vsum4 = vminq_s32(vsum4, valpha); + + vst1q_s32(dout_ptr, vsum1); + vst1q_s32(dout_ptr + 4, vsum2); + vst1q_s32(dout_ptr + 8, vsum3); + vst1q_s32(dout_ptr + 12, vsum4); + } + + if (remain > 0) { + const int* din_ptr = din + (cnt << 4); + int* dout_ptr = dout + (cnt << 4); + for (int i = 0; i < remain; i++) { + *dout_ptr = *din_ptr * scale + bias; + *dout_ptr = *dout_ptr > 0 ? (*dout_ptr > alpha ? alpha : *dout_ptr) : 0; + dout_ptr++; + din_ptr++; + } + } +} + +template <> +void scale_leaky_relu( + const int* din, int* dout, int num, int scale, int bias, int alpha) { int cnt = num >> 4; int remain = num % 16; int32x4_t vscale = vdupq_n_s32(scale); int32x4_t vbias = vdupq_n_s32(bias); + int32x4_t vzero = vdupq_n_s32(0); + int32x4_t valpha = vdupq_n_s32(alpha); #pragma omp parallel for for (int i = 0; i < cnt; i++) { const int* din_ptr = din + (i << 4); @@ -79,16 +570,33 @@ void scale(const int* din, int* dout, int num, int scale, int bias) { int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale); int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale); + uint32x4_t v1 = vcgeq_s32(vsum1, vzero); + uint32x4_t v2 = vcgeq_s32(vsum2, vzero); + uint32x4_t v3 = vcgeq_s32(vsum3, vzero); + uint32x4_t v4 = vcgeq_s32(vsum4, vzero); + + int32x4_t v11 = vmulq_s32(vsum1, valpha); + int32x4_t v21 = vmulq_s32(vsum1, valpha); + int32x4_t v31 = vmulq_s32(vsum1, valpha); + int32x4_t v41 = vmulq_s32(vsum1, valpha); + + vsum1 = vbslq_s32(v1, vsum1, v11); + vsum2 = vbslq_s32(v2, vsum2, v21); + vsum3 = vbslq_s32(v3, vsum3, v31); + vsum4 = vbslq_s32(v4, vsum4, v41); + vst1q_s32(dout_ptr, vsum1); vst1q_s32(dout_ptr + 4, vsum2); vst1q_s32(dout_ptr + 8, vsum3); vst1q_s32(dout_ptr + 12, vsum4); } + if (remain > 0) { const int* din_ptr = din + (cnt << 4); int* dout_ptr = dout + (cnt << 4); for (int i = 0; i < remain; i++) { *dout_ptr = *din_ptr * scale + bias; + *dout_ptr = *dout_ptr > 0 ? *dout_ptr : (*dout_ptr) * alpha; dout_ptr++; din_ptr++; } diff --git a/lite/backends/arm/math/scale.h b/lite/backends/arm/math/scale.h index 910bea5613997c05e9257507f8f84792e0071a53..bbdb596bc8f45c247a24f9833680c8a510c1e904 100644 --- a/lite/backends/arm/math/scale.h +++ b/lite/backends/arm/math/scale.h @@ -40,6 +40,15 @@ void scale_compute_basic(const operators::ScaleParam& param) { template void scale(const T* din, T* dout, int num, T scale, T bias); +template +void scale_relu(const T* din, T* dout, int num, T scale, T bias); + +template +void scale_relu6(const T* din, T* dout, int num, T scale, T bias, T alpha); + +template +void scale_leaky_relu(const T* din, T* dout, int num, T scale, T bias, T alpha); + template void scale(const T* din, T* dout, diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h index 41059a0d42a95bbffed4c41611b9f3b8ac60861c..06e6c7ee46d8b839873d433843f0035e3963664c 100644 --- a/lite/backends/opencl/cl_context.h +++ b/lite/backends/opencl/cl_context.h @@ -28,6 +28,7 @@ namespace lite { class CLContext { public: ~CLContext() { + GetCommandQueue().finish(); for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) { // Note(ysh329): Don't need `clReleaseKernel` kernels_[kidx].reset(); diff --git a/lite/backends/opencl/cl_functions_test.cc b/lite/backends/opencl/cl_functions_test.cc index ba32d8c803bfd832289a936fe9150ba8d14cd984..17c879269cb745481cd2b474833e71f7417e7bad 100644 --- a/lite/backends/opencl/cl_functions_test.cc +++ b/lite/backends/opencl/cl_functions_test.cc @@ -100,16 +100,18 @@ TEST(cl_test, kernel_test) { size_t width = in_image.ImageWidth(); size_t height = in_image.ImageHeight(); auto global_work_size = cl::NDRange{width, height}; - cl::Event event; status = context->GetCommandQueue().enqueueNDRangeKernel( - kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event); + kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr); CL_CHECK_FATAL(status); status = context->GetCommandQueue().finish(); CL_CHECK_FATAL(status); +#if 0 double start_nanos = event.getProfilingInfo(); double stop_nanos = event.getProfilingInfo(); double elapsed_micros = (stop_nanos - start_nanos) / 1000.0; LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us."; +#endif + LOG(INFO) << out_image; } diff --git a/lite/backends/opencl/cl_image_converter.cc b/lite/backends/opencl/cl_image_converter.cc index 7e6f83a4d12f82c780b8e2a8ba582d6a13d8dc07..2cfcc5dc81576973ef20fc0855131472ec2c0977 100644 --- a/lite/backends/opencl/cl_image_converter.cc +++ b/lite/backends/opencl/cl_image_converter.cc @@ -73,7 +73,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, i2 += 4; p++; } else { - image[i2] = 0.0; + image[i2] = Float2Half(0.f); i2 += 4; } } @@ -261,7 +261,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, image[index] = Float2Half(*p); p++; } else { - image[index] = 0.0; + image[index] = Float2Half(0.f); } if (index >= (width * height * 4)) { LOG(INFO) << " index out of range "; diff --git a/lite/backends/opencl/cl_kernel/cl_common.h b/lite/backends/opencl/cl_kernel/cl_common.h index 582e6a08b16ea7b5b8edd5850b1c9af04db56aad..b427eb70d6cdbb5cd495e970fb77c4790bc01723 100644 --- a/lite/backends/opencl/cl_kernel/cl_common.h +++ b/lite/backends/opencl/cl_kernel/cl_common.h @@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once ///////////////////////////////// @@ -108,7 +107,8 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in #endif #ifdef RELU6 - output = clamp(in, (CL_DTYPE4)0, (CL_DTYPE4)6); + in = fmax((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in); + output = fmin((CL_DTYPE4)(6.0f, 6.0f, 6.0f, 6.0f), in); #endif return output; } diff --git a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl index 08491d5d9fd195430a4b03673c38767f7e4a5be8..a4070f747aec43f7a0ed097f9b15186cafd32476 100644 --- a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl @@ -14,36 +14,30 @@ limitations under the License. */ #include - __kernel void relu(__read_only image2d_t input, __write_only image2d_t output, __private const float threshold, __private const float scale) { + const int x = get_global_id(0); // image_width + const int y = get_global_id(1); // image_height - const int x = get_global_id(0); // image_width - const int y = get_global_id(1); // image_height - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); in = max((CL_DTYPE4)(0.0f), in); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); } - __kernel void relu6(__read_only image2d_t input, __write_only image2d_t output, __private const float threshold, - __private const float scale){ - + __private const float scale) { const int x = get_global_id(0); const int y = get_global_id(1); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in); @@ -51,7 +45,6 @@ __kernel void relu6(__read_only image2d_t input, WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); } - __kernel void sigmoid(__read_only image2d_t input, __write_only image2d_t output, __private const float threshold, @@ -64,70 +57,66 @@ __kernel void sigmoid(__read_only image2d_t input, CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 out; - out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x))); - out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y))); - out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z))); - out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w))); + + out.x = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.x)))); + out.y = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.y)))); + out.z = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.z)))); + out.w = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.w)))); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); } __kernel void leaky_relu(__read_only image2d_t input, - __write_only image2d_t output, - __private const float threshold, - __private const float scale) { + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { const int x = get_global_id(0); const int y = get_global_id(1); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in; - if (in.x < 0.0f){ + if (in.x < 0.0f) { in.x = s_val.x; } - if (in.y < 0.0f){ + if (in.y < 0.0f) { in.y = s_val.y; } - if (in.z < 0.0f){ + if (in.z < 0.0f) { in.z = s_val.z; } - if (in.w < 0.0f){ + if (in.w < 0.0f) { in.w = s_val.w; } WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); } __kernel void tanh_act(__read_only image2d_t input, - __write_only image2d_t output, - __private const float threshold, - __private const float scale) { - - const int x = get_global_id(0); // image_width - const int y = get_global_id(1); // image_height + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { + const int x = get_global_id(0); // image_width + const int y = get_global_id(1); // image_height - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); - CL_DTYPE4 out= (exp(in) - exp(-in))/ (exp(in) + exp(-in)); + CL_DTYPE4 out = (exp(in) - exp(-in)) / (exp(in) + exp(-in)); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); } __kernel void exp_act(__read_only image2d_t input, __write_only image2d_t output, __private const float threshold, - __private const float scale) { - - const int x = get_global_id(0); // image_width - const int y = get_global_id(1); // image_height + __private const float scale) { + const int x = get_global_id(0); // image_width + const int y = get_global_id(1); // image_height - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 out = exp(in); @@ -135,19 +124,16 @@ __kernel void exp_act(__read_only image2d_t input, } __kernel void swish(__read_only image2d_t input, - __write_only image2d_t output, - __private const float threshold, - __private const float scale) { - - const int x = get_global_id(0); // image_width - const int y = get_global_id(1); // image_height + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { + const int x = get_global_id(0); // image_width + const int y = get_global_id(1); // image_height - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 out = in / (1 + exp(-(CL_DTYPE)scale * in)); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); } - diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl index 4b2d5ba32072e7eb31adbf347360e0bbcee7bc5b..1c808da68ddc923e12234bc4b6ac99b35bfffb0b 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl @@ -1,28 +1,29 @@ #include -__kernel void conv2d_1x1_opt(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter, +__kernel void conv2d_1x1_opt( + __private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, #endif #ifdef BATCH_NORM -__read_only image2d_t new_scale, - __read_only image2d_t new_biase, + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, #endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c_block, - __private const int input_c_origin, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height, - __private const int old_w) { + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c_block, + __private const int input_c_origin, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height, + __private const int old_w) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -287,7 +288,7 @@ __kernel void conv2d_1x1_simple( __read_only image2d_t bias, #endif #ifdef BATCH_NORM -__read_only image2d_t new_scale, + __read_only image2d_t new_scale, __read_only image2d_t new_biase, #endif __write_only image2d_t output_image, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl index 8d7950d6b897df833ada56e2de5be7c6203de9ea..771765ea6063a08784ae824a757b28450d808f6d 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl @@ -27,402 +27,509 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, __private const int offset, __private const int input_c, __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ __private const int output_width, __private const int output_height, __private const int output_c, __private const int filter_channel, - __private const int filter_width, - __private const int filter_height, - __private const int group) { + __private const int filter_width, + __private const int filter_height, + __private const int group, + __private const int input_tensor_c - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); +) { - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - if (out_c >= global_size_dim0 || - out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; #ifdef BIASE_CH - CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); + CL_DTYPE4 output = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); + CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); #else - CL_DTYPE4 output = 0.0f; + CL_DTYPE4 output = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f); #endif - CL_DTYPE4 input[9]; // 3x3 region of input - if (group == 1) { - for (int i = 0; i < input_c; ++i) { // each run for 3x3 - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - - input[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y - dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - - input[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x, pos_in.y - dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - - input[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y - dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - - input[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - input[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x, pos_in.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - input[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - input[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y + dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - - input[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x, pos_in.y + dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - - input[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y + dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - - int j = 0; - int2 pos_of_weight; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - CL_DTYPE4 weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y += 3; - CL_DTYPE4 weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y += 3; - CL_DTYPE4 weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y += 3; - CL_DTYPE4 weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 1; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 2; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 3; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 4; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 5; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 6; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 7; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 8; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); + CL_DTYPE4 input[9]; // 3x3 region of input + if (group == 1) { + for (int i = 0; i < input_c; ++i) { // each run for 3x3 + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, + in_pos_in_one_block.y); + + input[0] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[1] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[2] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[3] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[4] = select( + READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[5] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[6] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + input[7] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + input[8] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + if (i == input_c - 1) { + int c_shr = input_tensor_c % 4; + if (c_shr == 1) { + for (int k = 0; k < 9; k++) { + input[k].y = (half)0.f; + input[k].z = (half)0.f; + input[k].w = (half)0.f; + } + } else if (c_shr == 2) { + for (int k = 0; k < 9; k++) { + input[k].z = (half)0.f; + input[k].w = (half)0.f; + } + } else if (c_shr == 3) { + for (int k = 0; k < 9; k++) { + input[k].w = (half)0.f; + } + } else if (c_shr == 0) { } - } else { // group != 1 - for (int i = 0; i < 4; i++) { - int used_input_channel_num = + } + + int j = 0; + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + CL_DTYPE4 weight_x = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y += 3; + CL_DTYPE4 weight_y = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y += 3; + CL_DTYPE4 weight_z = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y += 3; + CL_DTYPE4 weight_w = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 1; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 2; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 3; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 4; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 5; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 6; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 7; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 8; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + } + } else { // group != 1 + for (int i = 0; i < 4; i++) { + int used_input_channel_num = (out_c * 4 + i) / (output_c / group) * filter_channel; - for (int f_c = 0; f_c < filter_channel; ++f_c) { - int input_c = used_input_channel_num + f_c; - int input_block = input_c / 4; - int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, - in_pos_in_one_block.y); - input[0] = select( - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + for (int f_c = 0; f_c < filter_channel; ++f_c) { + int input_c = used_input_channel_num + f_c; + int input_block = input_c / 4; + int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, + in_pos_in_one_block.y); + input[0] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, (int2)(pos_in.x - dilation, pos_in.y - dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || - in_pos_in_one_block.y - dilation < 0 || - in_pos_in_one_block.x - dilation >= input_width || - in_pos_in_one_block.y - dilation >= input_height) - << 15)); - input[1] = - select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + input[1] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, (int2)(pos_in.x, pos_in.y - dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || - in_pos_in_one_block.y - dilation < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y - dilation >= input_height) - << 15)); - input[2] = select( - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + input[2] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, (int2)(pos_in.x + dilation, pos_in.y - dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || - in_pos_in_one_block.y - dilation < 0 || - in_pos_in_one_block.x + dilation >= input_width || - in_pos_in_one_block.y - dilation >= input_height) - << 15)); - input[3] = select( - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || - in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x - dilation >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - input[4] = select( - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - input[5] = - select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y)), - (CL_DTYPE4)(0.0f), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + input[3] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + input[4] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x, pos_in.y)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + input[5] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - input[6] = select( - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + input[6] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, (int2)(pos_in.x - dilation, pos_in.y + dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || - in_pos_in_one_block.y + dilation < 0 || - in_pos_in_one_block.x - dilation >= input_width || - in_pos_in_one_block.y + dilation >= input_height) - << 15)); - input[7] = - select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + input[7] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, (int2)(pos_in.x, pos_in.y + dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || - in_pos_in_one_block.y + dilation < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y + dilation >= input_height) - << 15)); - input[8] = select( - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + input[8] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, (int2)(pos_in.x + dilation, pos_in.y + dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || - in_pos_in_one_block.y + dilation < 0 || - in_pos_in_one_block.x + dilation >= input_width || - in_pos_in_one_block.y + dilation >= input_height) - << 15)); - - CL_DTYPE tmp_out = 0; - for (int j = 0; j < 9; j++) { - int2 pos_of_weight; - pos_of_weight.x = (f_c / 4) * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3; - CL_DTYPE4 weight = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - - int f_c_offset = f_c % 4; - CL_DTYPE f_value; - if (f_c_offset == 0) { - f_value = weight.x; - } else if (f_c_offset == 1) { - f_value = weight.y; - } else if (f_c_offset == 2) { - f_value = weight.z; - } else if (f_c_offset == 3) { - f_value = weight.w; - } - - int input_c_offset = input_c % 4; - CL_DTYPE input_value; - if (input_c_offset == 0) { - input_value = input[j].x; - } else if (input_c_offset == 1) { - input_value = input[j].y; - } else if (input_c_offset == 2) { - input_value = input[j].z; - } else if (input_c_offset == 3) { - input_value = input[j].w; - } - tmp_out += f_value * input_value; + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + CL_DTYPE tmp_out = 0; + for (int j = 0; j < 9; j++) { + int2 pos_of_weight; + pos_of_weight.x = (f_c / 4) * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3; + CL_DTYPE4 weight = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + + int f_c_offset = f_c % 4; + CL_DTYPE f_value; + if (f_c_offset == 0) { + f_value = weight.x; + } else if (f_c_offset == 1) { + f_value = weight.y; + } else if (f_c_offset == 2) { + f_value = weight.z; + } else if (f_c_offset == 3) { + f_value = weight.w; } - if (i == 0) { - output.x += tmp_out; - } else if (i == 1) { - output.y += tmp_out; - } else if (i == 2) { - output.z += tmp_out; - } else if (i == 3) { - output.w += tmp_out; + int input_c_offset = input_c % 4; + CL_DTYPE input_value; + if (input_c_offset == 0) { + input_value = input[j].x; + } else if (input_c_offset == 1) { + input_value = input[j].y; + } else if (input_c_offset == 2) { + input_value = input[j].z; + } else if (input_c_offset == 3) { + input_value = input[j].w; } + tmp_out += f_value * input_value; + } + + if (i == 0) { + output.x += tmp_out; + } else if (i == 1) { + output.y += tmp_out; + } else if (i == 2) { + output.z += tmp_out; + } else if (i == 3) { + output.w += tmp_out; } } } + } - output = activation_type4(output); + output = activation_type4(output); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); } diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl index 6ab2b59343f09c1284ec21a7913f67c26707301c..5626fe6be7d451d4ffe22a2008affa7d82298bc3 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl @@ -12,288 +12,375 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include -__kernel void depth_conv2d_3x3(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input, - __read_only image2d_t filter, +__kernel void depth_conv2d_3x3( + __private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int dilation, - __private const int input_c, - __private const int input_width,/* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int dilation, + __private const int input_c, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height) { - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - const int batch_index = out_nh / output_height; + const int batch_index = out_nh / output_height; - const int out_nh_in_one_batch = out_nh % output_height; + const int out_nh_in_one_batch = out_nh % output_height; + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); - int2 stride_xy = (int2)(stride, stride); - int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); - - int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + int2 in_pos_in_one_block = + ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); #ifdef BIASE_CH - CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); + CL_DTYPE4 output = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); + CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); #else - CL_DTYPE4 output = 0.0f; + CL_DTYPE4 output = 0.0f; #endif - const int filter_width = 3; - const int filter_height = 3; - - int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); - - int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height); - - int filter_x = pos_in_filter_block.x ; - int filter_y = pos_in_filter_block.y ; - - CL_DTYPE4 inputs[9]; - - inputs[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); - - inputs[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); - - inputs[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); - - inputs[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - /* - if (output_pos.x == 112 && output_pos.y == 0) { - CL_DTYPE4 input1 = inputs[3]; - float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); - printf(" input4 3 - %v4hlf \n", in); - printf(" --- %d ---\n", in_pos_in_one_block.x - 1); - } - */ - - - inputs[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - inputs[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - inputs[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); - - inputs[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); - - inputs[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); - - CL_DTYPE4 filters[9]; - filters[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y)); - filters[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y)); - filters[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y)); - filters[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1)); - filters[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1)); - filters[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1)); - filters[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2)); - filters[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2)); - filters[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2)); - - for(int i = 0 ;i < 9 ; i++){ - output += inputs[i] * filters[i]; - } - - output = activation_type4(output); - - - /* - - if (output_pos.x == 112 && output_pos.y == 0) { - - for (int i = 0; i < 9; ++i) { - CL_DTYPE4 input1 = inputs[i]; - float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); - printf(" input4 %d - %v4hlf \n", i, in); - } - - float4 out = (float4)(output.x, output.y, output.z, output.w); - printf(" depth wise output output4 = %v4hlf \n", out); - printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); - printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); - printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); - printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); - } - - */ - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); - + const int filter_width = 3; + const int filter_height = 3; + + int2 pos_in_input_block = + (int2)(out_c * input_width, batch_index * input_height); + + int2 pos_in_filter_block = + (int2)(out_c * filter_width, batch_index * filter_height); + + int filter_x = pos_in_filter_block.x; + int filter_y = pos_in_filter_block.y; + + CL_DTYPE4 inputs[9]; + + inputs[0] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || + in_pos_in_one_block.y - 1 < 0 || + in_pos_in_one_block.x - 1 >= input_width || + in_pos_in_one_block.y - 1 >= input_height) + << 15)); + + inputs[1] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x, + pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - 1 >= input_height) + << 15)); + + inputs[2] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || + in_pos_in_one_block.y - 1 < 0 || + in_pos_in_one_block.x + 1 >= input_width || + in_pos_in_one_block.y - 1 >= input_height) + << 15)); + + inputs[3] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + pos_in_input_block.y + in_pos_in_one_block.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - 1 >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + /* + if (output_pos.x == 112 && output_pos.y == 0) { + CL_DTYPE4 input1 = inputs[3]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 3 - %v4hlf \n", in); + printf(" --- %d ---\n", in_pos_in_one_block.x - 1); + } + */ + + inputs[4] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x, + pos_in_input_block.y + in_pos_in_one_block.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + inputs[5] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + pos_in_input_block.y + in_pos_in_one_block.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x + 1 >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + inputs[6] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || + in_pos_in_one_block.y + 1 < 0 || + in_pos_in_one_block.x - 1 >= input_width || + in_pos_in_one_block.y + 1 >= input_height) + << 15)); + + inputs[7] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x, + pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + 1 >= input_height) + << 15)); + + inputs[8] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || + in_pos_in_one_block.y + 1 < 0 || + in_pos_in_one_block.x + 1 >= input_width || + in_pos_in_one_block.y + 1 >= input_height) + << 15)); + + CL_DTYPE4 filters[9]; + filters[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y)); + filters[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y)); + filters[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y)); + filters[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 1)); + filters[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 1)); + filters[5] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 1)); + filters[6] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 2)); + filters[7] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 2)); + filters[8] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 2)); + + for (int i = 0; i < 9; i++) { + output += inputs[i] * filters[i]; + } + + output = activation_type4(output); + + /* + + if (output_pos.x == 112 && output_pos.y == 0) { + + for (int i = 0; i < 9; ++i) { + CL_DTYPE4 input1 = inputs[i]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 %d - %v4hlf \n", i, in); + } + + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" depth wise output output4 = %v4hlf \n", out); + printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); + printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); + printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); + printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); + } + + */ + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); } - - __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, - __private const int ou_w_blk, - __private const int ou_nh, - __read_only image2d_t input, - __read_only image2d_t filter, + __private const int ou_w_blk, + __private const int ou_nh, + __read_only image2d_t input, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int in_ch, - __private const int in_w,/* of one block */ - __private const int in_h, /* of one block */ - __private const int ou_w, - __private const int ou_h) { - - const int ou_ch_blk_id = get_global_id(0); - const int ou_w_blk_id = get_global_id(1); - const int ou_nh_id = get_global_id(2); - const int w_blk_size = 2; - - const int batch_id = ou_nh_id / ou_h; - int ou_col_id = ou_w_blk_id * w_blk_size; - int ou_row_id = ou_nh_id % ou_h; - int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id); - - // input pos in one block and on batch - int col_id = ou_col_id - pad; - int row_id = ou_row_id - pad; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int in_ch, + __private const int in_w, /* of one block */ + __private const int in_h, /* of one block */ + __private const int ou_w, + __private const int ou_h) { + + const int ou_ch_blk_id = get_global_id(0); + const int ou_w_blk_id = get_global_id(1); + const int ou_nh_id = get_global_id(2); + const int w_blk_size = 2; + + const int batch_id = ou_nh_id / ou_h; + int ou_col_id = ou_w_blk_id * w_blk_size; + int ou_row_id = ou_nh_id % ou_h; + int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id); + + // input pos in one block and on batch + int col_id = ou_col_id - pad; + int row_id = ou_row_id - pad; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; #ifdef BIASE_CH - CL_DTYPE4 output[2]; - output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0)); - output[1] = output[0]; + CL_DTYPE4 output[2]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0)); + output[1] = output[0]; #elif defined(BIASE_ELE) - CL_DTYPE4 output[2]; - output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id)); - if (ou_col_id + 1 < ou_w) { - output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id)); - } + CL_DTYPE4 output[2]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id)); + if (ou_col_id + 1 < ou_w) { + output[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id)); + } #else - CL_DTYPE4 output[2] = {0.0f}; + CL_DTYPE4 output[2] = {0.0f}; #endif - CL_DTYPE4 inputs[12]; - - int filter_x = ou_ch_blk_id * 3; - int filter_y = 0; - CL_DTYPE4 filters[9]; - filters[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y)); - filters[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y)); - filters[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y)); - - int in_x = mad24(ou_ch_blk_id, in_w, col_id); - int in_y = mad24(batch_id, in_h, row_id); - - int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h); - int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w); - inputs[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y0)); - int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w); - inputs[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y0)); - int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w); - inputs[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y0)); - int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w); - inputs[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y0)); - - output[0] = mad(inputs[0], filters[0], output[0]); - output[1] = mad(inputs[1], filters[0], output[1]); - - output[0] = mad(inputs[1], filters[1], output[0]); - output[1] = mad(inputs[2], filters[1], output[1]); - - output[0] = mad(inputs[2], filters[2], output[0]); - output[1] = mad(inputs[3], filters[2], output[1]); - - - filters[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1)); - filters[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1)); - filters[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1)); - - - int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h); - inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1)); - inputs[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y1)); - inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1)); - inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1)); - - - output[0] = mad(inputs[4], filters[3], output[0]); - output[1] = mad(inputs[5], filters[3], output[1]); - - output[0] = mad(inputs[5], filters[4], output[0]); - output[1] = mad(inputs[6], filters[4], output[1]); - - output[0] = mad(inputs[6], filters[5], output[0]); - output[1] = mad(inputs[7], filters[5], output[1]); - - - filters[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2)); - filters[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2)); - filters[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2)); - - int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h); - inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2)); - inputs[9] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y2)); - inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2)); - inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2)); - - - output[0] = mad(inputs[8], filters[6], output[0]); - output[1] = mad(inputs[9], filters[6], output[1]); - - output[0] = mad(inputs[9], filters[7], output[0]); - output[1] = mad(inputs[10], filters[7], output[1]); - - output[0] = mad(inputs[10], filters[8], output[0]); - output[1] = mad(inputs[11], filters[8], output[1]); - - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]); - if (ou_col_id + 1 < ou_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]); - } - + CL_DTYPE4 inputs[12]; + + int filter_x = ou_ch_blk_id * 3; + int filter_y = 0; + CL_DTYPE4 filters[9]; + filters[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y)); + filters[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y)); + filters[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y)); + + int in_x = mad24(ou_ch_blk_id, in_w, col_id); + int in_y = mad24(batch_id, in_h, row_id); + + int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h); + int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w); + inputs[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y0)); + int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w); + inputs[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y0)); + int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w); + inputs[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y0)); + int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w); + inputs[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y0)); + + output[0] = mad(inputs[0], filters[0], output[0]); + output[1] = mad(inputs[1], filters[0], output[1]); + + output[0] = mad(inputs[1], filters[1], output[0]); + output[1] = mad(inputs[2], filters[1], output[1]); + + output[0] = mad(inputs[2], filters[2], output[0]); + output[1] = mad(inputs[3], filters[2], output[1]); + + filters[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 1)); + filters[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 1)); + filters[5] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 1)); + + int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h); + inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1)); + inputs[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y1)); + inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1)); + inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1)); + + output[0] = mad(inputs[4], filters[3], output[0]); + output[1] = mad(inputs[5], filters[3], output[1]); + + output[0] = mad(inputs[5], filters[4], output[0]); + output[1] = mad(inputs[6], filters[4], output[1]); + + output[0] = mad(inputs[6], filters[5], output[0]); + output[1] = mad(inputs[7], filters[5], output[1]); + + filters[6] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 2)); + filters[7] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 2)); + filters[8] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 2)); + + int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h); + inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2)); + inputs[9] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y2)); + inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2)); + inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2)); + + output[0] = mad(inputs[8], filters[6], output[0]); + output[1] = mad(inputs[9], filters[6], output[1]); + + output[0] = mad(inputs[9], filters[7], output[0]); + output[1] = mad(inputs[10], filters[7], output[1]); + + output[0] = mad(inputs[10], filters[8], output[0]); + output[1] = mad(inputs[11], filters[8], output[1]); + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + + WRITE_IMG_TYPE( + CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]); + if (ou_col_id + 1 < ou_w) { + WRITE_IMG_TYPE( + CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]); + } } - diff --git a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl index 6c419fe3c134614d28b3bcee3eabac5e8f7bdf6e..4c90981eb97f864b2c7ffa3b01e61b23aa4444de 100644 --- a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl @@ -18,7 +18,7 @@ limitations under the License. */ //////////////////////////////////////////////////////// // buffer -> image2d //////////////////////////////////////////////////////// -__kernel void buffer_to_image2d(__global CL_DTYPE *in, +__kernel void buffer_to_image2d(__global CL_DTYPE* in, __write_only image2d_t output_image, __private const int out_H, __private const int out_W, @@ -26,7 +26,6 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in, __private const int Stride0, __private const int Stride1, __private const int Stride2) { - const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -66,16 +65,25 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in, #ifdef DEBUG if (out_w > 2045) { - printf("out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f %.2f\n", - out_w, - out_C - 4 * out_c, - (float)(in[input_pos0]), - (float)(in[input_pos1]), - (float)(in[input_pos2]), - (float)(in[input_pos3])); - printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", out_c, out_w, out_nh, - output_pos.x, output_pos.y, - (float)(output.x), (float)(output.y), (float)(output.z), (float)(output.w)); + printf( + "out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f " + "%.2f\n", + out_w, + out_C - 4 * out_c, + (float)(in[input_pos0]), + (float)(in[input_pos1]), + (float)(in[input_pos2]), + (float)(in[input_pos3])); + printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", + out_c, + out_w, + out_nh, + output_pos.x, + output_pos.y, + (float)(output.x), + (float)(output.y), + (float)(output.z), + (float)(output.w)); } #endif @@ -101,34 +109,42 @@ __kernel void image2d_to_buffer(__read_only image2d_t input, const int in_h = in_nh % in_height; const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const int pos_x = mad24(in_c, in_width, in_w); - CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)); + CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE( + CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)); #ifdef DEBUG if (in_w > 2045) { - printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", in_c, in_w, in_nh, - pos_x, in_nh, - (float)(in.x), (float)(in.y), (float)(in.z), (float)(in.w)); + printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", + in_c, + in_w, + in_nh, + pos_x, + in_nh, + (float)(in.x), + (float)(in.y), + (float)(in.z), + (float)(in.w)); } #endif - const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; + const int index = + in_n * size_batch + in_c * size_block + in_h * in_width + in_w; out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE); if (C - 4 * in_c >= 2) { out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE); } - if(C - 4 * in_c >= 3) { + if (C - 4 * in_c >= 3) { out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE); } - if(C - 4 * in_c >= 4) { + if (C - 4 * in_c >= 4) { out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE); } } - -#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile +#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile //////////////////////////////////////////////////////// // buffer -> image2d_nw //////////////////////////////////////////////////////// @@ -182,8 +198,7 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in, } #endif - -#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile +#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile // image2d -> buffer __kernel void image2d_to_buffer_2d(__private const int in_height, __private const int in_width, @@ -208,15 +223,14 @@ __kernel void image2d_to_buffer_2d(__private const int in_height, //////////////////////////////////////////////////////// // buffer -> image2d (divide by 255 to normalize) //////////////////////////////////////////////////////// -__kernel void buffer_to_image2d_with_pre255(__global uchar *in, +__kernel void buffer_to_image2d_with_pre255(__global uchar* in, __write_only image2d_t output_image, __private const int out_H, __private const int out_W, __private const int out_C, __private const int Stride0, __private const int Stride1, - __private const int Stride2){ - + __private const int Stride2) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -231,7 +245,6 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in, const int in_h = out_h; const int in_w = out_w; - int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w; int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w; int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w; @@ -243,30 +256,29 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in, CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f; output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255; - if(out_C - 4 * out_c>=2){ - output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255; + if (out_C - 4 * out_c >= 2) { + output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255; } - if(out_C - 4 * out_c>=3){ - output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255; + if (out_C - 4 * out_c >= 3) { + output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255; } - if(out_C - 4 * out_c>=4){ - output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255; + if (out_C - 4 * out_c >= 4) { + output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255; } WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output); } - //////////////////////////////////////////////////////// // image2d -> buffer (multiply by 255 to de-normalize) //////////////////////////////////////////////////////// __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input, - __private const int in_width, - __private const int in_height, - __global uchar* out, - __private const int size_ch, - __private const int size_block, - __private const int size_batch, - __private const int C) { + __private const int in_width, + __private const int in_height, + __global uchar* out, + __private const int size_ch, + __private const int size_block, + __private const int size_batch, + __private const int C) { const int in_c = get_global_id(0); const int in_w = get_global_id(1); const int in_nh = get_global_id(2); @@ -277,22 +289,34 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input, CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const int pos_x = mad24(in_c, in_width, in_w); - CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) * 255; + CL_COMPUTE_DTYPE4 in = + READ_IMG_TYPE( + CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) * + 255; #ifdef DEBUG printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n", - in_c, in_w, in_nh, pos_x, in_nh, in.x, in.y, in.z, in.w); + in_c, + in_w, + in_nh, + pos_x, + in_nh, + in.x, + in.y, + in.z, + in.w); #endif - const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; + const int index = + in_n * size_batch + in_c * size_block + in_h * in_width + in_w; out[index] = convert_uchar_sat(in.x); - if(C - 4 * in_c>=2){ + if (C - 4 * in_c >= 2) { out[index + size_ch] = convert_uchar_sat(in.y); } - if(C - 4 * in_c>=3){ + if (C - 4 * in_c >= 3) { out[index + size_ch * 2] = convert_uchar_sat(in.z); } - if(C - 4 * in_c>=4){ + if (C - 4 * in_c >= 4) { out[index + size_ch * 3] = convert_uchar_sat(in.w); } } diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc index d5b2d70b09a84cb405c0e7c8f2b55f4254eb7f64..c074768a64671076c364f528f62a54bcc104c90e 100644 --- a/lite/backends/opencl/cl_runtime.cc +++ b/lite/backends/opencl/cl_runtime.cc @@ -45,6 +45,9 @@ bool CLRuntime::Init() { bool is_device_init = InitializeDevice(); is_init_success_ = is_platform_init && is_device_init; initialized_ = true; + + context_ = CreateContext(); + command_queue_ = CreateCommandQueue(context()); return initialized_; } @@ -55,7 +58,7 @@ cl::Platform& CLRuntime::platform() { cl::Context& CLRuntime::context() { if (context_ == nullptr) { - context_ = CreateContext(); + LOG(FATAL) << "context_ create failed. "; } return *context_; } @@ -67,7 +70,7 @@ cl::Device& CLRuntime::device() { cl::CommandQueue& CLRuntime::command_queue() { if (command_queue_ == nullptr) { - command_queue_ = CreateCommandQueue(context()); + LOG(FATAL) << "command_queue_ create failed. "; } return *command_queue_; } @@ -96,7 +99,7 @@ std::unique_ptr CLRuntime::CreateEvent( bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) { /* -I +CLRuntime::Global()->cl_path() + "/cl_kernel"*/ - std::string build_option = options + " -cl-fast-relaxed-math "; + std::string build_option = options + " -cl-fast-relaxed-math -cl-mad-enable"; VLOG(4) << "OpenCL build_option: " << build_option; status_ = program->build({*device_}, build_option.c_str()); CL_CHECK_ERROR(status_); diff --git a/lite/backends/opencl/target_wrapper.cc b/lite/backends/opencl/target_wrapper.cc index 9cf07dfc0c474b0b5c57b8355c099eba15610a91..950f2fc442bdbbbb843ea6b15f0c2eac23c2e690 100644 --- a/lite/backends/opencl/target_wrapper.cc +++ b/lite/backends/opencl/target_wrapper.cc @@ -66,7 +66,8 @@ void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, cl_int status; cl::Image2D *cl_image = new cl::Image2D(CLRuntime::Global()->context(), - CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), + CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR + : CL_MEM_ALLOC_HOST_PTR), img_format, cl_image2d_width, cl_image2d_height, @@ -89,7 +90,8 @@ void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, cl_int status; cl::Image2D *cl_image = new cl::Image2D(CLRuntime::Global()->context(), - CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), + CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR + : CL_MEM_ALLOC_HOST_PTR), img_format, cl_image2d_width, cl_image2d_height, @@ -112,7 +114,8 @@ void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, cl_int status; cl::Image2D *cl_image = new cl::Image2D(CLRuntime::Global()->context(), - CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), + CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR + : CL_MEM_ALLOC_HOST_PTR), img_format, cl_image2d_width, cl_image2d_height, @@ -192,7 +195,6 @@ void TargetWrapperCL::MemcpySync(void *dst, size_t size, IoDirection dir) { cl_int status; - cl::Event event; auto stream = CLRuntime::Global()->command_queue(); switch (dir) { case IoDirection::DtoD: @@ -202,9 +204,9 @@ void TargetWrapperCL::MemcpySync(void *dst, 0, size, nullptr, - &event); + nullptr); CL_CHECK_FATAL(status); - event.wait(); + CLRuntime::Global()->command_queue().finish(); break; case IoDirection::HtoD: status = stream.enqueueWriteBuffer(*static_cast(dst), @@ -283,7 +285,6 @@ void TargetWrapperCL::ImgcpySync(void *dst, cl::array origin = {0, 0, 0}; cl::array region = {cl_image2d_width, cl_image2d_height, 1}; cl_int status; - cl::Event event; auto stream = CLRuntime::Global()->command_queue(); switch (dir) { case IoDirection::DtoD: @@ -293,9 +294,9 @@ void TargetWrapperCL::ImgcpySync(void *dst, origin, region, nullptr, - &event); + nullptr); CL_CHECK_FATAL(status); - event.wait(); + CLRuntime::Global()->command_queue().finish(); break; case IoDirection::HtoD: status = stream.enqueueWriteImage(*static_cast(dst), diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc index 05a10b5a19fbc8e80ee6dd07e67154d9cf6d1b22..cb1781db2199c1b7a12aaec80b1904f65b23b534 100644 --- a/lite/backends/x86/math/math_function.cc +++ b/lite/backends/x86/math/math_function.cc @@ -129,8 +129,7 @@ struct RowwiseAdd { T* output_data = output->template mutable_data(); for (int64_t i = 0; i < in_dims[0]; ++i) { for (int64_t j = 0; j < size; ++j) { - output_data[i * in_dims[0] + j] = - input_data[i * in_dims[0] + j] + vector_data[j]; + output_data[i * size + j] = input_data[i * size + j] + vector_data[j]; } } } diff --git a/lite/backends/x86/math/selected_rows_functor.cc b/lite/backends/x86/math/selected_rows_functor.cc index acb377e31ccac96547fc4f0644332cfad36d66bc..fe7a46f9f04d49ea7b505b8e2ece6b4bdd0ec826 100644 --- a/lite/backends/x86/math/selected_rows_functor.cc +++ b/lite/backends/x86/math/selected_rows_functor.cc @@ -279,7 +279,7 @@ struct MergeAdd { } } if (has_value_input == nullptr) { - VLOG(3) << "no input has value! just return" << std::endl; + VLOG(3) << "no input has value! just return"; return; } auto input_width = has_value_input->value().dims()[1]; diff --git a/lite/core/context.cc b/lite/core/context.cc index be41aa6eb0cb986760f38eaa2bb5b7e017cc4edb..711c67f8b7f36edcd2d66569d964296d96e8d85c 100644 --- a/lite/core/context.cc +++ b/lite/core/context.cc @@ -19,6 +19,7 @@ namespace lite { #ifdef LITE_WITH_XPU thread_local xdnn::Context* Context::_tls_raw_ctx{nullptr}; +int Context::_workspace_l3_size_per_thread{0}; #endif } // namespace lite diff --git a/lite/core/context.h b/lite/core/context.h index bacb570a903d807945cb9e2a8b98615fcaba9384..d0c1bd93cc7b93628aedc5f549c84d19c44f4f71 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -151,14 +151,23 @@ class Context { if (_tls_raw_ctx == nullptr) { _tls_raw_ctx = xdnn::create_context(); CHECK(_tls_raw_ctx); + int r = xdnn::set_workspace_l3_size(_tls_raw_ctx, + _workspace_l3_size_per_thread); + if (r != 0) { + LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r + << ", _workspace_l3_size_per_thread = " + << _workspace_l3_size_per_thread; + } } return _tls_raw_ctx; } static void SetWorkspaceL3Size(int l3_size = 0xfffc00) { - xdnn::set_workspace_l3_size(GetRawContext(), l3_size); + _workspace_l3_size_per_thread = l3_size; } + // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker + // thread static void SetDev(int dev_no = 0) { const char* dev_env = getenv("LITE_XPU_DEV"); if (dev_env) { @@ -173,6 +182,7 @@ class Context { private: static thread_local xdnn::Context* _tls_raw_ctx; + static int _workspace_l3_size_per_thread; }; #endif @@ -340,27 +350,17 @@ class Context { template <> class Context { std::shared_ptr cl_context_; - using WaitListType = - std::unordered_map(nullptr)), - std::shared_ptr>; - std::shared_ptr cl_wait_list_; public: CLContext* cl_context() { return cl_context_.get(); } - WaitListType* cl_wait_list() { return cl_wait_list_.get(); } void InitOnce() { // Init cl runtime. CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed"; - cl_context_ = std::make_shared(); - cl_wait_list_ = std::make_shared(); } - void CopySharedTo(OpenCLContext* ctx) { - ctx->cl_context_ = cl_context_; - ctx->cl_wait_list_ = cl_wait_list_; - } + void CopySharedTo(OpenCLContext* ctx) { ctx->cl_context_ = cl_context_; } }; #endif diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt index d036bf7988b98e64586e42683d33b4696e9ff706..b8234b18922f454c41e295209da13de024184adc 100644 --- a/lite/core/mir/CMakeLists.txt +++ b/lite/core/mir/CMakeLists.txt @@ -21,9 +21,13 @@ lite_cc_library(mir_passes fusion/elementwise_add_activation_fuse_pass.cc fusion/quant_dequant_fuse_pass.cc fusion/sequence_pool_concat_fuse_pass.cc + fusion/scale_activation_fuse_pass.cc fusion/__xpu__resnet_fuse_pass.cc fusion/__xpu__multi_encoder_fuse_pass.cc + fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc + fusion/__xpu__fc_fuse_pass.cc elimination/identity_scale_eliminate_pass.cc + elimination/identity_dropout_eliminate_pass.cc elimination/elementwise_mul_constant_eliminate_pass.cc static_kernel_pick_pass.cc variable_place_inference_pass.cc diff --git a/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc b/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..92401df875da1f500ec09b34b2786d15cea2991b --- /dev/null +++ b/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/pass.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/pattern_matcher_high_api.h" + +namespace paddle { +namespace lite { +namespace mir { + +namespace { + +class Eliminator : public FuseBase { + public: + void BuildPattern() override { + // the previous op's output need updat + auto* pre_op = OpNode("preop")->assert_is_not_op_type("conditional_block"); + // TODO(Superjomn) check has only one output + auto* x = VarNode("x")->assert_is_op_input("dropout", "X"); + auto* dropout_op = OpNode("dropout", "dropout") + ->assert_op_attr("is_test", 1) + ->assert_op_attr( + "dropout_implementation", "upscale_in_train"); + auto* out = VarNode("out")->assert_is_op_output("dropout", "Out"); + auto* mask = VarNode("mask")->assert_is_op_output("dropout", "Mask"); + + *pre_op >> *x >> *dropout_op >> *out; + *dropout_op >> *mask; + + // The pre_op will be eliminated, and a new output-updated op will insert. + x->AsIntermediate(); // x is pre_op's output, need to update + dropout_op->AsIntermediate(); + mask->AsIntermediate(); + } + + private: + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + auto& pre_op = matched.at("preop")->AsStmt(); + auto op_info = *pre_op.op_info(); + + op_info.UpdateAllOutputs(matched.at("x")->AsArg().name, + matched.at("out")->AsArg().name); + pre_op.ResetOp(op_info, graph->valid_places()); + + IR_NODE_LINK_TO(matched.at("preop"), matched.at("out")); + } +}; + +} // namespace + +class IdentityDropoutEliminatePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + Eliminator eliminator; + eliminator(graph.get()); + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(identity_dropout_eliminate_pass, + paddle::lite::mir::IdentityDropoutEliminatePass) + .BindTargets({TARGET(kXPU)}); diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt index 04a36976c7110c64ef781af12fc86fd4853fe583..a7a4cee798c1e8ef5b9b8f8d9e8e5810554fc571 100644 --- a/lite/core/mir/fusion/CMakeLists.txt +++ b/lite/core/mir/fusion/CMakeLists.txt @@ -31,6 +31,9 @@ lite_cc_library(fuse_interpolate lite_cc_library(fuse_sequence_pool_concat SRCS sequence_pool_concat_fuser.cc DEPS pattern_matcher_high_api) +lite_cc_library(fuse_scale_activation + SRCS scale_activation_fuser.cc + DEPS pattern_matcher_high_api) set(mir_fusers fuse_fc @@ -44,6 +47,7 @@ set(mir_fusers fuse_transpose_softmax_transpose fuse_interpolate fuse_sequence_pool_concat + fuse_scale_activation CACHE INTERNAL "fusers") if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) diff --git a/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc b/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..1272ae4c63c2521bf738ca8623fcde2d40014dea --- /dev/null +++ b/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc @@ -0,0 +1,166 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/xpu_pattern_matcher_high_api.h" +#include "lite/utils/string.h" + +namespace paddle { +namespace lite { +namespace mir { + +namespace fusion { + +class XPUEmbeddingWithEltwiseAddFuser : public FuseBase { + public: + explicit XPUEmbeddingWithEltwiseAddFuser(int n_embedding) + : n_embedding_(n_embedding) {} + + void BuildPattern() override { + auto* ids0 = + VarNode("ids0")->assert_is_op_input("lookup_table", "Ids")->AsInput(); + auto* table0 = + VarNode("table0")->assert_is_op_input("lookup_table", "W")->AsInput(); + auto* embedding0 = OpNode("embedding0", "lookup_table"); + auto* embedding_out0 = VarNode("embedding_out0") + ->assert_is_op_output("lookup_table", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + + auto* ids1 = + VarNode("ids1")->assert_is_op_input("lookup_table", "Ids")->AsInput(); + auto* table1 = + VarNode("table1")->assert_is_op_input("lookup_table", "W")->AsInput(); + auto* embedding1 = OpNode("embedding1", "lookup_table")->AsIntermediate(); + auto* embedding_out1 = VarNode("embedding_out1") + ->assert_is_op_output("lookup_table", "Out") + ->assert_is_op_input("elementwise_add", "Y") + ->AsIntermediate(); + + auto* ewadd01 = OpNode("ewadd01", "elementwise_add")->AsIntermediate(); + auto* ewadd01_out = VarNode("ewadd01_out") + ->assert_is_op_output("elementwise_add", "Out") + ->AsIntermediate(); + + embedding0->LinksFrom({ids0, table0}); + embedding0->LinksTo({embedding_out0}); + embedding1->LinksFrom({ids1, table1}); + embedding1->LinksTo({embedding_out1}); + ewadd01->LinksFrom({embedding_out0, embedding_out1}); + ewadd01->LinksTo({ewadd01_out}); + + auto* last_ewadd_out = ewadd01_out; + for (int i = 2; i < n_embedding_; ++i) { + auto ids_name = paddle::lite::string_format("ids%d", i); + auto table_name = paddle::lite::string_format("table%d", i); + auto embedding_name = paddle::lite::string_format("embedding%d", i); + auto embedding_out_name = + paddle::lite::string_format("embedding_out%d", i); + + auto* new_ids = VarNode(ids_name) + ->assert_is_op_input("lookup_table", "Ids") + ->AsInput(); + auto* new_table = VarNode(table_name) + ->assert_is_op_input("lookup_table", "W") + ->AsInput(); + auto* new_embedding = + OpNode(embedding_name, "lookup_table")->AsIntermediate(); + auto* new_embedding_out = VarNode(embedding_out_name) + ->assert_is_op_output("lookup_table", "Out") + ->assert_is_op_input("elementwise_add", "Y") + ->AsIntermediate(); + + new_embedding->LinksFrom({new_ids, new_table}); + new_embedding->LinksTo({new_embedding_out}); + + auto ewadd_name = paddle::lite::string_format("ewadd%d%d", i - 1, i); + auto ewadd_out_name = ewadd_name + "_out"; + + auto* new_ewadd = OpNode(ewadd_name, "elementwise_add")->AsIntermediate(); + auto* new_ewadd_out = VarNode(ewadd_out_name) + ->assert_is_op_output("elementwise_add", "Out") + ->AsIntermediate(); + + new_ewadd->LinksFrom({last_ewadd_out, new_embedding_out}); + new_ewadd->LinksTo({new_ewadd_out}); + last_ewadd_out = new_ewadd_out; + } + last_ewadd_out->AsOutput(); + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__embedding_with_eltwise_add"); + std::vector ids_names; + std::vector table_names; + for (int i = 0; i < n_embedding_; ++i) { + auto ids_name = paddle::lite::string_format("ids%d", i); + ids_names.push_back(matched.at(ids_name)->arg()->name); + auto table_name = paddle::lite::string_format("table%d", i); + table_names.push_back(matched.at(table_name)->arg()->name); + } + op_desc.SetInput("Ids", ids_names); + op_desc.SetInput("Tables", table_names); + auto output_name = paddle::lite::string_format( + "ewadd%d%d_out", n_embedding_ - 2, n_embedding_ - 1); + op_desc.SetOutput("Output", {matched.at(output_name)->arg()->name}); + op_desc.SetAttr("n_embedding", n_embedding_); + auto* embedding0_op_info = matched.at("embedding0")->stmt()->op_info(); + op_desc.SetAttr( + "padding_idx", embedding0_op_info->GetAttr("padding_idx")); + + auto* new_stmt = matched.at("embedding0")->stmt(); + auto new_op = LiteOpRegistry::Global().Create(op_desc.Type()); + new_op->Attach(op_desc, new_stmt->op()->scope()); + new_op->SetValidPlaces(new_stmt->op()->valid_places()); + auto kernels = new_op->CreateKernels(new_op->valid_places()); + new_stmt->SetOp(new_op); + new_stmt->SetKernels(std::move(kernels)); + + for (int i = 0; i < n_embedding_; ++i) { + auto ids_name = paddle::lite::string_format("ids%d", i); + auto table_name = paddle::lite::string_format("table%d", i); + DirectedLink(matched.at(ids_name), matched.at("embedding0")); + DirectedLink(matched.at(table_name), matched.at("embedding0")); + } + IR_OP_VAR_LINK(matched.at("embedding0"), matched.at(output_name)); + } + + private: + int n_embedding_; +}; + +} // namespace fusion + +class XPUEmbeddingWithEltwiseAddFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + for (int n_embedding : {4, 3}) { + fusion::XPUEmbeddingWithEltwiseAddFuser fuser(n_embedding); + fuser(graph.get()); + } + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass, + paddle::lite::mir::XPUEmbeddingWithEltwiseAddFusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("lookup_table"); diff --git a/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc b/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..1e6b28790e1c87f2e9e80acc99f3cf517621c477 --- /dev/null +++ b/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/pattern_matcher_high_api.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +class XPUFcFuser : public FuseBase { + public: + explicit XPUFcFuser(bool with_relu) : with_relu_(with_relu) {} + + void BuildPattern() override { + // create nodes. + auto* x = VarNode("x")->assert_is_op_input("mul", "X"); + auto* W = VarNode("W")->assert_is_op_input("mul", "Y"); + auto* b = VarNode("b")->assert_is_persistable_var(); + auto* mul = OpNode("mul", "mul"); + auto* mul_out = VarNode("mul_out"); + auto* add = OpNode("add", "elementwise_add"); + auto* Out = VarNode("Out"); + + // create topology. + std::vector mul_inputs{W, x}; + std::vector add_inputs{mul_out, b}; + mul_inputs >> *mul >> *mul_out; + + // Some op specialities. + mul_out->AsIntermediate(); + mul->AsIntermediate(); + add->AsIntermediate(); + + if (with_relu_) { + auto* add_out = VarNode("add_out"); + auto* relu = OpNode("relu", "relu"); + std::vector relu_inputs{add_out}; + add_inputs >> *add >> *add_out; + relu_inputs >> *relu >> *Out; + add_out->AsIntermediate(); + relu->AsIntermediate(); + } else { + add_inputs >> *add >> *Out; + } + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + auto mul = matched.at("mul")->stmt()->op(); + auto* scope = mul->scope(); + + // convert W from float to int16, and transpose W + auto weight_name = matched.at("W")->arg()->name; + auto* weight_t = scope->FindMutableTensor(weight_name); + auto weight_dims = weight_t->dims(); + int weight_len = weight_t->numel(); + float* weight_on_host = weight_t->mutable_data(); + float max_f = + paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len); + + std::unique_ptr weight_int16(new int16_t[weight_len]); + std::unique_ptr weight_trans_int16(new int16_t[weight_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + weight_on_host, weight_int16.get(), max_f, weight_len); + paddle::lite::xpu::math::Transpose(weight_int16.get(), + weight_trans_int16.get(), + weight_dims[0], + weight_dims[1]); + memcpy( + weight_on_host, weight_trans_int16.get(), weight_len * sizeof(int16_t)); + + auto op_desc = GenOpDesc(matched, max_f, true); + auto fc_op = LiteOpRegistry::Global().Create("__xpu__fc"); + auto& valid_places = mul->valid_places(); + fc_op->Attach(op_desc, scope); + + auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places); + + IR_NODE_LINK_TO(matched.at("W"), new_op_node); + IR_NODE_LINK_TO(matched.at("x"), new_op_node); + IR_NODE_LINK_TO(matched.at("b"), new_op_node); + IR_NODE_LINK_TO(new_op_node, matched.at("Out")); + } + + private: + cpp::OpDesc GenOpDesc(const key2nodes_t& matched, + float w_max, + bool transpose_w) { + cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info(); + op_desc.mutable_inputs()->clear(); + op_desc.mutable_outputs()->clear(); + op_desc.SetType("__xpu__fc"); + op_desc.SetInput("Input", {matched.at("x")->arg()->name}); + op_desc.SetInput("W", {matched.at("W")->arg()->name}); + op_desc.SetInput("Bias", {matched.at("b")->arg()->name}); + op_desc.SetOutput("Out", {matched.at("Out")->arg()->name}); + op_desc.SetAttr( + "in_num_col_dims", + matched.at("mul")->stmt()->op_info()->GetAttr("x_num_col_dims")); + op_desc.SetAttr("w_max", w_max); + op_desc.SetAttr("transpose_w", transpose_w); + if (with_relu_) { + op_desc.SetAttr("activation_type", std::string{"relu"}); + } + return op_desc; + } + + bool with_relu_; +}; + +} // namespace fusion + +class XPUFcFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + + fusion::XPUFcFuser fuser(true /* with_relu */); + fuser(graph.get()); + + fusion::XPUFcFuser fuser2(false /* with_relu */); + fuser2(graph.get()); + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__fc_fuse_pass, paddle::lite::mir::XPUFcFusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("fc"); diff --git a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc index 655274070f1ffcccf39b5f3ff6aaa705c5cbbfda..a6640f107f5dd46e6570a55cf59d2ad69a2bee1a 100644 --- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc +++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc @@ -16,6 +16,7 @@ #include #include "lite/backends/xpu/math.h" #include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/type_precision_cast_pass.h" // For UpdateInputs() #include "lite/core/mir/xpu_pattern_matcher_high_api.h" #include "lite/operators/subgraph_op.h" @@ -588,8 +589,7 @@ class XPUMultiEncoderFuser { multi_encoder_stmt->SetOp(multi_encoder_op); multi_encoder_stmt->SetKernels(std::move(kernels)); - // temp remove useless cast - std::unordered_set to_remove2; + // remove dangling/useless cast Node* stack = nullptr; for (auto* node : graph->StmtTopologicalOrder()) { CHECK(node->IsStmt()); @@ -597,16 +597,39 @@ class XPUMultiEncoderFuser { stack = node; } } - Node* stack_out = stack->outlinks.front(); - for (Node* cast : stack_out->outlinks) { - Node* cast_out = cast->outlinks.front(); - if (cast_out->outlinks.size() == 0) { - // remove - to_remove2.insert(cast_out); - to_remove2.insert(cast); + if (stack) { + std::unordered_set to_remove2; + Node* stack_out = stack->outlinks.front(); + // avoid modification while traversing + auto stack_out_outlinks = stack_out->outlinks; + for (Node* cast : stack_out_outlinks) { + if (cast->stmt()->op_info()->Type() != "cast") { + continue; + } + + Node* cast_out = cast->outlinks.front(); + if (cast_out->outlinks.size() == 0) { + // dangling cast + to_remove2.insert(cast); + to_remove2.insert(cast_out); + VLOG(3) << "Remove dangling cast [" << cast_out->arg()->name << "]"; + } else if (cast_out->outlinks.size() == 1) { + // useless cast + to_remove2.insert(cast); + to_remove2.insert(cast_out); + VLOG(3) << "Remove useless cast [" << cast_out->arg()->name << "]"; + + auto* multi_encoder = cast_out->outlinks.front(); + DirectedLink(stack_out, multi_encoder); + UpdateInputs(multi_encoder->stmt()->op().get(), + cast_out->arg()->name, + stack_out->arg()->name); + auto update_op_info = *multi_encoder->stmt()->op_info(); + multi_encoder->stmt()->ResetOp(update_op_info, graph->valid_places()); + } } + GraphSafeRemoveNodes(graph, to_remove2); } - GraphSafeRemoveNodes(graph, to_remove2); } }; diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc index 143a7cecce8c1c45ada9ad31e8e4bea5447fec68..6718356788d46e24752204c3586cd8447cbbfaaa 100644 --- a/lite/core/mir/fusion/conv_bn_fuser.cc +++ b/lite/core/mir/fusion/conv_bn_fuser.cc @@ -103,9 +103,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { std::string conv_weight_name = matched.at("conv_weight")->arg()->name; auto conv_weight_t = scope->FindVar(conv_weight_name)->GetMutable(); + auto groups = conv_op_desc->GetAttr("groups"); + bool depthwise = false; if (conv_type_ == "conv2d_transpose") { + depthwise = (conv_weight_t->dims()[0] == conv_weight_t->dims()[1] * groups); CHECK_EQ(static_cast(bn_scale_t->data_size()), - static_cast(conv_weight_t->dims()[1])) + static_cast(conv_weight_t->dims()[1] * groups)) << "The BN bias's size should be equal to the size of the first " << "dim size of the conv weights"; } else { @@ -159,7 +162,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { // compute new conv_weight for int8 auto weight_scale = conv_op_desc->GetAttr>("weight_scale"); - if (conv_type_ == "conv2d_transpose") { + if (conv_type_ == "conv2d_transpose" && !depthwise) { int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; @@ -199,7 +202,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { } else { // compute new conv_weight auto conv_weight_d = conv_weight_t->mutable_data(); - if (conv_type_ == "conv2d_transpose") { + if (conv_type_ == "conv2d_transpose" && !depthwise) { int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; diff --git a/lite/core/mir/fusion/scale_activation_fuse_pass.cc b/lite/core/mir/fusion/scale_activation_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..2ad1f4994f6d5183d3b5c925bb222cb95ea064e8 --- /dev/null +++ b/lite/core/mir/fusion/scale_activation_fuse_pass.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/fusion/scale_activation_fuse_pass.h" +#include +#include +#include "lite/core/mir/fusion/scale_activation_fuser.h" +#include "lite/core/mir/pass_registry.h" + +namespace paddle { +namespace lite { +namespace mir { + +void ScaleActivationFusePass::Apply(const std::unique_ptr& graph) { + for (auto act_type : {"relu", "relu6", "leaky_relu"}) { + fusion::ScaleActivationFuser fuser(act_type); + fuser(graph.get()); + } +} + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(lite_scale_activation_fuse_pass, + paddle::lite::mir::ScaleActivationFusePass) + .BindTargets({TARGET(kARM)}) + .BindKernel("scale"); diff --git a/lite/core/mir/fusion/scale_activation_fuse_pass.h b/lite/core/mir/fusion/scale_activation_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..2118a0b6f396ff12855009a975059c95ee6111a8 --- /dev/null +++ b/lite/core/mir/fusion/scale_activation_fuse_pass.h @@ -0,0 +1,32 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/core/mir/pass.h" + +namespace paddle { +namespace lite { +namespace mir { + +class ScaleActivationFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/fusion/scale_activation_fuser.cc b/lite/core/mir/fusion/scale_activation_fuser.cc new file mode 100644 index 0000000000000000000000000000000000000000..4f18099da8bc97d9dab8f9c31fd6c23d42d67d81 --- /dev/null +++ b/lite/core/mir/fusion/scale_activation_fuser.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/fusion/scale_activation_fuser.h" +#include +#include + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +void ScaleActivationFuser::BuildPattern() { + // create input nodes. + auto* x = VarNode("x")->assert_is_op_input("scale", "X")->AsInput(); + + // create op nodes + auto* scale = + OpNode("scale", "scale")->assert_is_op("scale")->AsIntermediate(); + auto* act = + OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate(); + + // create intermediate nodes + auto* scale_out = VarNode("scale_out") + ->assert_is_op_output("scale", "Out") + ->assert_is_op_input(act_type_, "X") + ->AsIntermediate(); + + // create output node + auto* out = + VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput(); + // create topology. + *x >> *scale >> *scale_out; + *scale_out >> *act >> *out; +} + +void ScaleActivationFuser::InsertNewNode(SSAGraph* graph, + const key2nodes_t& matched) { + auto op_desc = GenOpDesc(matched); + auto scale_op = LiteOpRegistry::Global().Create("scale"); + auto scale = matched.at("scale")->stmt()->op(); + auto* scope = scale->scope(); + auto& valid_places = scale->valid_places(); + scale_op->Attach(op_desc, scope); + + auto* new_op_node = graph->GraphCreateInstructNode(scale_op, valid_places); + + IR_NODE_LINK_TO(matched.at("x"), new_op_node); + IR_NODE_LINK_TO(new_op_node, matched.at("output")); +} + +cpp::OpDesc ScaleActivationFuser::GenOpDesc(const key2nodes_t& matched) { + cpp::OpDesc op_desc = *matched.at("scale")->stmt()->op_info(); + op_desc.SetOutput("Out", {matched.at("output")->arg()->name}); + cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info(); + + op_desc.SetAttr("activation_type", act_type_); + if (act_type_ == "relu") { + op_desc.SetAttr("fuse_relu", true); + } else if (act_type_ == "relu6") { + float alpha = act_op_desc.GetAttr("threshold"); + op_desc.SetAttr("alpha", alpha); + } else if (act_type_ == "leaky_relu") { + float alpha = act_op_desc.GetAttr("alpha"); + op_desc.SetAttr("alpha", alpha); + } + return op_desc; +} + +} // namespace fusion +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/fusion/scale_activation_fuser.h b/lite/core/mir/fusion/scale_activation_fuser.h new file mode 100644 index 0000000000000000000000000000000000000000..9fa9b9d2b5ebc5091b41a2ca244689797c97ccb6 --- /dev/null +++ b/lite/core/mir/fusion/scale_activation_fuser.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/core/mir/pattern_matcher_high_api.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +class ScaleActivationFuser : public FuseBase { + public: + explicit ScaleActivationFuser(const std::string& act_type) { + act_type_ = act_type; + } + void BuildPattern() override; + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; + + private: + cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; + std::string act_type_; +}; + +} // namespace fusion +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc index 941a9e9f88cf04ef47487237b1a3f6509dea762b..de76f404f8a129eb94e645dc731a0d09c1ee3c77 100644 --- a/lite/core/op_lite.cc +++ b/lite/core/op_lite.cc @@ -25,16 +25,16 @@ namespace lite { bool OpLite::InferShape() { // if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_ // InferShapeByMemoryInternal will be applied. - if (param_.input_tensor_ptrs() && param_.output_tensor_ptrs()) { + if (op_param_ && op_param_->input_tensor_ptrs() && + op_param_->output_tensor_ptrs()) { return this->InferShapeWithCache(); } else { - // otherwise, InferShapeImpl is applied directly. return this->InferShapeImpl(); } } bool OpLite::InferShapeWithCache() { // 1. Get vector of current input tensors - auto *current_inputs = param_.input_tensor_ptrs(); + auto *current_inputs = op_param_->input_tensor_ptrs(); // 2. Get hash value of current inputs shape and lod size_t new_hash = 0; for (auto iter = current_inputs->begin(); iter != current_inputs->end(); @@ -59,7 +59,7 @@ bool OpLite::InferShapeWithCache() { if (new_hash == io_shape_lod_hash_ && new_hash != 0) { // if current hash value is consistent with io_shape_lod_hash_, // previous outputs shape and lod are reused. - auto *current_outputs = param_.output_tensor_ptrs(); + auto *current_outputs = op_param_->output_tensor_ptrs(); for (size_t i = 0; i < current_outputs->size(); i++) { current_outputs->at(i)->Resize(last_output_shapes[i]); current_outputs->at(i)->set_lod(last_output_lods[i]); @@ -68,10 +68,12 @@ bool OpLite::InferShapeWithCache() { // otherwise, current hash value is changed, InferShapeImpl will apply. io_shape_lod_hash_ = new_hash; this->InferShapeImpl(); - auto *current_outputs = param_.output_tensor_ptrs(); + auto *current_outputs = op_param_->output_tensor_ptrs(); + last_output_shapes.clear(); + last_output_lods.clear(); for (size_t i = 0; i < current_outputs->size(); i++) { - last_output_shapes[i] = current_outputs->at(i)->dims(); - last_output_lods[i] = current_outputs->at(i)->lod(); + last_output_shapes.push_back(current_outputs->at(i)->dims()); + last_output_lods.push_back(current_outputs->at(i)->lod()); } } return true; diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h index 428b188c468ded790e74c9cc4f5da5c7efe2fd00..656f992b1736d88abd1ed95759b19519ec11aff7 100644 --- a/lite/core/op_lite.h +++ b/lite/core/op_lite.h @@ -77,6 +77,11 @@ class OpLite : public Registry { // Link the external execution environ to internal context. bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope); + template + inline void AttachParam(T *param) { + op_param_ = static_cast(param); + } + const OpInfo *op_info() const { return op_info_.get(); } OpInfo *mutable_op_info() { return op_info_.get(); } @@ -167,11 +172,10 @@ class OpLite : public Registry { std::vector valid_places_; Place kernel_place_{TARGET(kHost), PRECISION(kFloat)}; std::unique_ptr op_info_; - std::vector last_output_shapes{}; std::vector>> last_output_lods{}; size_t io_shape_lod_hash_{}; - mutable operators::ParamBase param_; + mutable operators::ParamBase *op_param_{nullptr}; private: // Infer Shape according to memory, if current input shapes are consistent diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h index 7d73155ac067da4bfd112661d9061c008c1ccef1..7c2df12b17bdae80586a94caa8681271cfb7d409 100644 --- a/lite/core/op_registry.h +++ b/lite/core/op_registry.h @@ -111,18 +111,23 @@ class KernelRegistry final { KernelRegistryForTarget *, // + KernelRegistryForTarget *, // KernelRegistryForTarget *, // KernelRegistryForTarget *, // + KernelRegistryForTarget *, // KernelRegistryForTarget *, // + KernelRegistryForTarget *, // @@ -141,9 +146,7 @@ class KernelRegistry final { KernelRegistryForTarget *, // - KernelRegistryForTarget *, // + KernelRegistryForTarget *, // diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 83df76f0230f666ec3857834e234afd921daa927..3d71b5d62e1a2d25202d34461affc78bd27f4852 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -71,12 +71,17 @@ class Optimizer { "identity_scale_eliminate_pass", // "elementwise_mul_constant_eliminate_pass", // "lite_sequence_pool_concat_fuse_pass", // + "lite_scale_activation_fuse_pass", // #if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \ (defined LITE_WITH_ARM) "lite_elementwise_add_activation_fuse_pass", // #endif "__xpu__resnet_fuse_pass", "__xpu__multi_encoder_fuse_pass", + "__xpu__embedding_with_eltwise_add_fuse_pass", + "__xpu__fc_fuse_pass", + "identity_dropout_eliminate_pass", // should be placed after + // xpu fusion "quantized_op_attributes_inference_pass", // Only for fully // quantized model, infer // the output scale and diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h index ee581bf5e126f07fcdb1edeb9ab5b570df0c2ade..0eebf6a61016a3b399b7a7d4de26a4303f741440 100644 --- a/lite/core/profile/precision_profiler.h +++ b/lite/core/profile/precision_profiler.h @@ -22,6 +22,7 @@ #include #include #include "lite/core/program.h" +#include "lite/fluid/float16.h" #ifdef LITE_WITH_OPENCL #include "lite/backends/opencl/cl_image_converter.h" @@ -52,6 +53,24 @@ static bool write_tensorfile(const Tensor* tensor, const std::string& locate) { return true; } +static bool write_precision_summary_tofile(const std::string& string, + const std::string& log_dir = "") { + if (log_dir == "") { + LOG(INFO) << "The `log_dir` of precision summary file is not set. log_dir:" + << log_dir; + return false; + } + FILE* fp = fopen(log_dir.c_str(), "a"); + if (fp == nullptr) { + LOG(INFO) << "Open precision summary file:" << log_dir << "failed."; + return false; + } else { + fprintf(fp, "%s\n", string.c_str()); + } + fclose(fp); + return true; +} + class PrecisionProfiler { public: // TODO(ysh329): need to remove `explicit PrecisionProfiler` @@ -67,7 +86,7 @@ class PrecisionProfiler { using std::left; using std::fixed; STL::stringstream ss; - ss << "========================================= " + ss << "\n\n========================================= " << "Detailed Precision Profiler Summary " << "=========================================" << std::endl; ss << setw(45) << left << "operator:(kernel_info)" @@ -77,6 +96,13 @@ class PrecisionProfiler { << " " << setw(15) << left << "std_deviation" << " " << setw(15) << left << "ave_grow_rate*" << std::endl; + // write to file with path: `log_dir` + if (log_dir_ != "") { + FILE* fp = fopen(log_dir_.c_str(), "a"); + std::string header_str{ss.str()}; + fprintf(fp, "%s\n", header_str.c_str()); + fclose(fp); + } return ss.str(); } @@ -194,6 +220,7 @@ class PrecisionProfiler { } #ifdef LITE_WITH_OPENCL } else if (target_type == TARGET(kOpenCL)) { + CLRuntime::Global()->command_queue().finish(); switch (layout_type) { case DATALAYOUT(kImageDefault): { paddle::lite::CLImageConverterDefault default_convertor; @@ -360,8 +387,12 @@ class PrecisionProfiler { } } } + write_precision_summary_tofile(ss.str(), log_dir_); return ss.str(); } + + private: + std::string log_dir_{"/storage/emulated/0/precision.log"}; }; } // namespace profile diff --git a/lite/core/scope.cc b/lite/core/scope.cc index 775652e2a0d3c962c17dc796ef5f1d381411fa50..d87360a1da8215332c71739bbfa2660977f4f74c 100644 --- a/lite/core/scope.cc +++ b/lite/core/scope.cc @@ -60,6 +60,29 @@ Variable *Scope::FindLocalVar(const std::string &name) const { return nullptr; } +// AttributeVarNames will get persistive attribute names stored in parent scope +std::vector Scope::AttributeVarNames() const { + std::vector resulted_keys; + const Scope *cur_scope = this; + while (cur_scope->parent()) { + cur_scope = cur_scope->parent(); + auto keys = cur_scope->LocalVarNames(); + resulted_keys.insert(resulted_keys.end(), keys.begin(), keys.end()); + } + // remove feed and fetch + std::vector skiped_vars = {"feed", "fetch"}; + for (int i = 0; i < skiped_vars.size(); i++) { + auto iter = + std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]); + while (iter != resulted_keys.end()) { + resulted_keys.erase(iter); + iter = + std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]); + } + } + return resulted_keys; +} + std::vector Scope::LocalVarNames() const { std::vector keys; for (const auto &item : vars_) { diff --git a/lite/core/scope.h b/lite/core/scope.h index 2593c365224a0564caa27cf10eee1f917b90c342..aa3a8a1bfb7f4bf1cc00b548c0b0962ce8d73663 100644 --- a/lite/core/scope.h +++ b/lite/core/scope.h @@ -45,6 +45,8 @@ class Scope final { const Scope* parent() const { return parent_; } + // Get attribute params stored in parent scopes. + std::vector AttributeVarNames() const; // Following the legacy scope interface. std::vector LocalVarNames() const; diff --git a/lite/demo/cxx/cuda_demo/CMakeLists.txt b/lite/demo/cxx/cuda_demo/CMakeLists.txt index e27548b4e56ce03098c5c82b3eee49add62cc0a4..f057a1f189fdb92ff33f00d5ceacc83f7fc28c5d 100644 --- a/lite/demo/cxx/cuda_demo/CMakeLists.txt +++ b/lite/demo/cxx/cuda_demo/CMakeLists.txt @@ -1,20 +1,24 @@ -project(demo CXX C) cmake_minimum_required(VERSION 2.8) +project(demo CXX C) + +add_definitions(-DLITE_WITH_CUDA) set(TARGET demo) set(CMAKE_CXX_FLAGS "-std=c++11 -O3") -set(LITE_LIB "${PROJECT_SOURCE_DIR}/../../cxx") -set(PROTOBUF_LIB "${PROJECT_SOURCE_DIR}/../../third_party/protobuf") +set(LITE_ROOT "${PROJECT_SOURCE_DIR}/../../cxx") +set(PROTOBUF_ROOT "${PROJECT_SOURCE_DIR}/../../third_party/protobuf") -include_directories("${LITE_LIB}/include") -link_directories("${LITE_LIB}/lib") -link_directories("${PROTOBUF_LIB}/lib") +include_directories("${LITE_ROOT}/include") +link_directories("${LITE_ROOT}/lib") +link_directories("${PROTOBUF_ROOT}/lib") +# cuda lib +link_directories("/usr/local/cuda/lib64/") add_executable(${TARGET} ${TARGET}.cc) -set(DEPS ${LITE_LIB}/lib/libpaddle_full_api_shared.so) +set(DEPS ${LITE_ROOT}/lib/libpaddle_full_api_shared.so) set(DEPS ${DEPS} protobuf-lite) -set(DEPS ${DEPS} "-lrt -lpthread -ldl") +set(DEPS ${DEPS} "-lrt -lpthread -ldl -lcudart") target_link_libraries(${TARGET} ${DEPS}) diff --git a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7 b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7 new file mode 100644 index 0000000000000000000000000000000000000000..fe808ef7ec571bb73b2aa7c4888ba447a35ad8bd --- /dev/null +++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7 @@ -0,0 +1,97 @@ +ARM_ABI = arm7 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include + +CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS) + +LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared +LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a +LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared +LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a + +########## +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + +test_helper.o: test_helper.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc + +classification_full.o: classification_full.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc + +classification_light.o: classification_light.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc + +classification_full_shared: fetch_opencv classification_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS} + +classification_full_static: fetch_opencv classification_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +classification_light_shared: fetch_opencv classification_light.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS} + +classification_light_static: fetch_opencv classification_light.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +###### +yolov3_full.o: yolov3_full.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc + +yolov3_light.o: yolov3_light.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc + +yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS} + +yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS} + +yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +##### +all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static + +clean: + rm -f *.o + rm -f classification_full_shared + rm -r classification_full_static + rm -r classification_light_shared + rm -f classification_light_static + rm -f yolov3_full_shared + rm -f yolov3_full_static + rm -f yolov3_light_shared + rm -f yolov3_light_static diff --git a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8 b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8 new file mode 100644 index 0000000000000000000000000000000000000000..f87143a92043e2c011c572bac78a9eb420bacaf1 --- /dev/null +++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8 @@ -0,0 +1,97 @@ +ARM_ABI = arm8 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include + +CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS) + +LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared +LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a +LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared +LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a + +########## +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + +test_helper.o: test_helper.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc + +classification_full.o: classification_full.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc + +classification_light.o: classification_light.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc + +classification_full_shared: fetch_opencv classification_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS} + +classification_full_static: fetch_opencv classification_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +classification_light_shared: fetch_opencv classification_light.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS} + +classification_light_static: fetch_opencv classification_light.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +###### +yolov3_full.o: yolov3_full.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc + +yolov3_light.o: yolov3_light.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc + +yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS} + +yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS} + +yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +##### +all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static + +clean: + rm -f *.o + rm -f classification_full_shared + rm -r classification_full_static + rm -r classification_light_shared + rm -f classification_light_static + rm -f yolov3_full_shared + rm -f yolov3_full_static + rm -f yolov3_light_shared + rm -f yolov3_light_static diff --git a/lite/demo/cxx/test_libs/classification_full.cc b/lite/demo/cxx/test_libs/classification_full.cc new file mode 100644 index 0000000000000000000000000000000000000000..2515d6abd89b6714ff731bed28f4e8e8c5c3dd75 --- /dev/null +++ b/lite/demo/cxx/test_libs/classification_full.cc @@ -0,0 +1,185 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle_api.h" // NOLINT +#include "test_helper.h" // NOLINT + +DEFINE_string(model_dir, + "", + "the path of the model, the model and param files is under " + "model_dir."); +DEFINE_string(model_filename, + "", + "the filename of model file. When the model is combined formate, " + "please set model_file."); +DEFINE_string(param_filename, + "", + "the filename of param file, set param_file when the model is " + "combined formate."); +DEFINE_string(img_path, "", "the path of input image"); +DEFINE_string(img_txt_path, + "", + "the path of input image, the image is processed " + " and saved in txt file"); +DEFINE_double(out_max_value, 0.0, "The max value in output tensor"); +DEFINE_double(threshold, + 1e-3, + "If the max value diff is smaller than threshold, pass test"); +DEFINE_int32(out_max_value_index, 65, "The max value index in output tensor"); + +// Optimize model for ARM CPU. +// If the model is not combined, set model_filename and params_filename as empty +void OptModel(const std::string& load_model_dir, + const std::string& model_filename, + const std::string& params_filename, + const std::string& save_model_path) { + paddle::lite_api::CxxConfig config; + config.set_model_dir(load_model_dir); + if (!model_filename.empty() && !params_filename.empty()) { + config.set_model_file(load_model_dir + "/" + model_filename); + config.set_param_file(load_model_dir + "/" + params_filename); + } + std::vector vaild_places = { + paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)}, + paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt32)}, + paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt64)}, + }; + config.set_valid_places(vaild_places); + + auto predictor = paddle::lite_api::CreatePaddlePredictor(config); + + std::string cmd_str = "rm -rf " + save_model_path; + int ret = system(cmd_str.c_str()); + if (ret == 0) { + std::cout << "Delete old optimized model " << save_model_path << std::endl; + } + predictor->SaveOptimizedModel(save_model_path, + paddle::lite_api::LiteModelType::kNaiveBuffer); + std::cout << "Load model from " << load_model_dir << std::endl; + std::cout << "Save optimized model to " << save_model_path << std::endl; +} + +void Run(const std::string& model_path, + const std::string& img_path, + const std::string& img_txt_path, + const float out_max_value, + const int out_max_value_index, + const float threshold, + const int height, + const int width) { + // set config and create predictor + paddle::lite_api::MobileConfig config; + config.set_threads(3); + config.set_model_from_file(model_path); + + auto predictor = paddle::lite_api::CreatePaddlePredictor(config); + + // set input + auto input_tensor = predictor->GetInput(0); + input_tensor->Resize({1, 3, height, width}); + auto input_data = input_tensor->mutable_data(); + if (img_txt_path.size() > 0) { + std::fstream fs(img_txt_path); + if (!fs.is_open()) { + std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl; + } + int num = 1 * 3 * height * width; + for (int i = 0; i < num; i++) { + fs >> input_data[i]; + } + } else { + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + if (!img.data) { + std::cerr << "Fail to open img:" << img_path << std::endl; + exit(1); + } + float means[3] = {0.485f, 0.456f, 0.406f}; + float scales[3] = {0.229f, 0.224f, 0.225f}; + process_img(img, width, height, input_data, means, scales); + } + + predictor->Run(); + + auto out_tensor = predictor->GetOutput(0); + auto* out_data = out_tensor->data(); + int64_t output_num = ShapeProduction(out_tensor->shape()); + float max_value = out_data[0]; + int max_index = 0; + for (int i = 0; i < output_num; i++) { + if (max_value < out_data[i]) { + max_value = out_data[i]; + max_index = i; + } + } + + std::cout << "max_value:" << max_value << std::endl; + std::cout << "max_index:" << max_index << std::endl; + std::cout << "max_value_ground_truth:" << out_max_value << std::endl; + std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl; + if (max_index != out_max_value_index || + fabs(max_value - out_max_value) > threshold) { + std::cerr << "----------Fail Test.---------- \n\n"; + } else { + std::cout << "----------Pass Test.---------- \n\n"; + } +} + +int main(int argc, char** argv) { + // Check inputs + google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_model_dir.empty() || + (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) { + std::cerr << "Input error." << std::endl; + std::cerr + << "Usage: " << argv[0] << std::endl + << "--model_dir: the path of not optimized model \n" + "--model_filename: the model filename of not optimized model \n" + "--param_filename: the param filename of not optimized model \n" + "--img_txt_path: the path of input image, the image is processed \n" + " and saved in txt file \n" + "--img_path: the path of input image \n" + "--out_max_value: The max value in output tensor \n" + "--threshold: If the max value diff is smaller than threshold,\n" + " pass test. Default 1e-3.\n" + "--out_max_value_index: The max value index in output tensor \n"; + exit(1); + } + + const int height = 224; + const int width = 224; + std::string model_dir = FLAGS_model_dir; + if (model_dir.back() == '/') { + model_dir.pop_back(); + } + std::string optimized_model_path = model_dir + "_opt2"; + OptModel(FLAGS_model_dir, + FLAGS_model_filename, + FLAGS_param_filename, + optimized_model_path); + std::string run_model_path = optimized_model_path + ".nb"; + + // Run test + Run(run_model_path, + FLAGS_img_path, + FLAGS_img_txt_path, + FLAGS_out_max_value, + FLAGS_out_max_value_index, + FLAGS_threshold, + height, + width); + return 0; +} diff --git a/lite/demo/cxx/test_libs/classification_light.cc b/lite/demo/cxx/test_libs/classification_light.cc new file mode 100644 index 0000000000000000000000000000000000000000..91d981e1fc991bef48da97847eddee9e724fe654 --- /dev/null +++ b/lite/demo/cxx/test_libs/classification_light.cc @@ -0,0 +1,129 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle_api.h" // NOLINT +#include "test_helper.h" // NOLINT + +DEFINE_string(optimized_model_path, "", "the path of optimized model"); +DEFINE_string(img_path, "", "the path of input image"); +DEFINE_string(img_txt_path, + "", + "the path of input image, the image is processed " + " and saved in txt file"); +DEFINE_double(out_max_value, 0.0, "The max value in output tensor"); +DEFINE_double(threshold, + 1e-3, + "If the max value diff is smaller than threshold, pass test"); +DEFINE_int32(out_max_value_index, -1, "The max value index in output tensor"); + +void Run(const std::string& model_path, + const std::string& img_path, + const std::string& img_txt_path, + const float out_max_value, + const int out_max_value_index, + const float threshold, + const int height, + const int width) { + // set config and create predictor + paddle::lite_api::MobileConfig config; + config.set_threads(3); + config.set_model_from_file(model_path); + + auto predictor = paddle::lite_api::CreatePaddlePredictor(config); + + // set input + auto input_tensor = predictor->GetInput(0); + input_tensor->Resize({1, 3, height, width}); + auto input_data = input_tensor->mutable_data(); + if (img_txt_path.size() > 0) { + std::fstream fs(img_txt_path); + if (!fs.is_open()) { + std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl; + } + int num = 1 * 3 * height * width; + for (int i = 0; i < num; i++) { + fs >> input_data[i]; + } + } else { + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + if (!img.data) { + std::cerr << "Fail to open img:" << img_path << std::endl; + exit(1); + } + float means[3] = {0.485f, 0.456f, 0.406f}; + float scales[3] = {0.229f, 0.224f, 0.225f}; + process_img(img, width, height, input_data, means, scales); + } + + predictor->Run(); + + auto out_tensor = predictor->GetOutput(0); + auto* out_data = out_tensor->data(); + int64_t output_num = ShapeProduction(out_tensor->shape()); + float max_value = out_data[0]; + int max_index = 0; + for (int i = 0; i < output_num; i++) { + if (max_value < out_data[i]) { + max_value = out_data[i]; + max_index = i; + } + } + + std::cout << "max_value:" << max_value << std::endl; + std::cout << "max_index:" << max_index << std::endl; + std::cout << "max_value_ground_truth:" << out_max_value << std::endl; + std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl; + if (max_index != out_max_value_index || + fabs(max_value - out_max_value) > threshold) { + std::cerr << "----------Fail Test---------- \n\n"; + } else { + std::cout << "----------Pass Test---------- \n\n"; + } +} + +int main(int argc, char** argv) { + // Check inputs + google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_optimized_model_path.empty() || + (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) { + std::cerr << "Input error." << std::endl; + std::cerr + << "Usage: " << argv[0] << std::endl + << "--optimized_model_path: the path of optimized model \n" + "--img_txt_path: the path of input image, the image is processed \n" + " and saved in txt file \n" + "--img_path: the path of input image \n" + "--out_max_value: The max value in output tensor \n" + "--threshold: If the max value diff is smaller than threshold,\n" + " pass test. Default 1e-3.\n" + "--out_max_value_index: The max value index in output tensor \n"; + exit(1); + } + + const int height = 224; + const int width = 224; + // Run test + Run(FLAGS_optimized_model_path, + FLAGS_img_path, + FLAGS_img_txt_path, + FLAGS_out_max_value, + FLAGS_out_max_value_index, + FLAGS_threshold, + height, + width); + return 0; +} diff --git a/lite/demo/cxx/test_libs/prepare.sh b/lite/demo/cxx/test_libs/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..9c8baf3f1afb7c785b0fb1621910739821b370b0 --- /dev/null +++ b/lite/demo/cxx/test_libs/prepare.sh @@ -0,0 +1,30 @@ +make clean +make all -j + +gf=test_lite_lib_files +if [ -d ${gf} ];then + rm -rf ${gf} +fi +mkdir ${gf} + +mv classification_full_shared ${gf} +mv classification_full_static ${gf} +mv classification_light_shared ${gf} +mv classification_light_static ${gf} +mv yolov3_full_shared ${gf} +mv yolov3_full_static ${gf} +mv yolov3_light_shared ${gf} +mv yolov3_light_static ${gf} +cp run.sh ${gf} + +make clean + +cp -r ../../../cxx/ ${gf} +mv ${gf}/cxx ${gf}/lite + +if [ ! -f "test_libs_models_imgs.tgz" ];then + wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/test_libs_models_imgs.tgz +fi +tar zxvf test_libs_models_imgs.tgz +mv test_libs_models_imgs ${gf} +mv ${gf}/test_libs_models_imgs ${gf}/models_imgs diff --git a/lite/demo/cxx/test_libs/run.sh b/lite/demo/cxx/test_libs/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..ead4c0adfaff1c3b44b9494d45277e365f6ff763 --- /dev/null +++ b/lite/demo/cxx/test_libs/run.sh @@ -0,0 +1,75 @@ +export LD_LIBRARY_PATH=$PWD/lite/lib/:${LD_LIBRARY_PATH} + +# mobilenetv1 + +./classification_light_shared \ + --optimized_model_path=models_imgs/models/mobilenetv1.nb \ + --img_txt_path=models_imgs/images/classification.jpg.txt \ + --out_max_value=0.936887 \ + --out_max_value_index=65 + +./classification_light_static \ + --optimized_model_path=models_imgs/models/mobilenetv1.nb \ + --img_txt_path=models_imgs/images/classification.jpg.txt \ + --out_max_value=0.936887 \ + --out_max_value_index=65 + +./classification_full_static \ + --model_dir=models_imgs/models/mobilenetv1 \ + --img_txt_path=models_imgs/images/classification.jpg.txt \ + --out_max_value=0.936887 \ + --out_max_value_index=65 + +./classification_full_shared \ + --model_dir=models_imgs/models/mobilenetv1 \ + --img_txt_path=models_imgs/images/classification.jpg.txt \ + --out_max_value=0.936887 \ + --out_max_value_index=65 + +# mobilenetv2 + +./classification_light_shared \ + --optimized_model_path=models_imgs/models/mobilenetv2.nb \ + --img_txt_path=models_imgs/images/classification.jpg.txt \ + --out_max_value=0.868888 \ + --out_max_value_index=65 + +./classification_light_static \ + --optimized_model_path=models_imgs/models/mobilenetv2.nb \ + --img_txt_path=models_imgs/images/classification.jpg.txt \ + --out_max_value=0.868888 \ + --out_max_value_index=65 + +./classification_full_static \ + --model_dir=models_imgs/models/mobilenetv2 \ + --img_txt_path=models_imgs/images/classification.jpg.txt \ + --out_max_value=0.868888 \ + --out_max_value_index=65 + +./classification_full_shared \ + --model_dir=models_imgs/models/mobilenetv2 \ + --img_txt_path=models_imgs/images/classification.jpg.txt \ + --out_max_value=0.868888 \ + --out_max_value_index=65 + +# yolov3 + +./yolov3_light_shared \ + --optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb \ + --img_txt_path=models_imgs/images/yolov3.jpg.txt \ + --out_values=0,0.153605,174.494,199.729,562.075,604.014 + +./yolov3_light_static \ + --optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb \ + --img_txt_path=models_imgs/images/yolov3.jpg.txt \ + --out_values=0,0.153605,174.494,199.729,562.075,604.014 + +./yolov3_full_static \ + --model_dir=models_imgs/models/yolov3_mobilenetv1 \ + --img_txt_path=models_imgs/images/yolov3.jpg.txt \ + --out_values=0,0.153605,174.494,199.729,562.075,604.014 + +./yolov3_full_shared \ + --model_dir=models_imgs/models/yolov3_mobilenetv1 \ + --img_txt_path=models_imgs/images/yolov3.jpg.txt \ + --out_values=0,0.153605,174.494,199.729,562.075,604.014 diff --git a/lite/demo/cxx/test_libs/test_helper.cc b/lite/demo/cxx/test_libs/test_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..450579c90d66f952f32ac70353f4867cee94e007 --- /dev/null +++ b/lite/demo/cxx/test_libs/test_helper.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test_helper.h" // NOLINT + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" + +double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +int64_t ShapeProduction(const std::vector& shape) { + int64_t num = 1; + for (auto i : shape) { + num *= i; + } + return num; +} + +std::vector GetIntNumsFromStr(const std::string& str) { + std::vector nums; + std::string tmp_str = str; + while (!tmp_str.empty()) { + int num = atoi(tmp_str.data()); + nums.push_back(num); + size_t next_offset = tmp_str.find(","); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return nums; +} + +std::vector GetDoubleNumsFromStr(const std::string& str) { + std::vector nums; + std::string tmp_str = str; + while (!tmp_str.empty()) { + double num = atof(tmp_str.data()); + nums.push_back(num); + size_t next_offset = tmp_str.find(","); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return nums; +} + +// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up +void neon_mean_scale( + const float* din, float* dout, int size, float* mean, float* scale) { + float32x4_t vmean0 = vdupq_n_f32(mean[0]); + float32x4_t vmean1 = vdupq_n_f32(mean[1]); + float32x4_t vmean2 = vdupq_n_f32(mean[2]); + float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]); + float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]); + float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]); + + float* dout_c0 = dout; + float* dout_c1 = dout + size; + float* dout_c2 = dout + size * 2; + + int i = 0; + for (; i < size - 3; i += 4) { + float32x4x3_t vin3 = vld3q_f32(din); + float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0); + float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1); + float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2); + float32x4_t vs0 = vmulq_f32(vsub0, vscale0); + float32x4_t vs1 = vmulq_f32(vsub1, vscale1); + float32x4_t vs2 = vmulq_f32(vsub2, vscale2); + vst1q_f32(dout_c0, vs0); + vst1q_f32(dout_c1, vs1); + vst1q_f32(dout_c2, vs2); + + din += 12; + dout_c0 += 4; + dout_c1 += 4; + dout_c2 += 4; + } + for (; i < size; i++) { + *(dout_c0++) = (*(din++) - mean[0]) / scale[0]; + *(dout_c0++) = (*(din++) - mean[1]) / scale[1]; + *(dout_c0++) = (*(din++) - mean[2]) / scale[2]; + } +} + +// Process img and set it as input +void process_img(const cv::Mat& img, + int width, + int height, + float* dest_data, + float* means, + float* scales) { + cv::Mat rgb_img; + cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); + cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f); + cv::Mat imgf; + rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f); + const float* dimg = reinterpret_cast(imgf.data); + neon_mean_scale(dimg, dest_data, width * height, means, scales); +} diff --git a/lite/demo/cxx/test_libs/test_helper.h b/lite/demo/cxx/test_libs/test_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..3ef42af571925fd556538747cd21b72e925329bc --- /dev/null +++ b/lite/demo/cxx/test_libs/test_helper.h @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" + +double GetCurrentUS(); + +int64_t ShapeProduction(const std::vector& shape); + +std::vector GetIntNumsFromStr(const std::string& str); +std::vector GetDoubleNumsFromStr(const std::string& str); + +void neon_mean_scale( + const float* din, float* dout, int size, float* mean, float* scale); + +void process_img(const cv::Mat& img, + int width, + int height, + float* dst_data, + float* means, + float* scales); diff --git a/lite/demo/cxx/test_libs/yolov3_full.cc b/lite/demo/cxx/test_libs/yolov3_full.cc new file mode 100644 index 0000000000000000000000000000000000000000..d0e69f9042f6ebf8ed68626b52889fac59f73c18 --- /dev/null +++ b/lite/demo/cxx/test_libs/yolov3_full.cc @@ -0,0 +1,182 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle_api.h" // NOLINT +#include "test_helper.h" // NOLINT + +DEFINE_string(model_dir, + "", + "the path of the model, the model and param files is under " + "model_dir."); +DEFINE_string(model_filename, + "", + "the filename of model file. When the model is combined formate, " + "please set model_file."); +DEFINE_string(param_filename, + "", + "the filename of param file, set param_file when the model is " + "combined formate."); +DEFINE_string(img_path, "", "the path of input image"); +DEFINE_string(img_txt_path, + "", + "the path of input image, the image is processed " + " and saved in txt file"); +DEFINE_string(out_values, + "", + "The output values, separated by colon and comma"); +DEFINE_double(threshold, + 1e-3, + "If the output value diff is smaller than threshold, pass test"); + +void OptModel(const std::string& load_model_dir, + const std::string& model_filename, + const std::string& params_filename, + const std::string& save_model_path) { + paddle::lite_api::CxxConfig config; + config.set_model_dir(load_model_dir); + if (!model_filename.empty() && !params_filename.empty()) { + config.set_model_file(load_model_dir + "/" + model_filename); + config.set_param_file(load_model_dir + "/" + params_filename); + } + std::vector vaild_places = { + paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)}, + paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt32)}, + paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt64)}, + }; + config.set_valid_places(vaild_places); + + auto predictor = paddle::lite_api::CreatePaddlePredictor(config); + + std::string cmd_str = "rm -rf " + save_model_path; + int ret = system(cmd_str.c_str()); + if (ret == 0) { + std::cout << "Delete old optimized model " << save_model_path << std::endl; + } + predictor->SaveOptimizedModel(save_model_path, + paddle::lite_api::LiteModelType::kNaiveBuffer); + std::cout << "Load model from " << load_model_dir << std::endl; + std::cout << "Save optimized model to " << save_model_path << std::endl; +} + +void Run(const std::string& model_path, + const std::string& img_path, + const std::string& img_txt_path, + const std::vector& out_values, + const float threshold, + const int height, + const int width) { + // set config and create predictor + paddle::lite_api::MobileConfig config; + config.set_threads(3); + config.set_model_from_file(model_path); + + auto predictor = paddle::lite_api::CreatePaddlePredictor(config); + + // set input + auto input_tensor = predictor->GetInput(0); + input_tensor->Resize({1, 3, height, width}); + auto input_data = input_tensor->mutable_data(); + if (img_txt_path.size() > 0) { + std::fstream fs(img_txt_path); + if (!fs.is_open()) { + std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl; + } + int num = 1 * 3 * height * width; + for (int i = 0; i < num; i++) { + fs >> input_data[i]; + } + } else { + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + if (!img.data) { + std::cerr << "Fail to open img:" << img_path << std::endl; + exit(1); + } + float means[3] = {0.485f, 0.456f, 0.406f}; + float scales[3] = {0.229f, 0.224f, 0.225f}; + process_img(img, width, height, input_data, means, scales); + } + auto shape_tensor = predictor->GetInput(1); + shape_tensor->Resize({1, 2}); + auto* shape_data = shape_tensor->mutable_data(); + shape_data[0] = height; + shape_data[1] = width; + + predictor->Run(); + + auto out_tensor = predictor->GetOutput(0); + auto* out_data = out_tensor->data(); + int64_t output_num = ShapeProduction(out_tensor->shape()); + bool is_pass = true; + for (int i = 0; i < output_num && i < out_values.size(); i++) { + std::cout << "id:" << i << " out_data:" << out_data[i] + << " gt_data:" << out_values[i] << std::endl; + if (fabs(out_data[i] - out_values[i]) > threshold) { + is_pass = false; + } + } + if (is_pass) { + std::cout << "----------Pass test---------- \n\n"; + } else { + std::cout << "----------Fail test---------- \n\n"; + } +} + +int main(int argc, char** argv) { + // Check inputs + google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_model_dir.empty() || + (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) { + std::cerr << "Input error." << std::endl; + std::cerr + << "Usage: " << argv[0] << std::endl + << "--model_dir: the path of not optimized model \n" + "--model_filename: the model filename of not optimized model \n" + "--param_filename: the param filename of not optimized model \n" + "--img_txt_path: the path of input image, the image is processed \n" + " and saved in txt file \n" + "--img_path: the path of input image \n" + "--out_values: The output values, separated by colon and comma.\n" + "--threshold: If the out value diff is smaller than threshold,\n" + " pass test. Default 1e-3.\n"; + exit(1); + } + + const int height = 608; + const int width = 608; + std::vector out_values = GetDoubleNumsFromStr(FLAGS_out_values); + + std::string model_dir = FLAGS_model_dir; + if (model_dir.back() == '/') { + model_dir.pop_back(); + } + std::string optimized_model_path = model_dir + "_opt2"; + OptModel(FLAGS_model_dir, + FLAGS_model_filename, + FLAGS_param_filename, + optimized_model_path); + std::string run_model_path = optimized_model_path + ".nb"; + + // Run test + Run(run_model_path, + FLAGS_img_path, + FLAGS_img_txt_path, + out_values, + FLAGS_threshold, + height, + width); + return 0; +} diff --git a/lite/demo/cxx/test_libs/yolov3_light.cc b/lite/demo/cxx/test_libs/yolov3_light.cc new file mode 100644 index 0000000000000000000000000000000000000000..b31151c8fc2384ec24f2f908d156f4200db279d7 --- /dev/null +++ b/lite/demo/cxx/test_libs/yolov3_light.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle_api.h" // NOLINT +#include "test_helper.h" // NOLINT + +DEFINE_string(optimized_model_path, "", "the path of the optimized model"); +DEFINE_string(img_path, "", "the path of input image"); +DEFINE_string(img_txt_path, + "", + "the path of input image, the image is processed " + " and saved in txt file"); +DEFINE_string(out_values, + "", + "The output values, separated by colon and comma"); +DEFINE_double(threshold, + 1e-3, + "If the output value diff is smaller than threshold, pass test"); + +void Run(const std::string& model_path, + const std::string& img_path, + const std::string& img_txt_path, + const std::vector& out_values, + const float threshold, + const int height, + const int width) { + // set config and create predictor + paddle::lite_api::MobileConfig config; + config.set_threads(3); + config.set_model_from_file(model_path); + + auto predictor = paddle::lite_api::CreatePaddlePredictor(config); + + // set input + auto input_tensor = predictor->GetInput(0); + input_tensor->Resize({1, 3, height, width}); + auto input_data = input_tensor->mutable_data(); + if (img_txt_path.size() > 0) { + std::fstream fs(img_txt_path); + if (!fs.is_open()) { + std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl; + } + int num = 1 * 3 * height * width; + for (int i = 0; i < num; i++) { + fs >> input_data[i]; + } + } else { + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + if (!img.data) { + std::cerr << "Fail to open img:" << img_path << std::endl; + exit(1); + } + float means[3] = {0.485f, 0.456f, 0.406f}; + float scales[3] = {0.229f, 0.224f, 0.225f}; + process_img(img, width, height, input_data, means, scales); + } + auto shape_tensor = predictor->GetInput(1); + shape_tensor->Resize({1, 2}); + auto* shape_data = shape_tensor->mutable_data(); + shape_data[0] = height; + shape_data[1] = width; + + predictor->Run(); + + auto out_tensor = predictor->GetOutput(0); + auto* out_data = out_tensor->data(); + int64_t output_num = ShapeProduction(out_tensor->shape()); + bool is_pass = true; + for (int i = 0; i < output_num && i < out_values.size(); i++) { + std::cout << "id:" << i << " out_data:" << out_data[i] + << " gt_data:" << out_values[i] << std::endl; + if (fabs(out_data[i] - out_values[i]) > threshold) { + is_pass = false; + } + } + if (is_pass) { + std::cout << "----------Pass test---------- \n\n"; + } else { + std::cout << "----------Fail test---------- \n\n"; + } +} + +int main(int argc, char** argv) { + // Check inputs + google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_optimized_model_path.empty() || + (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) { + std::cerr << "Input error." << std::endl; + std::cerr + << "Usage: " << argv[0] << std::endl + << "--optimized_model_path: the path of optimized model \n" + "--img_txt_path: the path of input image, the image is processed \n" + " and saved in txt file \n" + "--img_path: the path of input image \n" + "--out_values: The output values, separated by colon and comma.\n" + "--threshold: If the out value diff is smaller than threshold,\n" + " pass test. Default 1e-3.\n"; + exit(1); + } + + const int height = 608; + const int width = 608; + std::vector out_values = GetDoubleNumsFromStr(FLAGS_out_values); + + // Run test + Run(FLAGS_optimized_model_path, + FLAGS_img_path, + FLAGS_img_txt_path, + out_values, + FLAGS_threshold, + height, + width); + return 0; +} diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt index aa3a52e8ad1223451de06e820da7e1febb43b879..9670149114d0f7cc953129b83215c0e8b7caa56a 100644 --- a/lite/kernels/arm/CMakeLists.txt +++ b/lite/kernels/arm/CMakeLists.txt @@ -56,7 +56,6 @@ add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_k add_kernel(crop_compute_arm ARM extra SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(power_compute_arm ARM extra SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(norm_compute_arm ARM extra SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm) ## 3. extra kernels add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -88,13 +87,10 @@ add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(fill_constant_batch_size_like_compute_arm ARM basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_arm) diff --git a/lite/kernels/arm/beam_search_decode_compute.cc b/lite/kernels/arm/beam_search_decode_compute.cc index e0d4ae3f13c6b8bf2364ab5d50ec45bb245377c6..bbd17d98c6ab3096039a5741dd236467ab577f27 100644 --- a/lite/kernels/arm/beam_search_decode_compute.cc +++ b/lite/kernels/arm/beam_search_decode_compute.cc @@ -114,14 +114,14 @@ struct BeamSearchDecoder { lod.push_back(source_level_lod); lod.push_back(sentence_level_lod); - *(id_tensor->mutable_lod()) = lod; + id_tensor->set_lod(lod); id_tensor->Resize({static_cast(id_data.size())}); auto id_ptr = id_tensor->mutable_data(); TargetCopy( TARGET(kARM), id_ptr, id_data.data(), id_data.size() * sizeof(int64_t)); - *(score_tensor->mutable_lod()) = lod; + score_tensor->set_lod(lod); score_tensor->Resize({static_cast(score_data.size())}); auto score_ptr = score_tensor->mutable_data(); TargetCopy(TARGET(kARM), diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc index fb8529af5a0fa4b92b761e1cd8780859138c2059..2a545e70691f030a3a1e3f2a9a9822f5cd8b85b9 100644 --- a/lite/kernels/arm/conv_compute.cc +++ b/lite/kernels/arm/conv_compute.cc @@ -72,7 +72,7 @@ void ConvCompute::PrepareForRun() { impl_ = new DepthwiseConv; // VLOG(3) << "invoking dw conv"; } else if (param.groups == 1 && kw == 3 && stride == 1 && ks_equal && - no_dilation && pads_all_equal) { + no_dilation) { // TODO(MyPandaShaoxiang): winograd conv support any pad impl_ = new WinogradConv; // VLOG(3) << "invoking winograd conv"; @@ -109,6 +109,8 @@ void ConvCompute::PrepareForRun() { int pw = paddings[2]; int sh = param.strides[1]; int sw = param.strides[0]; + int hin = param.x->dims()[2]; + int win = param.x->dims()[3]; bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]); bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh); @@ -116,13 +118,12 @@ void ConvCompute::PrepareForRun() { bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2)); bool flag_dw_5x5 = pads_all_equal && (kw == 5 && (sw == 1 || sw == 2)); bool flag_dw = flag_dw_3x3 || flag_dw_5x5; - if (param.groups == ic && ic == oc && kps_equal && pads_equal && no_dilation && flag_dw) { impl_ = new DepthwiseConv; // VLOG(3) << "Run DepthwiseConv Int8"; } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) && - kps_equal && no_dilation) { + ic * oc < 4 * hin * win && kps_equal && no_dilation) { impl_ = new DirectConv; // VLOG(3) << "Run DirectConv Int8"; } else { @@ -154,6 +155,8 @@ void ConvCompute::PrepareForRun() { int pw = paddings[2]; int sh = param.strides[1]; int sw = param.strides[0]; + int hin = param.x->dims()[2]; + int win = param.x->dims()[3]; bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]); bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh); @@ -167,7 +170,7 @@ void ConvCompute::PrepareForRun() { impl_ = new DepthwiseConv; // VLOG(3) << "Run DepthwiseConv Int8"; } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) && - kps_equal && no_dilation) { + ic * oc < 4 * hin * win && kps_equal && no_dilation) { impl_ = new DirectConv; // VLOG(3) << "Run DirectConv Int8"; } else { diff --git a/lite/kernels/arm/conv_winograd.cc b/lite/kernels/arm/conv_winograd.cc index e433a3f4bb4a7aa553fbb1193ff82779d9af3242..d0880e51de1eff4763c63d2d3fa4bc74cafc859e 100644 --- a/lite/kernels/arm/conv_winograd.cc +++ b/lite/kernels/arm/conv_winograd.cc @@ -45,12 +45,14 @@ void WinogradConv::ReInitWhenNeeded() { int ow = o_dims[3]; int tile_block = 8; auto pad = *(param.paddings); - int pad_h = pad[0]; - int pad_w = pad[2]; + int pad_h0 = pad[0]; + int pad_h1 = pad[1]; + int pad_w0 = pad[2]; + int pad_w1 = pad[3]; int oc_pad = (oc + 3) / 4 * 4; int ic_pad = (ic + 3) / 4 * 4; const int new_input_size = - (ic + 3) / 4 * 4 * (ih + pad_h * 2) * (iw + pad_w * 2); + (ic + 3) / 4 * 4 * (ih + pad_h0 + pad_h1) * (iw + pad_w0 + pad_w1); const int temp_size = (tile_block * ((ic + 3) / 4 + (oc + 3) / 4) * 4 * wino_iw * wino_iw + 8 * wino_iw * wino_iw) * diff --git a/lite/kernels/arm/logical_compute.cc b/lite/kernels/arm/logical_compute.cc deleted file mode 100644 index 1e47329d8ff65f3d036fd4a8a653cfe5cdc80a3a..0000000000000000000000000000000000000000 --- a/lite/kernels/arm/logical_compute.cc +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/logical_compute.h" -#include -#include "lite/api/paddle_place.h" -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -#define LOGICAL_FUNCTOR(name, op) \ - template \ - struct _##name##Functor { \ - inline bool operator()(const T& a, const T& b) const { return a op b; } \ - }; - -LOGICAL_FUNCTOR(LogicalAnd, &&); -LOGICAL_FUNCTOR(LogicalOr, ||); - -template -struct _LogicalXorFunctor { - inline bool operator()(const T& a, const T& b) const { - return (a || b) && !(a && b); - } -}; - -template -struct _LogicalNotFunctor { - inline bool operator()(const T& a) const { return !a; } -}; - -// template -template