diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f656e065a065ab65d461ba2901a548fcf9b4e42a..cb4ecf259883455ccc7c8529c048e76d7536da04 100755 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -76,8 +76,8 @@ build:mobile_android: cache: key: mobile_thirdparty paths: - - $MOBILE_LITE_CACHE0 - - $MOBILE_LITE_CACHE1 + - build.lite.android.armv8.gcc/third_party + - build.lite.android.armv7.gcc/third_party - ~/.ccache - $CI_PROJECT_DIR/build_mobile_ccache script: @@ -96,8 +96,9 @@ build:mobile_armlinux: cache: key: mobile_thirdparty paths: - - $MOBILE_LITE_CACHE0 - - $MOBILE_LITE_CACHE1 + - build.lite.armlinux.armv8.gcc + - build.lite.armlinux.armv7.gcc + - build.lite.armlinux.armv7hf.gcc - ~/.ccache - $CI_PROJECT_DIR/build_mobile_ccache2 script: @@ -107,24 +108,13 @@ build:mobile_armlinux: dependencies: - build:server - cache: - key: mobile_thirdparty - paths: - - $MOBILE_LITE_CACHE0 - - $MOBILE_LITE_CACHE1 - - ~/.ccache build:mobile_model_mobilenetv1: tags: - lite stage: build_mobile image: $MOBILE_LITE_DOCKER_IMAGE - cache: - key: mobile_thirdparty - paths: - - $MOBILE_LITE_CACHE0 - - $MOBILE_LITE_CACHE1 - - ~/.ccache + script: - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_mobilenetv1 - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_mobilenetv1 @@ -135,8 +125,7 @@ build:mobile_model_mobilenetv1: cache: key: mobile_thirdparty paths: - - $MOBILE_LITE_CACHE0 - - $MOBILE_LITE_CACHE1 + - build.lite.android.armv8.gcc - ~/.ccache - $CI_PROJECT_DIR/build_mobile_model_mobilenetv1 @@ -145,12 +134,7 @@ build:mobile_model_mobilenetv2: - lite stage: build_mobile image: $MOBILE_LITE_DOCKER_IMAGE - cache: - key: mobile_thirdparty - paths: - - $MOBILE_LITE_CACHE0 - - $MOBILE_LITE_CACHE1 - - ~/.ccache + script: - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_mobilenetv2 - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_mobilenetv2 @@ -161,8 +145,7 @@ build:mobile_model_mobilenetv2: cache: key: mobile_thirdparty paths: - - $MOBILE_LITE_CACHE0 - - $MOBILE_LITE_CACHE1 + - build.lite.android.armv8.gcc - ~/.ccache - $CI_PROJECT_DIR/build_mobile_model_mobilenetv2 @@ -171,12 +154,7 @@ build:mobile_model_resnet50: - lite stage: build_mobile image: $MOBILE_LITE_DOCKER_IMAGE - cache: - key: mobile_thirdparty - paths: - - $MOBILE_LITE_CACHE0 - - $MOBILE_LITE_CACHE1 - - ~/.ccache + script: - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_resnet50 - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_resnet50 @@ -187,8 +165,7 @@ build:mobile_model_resnet50: cache: key: mobile_thirdparty paths: - - $MOBILE_LITE_CACHE0 - - $MOBILE_LITE_CACHE1 + - build.lite.android.armv8.gcc - ~/.ccache - $CI_PROJECT_DIR/build_mobile_model_resnet50 diff --git a/CMakeLists.txt b/CMakeLists.txt index e9d3b03cb1fcd5c22e95591173009580c24b9e53..59f565014b59f1393243a892f81f2069edd6eb9e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,6 +150,7 @@ option(WITH_LITE "Enable lite framework" OFF) option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF) option(LITE_WITH_X86 "Enable X86 in lite mode" ON) option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) +option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF) option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OFF) option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF) @@ -181,6 +182,12 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) include(external/eigen) # download eigen3 include(ccache) # set ccache for compilation + # for opencl + if (LITE_WITH_OPENCL) + include(external/opencl-headers) + include(external/opencl-clhpp) + endif() + include(generic) # simplify cmake module include(configure) # add paddle env configuration diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 385a9572f58d520e6c0905261f9be721e85749a2..95ae0be6384855256644eacb09369a004f999c51 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -176,6 +176,10 @@ if (LITE_WITH_ARM) add_definitions("-DLITE_WITH_ARM") endif() +if (LITE_WITH_OPENCL) + add_definitions("-DLITE_WITH_OPENCL") +endif() + if (LITE_WITH_PROFILE) add_definitions("-DLITE_WITH_PROFILE") endif() diff --git a/cmake/external/opencl-clhpp.cmake b/cmake/external/opencl-clhpp.cmake new file mode 100644 index 0000000000000000000000000000000000000000..ea724860d9b40ab5669975cebc6d5e1d7b662fb4 --- /dev/null +++ b/cmake/external/opencl-clhpp.cmake @@ -0,0 +1,36 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(OPENCL_CLHPP_SRCS_DIR ${THIRD_PARTY_PATH}/opencl-clhpp) +SET(OPENCL_CLHPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/opencl-clhpp) +SET(OPENCL_CLHPP_INCLUDE_DIR "${OPENCL_CLHPP_INSTALL_DIR}" CACHE PATH "opencl-clhpp include directory." FORCE) + +INCLUDE_DIRECTORIES(${OPENCL_CLHPP_INCLUDE_DIR}) + +ExternalProject_Add( + opencl_clhpp + GIT_REPOSITORY "https://github.com/KhronosGroup/OpenCL-CLHPP.git" + GIT_TAG "v2.0.10" + PREFIX "${OPENCL_CLHPP_SRCS_DIR}" + CMAKE_ARGS -DBUILD_DOCS=OFF + -DBUILD_EXAMPLES=OFF + -DBUILD_TESTS=OFF + -DCMAKE_INSTALL_PREFIX=${OPENCL_CLHPP_INSTALL_DIR} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${OPENCL_CLHPP_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} +) + +ADD_DEPENDENCIES(opencl_clhpp opencl_headers) diff --git a/cmake/external/opencl-headers.cmake b/cmake/external/opencl-headers.cmake new file mode 100644 index 0000000000000000000000000000000000000000..68c9c5251cfb04df4882fdd455936832440d3cff --- /dev/null +++ b/cmake/external/opencl-headers.cmake @@ -0,0 +1,33 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(OPENCL_HEADERS_SRCS_DIR ${THIRD_PARTY_PATH}/opencl-headers) +SET(OPENCL_HEADERS_INCLUDE_DIR "${OPENCL_HEADERS_SRCS_DIR}/src/opencl_headers" CACHE PATH "opencl-headers include directory." FORCE) + +INCLUDE_DIRECTORIES(${OPENCL_HEADERS_INCLUDE_DIR}) + +ExternalProject_Add( + opencl_headers + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/KhronosGroup/OpenCL-Headers.git" + GIT_TAG "c5a4bbeabb10d8ed3d1c651b93aa31737bc473dd" + PREFIX ${OPENCL_HEADERS_SRCS_DIR} + DOWNLOAD_NAME "OpenCL-Headers" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) diff --git a/paddle/fluid/lite/CMakeLists.txt b/paddle/fluid/lite/CMakeLists.txt index e2a8984b459ce135a81170bcc3f293deafc61bb6..4de031077f730422399a305a3f5e031ca198c3ab 100644 --- a/paddle/fluid/lite/CMakeLists.txt +++ b/paddle/fluid/lite/CMakeLists.txt @@ -24,8 +24,7 @@ function(lite_download_and_uncompress INSTALL_DIR URL FILENAME) ${EXTERNAL_PROJECT_NAME} ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${INSTALL_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && - ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME} + DOWNLOAD_COMMAND wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME} DOWNLOAD_DIR ${INSTALL_DIR} DOWNLOAD_NO_PROGRESS 1 CONFIGURE_COMMAND "" @@ -143,6 +142,8 @@ function(lite_cc_binary TARGET) HVY_DEPS ${args_HVY_DEPS} ) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS}) + # collect targets need to compile for lite + add_dependencies(lite_compile_deps ${TARGET}) endfunction() # Add a unit-test name to file for latter offline manual test. @@ -181,6 +182,7 @@ add_subdirectory(x86) add_subdirectory(arm) add_subdirectory(host) add_subdirectory(cuda) +add_subdirectory(opencl) add_subdirectory(model_parser) add_subdirectory(utils) add_subdirectory(api) diff --git a/paddle/fluid/lite/api/CMakeLists.txt b/paddle/fluid/lite/api/CMakeLists.txt index 3cac3eeba6d4aef3d7af88979e79ee0cbf5b2efe..84da4757b1018db0fe66e708c7216cc1dc8c6d1a 100644 --- a/paddle/fluid/lite/api/CMakeLists.txt +++ b/paddle/fluid/lite/api/CMakeLists.txt @@ -12,7 +12,6 @@ lite_cc_library(lite_api_test_helper SRCS lite_api_test_helper.cc CUDA_DEPS kernels_cuda X86_DEPS ${x86_kernels} ) -lite_cc_library(cxx_api_lite SRCS cxx_api.cc DEPS lite_api_test_helper) set(light_api_deps scope_lite target_wrapper_host model_parser_lite program_lite) @@ -21,27 +20,34 @@ if(LITE_WITH_CUDA) set(light_api_deps ${light_api_deps} target_wrapper_cuda) endif() -lite_cc_library(light_api_lite SRCS light_api.cc - DEPS ${light_api_deps} ${ops_lite} ${host_kernels} - ) - message(STATUS "get ops ${ops_lite}") message(STATUS "get Host kernels ${host_kernels}") message(STATUS "get ARM kernels ${arm_kernels}") +lite_cc_library(cxx_api_lite SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} ${ops_lite} ${host_kernels} program_lite) + +lite_cc_library(light_api_lite SRCS light_api.cc + DEPS scope_lite target_wrapper_host model_parser_lite + ${light_api_deps} ${ops_lite} ${host_kernels} program_lite + CUDA_DEPS target_wrapper_cuda + X86_DEPS ${x86_kernels} operator + ARM_DEPS ${arm_kernels} + ) + include(ExternalProject) set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING "A path setting inference demo download directories.") if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc - DEPS cxx_api_lite mir_passes + DEPS cxx_api_lite mir_passes lite_api_test_helper ${ops_lite} ${host_kernels} ${x86_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz) endif() + if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) set(lite_model_test_DEPS cxx_api_lite mir_passes ${ops_lite} ${host_kernels} ${arm_kernels}) @@ -68,25 +74,18 @@ endif() # These tests needs CLI arguments, and is not supported in ARM CI. # TODO(Superjomn) support latter. -if(NOT LITE_ON_MOBILE) - lite_cc_test(test_light_api SRCS light_api_test.cc - DEPS light_api_lite mir_passes - X86_DEPS ${x86_kernels} - ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt - SERIAL) - - lite_cc_test(test_apis_lite SRCS apis_test.cc - DEPS cxx_api_lite light_api_lite ${ops_lite} mir_passes - X86_DEPS ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model - --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) -endif() +lite_cc_test(test_light_api_lite SRCS light_api_test.cc + DEPS light_api_lite program_lite mir_passes + ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt + SERIAL) + +lite_cc_test(test_apis_lite SRCS apis_test.cc + DEPS cxx_api_lite light_api_lite ${ops_lite} + X86_DEPS ${x86_kernels} operator + ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model + --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) -lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc - DEPS - cxx_api_lite - model_parser_lite - target_wrapper_host - mir_passes - ${ops_lite} ${host_kernels} - ARM_DEPS ${arm_kernels}) +#lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc + #X86_DEPS operator + #DEPS light_api_lite model_parser_lite target_wrapper_host mir_passes + #ARM_DEPS ${arm_kernels}) diff --git a/paddle/fluid/lite/api/apis_test.cc b/paddle/fluid/lite/api/apis_test.cc index 7dd6a1193754437a32957f081b3be3fd5c1fc403..a5cf93f0e29ce4b0ec5cc733da3ac609668ae539 100644 --- a/paddle/fluid/lite/api/apis_test.cc +++ b/paddle/fluid/lite/api/apis_test.cc @@ -39,23 +39,41 @@ void SetConstInput(lite::Tensor* x) { } } -bool CompareTensors(const std::string& name, const ExecutorLite& cxx_api, +bool CompareTensors(const std::string& name, const Predictor& cxx_api, const LightPredictor& light_api) { const auto* a = cxx_api.GetTensor(name); const auto* b = light_api.GetTensor(name); return TensorCompareWith(*a, *b); } -#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK +TEST(CXXApi_LightApi, optim_model) { + lite::Predictor cxx_api; + std::vector valid_places({ + Place{TARGET(kHost), PRECISION(kFloat)}, + Place{TARGET(kX86), PRECISION(kFloat)}, + Place{TARGET(kARM), PRECISION(kFloat)}, // Both works on X86 and ARM + }); + // On ARM devices, the preferred X86 target not works, but it can still + // select ARM kernels. + cxx_api.Build(FLAGS_model_dir, Place{TARGET(kX86), PRECISION(kFloat)}, + valid_places); + cxx_api.SaveModel(FLAGS_optimized_model); +} + TEST(CXXApi_LightApi, save_and_load_model) { - lite::ExecutorLite cxx_api; - lite::LightPredictor light_api; + lite::Predictor cxx_api; + lite::LightPredictor light_api(FLAGS_optimized_model); // CXXAPi { - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kX86), PRECISION(kFloat)}}); - cxx_api.Build(FLAGS_model_dir, Place{TARGET(kCUDA), PRECISION(kFloat)}, + std::vector valid_places({ + Place{TARGET(kHost), PRECISION(kFloat)}, + Place{TARGET(kX86), PRECISION(kFloat)}, + Place{TARGET(kARM), PRECISION(kFloat)}, // Both works on X86 and ARM + }); + // On ARM devices, the preferred X86 target not works, but it can still + // select ARM kernels. + cxx_api.Build(FLAGS_model_dir, Place{TARGET(kX86), PRECISION(kFloat)}, valid_places); auto* x = cxx_api.GetInput(0); @@ -69,8 +87,6 @@ TEST(CXXApi_LightApi, save_and_load_model) { // LightApi { - light_api.Build(FLAGS_optimized_model); - auto* x = light_api.GetInput(0); SetConstInput(x); @@ -89,7 +105,6 @@ TEST(CXXApi_LightApi, save_and_load_model) { ASSERT_TRUE(CompareTensors(tensor_name, cxx_api, light_api)); } } -#endif // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK } // namespace lite } // namespace paddle diff --git a/paddle/fluid/lite/api/cxx_api.cc b/paddle/fluid/lite/api/cxx_api.cc index 1ea8be2c0b588ed58c82a70f4ef9263c46d15654..16a5cc891668f604b8f1bdc459473499e8a8a551 100644 --- a/paddle/fluid/lite/api/cxx_api.cc +++ b/paddle/fluid/lite/api/cxx_api.cc @@ -17,19 +17,66 @@ #include #include #include -#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK -#include "paddle/fluid/platform/port.h" -#endif +#include "paddle/fluid/lite/utils/io.h" namespace paddle { namespace lite { -#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK -void ExecutorLite::SaveModel(const std::string &dir) { - MkDirRecursively(dir.c_str()); +void Predictor::SaveModel(const std::string &dir) { +#ifndef LITE_WITH_ARM + MkDirRecur(dir); +#else +#endif program_->PersistModel(dir, program_desc_); + LOG(INFO) << "Save model to " << dir; +} + +lite::Tensor *Predictor::GetInput(size_t offset) { + auto *_feed_list = program_->exec_scope()->FindVar("feed"); + CHECK(_feed_list) << "no feed variable in exec_scope"; + auto *feed_list = _feed_list->GetMutable>(); + if (offset >= feed_list->size()) { + feed_list->resize(offset + 1); + } + return &feed_list->at(offset); +} + +const lite::Tensor *Predictor::GetOutput(size_t offset) { + auto *_fetch_list = program_->exec_scope()->FindVar("fetch"); + CHECK(_fetch_list) << "no fatch variable in exec_scope"; + auto &fetch_list = *_fetch_list->GetMutable>(); + CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow"; + return &fetch_list.at(offset); +} + +void Predictor::Build(const std::string &model_path, const Place &prefer_place, + const std::vector &valid_places) { + LoadModel(model_path, scope_.get(), &program_desc_); + Build(program_desc_, prefer_place, valid_places); +} + +const framework::proto::ProgramDesc &Predictor::program_desc() const { + return program_desc_; +} + +void Predictor::Build(const framework::proto::ProgramDesc &desc, + const Place &prefer_place, + const std::vector &valid_places) { + program_desc_ = desc; + Program program(desc, scope_, valid_places); + + optimizer_.KernelPickPreferPlace(prefer_place); + core::KernelPickFactor factor; + factor.ConsiderTarget(); + factor.ConsiderPrecision(); + optimizer_.Run(std::move(program), valid_places, factor); + program_ = optimizer_.GenRuntimeProgram(); +} + +const lite::Tensor *Predictor::GetTensor(const std::string &name) const { + auto *var = program_->exec_scope()->FindVar(name); + return &var->Get(); } -#endif } // namespace lite } // namespace paddle diff --git a/paddle/fluid/lite/api/cxx_api.h b/paddle/fluid/lite/api/cxx_api.h index ba2d784b942c04c169a19d4747352d9048fd6ff2..5434bc18eb634a7c2136a64f4afdb490db92119d 100644 --- a/paddle/fluid/lite/api/cxx_api.h +++ b/paddle/fluid/lite/api/cxx_api.h @@ -26,68 +26,39 @@ namespace paddle { namespace lite { -struct Config {}; - -class ExecutorLite { +/* + * Predictor for inference, input a model, it will optimize and execute it. + */ +class Predictor { public: - ExecutorLite() { scope_ = std::make_shared(); } - explicit ExecutorLite(const std::shared_ptr& root_scope) { - scope_ = root_scope; - } + // Create an empty predictor. + Predictor() { scope_ = std::make_shared(); } + // Create a predictor with the weight variable scope set. + explicit Predictor(const std::shared_ptr& root_scope) + : scope_(root_scope) {} + // Build from a model, with places set for hardware config. void Build(const std::string& model_path, const Place& prefer_place, - const std::vector& valid_places) { - LoadModel(model_path, scope_.get(), &program_desc_); - Build(program_desc_, prefer_place, valid_places); - } + const std::vector& valid_places); void Build(const framework::proto::ProgramDesc& desc, - const Place& prefer_place, - const std::vector& valid_places) { - program_desc_ = desc; - Program program(desc, scope_, valid_places); - - optimizer_.KernelPickPreferPlace(prefer_place); - core::KernelPickFactor factor; - factor.ConsiderTarget(); - optimizer_.Run(std::move(program), valid_places, factor); - program_ = optimizer_.GenRuntimeProgram(); - } - -// This method is disabled in mobile, or unnecessary dependencies required. -#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK - void SaveModel(const std::string& dir); -#endif + const Place& prefer_place, const std::vector& valid_places); - // Get offset-th col of feed. - lite::Tensor* GetInput(size_t offset) { - auto* _feed_list = program_->exec_scope()->FindVar("feed"); - CHECK(_feed_list) << "no feed variable in exec_scope"; - auto* feed_list = _feed_list->GetMutable>(); - if (offset >= feed_list->size()) { - feed_list->resize(offset + 1); - } - return &feed_list->at(offset); - } + // Run the predictor for a single batch of data. + void Run() { program_->Run(); } - const lite::Tensor* GetOutput(size_t offset) { - auto* _fetch_list = program_->exec_scope()->FindVar("fetch"); - CHECK(_fetch_list) << "no fatch variable in exec_scope"; - auto& fetch_list = *_fetch_list->GetMutable>(); - CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow"; - return &fetch_list.at(offset); - } + // Get offset-th col of feed inputs. + lite::Tensor* GetInput(size_t offset); - const lite::Tensor* GetTensor(const std::string& name) const { - auto* var = program_->exec_scope()->FindVar(name); - return &var->Get(); - } + // Get offset-th col of fetch results. + const lite::Tensor* GetOutput(size_t offset); - void Run() { program_->Run(); } + // Return the program desc for debug. + const framework::proto::ProgramDesc& program_desc() const; + const lite::Tensor* GetTensor(const std::string& name) const; - const framework::proto::ProgramDesc& program_desc() const { - return program_desc_; - } + // This method is disabled in mobile, for unnecessary dependencies required. + void SaveModel(const std::string& dir); private: Optimizer optimizer_; @@ -96,6 +67,7 @@ class ExecutorLite { std::unique_ptr program_; }; +#ifdef LITE_WITH_X86 /* * An executor for training. * @@ -119,13 +91,13 @@ class CXXTrainer { : scope_(root_scope), preferred_place_(preferred_place), valid_places_(valid_places), - main_program_executor_(ExecutorLite(scope_)) {} + main_program_executor_(Predictor(scope_)) {} // Build the RuntimeProgram cache for the main program. The cache will run // multiple times for the epoches. // NOTE Just support to execute the 0-th block currently. - ExecutorLite& BuildMainProgramExecutor( - const framework::proto::ProgramDesc& desc, int block_id = 0) { + Predictor& BuildMainProgramExecutor(const framework::proto::ProgramDesc& desc, + int block_id = 0) { main_program_executor_.Build(desc, preferred_place_, valid_places_); return main_program_executor_; } @@ -133,7 +105,7 @@ class CXXTrainer { // Run the startup program. It just executes once, no cache needed. void RunStartupProgram(const framework::proto::ProgramDesc& desc, int block_id = 0) { - ExecutorLite exe(scope_); + Predictor exe(scope_); exe.Build(desc, preferred_place_, valid_places_); exe.Run(); } @@ -145,8 +117,9 @@ class CXXTrainer { std::vector valid_places_; // The training program. - ExecutorLite main_program_executor_; + Predictor main_program_executor_; }; +#endif } // namespace lite } // namespace paddle diff --git a/paddle/fluid/lite/api/cxx_api_bin.cc b/paddle/fluid/lite/api/cxx_api_bin.cc index 58cf5dd785efc5de02e746e0ef1d5609a7c120a5..36f6ed45a10653aec74658a3c4794954d65dd1f5 100644 --- a/paddle/fluid/lite/api/cxx_api_bin.cc +++ b/paddle/fluid/lite/api/cxx_api_bin.cc @@ -34,7 +34,7 @@ void Run(const char* model_dir, int repeat, int thread_num) { DeviceInfo::Init(); DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num); #endif - lite::ExecutorLite predictor; + lite::Predictor predictor; std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, Place{TARGET(kARM), PRECISION(kFloat)}}); diff --git a/paddle/fluid/lite/api/cxx_api_test.cc b/paddle/fluid/lite/api/cxx_api_test.cc index 093f8b73055fd0e9a8caed33430460b68cb8fbea..a1a028a5453a25f025bb55a4f81d4b94445480bb 100644 --- a/paddle/fluid/lite/api/cxx_api_test.cc +++ b/paddle/fluid/lite/api/cxx_api_test.cc @@ -42,7 +42,7 @@ TEST(CXXApi, test) { } TEST(CXXApi, save_model) { - lite::ExecutorLite predictor; + lite::Predictor predictor; std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, Place{TARGET(kX86), PRECISION(kFloat)}}); predictor.Build(FLAGS_model_dir, Place{TARGET(kCUDA), PRECISION(kFloat)}, diff --git a/paddle/fluid/lite/api/inceptionv4_test.cc b/paddle/fluid/lite/api/inceptionv4_test.cc index b0f0aaf3c13abe9e5fb02c8a47c29a66842008af..977aa04abc6c9d990ae17d73750fc1f2af373920 100644 --- a/paddle/fluid/lite/api/inceptionv4_test.cc +++ b/paddle/fluid/lite/api/inceptionv4_test.cc @@ -16,21 +16,20 @@ #include #include #include "paddle/fluid/lite/api/cxx_api.h" +#include "paddle/fluid/lite/api/test_helper.h" #include "paddle/fluid/lite/core/mir/use_passes.h" #include "paddle/fluid/lite/core/op_registry.h" #include "paddle/fluid/lite/kernels/use_kernels.h" #include "paddle/fluid/lite/operators/use_ops.h" -// for eval -DEFINE_string(model_dir, "", ""); - namespace paddle { namespace lite { #ifdef LITE_WITH_ARM TEST(InceptionV4, test) { DeviceInfo::Init(); - lite::ExecutorLite predictor; + DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads); + lite::Predictor predictor; std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, Place{TARGET(kARM), PRECISION(kFloat)}}); @@ -44,7 +43,20 @@ TEST(InceptionV4, test) { data[i] = 1; } - predictor.Run(); + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor.Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeats; ++i) { + predictor.Run(); + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; auto* out = predictor.GetOutput(0); std::vector results({0.00078033, 0.00083865, 0.00060029, 0.00057083, diff --git a/paddle/fluid/lite/api/light_api.cc b/paddle/fluid/lite/api/light_api.cc index 9d3da3a5919e9cb07fbfc67dc0c7538d96775db4..6a7e20a053c8d355289f51636966f55eb429b897 100644 --- a/paddle/fluid/lite/api/light_api.cc +++ b/paddle/fluid/lite/api/light_api.cc @@ -13,3 +13,67 @@ // limitations under the License. #include "paddle/fluid/lite/api/light_api.h" + +namespace paddle { +namespace lite { + +void LightPredictor::Build(const std::string& model_dir) { + framework::proto::ProgramDesc desc; + LoadModel(model_dir, scope_.get(), &desc); + BuildRuntimeProgram(desc); +} + +Tensor* LightPredictor::GetInput(size_t offset) { + auto* _feed_list = program_->exec_scope()->FindVar("feed"); + CHECK(_feed_list) << "no feed variable in exec_scope"; + auto* feed_list = _feed_list->GetMutable>(); + if (offset >= feed_list->size()) { + feed_list->resize(offset + 1); + } + return &feed_list->at(offset); +} + +const Tensor* LightPredictor::GetOutput(size_t offset) { + auto* _fetch_list = program_->exec_scope()->FindVar("fetch"); + CHECK(_fetch_list) << "no fatch variable in exec_scope"; + auto& fetch_list = *_fetch_list->GetMutable>(); + CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow"; + return &fetch_list.at(offset); +} + +void LightPredictor::BuildRuntimeProgram( + const framework::proto::ProgramDesc& prog) { + std::vector insts; + // 1. Create op first + Program program(prog, scope_, {}); + + // 2. Create Instructs + + // Create the kernels of the target places, and filter out the specific + // kernel with the target alias. + for (auto& op : program.ops()) { + auto kernel_type = op->op_info()->GetAttr(kKernelTypeAttr); + std::string op_type, alias; + Place place; + KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place); + auto kernels = op->CreateKernels({place}); + // filter out a kernel + auto it = std::find_if( + kernels.begin(), kernels.end(), + [&](std::unique_ptr& it) { return it->alias() == alias; }); + CHECK(it != kernels.end()); + (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target())); + insts.emplace_back(op, std::move(*it)); + } + program_.reset(new RuntimeProgram(std::move(insts))); + CHECK(program.exec_scope()); + program_->set_exec_scope(program.exec_scope()); +} + +LightPredictor::LightPredictor(const std::string& model_dir) { + scope_ = std::make_shared(); + Build(model_dir); +} + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/api/light_api.h b/paddle/fluid/lite/api/light_api.h index 5085909385c94e2e81b2cfa14167e8ce886060a3..bf1d7e95a3d90c8db3090815b150926551f63113 100644 --- a/paddle/fluid/lite/api/light_api.h +++ b/paddle/fluid/lite/api/light_api.h @@ -32,36 +32,21 @@ namespace paddle { namespace lite { +/* + * The light weight predictor, mainly for mobile. It loads an optimized model, + * and will not depend on the MIR or perform latter optimization. + */ class LightPredictor { public: - LightPredictor() { scope_ = std::make_shared(); } - - void Build(const std::string& model_dir) { - framework::proto::ProgramDesc desc; - LoadModel(model_dir, scope_.get(), &desc); - BuildRuntimeProgram(desc); - } + explicit LightPredictor(const std::string& model_dir); void Run() { program_->Run(); } - // Get offset-th col of feed. - Tensor* GetInput(size_t offset) { - auto* _feed_list = program_->exec_scope()->FindVar("feed"); - CHECK(_feed_list) << "no feed variable in exec_scope"; - auto* feed_list = _feed_list->GetMutable>(); - if (offset >= feed_list->size()) { - feed_list->resize(offset + 1); - } - return &feed_list->at(offset); - } + // Get offset-th col of feed inputs. + Tensor* GetInput(size_t offset); - const Tensor* GetOutput(size_t offset) { - auto* _fetch_list = program_->exec_scope()->FindVar("fetch"); - CHECK(_fetch_list) << "no fatch variable in exec_scope"; - auto& fetch_list = *_fetch_list->GetMutable>(); - CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow"; - return &fetch_list.at(offset); - } + // Get offset-th col of fetch outputs. + const Tensor* GetOutput(size_t offset); const lite::Tensor* GetTensor(const std::string& name) const { auto* var = program_->exec_scope()->FindVar(name); @@ -69,34 +54,8 @@ class LightPredictor { } private: - void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog) { - std::vector insts; - // 1. Create op first - Program program(prog, scope_, {}); - - // 2. Create Instructs - - // Create the kernels of the target places, and filter out the specific - // kernel with the target alias. - for (auto& op : program.ops()) { - auto kernel_type = op->op_info()->GetAttr(kKernelTypeAttr); - std::string op_type, alias; - Place place; - KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place); - auto kernels = op->CreateKernels({place}); - // filter out a kernel - auto it = std::find_if(kernels.begin(), kernels.end(), - [&](std::unique_ptr& it) { - return it->alias() == alias; - }); - CHECK(it != kernels.end()); - (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target())); - insts.emplace_back(op, std::move(*it)); - } - program_.reset(new RuntimeProgram(std::move(insts))); - CHECK(program.exec_scope()); - program_->set_exec_scope(program.exec_scope()); - } + void Build(const std::string& model_dir); + void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog); private: std::shared_ptr scope_; diff --git a/paddle/fluid/lite/api/light_api_test.cc b/paddle/fluid/lite/api/light_api_test.cc index faf53b8177a4d11fb33017599ecdb9dc650fbc43..d7e58fbe56cee4055c422af9a8881e664cc26605 100644 --- a/paddle/fluid/lite/api/light_api_test.cc +++ b/paddle/fluid/lite/api/light_api_test.cc @@ -25,8 +25,10 @@ namespace paddle { namespace lite { TEST(LightAPI, load) { - LightPredictor predictor; - predictor.Build(FLAGS_optimized_model); + if (FLAGS_optimized_model.empty()) { + FLAGS_optimized_model = "lite_naive_model"; + } + LightPredictor predictor(FLAGS_optimized_model); auto* input_tensor = predictor.GetInput(0); input_tensor->Resize(DDim(std::vector({100, 100}))); diff --git a/paddle/fluid/lite/api/lite_api_test_helper.cc b/paddle/fluid/lite/api/lite_api_test_helper.cc index b82541723308f4748e28c64affa6899bf2d9b727..3c0835bc49b32a336848e9b9e88ea2afa3f1c698 100644 --- a/paddle/fluid/lite/api/lite_api_test_helper.cc +++ b/paddle/fluid/lite/api/lite_api_test_helper.cc @@ -22,7 +22,7 @@ namespace paddle { namespace lite { const lite::Tensor* RunHvyModel() { - lite::ExecutorLite predictor; + lite::Predictor predictor; #ifndef LITE_WITH_CUDA std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, Place{TARGET(kX86), PRECISION(kFloat)}}); diff --git a/paddle/fluid/lite/api/mobilenetv1_test.cc b/paddle/fluid/lite/api/mobilenetv1_test.cc index 527b387a4260b46f8033ce7e8a1b8b5ae91a7928..9b7d6dc40b881c3145ded02c0d065e66ea0a5afc 100644 --- a/paddle/fluid/lite/api/mobilenetv1_test.cc +++ b/paddle/fluid/lite/api/mobilenetv1_test.cc @@ -16,21 +16,20 @@ #include #include #include "paddle/fluid/lite/api/cxx_api.h" +#include "paddle/fluid/lite/api/test_helper.h" #include "paddle/fluid/lite/core/mir/use_passes.h" #include "paddle/fluid/lite/core/op_registry.h" #include "paddle/fluid/lite/kernels/use_kernels.h" #include "paddle/fluid/lite/operators/use_ops.h" -// for eval -DEFINE_string(model_dir, "", ""); - namespace paddle { namespace lite { #ifdef LITE_WITH_ARM TEST(MobileNetV1, test) { DeviceInfo::Init(); - lite::ExecutorLite predictor; + DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads); + lite::Predictor predictor; std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, Place{TARGET(kARM), PRECISION(kFloat)}}); @@ -44,7 +43,20 @@ TEST(MobileNetV1, test) { data[i] = 1; } - predictor.Run(); + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor.Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeats; ++i) { + predictor.Run(); + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; auto* out = predictor.GetOutput(0); std::vector results({1.91308980e-04, 5.92055148e-04, 1.12303176e-04, diff --git a/paddle/fluid/lite/api/mobilenetv2_test.cc b/paddle/fluid/lite/api/mobilenetv2_test.cc index 8a1ccdf4d37755559b80aba08010ec1ae6eb0578..e50ac212c10a462180786d2bb9887881957bb0f7 100644 --- a/paddle/fluid/lite/api/mobilenetv2_test.cc +++ b/paddle/fluid/lite/api/mobilenetv2_test.cc @@ -16,21 +16,20 @@ #include #include #include "paddle/fluid/lite/api/cxx_api.h" +#include "paddle/fluid/lite/api/test_helper.h" #include "paddle/fluid/lite/core/mir/use_passes.h" #include "paddle/fluid/lite/core/op_registry.h" #include "paddle/fluid/lite/kernels/use_kernels.h" #include "paddle/fluid/lite/operators/use_ops.h" -// for eval -DEFINE_string(model_dir, "", ""); - namespace paddle { namespace lite { #ifdef LITE_WITH_ARM TEST(MobileNetV2, test) { DeviceInfo::Init(); - lite::ExecutorLite predictor; + DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads); + lite::Predictor predictor; std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, Place{TARGET(kARM), PRECISION(kFloat)}}); @@ -44,7 +43,20 @@ TEST(MobileNetV2, test) { data[i] = 1; } - predictor.Run(); + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor.Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeats; ++i) { + predictor.Run(); + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; auto* out = predictor.GetOutput(0); std::vector results({0.00097802, 0.00099822, 0.00103093, 0.00100121, diff --git a/paddle/fluid/lite/api/resnet50_test.cc b/paddle/fluid/lite/api/resnet50_test.cc index c4c214d6cdb462b7d95cbfd0f1787dab8d359a47..a1e57bf32c583d74800cd36194ccd9b5171a8366 100644 --- a/paddle/fluid/lite/api/resnet50_test.cc +++ b/paddle/fluid/lite/api/resnet50_test.cc @@ -16,21 +16,20 @@ #include #include #include "paddle/fluid/lite/api/cxx_api.h" +#include "paddle/fluid/lite/api/test_helper.h" #include "paddle/fluid/lite/core/mir/use_passes.h" #include "paddle/fluid/lite/core/op_registry.h" #include "paddle/fluid/lite/kernels/use_kernels.h" #include "paddle/fluid/lite/operators/use_ops.h" -// for eval -DEFINE_string(model_dir, "", ""); - namespace paddle { namespace lite { #ifdef LITE_WITH_ARM TEST(ResNet50, test) { DeviceInfo::Init(); - lite::ExecutorLite predictor; + DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads); + lite::Predictor predictor; std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, Place{TARGET(kARM), PRECISION(kFloat)}}); @@ -44,7 +43,20 @@ TEST(ResNet50, test) { data[i] = 1; } - predictor.Run(); + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor.Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeats; ++i) { + predictor.Run(); + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; auto* out = predictor.GetOutput(0); std::vector results({2.41399175e-04, 4.13724629e-04, 2.64324830e-04, diff --git a/paddle/fluid/lite/kernels/arm/use_kernels.h b/paddle/fluid/lite/api/test_helper.h similarity index 59% rename from paddle/fluid/lite/kernels/arm/use_kernels.h rename to paddle/fluid/lite/api/test_helper.h index 1a6583f3f570e688080b1bb1a96217c25ca4bcc9..4d184eeb169c4f1c7f1de968e373137c4e9ffcc6 100644 --- a/paddle/fluid/lite/kernels/arm/use_kernels.h +++ b/paddle/fluid/lite/api/test_helper.h @@ -13,13 +13,24 @@ // limitations under the License. #pragma once -#include "paddle/fluid/lite/core/op_registry.h" -USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(pool, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(feed, kARM, kAny, kAny, def); -USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def); +#include +#include + +// for eval +DEFINE_string(model_dir, "", "model dir"); +DEFINE_int32(warmup, 0, "warmup times"); +DEFINE_int32(repeats, 1, "repeats times"); +DEFINE_int32(threads, 1, "threads num"); + +namespace paddle { +namespace lite { + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/arm/math/CMakeLists.txt b/paddle/fluid/lite/arm/math/CMakeLists.txt index dd439bbf0f6e23b721c1f61fb5e39d821b79fb26..32f367f703e6cdf1484a2bf2e53edcf38f879357 100644 --- a/paddle/fluid/lite/arm/math/CMakeLists.txt +++ b/paddle/fluid/lite/arm/math/CMakeLists.txt @@ -35,6 +35,8 @@ cc_library(math_arm SRCS split.cc activation.cc dropout.cc + gemm_prepacked_int8.cc + gemv_arm_int8.cc DEPS ${lite_kernel_deps} eigen3 framework_proto_lite) # TODO(TJ): fix me do not deps proto diff --git a/paddle/fluid/lite/core/CMakeLists.txt b/paddle/fluid/lite/core/CMakeLists.txt index 1e95668cddc722e32ea784fe2331380ea3a3940e..f6d48c2bea52040a924561812fb092df412a0c15 100644 --- a/paddle/fluid/lite/core/CMakeLists.txt +++ b/paddle/fluid/lite/core/CMakeLists.txt @@ -25,7 +25,7 @@ cc_library(op_registry_lite SRCS op_registry.cc DEPS framework_proto_lite) cc_library(scope_lite SRCS scope.cc DEPS ${tensor_lite}) cc_library(cpu_info_lite SRCS cpu_info.cc) lite_cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite eigen3) -cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite +cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite kernel_lite cpp_op_desc_lite ${tensor_lite}) cc_library(types_lite SRCS types.cc) cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite} target_wrapper_lite) diff --git a/paddle/fluid/lite/core/kernel.cc b/paddle/fluid/lite/core/kernel.cc index 44b00f53d018ffe9431c7b481fb1bc1a6e1f7cdc..0dae1394290c34cddcf8b2f22868fa326f1974fd 100644 --- a/paddle/fluid/lite/core/kernel.cc +++ b/paddle/fluid/lite/core/kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/lite/core/kernel.h" +#include namespace paddle { namespace lite { @@ -49,6 +50,36 @@ std::string KernelBase::GenParamTypeKey() const { return ss.str(); } +void KernelBase::ParseKernelType(const std::string &kernel_type, + std::string *op_type, std::string *alias, + Place *place) { + std::stringstream ss(kernel_type); + std::getline(ss, *op_type, '/'); + std::getline(ss, *alias, '/'); + std::string target, precision, layout; + std::getline(ss, target, '/'); + std::getline(ss, precision, '/'); + std::getline(ss, layout, '/'); + + place->target = static_cast(std::atoi(target.c_str())); + place->precision = static_cast(std::atoi(precision.c_str())); + place->layout = static_cast(std::atoi(layout.c_str())); +} + +std::string KernelBase::SerializeKernelType(const std::string &op_type, + const std::string &alias, + const Place &place) { + std::stringstream ss; + ss << op_type << "/"; + ss << alias << "/"; + // We serialize the place value not the string representation here for + // easier deserialization. + ss << static_cast(place.target) << "/"; + ss << static_cast(place.precision) << "/"; + ss << static_cast(place.layout); + return ss.str(); +} + bool ParamTypeRegistry::KeyCmp::operator()( const ParamTypeRegistry::key_t &a, const ParamTypeRegistry::key_t &b) const { diff --git a/paddle/fluid/lite/core/kernel.h b/paddle/fluid/lite/core/kernel.h index d7b296eec12a27281b84701e1daa7ca09829fc47..0ef46b65870b11077dcda2cd1833b3eb67a562fa 100644 --- a/paddle/fluid/lite/core/kernel.h +++ b/paddle/fluid/lite/core/kernel.h @@ -118,33 +118,11 @@ class KernelBase { static std::string SerializeKernelType(const std::string& op_type, const std::string& alias, - const Place& place) { - std::stringstream ss; - ss << op_type << "/"; - ss << alias << "/"; - // We serialize the place value not the string representation here for - // easier deserialization. - ss << static_cast(place.target) << "/"; - ss << static_cast(place.precision) << "/"; - ss << static_cast(place.layout); - return ss.str(); - } + const Place& place); static void ParseKernelType(const std::string& kernel_type, std::string* op_type, std::string* alias, - Place* place) { - std::stringstream ss(kernel_type); - std::getline(ss, *op_type, '/'); - std::getline(ss, *alias, '/'); - std::string target, precision, layout; - std::getline(ss, target, '/'); - std::getline(ss, precision, '/'); - std::getline(ss, layout, '/'); - - place->target = static_cast(std::stoi(target)); - place->precision = static_cast(std::stoi(precision)); - place->layout = static_cast(std::stoi(layout)); - } + Place* place); virtual ~KernelBase() = default; void Torch() {} diff --git a/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass_test.cc b/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass_test.cc index 44189e3d1ed5e58807bb577a477a5ee68ac11a80..9d2c9fbc7dc9d0e7c591b189308795d3f783e112 100644 --- a/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass_test.cc +++ b/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass_test.cc @@ -28,7 +28,7 @@ namespace lite { namespace mir { TEST(fc_fuse_pass, fuse_test) { - lite::ExecutorLite predictor; + lite::Predictor predictor; #ifndef LITE_WITH_CUDA std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, Place{TARGET(kX86), PRECISION(kFloat)}}); @@ -69,7 +69,7 @@ TEST(fc_fuse_pass, fuse_test) { #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK TEST(fc_fuse_pass, save_model_test) { - lite::ExecutorLite predictor; + lite::Predictor predictor; std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, Place{TARGET(kX86), PRECISION(kFloat)}}); predictor.Build(FLAGS_model_dir, Place{TARGET(kX86), PRECISION(kFloat)}, diff --git a/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.h b/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.h index b4778aab182abf368461984bbfb9ef827b6c0fb9..29ff767e772cdd63149c965107d1c448788dc9db 100644 --- a/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.h +++ b/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.h @@ -25,7 +25,7 @@ namespace fusion { /* The model trained by fluid quantization is a simulation of real int8. * The quantized Ops(conv2d, mul, depthwise conv2d etc) have fake_quantop - * in front and fake_dequantop behind. + * in front and fake_dequantop behind. * * When in int8 mode, the pattern like "fake_quant + quantized_op + * fake_dequant" diff --git a/paddle/fluid/lite/core/mir/pattern_matcher_high_api.cc b/paddle/fluid/lite/core/mir/pattern_matcher_high_api.cc index 9f0b2e1f3225d708f0e71c255bad2eec71628f76..322ddb29064de5eb8771f50508d20ba9ba7f053c 100644 --- a/paddle/fluid/lite/core/mir/pattern_matcher_high_api.cc +++ b/paddle/fluid/lite/core/mir/pattern_matcher_high_api.cc @@ -41,7 +41,7 @@ void FuseBase::DeleteInterNodes(SSAGraph *graph) { } } - LOG(INFO) << "keys: " << key2nodes_.size(); + VLOG(4) << "keys: " << key2nodes_.size(); std::unordered_set nodes2rm; for (auto &matched : key2nodes_) { for (const auto &key : keys) { diff --git a/paddle/fluid/lite/core/op_registry.h b/paddle/fluid/lite/core/op_registry.h index 1052419ecda8bcad8d919c0d8f8e2ab3f969440f..fc4cd25fa56eec295c522857a67e17315ed49ba8 100644 --- a/paddle/fluid/lite/core/op_registry.h +++ b/paddle/fluid/lite/core/op_registry.h @@ -80,6 +80,8 @@ class KernelRegistry final { KernelRegistryForTarget *, // KernelRegistryForTarget *, // + KernelRegistryForTarget * // >; diff --git a/paddle/fluid/lite/core/optimizer.h b/paddle/fluid/lite/core/optimizer.h index ea65329b668c89405ca43f55121f2ca1790539c0..c42699ff10a6e9e926693c46b38f3cd6343a4dd0 100644 --- a/paddle/fluid/lite/core/optimizer.h +++ b/paddle/fluid/lite/core/optimizer.h @@ -58,7 +58,6 @@ class Optimizer { #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "lite_elementwise_add_activation_fuse_pass", // #endif - "lite_fc_fuse_pass", // "static_kernel_pick_pass", // "variable_place_inference_pass", // "argument_type_display_pass", // diff --git a/paddle/fluid/lite/core/target_wrapper.h b/paddle/fluid/lite/core/target_wrapper.h index c4a870ab83f0c61fc4a5116f8c3dd379e8ead9db..66fbc652203dc4045aeae5eca87df856e76febbe 100644 --- a/paddle/fluid/lite/core/target_wrapper.h +++ b/paddle/fluid/lite/core/target_wrapper.h @@ -38,6 +38,7 @@ enum class PrecisionType : int { kUnk = 0, kFloat, kInt8, + kInt32, kAny, // any precision NUM, // number of fields. }; @@ -48,6 +49,19 @@ enum class DataLayoutType : int { NUM, // number of fields. }; +static size_t PrecisionTypeLength(PrecisionType type) { + switch (type) { + case PrecisionType::kFloat: + return 4; + case PrecisionType::kInt8: + return 1; + case PrecisionType::kInt32: + return 4; + default: + return 4; + } +} + // Some helper macro to get a specific TargetType. #define TARGET(item__) paddle::lite::TargetType::item__ // Some helper macro to get a specific PrecisionType. @@ -87,7 +101,7 @@ static const std::string& TargetRepr(TargetType target) { static const std::string& PrecisionRepr(PrecisionType precision) { static const std::string precision2string[] = {"kUnk", "kFloat", "kInt8", - "kAny"}; + "kInt32", "kAny"}; auto x = static_cast(precision); CHECK_LT(x, static_cast(PRECISION(NUM))); return precision2string[x]; diff --git a/paddle/fluid/lite/kernels/arm/CMakeLists.txt b/paddle/fluid/lite/kernels/arm/CMakeLists.txt index 337fd846cbddac2fe53da1faf79b0479a215a576..21d3aa564acae69ecf3d50267fe916e6fc5432c6 100644 --- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt @@ -51,5 +51,3 @@ set(arm_kernels ) set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels") - - diff --git a/paddle/fluid/lite/kernels/arm/conv_compute.cc b/paddle/fluid/lite/kernels/arm/conv_compute.cc index 5e9ddb6271684120c8cab68e6e10bade3a3ab015..af8f8e1242a32f58727ad1658b7db2cefbc1b653 100644 --- a/paddle/fluid/lite/kernels/arm/conv_compute.cc +++ b/paddle/fluid/lite/kernels/arm/conv_compute.cc @@ -92,6 +92,9 @@ void ConvCompute::Run() { // } } +void ConvComputeInt8::PrepareForRun() {} +void ConvComputeInt8::Run() {} + } // namespace arm } // namespace kernels } // namespace lite @@ -112,3 +115,23 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); + +REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, + paddle::lite::kernels::arm::ConvComputeInt8, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("Filter", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) + .Finalize(); + +REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, + paddle::lite::kernels::arm::ConvComputeInt8, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("Filter", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) + .Finalize(); diff --git a/paddle/fluid/lite/kernels/arm/conv_compute.h b/paddle/fluid/lite/kernels/arm/conv_compute.h index 21fabf8c3e8f7983a891265135c39b96aaf42e8d..e5d5721a3b30256bd14a165400723cc4563cd942 100644 --- a/paddle/fluid/lite/kernels/arm/conv_compute.h +++ b/paddle/fluid/lite/kernels/arm/conv_compute.h @@ -41,6 +41,25 @@ class ConvCompute : public KernelLite { nullptr}; }; +class ConvComputeInt8 : public KernelLite { + public: + using param_t = operators::ConvParam; + + void PrepareForRun() override; + + void Run() override; + + ~ConvComputeInt8() { + if (impl_ != nullptr) { + delete impl_; + } + } + + private: + lite::arm::math::ImplBase* impl_{ + nullptr}; +}; + } // namespace arm } // namespace kernels } // namespace lite diff --git a/paddle/fluid/lite/kernels/use_kernels.h b/paddle/fluid/lite/kernels/use_kernels.h index d44069e14e0d6bcaf73c09d41e107d970d8acecb..09395abab523accd0bc4f95c75d0b9b23f1e8999 100644 --- a/paddle/fluid/lite/kernels/use_kernels.h +++ b/paddle/fluid/lite/kernels/use_kernels.h @@ -12,14 +12,33 @@ // See the License for the specific language governing permissions and // limitations under the License. -#pragma once /* * ATTENTION this header file can only include in .cc file. */ +#pragma once +#include "paddle/fluid/lite/core/op_registry.h" + USE_LITE_KERNEL(feed, kHost, kAny, kAny, def); USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def); +#ifdef LITE_WITH_ARM +USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def); +#endif + #ifdef LITE_WITH_X86 USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def); USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def); @@ -36,21 +55,6 @@ USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def); USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def); #endif -#ifdef LITE_WITH_ARM -USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def); -#endif - #ifdef LITE_WITH_CUDA USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def); USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device); diff --git a/paddle/fluid/lite/kernels/x86/CMakeLists.txt b/paddle/fluid/lite/kernels/x86/CMakeLists.txt index f66818b2e9dacd8e8aae3506a2f4f12b1b117cdb..fb3ea29260480738297d5416aab2d346412b3490 100644 --- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt @@ -44,10 +44,9 @@ set(x86_kernels softmax_compute_x86 dropout_compute_x86 concat_compute_x86 - conv_compute_x86 - pool_compute_x86 - batch_norm_compute_x86 + conv_compute_x86 + pool_compute_x86 + batch_norm_compute_x86 ) set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels") - diff --git a/paddle/fluid/lite/opencl/CMakeLists.txt b/paddle/fluid/lite/opencl/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0d20bef8bc08ef02fb54c1742aae6828bde2ecf --- /dev/null +++ b/paddle/fluid/lite/opencl/CMakeLists.txt @@ -0,0 +1,19 @@ +if (NOT LITE_WITH_OPENCL) + return() +endif() + +if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) + cc_library(cl_wrapper SRCS cl_wrapper.cc) + cc_library(cl_tool SRCS cl_tool.cc) + target_compile_options(cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers) + cc_library(cl_half SRCS cl_half.cc) + target_compile_options(cl_half BEFORE PUBLIC -fno-strict-aliasing) + cc_library(cl_engine SRCS cl_engine.cc DEPS cl_tool) + cc_library(cl_context SRCS cl_context.cc DEPS cl_engine) + cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context) + cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS cl_half lite_tensor) + cc_library(cl_image SRCS cl_image.cc DEPS cl_half lite_tensor cl_image_converter cl_engine) + cc_library(cl_caller SRCS cl_caller.cc DEPS cl_helper cl_image) + lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller cl_wrapper) + add_dependencies(cl_tool opencl_clhpp) +endif() diff --git a/paddle/fluid/lite/opencl/cl2_header.h b/paddle/fluid/lite/opencl/cl2_header.h new file mode 100644 index 0000000000000000000000000000000000000000..254782d6296df1f78e64637854a59267d94ad9b1 --- /dev/null +++ b/paddle/fluid/lite/opencl/cl2_header.h @@ -0,0 +1,21 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#define CL_TARGET_OPENCL_VERSION 200 +#define CL_HPP_TARGET_OPENCL_VERSION 200 +#define CL_HPP_MINIMUM_OPENCL_VERSION 110 + +#include diff --git a/paddle/fluid/lite/opencl/cl_caller.cc b/paddle/fluid/lite/opencl/cl_caller.cc new file mode 100644 index 0000000000000000000000000000000000000000..fbb970dea2dee5290f6b5c9f8b9c5b410bd6c38d --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_caller.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/lite/opencl/cl_caller.h" +#include +#include "paddle/fluid/lite/core/compatible_tensor.h" +#include "paddle/fluid/lite/opencl/cl_context.h" +#include "paddle/fluid/lite/opencl/cl_engine.h" +#include "paddle/fluid/lite/opencl/cl_helper.h" +#include "paddle/fluid/lite/opencl/cl_image.h" +#include "paddle/fluid/lite/opencl/cl_tool.h" + +namespace paddle { +namespace lite { +static void CopyImageData(const CLImage& cl_image, float* out) { + int width = cl_image.image_dims()[0]; + int height = cl_image.image_dims()[1]; + + half_t* image_data = new half_t[height * width * 4]; + cl::Image* image = cl_image.cl_image(); + const std::array origin{0, 0, 0}; + const std::array region{static_cast(width), + static_cast(height), 1}; + cl_int err = CLEngine::Global()->command_queue().enqueueReadImage( + *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr); + CL_CHECK_ERRORS(err); + + auto* converter = cl_image.image_converter(); + converter->ImageToNCHW(image_data, out, cl_image.image_dims(), + cl_image.tensor_dims()); + + delete[] image_data; +} + +bool InitOpenCLEngine(std::string cl_path) { + auto* engine = CLEngine::Global(); + engine->set_cl_path(cl_path); + return engine->IsInitSuccess(); +} + +void elementwise_add(CLContext* context, float* in, const DDim& in_dim, + float* bias, const DDim& bias_dim, float* out, + const DDim& out_dim) { + CLHelper helper(context); + helper.AddKernel("elementwise_add", "elementwise_add_kernel.cl"); + auto kernel = helper.KernelAt(0); + CLImage in_image; + in_image.set_tensor_data(in, in_dim); + in_image.InitNormalCLImage(helper.OpenCLContext()); + VLOG(3) << " --- Inpu image: " << in_image << " --- "; + CLImage bias_image; + bias_image.set_tensor_data(bias, bias_dim); + bias_image.InitNormalCLImage(helper.OpenCLContext()); + VLOG(3) << " --- Bias image: " << bias_image << " --- "; + CLImage out_image; + out_image.InitEmptyImage(helper.OpenCLContext(), out_dim); + cl_int status; + status = kernel.setArg(0, *in_image.cl_image()); + CL_CHECK_ERRORS(status); + status = kernel.setArg(1, *bias_image.cl_image()); + CL_CHECK_ERRORS(status); + status = kernel.setArg(2, *out_image.cl_image()); + CL_CHECK_ERRORS(status); + size_t width = in_image.ImageWidth(); + size_t height = in_image.ImageHeight(); + auto global_work_size = cl::NDRange{width, height}; + status = helper.OpenCLCommandQueue().enqueueNDRangeKernel( + kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr); + CL_CHECK_ERRORS(status); + + VLOG(3) << " --- Out image: " << out_image << " --- "; + + CopyImageData(out_image, out); +} + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_caller.h b/paddle/fluid/lite/opencl/cl_caller.h new file mode 100644 index 0000000000000000000000000000000000000000..ed9b879fae2ffaece1a8e28b729b578ff19fdb44 --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_caller.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/lite/core/compatible_tensor.h" +#include "paddle/fluid/lite/opencl/cl_context.h" + +namespace paddle { +namespace lite { + +bool InitOpenCLEngine(std::string cl_path); +void elementwise_add(CLContext* context, float* in, const DDim& in_dim, + float* bias, const DDim& bias_dim, float* out, + const DDim& out_dim); + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_context.cc b/paddle/fluid/lite/opencl/cl_context.cc new file mode 100644 index 0000000000000000000000000000000000000000..d8265d17d8e0f457232ea04c2bfabc583c04f469 --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_context.cc @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include + +#include "paddle/fluid/lite/opencl/cl_context.h" +#include "paddle/fluid/lite/opencl/cl_engine.h" +#include "paddle/fluid/lite/opencl/cl_tool.h" + +namespace paddle { +namespace lite { + +cl::CommandQueue &CLContext::GetCommandQueue() { + return CLEngine::Global()->command_queue(); +} + +cl::Context &CLContext::GetContext() { return CLEngine::Global()->context(); } + +cl::Program &CLContext::GetProgram(const std::string &file_name, + const std::string &options) { + std::string program_key = file_name; + if (!options.empty()) { + program_key += options; + } + auto it = programs_.find(program_key); + if (it != programs_.end()) { + VLOG(3) << " --- program -> " << program_key << " has been built --- "; + return *(it->second); + } + + auto program = CLEngine::Global()->CreateProgram( + GetContext(), CLEngine::Global()->cl_path() + "/cl_kernel/" + file_name); + + VLOG(3) << " --- begin build program -> " << program_key << " --- "; + CLEngine::Global()->BuildProgram(program.get(), options); + VLOG(3) << " --- end build program -> " << program_key << " --- "; + + programs_[program_key] = std::move(program); + + return *(programs_[program_key]); +} + +std::unique_ptr CLContext::GetKernel(const std::string &kernel_name, + const std::string &file_name, + const std::string &options) { + cl_int status{CL_SUCCESS}; + VLOG(3) << " --- to get program " << file_name << " --- "; + auto program = GetProgram(file_name, options); + VLOG(3) << " --- end get program --- "; + VLOG(3) << " --- to create kernel: " << kernel_name << " --- "; + std::unique_ptr kernel( + new cl::Kernel(program, kernel_name.c_str(), &status)); + CL_CHECK_ERRORS(status); + VLOG(3) << " --- end create kernel --- "; + return std::move(kernel); +} + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_context.h b/paddle/fluid/lite/opencl/cl_context.h new file mode 100644 index 0000000000000000000000000000000000000000..ba434ae73f4edcedf55e94abc4f084667b36dfbe --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_context.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/lite/opencl/cl2_header.h" + +namespace paddle { +namespace lite { + +class CLContext { + public: + cl::CommandQueue &GetCommandQueue(); + + cl::Context &GetContext(); + + cl::Program &GetProgram(const std::string &file_name, + const std::string &options); + + std::unique_ptr GetKernel(const std::string &kernel_name, + const std::string &file_name, + const std::string &options); + + private: + std::unordered_map> programs_; +}; + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_engine.cc b/paddle/fluid/lite/opencl/cl_engine.cc new file mode 100644 index 0000000000000000000000000000000000000000..be82ba23cbb47f9597deec89c42714ec2d27025e --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_engine.cc @@ -0,0 +1,171 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/lite/opencl/cl_engine.h" +#include +#include +#include +#include + +namespace paddle { +namespace lite { + +CLEngine* CLEngine::Global() { + static CLEngine cl_engine_; + cl_engine_.Init(); + return &cl_engine_; +} + +CLEngine::~CLEngine() { + if (command_queue_ != nullptr) { + command_queue_->finish(); + } + // For controlling the destruction order: + command_queue_.reset(); + context_.reset(); + device_.reset(); + platform_.reset(); +} + +bool CLEngine::Init() { + if (initialized_) { + return true; + } + bool is_platform_init = InitializePlatform(); + bool is_device_init = InitializeDevice(); + is_init_success_ = is_platform_init && is_device_init; + initialized_ = true; + return initialized_; +} + +cl::Platform& CLEngine::platform() { + CHECK(platform_ != nullptr) << "platform_ is not initialized!"; + return *platform_; +} + +cl::Context& CLEngine::context() { + if (context_ == nullptr) { + context_ = CreateContext(); + } + return *context_; +} + +cl::Device& CLEngine::device() { + CHECK(device_ != nullptr) << "device_ is not initialized!"; + return *device_; +} + +cl::CommandQueue& CLEngine::command_queue() { + if (command_queue_ == nullptr) { + command_queue_ = CreateCommandQueue(context()); + } + return *command_queue_; +} + +std::unique_ptr CLEngine::CreateProgram(const cl::Context& context, + std::string file_name) { + std::ifstream file{file_name, std::ios::binary | std::ios::ate}; + CHECK(file.is_open()) << "Can't open file from " << file_name; + auto size = file.tellg(); + CHECK(size > 0) << "size is too small."; + std::string content(size, '\0'); + file.seekg(0); + file.read(&content[0], size); + cl::Program::Sources sources; + sources.push_back(content); + auto prog = + std::unique_ptr(new cl::Program(context, sources, &status_)); + LOG(INFO) << "OpenCL kernel file name: " << file_name; + LOG(INFO) << "Program source size: " << content.size(); + CL_CHECK_ERRORS(status_); + return std::move(prog); +} + +std::unique_ptr CLEngine::CreateEvent( + const cl::Context& context) { + auto event = + std::unique_ptr(new cl::UserEvent(context, &status_)); + CL_CHECK_ERRORS(status_); + return std::move(event); +} + +bool CLEngine::BuildProgram(cl::Program* program, const std::string& options) { + std::string build_option = options + " -cl-fast-relaxed-math -I " + + CLEngine::Global()->cl_path() + "/cl_kernel"; + status_ = program->build({*device_}, build_option.c_str()); + CL_CHECK_ERRORS(status_); + + if (status_ != CL_SUCCESS) { + if (program->getBuildInfo(device()) == + CL_BUILD_ERROR) { + std::string log = program->getBuildInfo(device()); + LOG(INFO) << "Program build error: " << log; + } + return false; + } + + return true; +} + +bool CLEngine::InitializePlatform() { + std::vector all_platforms; + status_ = cl::Platform::get(&all_platforms); + CL_CHECK_ERRORS(status_); + if (all_platforms.empty()) { + LOG(ERROR) << "No OpenCL platform found!"; + return false; + } + platform_ = std::make_shared(); + *platform_ = all_platforms[0]; + return true; +} + +bool CLEngine::InitializeDevice() { + std::vector all_devices; + status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices); + CL_CHECK_ERRORS(status_); + if (all_devices.empty()) { + LOG(ERROR) << "No OpenCL GPU device found!"; + return false; + } + device_ = std::make_shared(); + *device_ = all_devices[0]; + + auto device_name = device_->getInfo(); + LOG(INFO) << "Using device: " << device_name; + auto image_support = device_->getInfo(); + if (image_support) { + LOG(INFO) << "The chosen device supports image processing."; + } else { + LOG(ERROR) << "The chosen device doesn't support image processing!"; + return false; + } + auto ext_data = device_->getInfo(); + LOG(INFO) << "The extensions supported by this device: " << ext_data; + if (ext_data.find("cl_khr_fp16") != std::string::npos) { + LOG(INFO) << "The chosen device supports the half data type."; + } else { + LOG(ERROR) << "The chosen device doesn't support the half data type!"; + return false; + } + auto max_units = device_->getInfo(); + LOG(INFO) << "The chosen device has " << max_units << " compute units."; + auto local_mem = device_->getInfo(); + LOG(INFO) << "The local memory size of the chosen device is " + << static_cast(local_mem) / 1024 << " KB."; + return true; +} + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_engine.h b/paddle/fluid/lite/opencl/cl_engine.h new file mode 100644 index 0000000000000000000000000000000000000000..d513110cead2b31e4e6f72b803f3dbe03de15da1 --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_engine.h @@ -0,0 +1,96 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/lite/opencl/cl2_header.h" +#include "paddle/fluid/lite/opencl/cl_tool.h" + +namespace paddle { +namespace lite { + +class CLEngine { + public: + static CLEngine* Global(); + + bool Init(); + + cl::Platform& platform(); + + cl::Context& context(); + + cl::Device& device(); + + cl::CommandQueue& command_queue(); + + std::unique_ptr CreateProgram(const cl::Context& context, + std::string file_name); + + std::unique_ptr CreateEvent(const cl::Context& context); + + bool BuildProgram(cl::Program* program, const std::string& options = ""); + + bool IsInitSuccess() { return is_init_success_; } + + std::string cl_path() { return cl_path_; } + + void set_cl_path(std::string cl_path) { cl_path_ = cl_path; } + + private: + CLEngine() = default; + + ~CLEngine(); + + bool InitializePlatform(); + + bool InitializeDevice(); + + std::shared_ptr CreateContext() { + auto context = std::make_shared( + std::vector{device()}, nullptr, nullptr, nullptr, &status_); + CL_CHECK_ERRORS(status_); + return context; + } + + std::shared_ptr CreateCommandQueue( + const cl::Context& context) { + auto queue = + std::make_shared(context, device(), 0, &status_); + CL_CHECK_ERRORS(status_); + return queue; + } + + std::string cl_path_; + + std::shared_ptr platform_{nullptr}; + + std::shared_ptr context_{nullptr}; + + std::shared_ptr device_{nullptr}; + + std::shared_ptr command_queue_{nullptr}; + + cl_int status_{CL_SUCCESS}; + + bool initialized_{false}; + + bool is_init_success_{false}; +}; + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_half.cc b/paddle/fluid/lite/opencl/cl_half.cc new file mode 100644 index 0000000000000000000000000000000000000000..bbed7c0b8b9bb1d73e121991cb50b37331bc6018 --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_half.cc @@ -0,0 +1,518 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf + +#include "paddle/fluid/lite/opencl/cl_half.h" + +namespace paddle { +namespace lite { + +static const uint32_t mantissatable[2048] = { + 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000, + 0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, + 0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000, + 0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000, + 0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000, + 0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000, + 0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000, + 0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000, + 0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000, + 0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000, + 0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000, + 0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000, + 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000, + 0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000, + 0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000, + 0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000, + 0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000, + 0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000, + 0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000, + 0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000, + 0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000, + 0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, + 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, + 0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000, + 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, + 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000, + 0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000, + 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, + 0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000, + 0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000, + 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, + 0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000, + 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, + 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000, + 0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000, + 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, + 0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000, + 0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, + 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, + 0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000, + 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, + 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000, + 0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000, + 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, + 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, + 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, + 0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000, + 0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000, + 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, + 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, + 0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000, + 0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000, + 0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000, + 0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000, + 0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000, + 0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000, + 0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000, + 0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000, + 0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000, + 0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000, + 0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000, + 0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000, + 0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000, + 0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000, + 0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000, + 0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000, + 0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000, + 0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000, + 0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000, + 0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000, + 0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000, + 0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000, + 0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000, + 0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000, + 0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000, + 0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000, + 0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000, + 0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000, + 0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000, + 0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000, + 0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000, + 0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000, + 0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000, + 0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000, + 0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000, + 0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000, + 0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000, + 0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000, + 0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000, + 0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000, + 0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000, + 0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000, + 0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000, + 0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000, + 0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000, + 0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000, + 0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000, + 0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000, + 0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000, + 0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000, + 0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000, + 0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000, + 0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000, + 0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000, + 0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000, + 0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000, + 0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000, + 0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000, + 0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000, + 0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000, + 0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000, + 0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000, + 0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000, + 0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000, + 0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000, + 0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000, + 0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000, + 0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000, + 0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000, + 0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000, + 0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000, + 0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000, + 0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000, + 0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000, + 0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000, + 0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000, + 0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000, + 0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000, + 0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000, + 0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000, + 0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000, + 0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000, + 0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000, + 0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000, + 0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000, + 0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000, + 0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000, + 0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000, + 0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000, + 0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000, + 0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000, + 0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000, + 0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000, + 0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000, + 0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000, + 0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000, + 0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000, + 0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000, + 0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000, + 0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000, + 0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000, + 0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000, + 0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000, + 0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000, + 0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000, + 0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000, + 0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000, + 0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000, + 0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000, + 0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000, + 0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000, + 0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000, + 0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000, + 0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000, + 0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000, + 0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000, + 0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000, + 0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000, + 0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000, + 0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000, + 0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000, + 0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000, + 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000, + 0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000, + 0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000, + 0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000, + 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000, + 0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, + 0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000, + 0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000, + 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000, + 0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, + 0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000, + 0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000, + 0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000, + 0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000, + 0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000, + 0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000, + 0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000, + 0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000, + 0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000, + 0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000, + 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000, + 0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, + 0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000, + 0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000, + 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000, + 0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000, + 0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000, + 0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000, + 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000, + 0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, + 0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000, + 0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000, + 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000, + 0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000, + 0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000, + 0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000, + 0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000, + 0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000, + 0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000, + 0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000, + 0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000, + 0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000, + 0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000, + 0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000, + 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000, + 0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, + 0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000, + 0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000, + 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000, + 0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000, + 0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000, + 0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000, + 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000, + 0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, + 0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000, + 0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000, + 0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000, + 0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000, + 0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000, + 0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000, + 0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000, + 0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000, + 0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000, + 0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000, + 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000, + 0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, + 0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000, + 0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000, + 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000, + 0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, + 0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000, + 0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000, + 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000, + 0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000, + 0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000, + 0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000, + 0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000, + 0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000, + 0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000, + 0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000, + 0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000, + 0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000, + 0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000, + 0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000, + 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000, + 0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, + 0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000, + 0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000, + 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000, + 0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000, + 0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000, + 0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000, + 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000, + 0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, + 0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000, + 0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000, + 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000, + 0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000, + 0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000, + 0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000, + 0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000, + 0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000, + 0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000, + 0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000, + 0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000, + 0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, + 0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000, + 0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000, + 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000, + 0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, + 0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000, + 0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000, + 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000, + 0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, + 0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000, + 0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000, + 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000, + 0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, + 0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000, + 0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000, + 0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000, + 0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000, + 0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000, + 0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000, + 0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000, + 0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000, + 0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000, + 0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000, + 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000, + 0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, + 0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000, + 0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000, + 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000, + 0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, + 0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000, + 0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000, + 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000, + 0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, + 0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000, + 0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000, + 0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000, + 0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000, + 0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000, + 0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000, + 0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000, + 0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000, + 0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000, + 0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000, + 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000, + 0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, + 0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000, + 0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000, + 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000, + 0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, + 0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000, + 0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000, + 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000, + 0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, + 0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000, + 0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000, + 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000, + 0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000, + 0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000, + 0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000, + 0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000, + 0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000, + 0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000, + 0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000, + 0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000, + 0x387fc000, 0x387fe000}; + +static const uint16_t offsettable[64] = { + 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400}; + +static const uint32_t exponenttable[64] = { + 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, + 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, + 0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000, + 0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000, + 0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000, + 0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000, + 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, + 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, + 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000, + 0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000, + 0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000}; + +static const uint16_t basetable[512] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, + 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000, + 0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, + 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800, + 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, + 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, + 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400, + 0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, + 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00, + 0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00}; + +static const uint8_t shifttable[512] = { + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, + 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, + 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d}; + +half_t Float2Half(float f) { + uint32_t v = *reinterpret_cast(&f); + return basetable[(v >> 23) & 0x1ff] + + ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]); +} + +float Half2Float(half_t h) { + uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + + exponenttable[h >> 10]; + return *reinterpret_cast(&v); +} + +void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) { + for (int i = 0; i < count; ++i) { + h_array[i] = Float2Half(f_array[i]); + } +} + +void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) { + for (int i = 0; i < count; ++i) { + f_array[i] = Half2Float(h_array[i]); + } +} + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_half.h b/paddle/fluid/lite/opencl/cl_half.h new file mode 100644 index 0000000000000000000000000000000000000000..0dcf325db2bc13b8fff68f1e777d4680d937abce --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_half.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +namespace paddle { +namespace lite { + +typedef uint16_t half_t; + +half_t Float2Half(float f); + +float Half2Float(half_t h); + +void FloatArray2HalfArray(float *f_array, half_t *h_array, int count); + +void HalfArray2FloatArray(half_t *h_array, float *f_array, int count); + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_helper.cc b/paddle/fluid/lite/opencl/cl_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..116828c153da8a3e94d1d6020137d4ff3cee95ef --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_helper.cc @@ -0,0 +1,90 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/lite/opencl/cl_helper.h" +#include +#include +#include +#include + +namespace paddle { +namespace lite { + +void CLHelper::set_context(CLContext *context) { context_ = context; } + +void CLHelper::AddKernel(const std::string &kernel_name, + const std::string &file_name, + const std::string &options) { + CHECK(context_ != nullptr) << "Please use set_context first!"; + VLOG(3) << " --- begin to add kernel ---"; + auto kernel = context_->GetKernel(kernel_name, file_name, options); + kernels.emplace_back(std::move(kernel)); + VLOG(3) << " --- end to add kernel --- "; +} + +cl::Kernel &CLHelper::KernelAt(const int index) { + VLOG(3) << " --- kernel count: " << kernels.size() << " --- "; + CHECK(static_cast(index) < kernels.size()) + << "The index must be less than the size of kernels."; + CHECK(kernels[index] != nullptr) + << "The target kernel pointer cannot be null."; + return *(kernels[index]); +} + +cl::CommandQueue &CLHelper::OpenCLCommandQueue() { + CHECK(context_ != nullptr) << "Please use set_context first!"; + return context_->GetCommandQueue(); +} + +cl::Context &CLHelper::OpenCLContext() { + CHECK(context_ != nullptr) << "Please use set_context first!"; + return context_->GetContext(); +} + +cl::NDRange CLHelper::DefaultWorkSize(const CLImage &image) { + // n c h w + auto image_dim = image.tensor_dims(); + if (image_dim.size() == 4) { + auto n = image_dim[0]; + auto h = image_dim[2]; + auto w = image_dim[3]; + auto image_width = image.ImageWidth(); + auto work_size_0 = image_width / w; + auto work_size_1 = w; + auto work_size_2 = n * h; + return cl::NDRange{static_cast(work_size_0), + static_cast(work_size_1), + static_cast(work_size_2)}; + } else if (image_dim.size() == 2) { + return cl::NDRange{static_cast(1), + static_cast(image.ImageWidth()), + static_cast(image.ImageHeight())}; + } else if (image_dim.size() == 1) { + return cl::NDRange{static_cast(1), + static_cast(image.ImageWidth()), + static_cast(1)}; + } else if (image_dim.size() == 3) { + auto c = image_dim[0]; + auto h = image_dim[1]; + auto w = image_dim[2]; + return cl::NDRange{static_cast((c + 3) / 4), static_cast(w), + static_cast(h)}; + } else { + LOG(FATAL) << "Not support this dimension, need to be implemented!"; + return cl::NDRange{}; + } +} + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_helper.h b/paddle/fluid/lite/opencl/cl_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..f6f89fb6fdac15f05cc61f61ead6ba8fadee4b74 --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_helper.h @@ -0,0 +1,52 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/lite/opencl/cl2_header.h" +#include "paddle/fluid/lite/opencl/cl_context.h" +#include "paddle/fluid/lite/opencl/cl_image.h" + +namespace paddle { +namespace lite { + +class CLHelper { + public: + CLHelper() = default; + + explicit CLHelper(CLContext *context) : context_(context) {} + + void set_context(CLContext *context); + + void AddKernel(const std::string &kernel_name, const std::string &file_name, + const std::string &options = ""); + + cl::Kernel &KernelAt(const int index); + + cl::CommandQueue &OpenCLCommandQueue(); + + cl::Context &OpenCLContext(); + + cl::NDRange DefaultWorkSize(const CLImage &image); + + private: + CLContext *context_{nullptr}; + std::vector> kernels; +}; + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_image.cc b/paddle/fluid/lite/opencl/cl_image.cc new file mode 100644 index 0000000000000000000000000000000000000000..2c551d6cd3342b23c34427fe3532374617619b3d --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_image.cc @@ -0,0 +1,164 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/lite/opencl/cl_image.h" +#include +#include +#include "paddle/fluid/lite/opencl/cl_engine.h" +#include "paddle/fluid/lite/opencl/cl_half.h" +#include "paddle/fluid/lite/opencl/cl_tool.h" + +namespace paddle { +namespace lite { + +std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) { + int width = cl_image.image_dims_[0]; + int height = cl_image.image_dims_[1]; + + half_t* image_data = new half_t[height * width * 4]; + cl::Image* image = cl_image.cl_image(); + const std::array origin{0, 0, 0}; + const std::array region{static_cast(width), + static_cast(height), 1}; + cl_int err = CLEngine::Global()->command_queue().enqueueReadImage( + *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr); + CL_CHECK_ERRORS(err); + + float* tensor_data = new float[cl_image.numel()]; + auto* converter = cl_image.image_converter(); + converter->ImageToNCHW(image_data, tensor_data, cl_image.image_dims_, + cl_image.tensor_dims_); + int stride = cl_image.numel() / 20; + stride = stride > 0 ? stride : 1; + + os << " dims: " << cl_image.tensor_dims_ << "\n"; + for (int i = 0; i < cl_image.numel(); i += stride) { + os << tensor_data[i] << " "; + } + + delete[] tensor_data; + delete[] image_data; + + return os; +} + +void CLImage::set_tensor_data(float* tensor_data, const DDim& dim) { +#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK + auto numel = dim.product(); +#else + auto numel = dim.production(); +#endif + tensor_data_.reset(new float[numel]); + memcpy(tensor_data_.get(), tensor_data, numel * sizeof(float)); + tensor_dims_ = dim; +} + +void CLImage::InitCLImage(const cl::Context& context) { + CHECK(tensor_data_ != nullptr) << " Please call " + "set_tensohelper->DefaultWorkSize(out_" + "image)r_data first!"; + image_converter_.reset(new CLImageConverterFolder); + InitCLImage(context, image_converter_.get()); +} + +void CLImage::InitNormalCLImage(const cl::Context& context) { + CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!"; + image_converter_.reset(new CLImageConverterNormal); + InitCLImage(context, image_converter_.get()); +} + +void CLImage::InitNImage(const cl::Context& context) { + CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!"; + CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4."; + image_converter_.reset(new CLImageConverterNWBlock); + InitCLImage(context, image_converter_.get()); +} + +void CLImage::InitDWImage(const cl::Context& context) { + CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!"; + CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4."; + image_converter_.reset(new CLImageConverterDWBlock); + InitCLImage(context, image_converter_.get()); +} + +void CLImage::InitEmptyImage(const cl::Context& context, const DDim& dim) { + CHECK(tensor_data_ == nullptr) + << " Empty image tensor data shouldn't have value"; + + tensor_dims_ = dim; + image_converter_.reset(new CLImageConverterNormal); + + VLOG(3) << " to get image dims "; + image_dims_ = image_converter_->InitImageDimInfoWith(tensor_dims_); + VLOG(3) << " end get image dims " << image_dims_; + + InitCLImage(context, image_dims_[0], image_dims_[1], nullptr); + + cl_event_ = CLEngine::Global()->CreateEvent(context); + initialized_ = true; + VLOG(3) << " end init cl image "; +} + +void CLImage::InitEmptyWithImageDim(const cl::Context& context, + const DDim& image_dims) { + VLOG(3) << " to get image dims "; + image_dims_ = image_dims; + VLOG(3) << " end get image dims " << image_dims_; + + InitCLImage(context, image_dims_[0], image_dims_[1], nullptr); + + cl_event_ = CLEngine::Global()->CreateEvent(context); + initialized_ = true; + VLOG(3) << " end init cl image"; +} + +void CLImage::InitCLImage(const cl::Context& context, + CLImageConverterBase* converter) { + CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!"; + + VLOG(3) << " begin init cl image "; + image_dims_ = converter->InitImageDimInfoWith(tensor_dims_); + +#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK + half_t* image_data = new half_t[image_dims_.product() * 4]; +#else + half_t* image_data = new half_t[image_dims_.production() * 4]; +#endif + + VLOG(3) << " convert to image "; + converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_); + VLOG(3) << " end convert to image "; + + InitCLImage(context, image_dims_[0], image_dims_[1], image_data); + + delete[] image_data; + tensor_data_ = nullptr; + cl_event_ = CLEngine::Global()->CreateEvent(context); + initialized_ = true; + VLOG(3) << " end init cl image "; +} + +void CLImage::InitCLImage(const cl::Context& context, int width, int height, + void* data) { + cl::ImageFormat img_format(CL_RGBA, CL_HALF_FLOAT); + cl_int err; + cl_image_.reset(new cl::Image2D( + context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0), + img_format, width, height, 0, data, &err)); + CL_CHECK_ERRORS(err); + CHECK(err == CL_SUCCESS) << " Create image 2d error."; +} + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_image.h b/paddle/fluid/lite/opencl/cl_image.h new file mode 100644 index 0000000000000000000000000000000000000000..627e503168e4ed12ea30137f2c3155bd07f9e062 --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_image.h @@ -0,0 +1,118 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/lite/core/compatible_tensor.h" +#include "paddle/fluid/lite/opencl/cl2_header.h" +#include "paddle/fluid/lite/opencl/cl_image_converter.h" + +namespace paddle { +namespace lite { + +class CLImage { + // For debug + friend std::ostream& operator<<(std::ostream& os, const CLImage& image); + + public: + CLImage() = default; + /* + * Will not hold input tensor data, memcpy in this method. + * */ + void set_tensor_data(float* tensor_data, const DDim& dim); + + bool IsInit() { return initialized_; } + /* + * Need call set_tensor_data first. + * Folder when one dim or two dim. + * */ + void InitCLImage(const cl::Context& context); + + void InitNormalCLImage(const cl::Context& context); + + void InitNImage(const cl::Context& context); + + void InitDWImage(const cl::Context& context); + + void InitEmptyImage(const cl::Context& context, const DDim& dim); + + void InitEmptyWithImageDim(const cl::Context& context, + const DDim& image_dims); + + cl::Image* cl_image() const { return cl_image_.get(); } + + const DDim& image_dims() const { return image_dims_; } + + inline size_t ImageWidth() const { return image_dims_[0]; } + + inline size_t ImageHeight() const { return image_dims_[1]; } + + const DDim& tensor_dims() const { return tensor_dims_; } + + /*with_da + * Resize original tensor dim. + * */ + inline CLImage& Resize(const DDim& dims) { + tensor_dims_ = dims; + return *this; + } + + template + T* data() const { + CHECK(!initialized_) << "CL image has initialized, tensor data has been " + "deleted, can't use tensor data!"; + return reinterpret_cast(tensor_data_); + } + + /* + * Numel of tensor dim + * */ + inline int64_t numel() const { +#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK + return tensor_dims_.product(); +#else + return tensor_dims_.production(); +#endif + } + + /* + * Original tensor dim + * */ + + cl::UserEvent& cl_event() const { return *cl_event_; } + + CLImageConverterBase* image_converter() const { + return image_converter_.get(); + } + + private: + void InitCLImage(const cl::Context& context, CLImageConverterBase* converter); + + void InitCLImage(const cl::Context& context, int width, int height, + void* data); + + bool initialized_ = false; + std::unique_ptr cl_image_{nullptr}; + std::unique_ptr cl_event_{nullptr}; + DDim tensor_dims_; + DDim image_dims_; + std::unique_ptr tensor_data_{nullptr}; + std::unique_ptr image_converter_{nullptr}; +}; + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_image_converter.cc b/paddle/fluid/lite/opencl/cl_image_converter.cc new file mode 100644 index 0000000000000000000000000000000000000000..4408625e8fef1c884ca94b8323803719bee9e6be --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_image_converter.cc @@ -0,0 +1,450 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/lite/opencl/cl_image_converter.h" +#include +#include + +namespace paddle { +namespace lite { + +DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (size_t j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + return DDim( + std::vector({static_cast(width), + static_cast(height)})); +} + +void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image, + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (size_t j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + + DDim in_image_dim = InitImageDimInfoWith(tensor_dim); + + VLOG(3) << " tensor dim: " << tensor_dim; + VLOG(3) << " image dim: " << in_image_dim; + + size_t width = in_image_dim[0]; + size_t w_block = width / W; + + float *p = nchw; + size_t i0 = 0; + for (size_t n = 0; n < N; n++) { + for (size_t c = 0; c < w_block * 4; c++) { + size_t i1 = i0 + (c / 4) * W; + for (size_t h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (size_t w = 0; w < W; w++) { + if (c < C) { + // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + + // (c % 4); + image[i2] = Float2Half(*p); + i2 += 4; + p++; + } else { + image[i2] = 0.0; + i2 += 4; + } + } + i1 += width; + } + } + i0 += width * H; + } +} + +void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (size_t j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + + size_t width = image_dim[0]; + float *p = tensor; + + size_t i0 = 0; + for (size_t n = 0; n < N; n++) { + for (size_t c = 0; c < C; c++) { + size_t i1 = i0 + (c / 4) * W; + for (size_t h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (size_t w = 0; w < W; w++) { + *p = Half2Float(image[i2]); + i2 += 4; + p++; + } + i1 += width; + } + } + i0 += width * H; + } +} + +DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) { + if (tensor_dim.size() <= 2) { + size_t tdim[2] = {1, 1}; + if (tensor_dim.size() == 1) { + tdim[1] = tensor_dim[0]; + } else { + tdim[0] = tensor_dim[0]; + tdim[1] = tensor_dim[1]; + } + size_t width = (tdim[1] + 3) / 4; + size_t height = tdim[0]; + + width_of_one_block_ = width; + height_of_one_block_ = height; + c_block_ = 1; + + return DDim( + std::vector({static_cast(width), + static_cast(height)})); + + } else { + size_t new_dims[] = {1, 1, 1, 1}; + for (size_t j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + + width_of_one_block_ = W; + height_of_one_block_ = H; + c_block_ = width / W; + + return DDim( + std::vector({static_cast(width), + static_cast(height)})); + } +} + +void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) { + CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) + << " Tensor dim is not support!"; + + if (tensor_dim.size() > 2) { + CLImageConverterDefault default_converter; + default_converter.NCHWToImage(tensor, image, tensor_dim); + + } else { + size_t tdim[2] = {1, 1}; + if (tensor_dim.size() == 1) { + tdim[1] = tensor_dim[0]; + } else { + tdim[0] = tensor_dim[0]; + tdim[1] = tensor_dim[1]; + } + + DDim image_dim = InitImageDimInfoWith(tensor_dim); + size_t width = image_dim[0]; + + for (size_t h = 0; h < tdim[0]; h++) { + for (size_t w = 0; w < tdim[1]; w++) { + image[(h * width + w / 4) * 4 + (w % 4)] = + Float2Half(tensor[h * tdim[1] + w]); + } + } + } +} + +void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + if (tensor_dim.size() > 2) { + CLImageConverterDefault default_converter; + default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim); + + } else { + size_t width = image_dim[0]; + size_t H = 1, W = 1; + + if (tensor_dim.size() == 2) { + H = tensor_dim[0]; + W = tensor_dim[1]; + } else if (tensor_dim.size() == 1) { + W = tensor_dim[0]; + } + + float *p = tensor; + + for (size_t h = 0; h < H; h++) { + for (size_t w = 0; w < W; w++) { + p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]); + } + } + } +} + +DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { + CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; + size_t N, C, H, W; + N = tensor_dim[0]; + C = tensor_dim[1]; + H = tensor_dim[2]; + W = tensor_dim[3]; + size_t width = W * ((N + 3) / 4); + size_t height = C * H; + return DDim( + std::vector({static_cast(width), + static_cast(height)})); +} + +void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) { + CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; + auto image_dim = InitImageDimInfoWith(tensor_dim); + float *p = tensor; + size_t N = tensor_dim[0]; + size_t C = tensor_dim[1]; + size_t H = tensor_dim[2]; + size_t W = tensor_dim[3]; + size_t width = image_dim[0]; + size_t height = image_dim[1]; + size_t block = image_dim[0] / tensor_dim[3]; + + for (size_t n = 0; n < block * 4; n++) { + for (size_t c = 0; c < C; c++) { + for (size_t h = 0; h < H; ++h) { + for (size_t w = 0; w < W; ++w) { + size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + + w * 4 + n % 4; + if (n < N) { + image[index] = Float2Half(*p); + p++; + } else { + image[index] = 0.0; + } + if (index >= (width * height * 4)) { + LOG(INFO) << " index out of range "; + } + } + } + } + } + VLOG(3) << " init done"; +} + +void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; + float *p = tensor; + size_t N = tensor_dim[0]; + size_t C = tensor_dim[1]; + size_t H = tensor_dim[2]; + size_t W = tensor_dim[3]; + size_t width = image_dim[0]; + size_t height = image_dim[1]; + + for (size_t n = 0; n < N; n++) { + for (size_t c = 0; c < C; c++) { + for (size_t h = 0; h < H; ++h) { + for (size_t w = 0; w < W; ++w) { + size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + + w * 4 + n % 4; + *p = Half2Float(image[index]); + p++; + if (index >= (width * height * 4)) { + LOG(INFO) << " index out of range "; + } + } + } + } + } + VLOG(3) << " init done"; +} + +DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { + CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; + size_t N, C, H, W; + N = tensor_dim[0]; + C = tensor_dim[1]; + H = tensor_dim[2]; + W = tensor_dim[3]; + size_t width = W * ((N + 3) / 4); + size_t height = C * H; + return DDim( + std::vector({static_cast(width), + static_cast(height)})); +} + +void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (size_t j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + + size_t N, C, H, W; + N = new_dims[1]; + C = new_dims[0]; + H = new_dims[2]; + W = new_dims[3]; + + DDim in_image_dim = InitImageDimInfoWith(tensor_dim); + + VLOG(3) << " tensor dim: " << tensor_dim; + VLOG(3) << " image dim: " << in_image_dim; + + size_t width = in_image_dim[0]; + size_t w_block = width / W; + + float *p = tensor; + size_t i0 = 0; + for (size_t n = 0; n < N; n++) { + for (size_t c = 0; c < w_block * 4; c++) { + size_t i1 = i0 + (c / 4) * W; + for (size_t h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (size_t w = 0; w < W; w++) { + if (c < C) { + // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + + // (c % 4); + image[i2] = Float2Half(*p); + i2 += 4; + p++; + } else { + image[i2] = 0.0; + i2 += 4; + } + } + i1 += width; + } + } + i0 += width * H; + } +} + +void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; + float *p = tensor; + size_t N = tensor_dim[1]; + size_t C = tensor_dim[0]; + size_t H = tensor_dim[2]; + size_t W = tensor_dim[3]; + size_t width = image_dim[0]; + + size_t i0 = 0; + for (size_t n = 0; n < N; n++) { + for (size_t c = 0; c < C; c++) { + size_t i1 = i0 + (c / 4) * W; + for (size_t h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (size_t w = 0; w < W; w++) { + *p = Half2Float(image[i2]); + i2 += 4; + p++; + } + i1 += width; + } + } + i0 += width * H; + } +} + +DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (size_t j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + + width_of_one_block_ = W; + height_of_one_block_ = H; + c_block_ = width / W; + + return DDim( + std::vector({static_cast(width), + static_cast(height)})); +} + +void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) { + CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) + << " Tensor dim is not support!"; + + CLImageConverterDefault default_converter; + default_converter.NCHWToImage(tensor, image, tensor_dim); +} + +void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + CLImageConverterDefault default_converter; + default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim); +} + +DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith( + const DDim &tensor_dim) { + CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; + size_t N, C; + N = tensor_dim[0]; + C = tensor_dim[1]; + size_t width = (C + 3) / 4; + size_t height = N * 16; // N * (wino_blk_size + 2) * (wino_blk_size + 2) + return DDim( + std::vector({static_cast(width), + static_cast(height)})); +} + +void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) {} + +void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) {} + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_image_converter.h b/paddle/fluid/lite/opencl/cl_image_converter.h new file mode 100644 index 0000000000000000000000000000000000000000..9dceca4503049e97d1bb1f24b3ee5417d7571966 --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_image_converter.h @@ -0,0 +1,115 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/lite/core/compatible_tensor.h" +#include "paddle/fluid/lite/opencl/cl_half.h" + +namespace paddle { +namespace lite { + +class CLImageConverterBase { + public: + virtual ~CLImageConverterBase() {} + + virtual void NCHWToImage(float *nchw, half_t *image, + const DDim &tensor_dim) = 0; + + virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim, + const DDim &tensor_dim) = 0; + virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0; +}; + +class CLImageConverterDefault : public CLImageConverterBase { + public: + DDim InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); +}; + +class CLImageConverterFolder : public CLImageConverterBase { + public: + DDim InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); + + /* + * width of original tensor + * */ + inline size_t WidthOfOneBlock() const { return width_of_one_block_; } + + /* + * height of original tensor + * */ + inline size_t HeightOfOneBlock() const { return height_of_one_block_; } + + int GetCBlock() const { return c_block_; } + + private: + int c_block_; + int width_of_one_block_; + int height_of_one_block_; +}; + +class CLImageConverterNormal : public CLImageConverterBase { + public: + DDim InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); + + /* + * width of original tensor + * */ + inline size_t WidthOfOneBlock() const { return width_of_one_block_; } + + /* + * height of original tensor + * */ + inline size_t HeightOfOneBlock() const { return height_of_one_block_; } + + int GetCBlock() const { return c_block_; } + + private: + int c_block_; + int width_of_one_block_; + int height_of_one_block_; +}; + +class CLImageConverterNWBlock : public CLImageConverterBase { + DDim InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); +}; +class CLImageConverterDWBlock : public CLImageConverterBase { + DDim InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); +}; + +class CLImageConverterWinoTransWeight : public CLImageConverterBase { + public: + DDim InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); +}; + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_kernel/cl_common.h b/paddle/fluid/lite/opencl/cl_kernel/cl_common.h new file mode 100644 index 0000000000000000000000000000000000000000..31ca6d7f65c66050e1a6c7c16ec840f1a93a0463 --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_kernel/cl_common.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +inline half4 activation(half4 in +#ifdef PRELU + , + half4 prelu_alpha +#endif + ) { + half4 output; +#ifdef PRELU + output = select(prelu_alpha * in, in, in >= (half4)0.0); +#endif + +#ifdef RELU + output = fmax(in, (half4)(0.0f)); +#endif + return output; +} diff --git a/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl b/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..f304764868959ce028a8448c4d311db878cc1f6e --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl @@ -0,0 +1,27 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + half4 in = read_imageh(input, sampler, coords); + half4 biase = read_imageh(bias, sampler, coords); + half4 output = in + biase; + write_imageh(outputImage,coords,output); + } diff --git a/paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl b/paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..a6a4da690fa921d281786fcddebf7362d3c52119 --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl @@ -0,0 +1,91 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#define MIN_VALUE -FLT_MAX + +__kernel void pool_max( + __private const int in_height, __private const int in_width, + __private const int out_height, __private const int out_width, + __private const int pad_top, __private const int pad_left, + __private const int stride_h, __private const int stride_w, + __private const int ksize_h, __private const int ksize_w, + __read_only image2d_t input, __write_only image2d_t output) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int start_h = out_h * stride_h - pad_top; + int end_h = min(start_h + ksize_h, in_height); + start_h = max(start_h,0); + + int start_w = out_w * stride_w - pad_left; + int end_w = min(start_w + ksize_w, in_width); + start_w = max(start_w,0); + + const int pos_in_x = out_c * in_width; + const int pos_in_y = out_n * in_height; + half4 max_value = (half4)(MIN_VALUE); + for (int y = start_h; y < end_h; ++y) { + for (int x = start_w; x < end_w; ++x) { + half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + max_value = max(max_value, tmp); + } + } + + const int pos_out_x = mad24(out_c, out_width, out_w); + write_imageh(output, (int2)(pos_out_x, out_nh), max_value); +} + +__kernel void pool_avg( + __private const int in_height, __private const int in_width, + __private const int out_height, __private const int out_width, + __private const int pad_top, __private const int pad_left, + __private const int stride_h, __private const int stride_w, + __private const int ksize_h, __private const int ksize_w, + __read_only image2d_t input, __write_only image2d_t output) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int start_h = max(out_h * stride_h - pad_top, 0); + int end_h = min(start_h + ksize_h, in_height); + + int start_w = max(out_w * stride_w - pad_left, 0); + int end_w = min(start_w + ksize_w, in_width); + + const int pos_in_x = out_c * in_width; + const int pos_in_y = out_n * in_height; + half4 sum = (half4)(0.0f); + int num = 0; + for (int y = start_h; y < end_h; ++y) { + for (int x = start_w; x < end_w; ++x) { + sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + num++; + } + } + half4 avg = sum / num; + const int pos_out_x = mad24(out_c, out_width, out_w); + write_imageh(output, (int2)(pos_out_x, out_nh), avg); +} diff --git a/paddle/fluid/lite/opencl/cl_test.cc b/paddle/fluid/lite/opencl/cl_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..4a4ac965c1191d9fe4407635911d8feef9bf726a --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_test.cc @@ -0,0 +1,154 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/lite/core/compatible_tensor.h" +#include "paddle/fluid/lite/opencl/cl_caller.h" +#include "paddle/fluid/lite/opencl/cl_context.h" +#include "paddle/fluid/lite/opencl/cl_engine.h" +#include "paddle/fluid/lite/opencl/cl_helper.h" +#include "paddle/fluid/lite/opencl/cl_image.h" + +DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path."); + +namespace paddle { +namespace lite { + +TEST(cl_test, engine_test) { + auto* engine = CLEngine::Global(); + CHECK(engine->IsInitSuccess()); + engine->set_cl_path(FLAGS_cl_path); + engine->platform(); + engine->device(); + engine->command_queue(); + auto& context = engine->context(); + auto program = engine->CreateProgram( + context, engine->cl_path() + "/cl_kernel/" + "elementwise_add_kernel.cl"); + auto event = engine->CreateEvent(context); + CHECK(engine->BuildProgram(program.get())); +} + +TEST(cl_test, context_test) { + auto* engine = CLEngine::Global(); + CHECK(engine->IsInitSuccess()); + engine->set_cl_path(FLAGS_cl_path); + CLContext context; + context.GetKernel("pool_max", "pool_kernel.cl", ""); + context.GetKernel("elementwise_add", "elementwise_add_kernel.cl", ""); + context.GetKernel("elementwise_add", "elementwise_add_kernel.cl", ""); +} + +TEST(cl_test, kernel_test) { + auto* engine = CLEngine::Global(); + CHECK(engine->IsInitSuccess()); + engine->set_cl_path(FLAGS_cl_path); + std::unique_ptr context(new CLContext); + // std::unique_ptr helper(new CLHelper(context.get())); + std::unique_ptr helper(new CLHelper); + helper->set_context(context.get()); + helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl"); + helper->AddKernel("pool_max", "pool_kernel.cl"); + helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl"); + auto kernel = helper->KernelAt(2); + + std::unique_ptr in_data(new float[1024 * 512]); + for (int i = 0; i < 1024 * 512; i++) { + in_data[i] = 1.f; + } + const DDim in_dim = DDim(std::vector{1024, 512}); + CLImage in_image; + in_image.set_tensor_data(in_data.get(), in_dim); + in_image.InitNormalCLImage(helper->OpenCLContext()); + LOG(INFO) << in_image; + + std::unique_ptr bias_data(new float[1024 * 512]); + for (int i = 0; i < 1024 * 512; i++) { + bias_data[i] = 2.f; + } + const DDim bias_dim = DDim(std::vector{1024, 512}); + CLImage bias_image; + bias_image.set_tensor_data(bias_data.get(), bias_dim); + bias_image.InitNormalCLImage(helper->OpenCLContext()); + LOG(INFO) << bias_image; + + CLImage out_image; + const DDim out_dim = DDim(std::vector{1024, 512}); + out_image.InitEmptyImage(helper->OpenCLContext(), out_dim); + LOG(INFO) << out_image; + + cl_int status; + status = kernel.setArg(0, *in_image.cl_image()); + CL_CHECK_ERRORS(status); + status = kernel.setArg(1, *bias_image.cl_image()); + CL_CHECK_ERRORS(status); + status = kernel.setArg(2, *out_image.cl_image()); + CL_CHECK_ERRORS(status); + + // auto global_work_size = helper->DefaultWorkSize(out_image); + size_t width = in_image.ImageWidth(); + size_t height = in_image.ImageHeight(); + auto global_work_size = cl::NDRange{width, height}; + cl::Event event; + status = helper->OpenCLCommandQueue().enqueueNDRangeKernel( + kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event); + CL_CHECK_ERRORS(status); + + double start_nanos = event.getProfilingInfo(); + double stop_nanos = event.getProfilingInfo(); + double elapsed_micros = (stop_nanos - start_nanos) / 1000.0; + LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us."; + LOG(INFO) << out_image; +} + +TEST(cl_test, elementwise_add_test) { + std::default_random_engine engine; + std::uniform_real_distribution dist(-5, 5); + + const DDim in_dim = DDim(std::vector{1024, 512}); + std::unique_ptr in_data(new float[1024 * 512]); + for (int i = 0; i < 1024 * 512; i++) { + in_data[i] = dist(engine); + } + + const DDim bias_dim = DDim(std::vector{1024, 512}); + std::unique_ptr bias_data(new float[1024 * 512]); + for (int i = 0; i < 1024 * 512; i++) { + bias_data[i] = dist(engine); + } + + const DDim out_dim = DDim(std::vector{1024, 512}); + std::unique_ptr out(new float[1024 * 512]); + + bool status = InitOpenCLEngine(FLAGS_cl_path); + CHECK(status) << "Fail to initialize OpenCL engine."; + CLContext context; + + elementwise_add(&context, in_data.get(), in_dim, bias_data.get(), bias_dim, + out.get(), out_dim); + + int stride = 1024 * 512 / 20; + for (int i = 0; i < 1024 * 512; i += stride) { + std::cout << out[i] << " "; + } + + std::cout << std::endl; +} + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_tool.cc b/paddle/fluid/lite/opencl/cl_tool.cc new file mode 100644 index 0000000000000000000000000000000000000000..d09642ff5535bd80e3d9db259b63435371c17971 --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_tool.cc @@ -0,0 +1,84 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/lite/opencl/cl_tool.h" + +namespace paddle { +namespace lite { + +const char *opencl_error_to_str(cl_int error) { +#define CASE_CL_CONSTANT(NAME) \ + case NAME: \ + return #NAME; + // Suppose that no combinations are possible. + switch (error) { + CASE_CL_CONSTANT(CL_SUCCESS) + CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND) + CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE) + CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE) + CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE) + CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES) + CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY) + CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE) + CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP) + CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH) + CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED) + CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE) + CASE_CL_CONSTANT(CL_MAP_FAILURE) + CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET) + CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) + CASE_CL_CONSTANT(CL_INVALID_VALUE) + CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE) + CASE_CL_CONSTANT(CL_INVALID_PLATFORM) + CASE_CL_CONSTANT(CL_INVALID_DEVICE) + CASE_CL_CONSTANT(CL_INVALID_CONTEXT) + CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES) + CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE) + CASE_CL_CONSTANT(CL_INVALID_HOST_PTR) + CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT) + CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR) + CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE) + CASE_CL_CONSTANT(CL_INVALID_SAMPLER) + CASE_CL_CONSTANT(CL_INVALID_BINARY) + CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS) + CASE_CL_CONSTANT(CL_INVALID_PROGRAM) + CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE) + CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME) + CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION) + CASE_CL_CONSTANT(CL_INVALID_KERNEL) + CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX) + CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE) + CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE) + CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS) + CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION) + CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE) + CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE) + CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET) + CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST) + CASE_CL_CONSTANT(CL_INVALID_EVENT) + CASE_CL_CONSTANT(CL_INVALID_OPERATION) + CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT) + CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE) + CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL) + CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE) + CASE_CL_CONSTANT(CL_INVALID_PROPERTY) + + default: + return "UNKNOWN ERROR CODE"; + } +#undef CASE_CL_CONSTANT +} + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_tool.h b/paddle/fluid/lite/opencl/cl_tool.h new file mode 100644 index 0000000000000000000000000000000000000000..3fdc6287cadf88fbe25b7ac5c63f8850283ff9a4 --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_tool.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/lite/opencl/cl2_header.h" + +namespace paddle { +namespace lite { + +const char* opencl_error_to_str(cl_int error); + +#define CL_CHECK_ERRORS(ERR) \ + if (ERR != CL_SUCCESS) { \ + printf( \ + "OpenCL error with code %s happened in file %s at line %d. " \ + "Exiting.\n", \ + opencl_error_to_str(ERR), __FILE__, __LINE__); \ + } +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_wrapper.cc b/paddle/fluid/lite/opencl/cl_wrapper.cc new file mode 100644 index 0000000000000000000000000000000000000000..52c68bdc969c311864e79f9351f26d84d0613f5e --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_wrapper.cc @@ -0,0 +1,962 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "paddle/fluid/lite/opencl/cl2_header.h" + +/** + * Wrapper of OpenCL 2.0, based on file opencl20/CL/cl.h + */ + +#if CL_HPP_TARGET_OPENCL_VERSION < 200 +#define CL_API_SUFFIX__VERSION_2_0 +#endif + +namespace paddle { +namespace lite { + +class OpenCLLibrary final { + private: + OpenCLLibrary(); + OpenCLLibrary(const OpenCLLibrary &) = delete; + OpenCLLibrary &operator=(const OpenCLLibrary &) = delete; + + bool Load(); + void *LoadFromPath(const std::string &path); + + public: + static OpenCLLibrary *Get(); + + using clGetPlatformIDsFunc = cl_int (*)(cl_uint, cl_platform_id *, cl_uint *); + using clGetPlatformInfoFunc = cl_int (*)(cl_platform_id, cl_platform_info, + size_t, void *, size_t *); + using clBuildProgramFunc = cl_int (*)(cl_program, cl_uint, + const cl_device_id *, const char *, + void (*pfn_notify)(cl_program, void *), + void *); + using clEnqueueNDRangeKernelFunc = cl_int (*)(cl_command_queue, cl_kernel, + cl_uint, const size_t *, + const size_t *, const size_t *, + cl_uint, const cl_event *, + cl_event *); + using clSetKernelArgFunc = cl_int (*)(cl_kernel, cl_uint, size_t, + const void *); + using clRetainMemObjectFunc = cl_int (*)(cl_mem); + using clReleaseMemObjectFunc = cl_int (*)(cl_mem); + using clEnqueueUnmapMemObjectFunc = cl_int (*)(cl_command_queue, cl_mem, + void *, cl_uint, + const cl_event *, cl_event *); + using clRetainCommandQueueFunc = cl_int (*)(cl_command_queue command_queue); + using clCreateContextFunc = cl_context (*)( + const cl_context_properties *, cl_uint, const cl_device_id *, + void(CL_CALLBACK *)( // NOLINT(readability/casting) + const char *, const void *, size_t, void *), + void *, cl_int *); + using clCreateContextFromTypeFunc = + cl_context (*)(const cl_context_properties *, cl_device_type, + void(CL_CALLBACK *)( // NOLINT(readability/casting) + const char *, const void *, size_t, void *), + void *, cl_int *); + using clReleaseContextFunc = cl_int (*)(cl_context); + using clWaitForEventsFunc = cl_int (*)(cl_uint, const cl_event *); + using clReleaseEventFunc = cl_int (*)(cl_event); + using clEnqueueWriteBufferFunc = cl_int (*)(cl_command_queue, cl_mem, cl_bool, + size_t, size_t, const void *, + cl_uint, const cl_event *, + cl_event *); + using clEnqueueReadBufferFunc = cl_int (*)(cl_command_queue, cl_mem, cl_bool, + size_t, size_t, void *, cl_uint, + const cl_event *, cl_event *); + using clEnqueueReadImageFunc = cl_int (*)(cl_command_queue, cl_mem, cl_bool, + const size_t *, const size_t *, + size_t, size_t, void *, cl_uint, + const cl_event *, cl_event *); + using clGetProgramBuildInfoFunc = cl_int (*)(cl_program, cl_device_id, + cl_program_build_info, size_t, + void *, size_t *); + using clRetainProgramFunc = cl_int (*)(cl_program program); + using clEnqueueMapBufferFunc = void *(*)(cl_command_queue, cl_mem, cl_bool, + cl_map_flags, size_t, size_t, + cl_uint, const cl_event *, + cl_event *, cl_int *); + using clEnqueueMapImageFunc = void *(*)(cl_command_queue, cl_mem, cl_bool, + cl_map_flags, const size_t *, + const size_t *, size_t *, size_t *, + cl_uint, const cl_event *, cl_event *, + cl_int *); + using clCreateCommandQueueFunc = cl_command_queue(CL_API_CALL *)( // NOLINT + cl_context, cl_device_id, cl_command_queue_properties, cl_int *); + using clCreateCommandQueueWithPropertiesFunc = cl_command_queue (*)( + cl_context, cl_device_id, const cl_queue_properties *, cl_int *); + using clReleaseCommandQueueFunc = cl_int (*)(cl_command_queue); + using clCreateProgramWithBinaryFunc = cl_program (*)(cl_context, cl_uint, + const cl_device_id *, + const size_t *, + const unsigned char **, + cl_int *, cl_int *); + using clRetainContextFunc = cl_int (*)(cl_context context); + using clGetContextInfoFunc = cl_int (*)(cl_context, cl_context_info, size_t, + void *, size_t *); + using clReleaseProgramFunc = cl_int (*)(cl_program program); + using clFlushFunc = cl_int (*)(cl_command_queue command_queue); + using clFinishFunc = cl_int (*)(cl_command_queue command_queue); + using clGetProgramInfoFunc = cl_int (*)(cl_program, cl_program_info, size_t, + void *, size_t *); + using clCreateKernelFunc = cl_kernel (*)(cl_program, const char *, cl_int *); + using clRetainKernelFunc = cl_int (*)(cl_kernel kernel); + using clCreateBufferFunc = cl_mem (*)(cl_context, cl_mem_flags, size_t, + void *, cl_int *); + using clCreateImage2DFunc = cl_mem(CL_API_CALL *)(cl_context, // NOLINT + cl_mem_flags, + const cl_image_format *, + size_t, size_t, size_t, + void *, cl_int *); + using clCreateImageFunc = cl_mem (*)(cl_context, cl_mem_flags, + const cl_image_format *, + const cl_image_desc *, void *, cl_int *); + using clCreateUserEventFunc = cl_event (*)(cl_context, cl_int *); + using clCreateProgramWithSourceFunc = cl_program (*)(cl_context, cl_uint, + const char **, + const size_t *, + cl_int *); + using clReleaseKernelFunc = cl_int (*)(cl_kernel kernel); + using clGetDeviceInfoFunc = cl_int (*)(cl_device_id, cl_device_info, size_t, + void *, size_t *); + using clGetDeviceIDsFunc = cl_int (*)(cl_platform_id, cl_device_type, cl_uint, + cl_device_id *, cl_uint *); + using clRetainDeviceFunc = cl_int (*)(cl_device_id); + using clReleaseDeviceFunc = cl_int (*)(cl_device_id); + using clRetainEventFunc = cl_int (*)(cl_event); + using clGetKernelWorkGroupInfoFunc = cl_int (*)(cl_kernel, cl_device_id, + cl_kernel_work_group_info, + size_t, void *, size_t *); + using clGetEventInfoFunc = cl_int (*)(cl_event event, + cl_event_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret); + using clGetEventProfilingInfoFunc = cl_int (*)(cl_event event, + cl_profiling_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret); + using clGetImageInfoFunc = cl_int (*)(cl_mem, cl_image_info, size_t, void *, + size_t *); + +#define PADDLE_CL_DEFINE_FUNC_PTR(func) func##Func func = nullptr + + PADDLE_CL_DEFINE_FUNC_PTR(clGetPlatformIDs); + PADDLE_CL_DEFINE_FUNC_PTR(clGetPlatformInfo); + PADDLE_CL_DEFINE_FUNC_PTR(clBuildProgram); + PADDLE_CL_DEFINE_FUNC_PTR(clEnqueueNDRangeKernel); + PADDLE_CL_DEFINE_FUNC_PTR(clSetKernelArg); + PADDLE_CL_DEFINE_FUNC_PTR(clReleaseKernel); + PADDLE_CL_DEFINE_FUNC_PTR(clCreateProgramWithSource); + PADDLE_CL_DEFINE_FUNC_PTR(clCreateBuffer); + PADDLE_CL_DEFINE_FUNC_PTR(clCreateImage); + PADDLE_CL_DEFINE_FUNC_PTR(clCreateImage2D); + PADDLE_CL_DEFINE_FUNC_PTR(clCreateUserEvent); + PADDLE_CL_DEFINE_FUNC_PTR(clRetainKernel); + PADDLE_CL_DEFINE_FUNC_PTR(clCreateKernel); + PADDLE_CL_DEFINE_FUNC_PTR(clGetProgramInfo); + PADDLE_CL_DEFINE_FUNC_PTR(clFlush); + PADDLE_CL_DEFINE_FUNC_PTR(clFinish); + PADDLE_CL_DEFINE_FUNC_PTR(clReleaseProgram); + PADDLE_CL_DEFINE_FUNC_PTR(clRetainContext); + PADDLE_CL_DEFINE_FUNC_PTR(clGetContextInfo); + PADDLE_CL_DEFINE_FUNC_PTR(clCreateProgramWithBinary); + PADDLE_CL_DEFINE_FUNC_PTR(clCreateCommandQueue); + PADDLE_CL_DEFINE_FUNC_PTR(clCreateCommandQueueWithProperties); + PADDLE_CL_DEFINE_FUNC_PTR(clReleaseCommandQueue); + PADDLE_CL_DEFINE_FUNC_PTR(clEnqueueMapBuffer); + PADDLE_CL_DEFINE_FUNC_PTR(clEnqueueMapImage); + PADDLE_CL_DEFINE_FUNC_PTR(clRetainProgram); + PADDLE_CL_DEFINE_FUNC_PTR(clGetProgramBuildInfo); + PADDLE_CL_DEFINE_FUNC_PTR(clEnqueueReadBuffer); + PADDLE_CL_DEFINE_FUNC_PTR(clEnqueueReadImage); + PADDLE_CL_DEFINE_FUNC_PTR(clEnqueueWriteBuffer); + PADDLE_CL_DEFINE_FUNC_PTR(clWaitForEvents); + PADDLE_CL_DEFINE_FUNC_PTR(clReleaseEvent); + PADDLE_CL_DEFINE_FUNC_PTR(clCreateContext); + PADDLE_CL_DEFINE_FUNC_PTR(clCreateContextFromType); + PADDLE_CL_DEFINE_FUNC_PTR(clReleaseContext); + PADDLE_CL_DEFINE_FUNC_PTR(clRetainCommandQueue); + PADDLE_CL_DEFINE_FUNC_PTR(clEnqueueUnmapMemObject); + PADDLE_CL_DEFINE_FUNC_PTR(clRetainMemObject); + PADDLE_CL_DEFINE_FUNC_PTR(clReleaseMemObject); + PADDLE_CL_DEFINE_FUNC_PTR(clGetDeviceInfo); + PADDLE_CL_DEFINE_FUNC_PTR(clGetDeviceIDs); + PADDLE_CL_DEFINE_FUNC_PTR(clRetainDevice); + PADDLE_CL_DEFINE_FUNC_PTR(clReleaseDevice); + PADDLE_CL_DEFINE_FUNC_PTR(clRetainEvent); + PADDLE_CL_DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo); + PADDLE_CL_DEFINE_FUNC_PTR(clGetEventInfo); + PADDLE_CL_DEFINE_FUNC_PTR(clGetEventProfilingInfo); + PADDLE_CL_DEFINE_FUNC_PTR(clGetImageInfo); + +#undef PADDLE_CL_DEFINE_FUNC_PTR + + private: + void *handle_ = nullptr; +}; + +OpenCLLibrary *OpenCLLibrary::Get() { + static OpenCLLibrary library; + return &library; +} + +OpenCLLibrary::OpenCLLibrary() { + this->Load(); + // Do not call dlclose which may unload all OpenCL symbols. + // If close the OpenCL library, the static OpenCLlite destructor may fail. + // If there is no dlclose, the library will be closed when the program exist. + // Besides, the library will not be load repeatedly even dlopen many times. +} + +bool OpenCLLibrary::Load() { + if (handle_ != nullptr) { + return true; + } + + // Add customized OpenCL search path here + const std::vector paths = { + "libOpenCL.so", +#if defined(__aarch64__) + // Qualcomm Adreno with Android + "/system/vendor/lib64/libOpenCL.so", + "/system/lib64/libOpenCL.so", + // Mali with Android + "/system/vendor/lib64/egl/libGLES_mali.so", + "/system/lib64/egl/libGLES_mali.so", + // Typical Linux board + "/usr/lib/aarch64-linux-gnu/libOpenCL.so", +#else + // Qualcomm Adreno with Android + "/system/vendor/lib/libOpenCL.so", + "/system/lib/libOpenCL.so", + // Mali with Android + "/system/vendor/lib/egl/libGLES_mali.so", + "/system/lib/egl/libGLES_mali.so", + // Typical Linux board + "/usr/lib/arm-linux-gnueabihf/libOpenCL.so", +#endif + }; + + for (const auto &path : paths) { + VLOG(3) << "Loading OpenCL from " << path; + void *handle = LoadFromPath(path); + if (handle != nullptr) { + handle_ = handle; + break; + } + } + + if (handle_ == nullptr) { + LOG(ERROR) + << "Failed to load OpenCL library, " + "please make sure there exists OpenCL library on your device, " + "and your APP have right to access the library."; + return false; + } + + return true; +} + +void *OpenCLLibrary::LoadFromPath(const std::string &path) { + void *handle = dlopen(path.c_str(), RTLD_LAZY | RTLD_LOCAL); + + if (handle == nullptr) { + VLOG(3) << "Failed to load OpenCL library from path " << path + << " error code: " << dlerror(); + return nullptr; + } + +#define PADDLE_CL_ASSIGN_FROM_DLSYM(func) \ + do { \ + void *ptr = dlsym(handle, #func); \ + if (ptr == nullptr) { \ + VLOG(1) << "Failed to load " << #func << " from " << path; \ + continue; \ + } \ + func = reinterpret_cast(ptr); \ + VLOG(3) << "Loaded " << #func << " from " << path; \ + } while (false) + + PADDLE_CL_ASSIGN_FROM_DLSYM(clGetPlatformIDs); + PADDLE_CL_ASSIGN_FROM_DLSYM(clGetPlatformInfo); + PADDLE_CL_ASSIGN_FROM_DLSYM(clBuildProgram); + PADDLE_CL_ASSIGN_FROM_DLSYM(clEnqueueNDRangeKernel); + PADDLE_CL_ASSIGN_FROM_DLSYM(clSetKernelArg); + PADDLE_CL_ASSIGN_FROM_DLSYM(clReleaseKernel); + PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateProgramWithSource); + PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateBuffer); + PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateImage); + PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateImage2D); + PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateUserEvent); + PADDLE_CL_ASSIGN_FROM_DLSYM(clRetainKernel); + PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateKernel); + PADDLE_CL_ASSIGN_FROM_DLSYM(clGetProgramInfo); + PADDLE_CL_ASSIGN_FROM_DLSYM(clFlush); + PADDLE_CL_ASSIGN_FROM_DLSYM(clFinish); + PADDLE_CL_ASSIGN_FROM_DLSYM(clReleaseProgram); + PADDLE_CL_ASSIGN_FROM_DLSYM(clRetainContext); + PADDLE_CL_ASSIGN_FROM_DLSYM(clGetContextInfo); + PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateProgramWithBinary); + PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateCommandQueue); + PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateCommandQueueWithProperties); + PADDLE_CL_ASSIGN_FROM_DLSYM(clReleaseCommandQueue); + PADDLE_CL_ASSIGN_FROM_DLSYM(clEnqueueMapBuffer); + PADDLE_CL_ASSIGN_FROM_DLSYM(clEnqueueMapImage); + PADDLE_CL_ASSIGN_FROM_DLSYM(clRetainProgram); + PADDLE_CL_ASSIGN_FROM_DLSYM(clGetProgramBuildInfo); + PADDLE_CL_ASSIGN_FROM_DLSYM(clEnqueueReadBuffer); + PADDLE_CL_ASSIGN_FROM_DLSYM(clEnqueueReadImage); + PADDLE_CL_ASSIGN_FROM_DLSYM(clEnqueueWriteBuffer); + PADDLE_CL_ASSIGN_FROM_DLSYM(clWaitForEvents); + PADDLE_CL_ASSIGN_FROM_DLSYM(clReleaseEvent); + PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateContext); + PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateContextFromType); + PADDLE_CL_ASSIGN_FROM_DLSYM(clReleaseContext); + PADDLE_CL_ASSIGN_FROM_DLSYM(clRetainCommandQueue); + PADDLE_CL_ASSIGN_FROM_DLSYM(clEnqueueUnmapMemObject); + PADDLE_CL_ASSIGN_FROM_DLSYM(clRetainMemObject); + PADDLE_CL_ASSIGN_FROM_DLSYM(clReleaseMemObject); + PADDLE_CL_ASSIGN_FROM_DLSYM(clGetDeviceInfo); + PADDLE_CL_ASSIGN_FROM_DLSYM(clGetDeviceIDs); + PADDLE_CL_ASSIGN_FROM_DLSYM(clRetainDevice); + PADDLE_CL_ASSIGN_FROM_DLSYM(clReleaseDevice); + PADDLE_CL_ASSIGN_FROM_DLSYM(clRetainEvent); + PADDLE_CL_ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo); + PADDLE_CL_ASSIGN_FROM_DLSYM(clGetEventInfo); + PADDLE_CL_ASSIGN_FROM_DLSYM(clGetEventProfilingInfo); + PADDLE_CL_ASSIGN_FROM_DLSYM(clGetImageInfo); + +#undef PADDLE_CL_ASSIGN_FROM_DLSYM + + return handle; +} + +} // namespace lite +} // namespace paddle + +CL_API_ENTRY cl_event clCreateUserEvent(cl_context context, cl_int *errcode_ret) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clCreateUserEvent; + if (func != nullptr) { + return func(context, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } +} + +// Platform APIs +CL_API_ENTRY cl_int +clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, + cl_uint *num_platforms) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clGetPlatformIDs; + if (func != nullptr) { + return func(num_entries, platforms, num_platforms); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int +clGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clGetPlatformInfo; + if (func != nullptr) { + return func(platform, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } +} + +// Device APIs +CL_API_ENTRY cl_int clGetDeviceIDs( + cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, + cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clGetDeviceIDs; + if (func != nullptr) { + return func(platform, device_type, num_entries, devices, num_devices); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int +clGetDeviceInfo(cl_device_id device, cl_device_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clGetDeviceInfo; + if (func != nullptr) { + return func(device, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clRetainDevice(cl_device_id device) + CL_API_SUFFIX__VERSION_1_2 { + auto func = paddle::lite::OpenCLLibrary::Get()->clRetainDevice; + if (func != nullptr) { + return func(device); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clReleaseDevice(cl_device_id device) + CL_API_SUFFIX__VERSION_1_2 { + auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseDevice; + if (func != nullptr) { + return func(device); + } else { + return CL_INVALID_PLATFORM; + } +} + +// Context APIs +CL_API_ENTRY cl_context clCreateContext( + const cl_context_properties *properties, cl_uint num_devices, + const cl_device_id *devices, + void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), + void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clCreateContext; + if (func != nullptr) { + return func(properties, num_devices, devices, pfn_notify, user_data, + errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } +} + +CL_API_ENTRY cl_context clCreateContextFromType( + const cl_context_properties *properties, cl_device_type device_type, + void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), + void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clCreateContextFromType; + if (func != nullptr) { + return func(properties, device_type, pfn_notify, user_data, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } +} + +CL_API_ENTRY cl_int clRetainContext(cl_context context) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clRetainContext; + if (func != nullptr) { + return func(context); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clReleaseContext(cl_context context) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseContext; + if (func != nullptr) { + return func(context); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int +clGetContextInfo(cl_context context, cl_context_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clGetContextInfo; + if (func != nullptr) { + return func(context, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } +} + +// Program Object APIs +CL_API_ENTRY cl_program clCreateProgramWithSource( + cl_context context, cl_uint count, const char **strings, + const size_t *lengths, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clCreateProgramWithSource; + if (func != nullptr) { + return func(context, count, strings, lengths, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } +} + +CL_API_ENTRY cl_program clCreateProgramWithBinary( + cl_context context, cl_uint num_devices, const cl_device_id *device_list, + const size_t *lengths, const unsigned char **binaries, + cl_int *binary_status, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clCreateProgramWithBinary; + if (func != nullptr) { + return func(context, num_devices, device_list, lengths, binaries, + binary_status, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } +} + +CL_API_ENTRY cl_int +clGetProgramInfo(cl_program program, cl_program_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clGetProgramInfo; + if (func != nullptr) { + return func(program, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clGetProgramBuildInfo( + cl_program program, cl_device_id device, cl_program_build_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clGetProgramBuildInfo; + if (func != nullptr) { + return func(program, device, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clRetainProgram(cl_program program) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clRetainProgram; + if (func != nullptr) { + return func(program); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clReleaseProgram(cl_program program) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseProgram; + if (func != nullptr) { + return func(program); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clBuildProgram( + cl_program program, cl_uint num_devices, const cl_device_id *device_list, + const char *options, + void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), + void *user_data) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clBuildProgram; + if (func != nullptr) { + return func(program, num_devices, device_list, options, pfn_notify, + user_data); + } else { + return CL_INVALID_PLATFORM; + } +} + +// Kernel Object APIs +CL_API_ENTRY cl_kernel +clCreateKernel(cl_program program, const char *kernel_name, + cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clCreateKernel; + if (func != nullptr) { + return func(program, kernel_name, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } +} + +CL_API_ENTRY cl_int clRetainKernel(cl_kernel kernel) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clRetainKernel; + if (func != nullptr) { + return func(kernel); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clReleaseKernel(cl_kernel kernel) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseKernel; + if (func != nullptr) { + return func(kernel); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, + size_t arg_size, const void *arg_value) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clSetKernelArg; + if (func != nullptr) { + return func(kernel, arg_index, arg_size, arg_value); + } else { + return CL_INVALID_PLATFORM; + } +} + +// Memory Object APIs +CL_API_ENTRY cl_mem +clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, + void *host_ptr, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clCreateBuffer; + if (func != nullptr) { + return func(context, flags, size, host_ptr, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } +} + +CL_API_ENTRY cl_mem clCreateImage( + cl_context context, cl_mem_flags flags, const cl_image_format *image_format, + const cl_image_desc *image_desc, void *host_ptr, + cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2 { + auto func = paddle::lite::OpenCLLibrary::Get()->clCreateImage; + if (func != nullptr) { + return func(context, flags, image_format, image_desc, host_ptr, + errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } +} + +CL_API_ENTRY cl_int clRetainMemObject(cl_mem memobj) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clRetainMemObject; + if (func != nullptr) { + return func(memobj); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clReleaseMemObject(cl_mem memobj) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseMemObject; + if (func != nullptr) { + return func(memobj); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clGetImageInfo(cl_mem image, cl_image_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clGetImageInfo; + if (func != nullptr) { + return func(image, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } +} + +// Command Queue APIs +CL_API_ENTRY cl_command_queue clCreateCommandQueueWithProperties( + cl_context context, cl_device_id device, + const cl_queue_properties *properties, + cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_0 { + auto func = + paddle::lite::OpenCLLibrary::Get()->clCreateCommandQueueWithProperties; + if (func != nullptr) { + return func(context, device, properties, errcode_ret); + } else { + // Fix MediaTek MT6771 OpenCL driver breakage + VLOG(3) << "Fallback to clCreateCommandQueue"; + if (properties[0] == CL_QUEUE_PROPERTIES) { +// When calling with OpenCL-CLHPP, the 2nd param is provided by caller. +#pragma GCC diagnostic push // disable warning both for clang and gcc +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + return clCreateCommandQueue(context, device, properties[1], errcode_ret); +#pragma GCC diagnostic pop + } else { + LOG(FATAL) << "Unknown calling parameters, check the code here"; + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } + } +} + +CL_API_ENTRY cl_int clRetainCommandQueue(cl_command_queue command_queue) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clRetainCommandQueue; + if (func != nullptr) { + return func(command_queue); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clReleaseCommandQueue(cl_command_queue command_queue) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseCommandQueue; + if (func != nullptr) { + return func(command_queue); + } else { + return CL_INVALID_PLATFORM; + } +} + +// Enqueued Commands APIs +CL_API_ENTRY cl_int clEnqueueReadBuffer( + cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, + size_t offset, size_t size, void *ptr, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueReadBuffer; + if (func != nullptr) { + return func(command_queue, buffer, blocking_read, offset, size, ptr, + num_events_in_wait_list, event_wait_list, event); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clEnqueueReadImage( + cl_command_queue command_queue, cl_mem image, cl_bool blocking_read, + const size_t *origin, const size_t *region, size_t row_pitch, + size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueReadImage; + if (func != nullptr) { + return func(command_queue, image, blocking_read, origin, region, row_pitch, + slice_pitch, ptr, num_events_in_wait_list, event_wait_list, + event); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clEnqueueWriteBuffer( + cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, + size_t offset, size_t size, const void *ptr, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueWriteBuffer; + if (func != nullptr) { + return func(command_queue, buffer, blocking_write, offset, size, ptr, + num_events_in_wait_list, event_wait_list, event); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY void *clEnqueueMapBuffer( + cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, + cl_map_flags map_flags, size_t offset, size_t size, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueMapBuffer; + if (func != nullptr) { + return func(command_queue, buffer, blocking_map, map_flags, offset, size, + num_events_in_wait_list, event_wait_list, event, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } +} + +CL_API_ENTRY void *clEnqueueMapImage( + cl_command_queue command_queue, cl_mem image, cl_bool blocking_map, + cl_map_flags map_flags, const size_t *origin, const size_t *region, + size_t *image_row_pitch, size_t *image_slice_pitch, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueMapImage; + if (func != nullptr) { + return func(command_queue, image, blocking_map, map_flags, origin, region, + image_row_pitch, image_slice_pitch, num_events_in_wait_list, + event_wait_list, event, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } +} + +CL_API_ENTRY cl_int clEnqueueUnmapMemObject( + cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueUnmapMemObject; + if (func != nullptr) { + return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, + event_wait_list, event); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clGetKernelWorkGroupInfo( + cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clGetKernelWorkGroupInfo; + if (func != nullptr) { + return func(kernel, device, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clEnqueueNDRangeKernel( + cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, + const size_t *global_work_offset, const size_t *global_work_size, + const size_t *local_work_size, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueNDRangeKernel; + if (func != nullptr) { + return func(command_queue, kernel, work_dim, global_work_offset, + global_work_size, local_work_size, num_events_in_wait_list, + event_wait_list, event); + } else { + return CL_INVALID_PLATFORM; + } +} + +// Event Object APIs +CL_API_ENTRY cl_int clWaitForEvents( + cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clWaitForEvents; + if (func != nullptr) { + return func(num_events, event_list); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clRetainEvent; + if (func != nullptr) { + return func(event); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseEvent; + if (func != nullptr) { + return func(event); + } else { + return CL_INVALID_PLATFORM; + } +} + +// Event API +CL_API_ENTRY cl_int clGetEventInfo(cl_event event, cl_event_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clGetEventInfo; + if (func != nullptr) { + return func(event, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } +} + +// Profiling APIs +CL_API_ENTRY cl_int clGetEventProfilingInfo( + cl_event event, cl_profiling_info param_name, size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clGetEventProfilingInfo; + if (func != nullptr) { + return func(event, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } +} + +// Flush and Finish APIs +CL_API_ENTRY cl_int clFlush(cl_command_queue command_queue) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clFlush; + if (func != nullptr) { + return func(command_queue); + } else { + return CL_INVALID_PLATFORM; + } +} + +CL_API_ENTRY cl_int clFinish(cl_command_queue command_queue) + CL_API_SUFFIX__VERSION_1_0 { + auto func = paddle::lite::OpenCLLibrary::Get()->clFinish; + if (func != nullptr) { + return func(command_queue); + } else { + return CL_INVALID_PLATFORM; + } +} + +// Deprecated OpenCL 1.1 APIs +CL_API_ENTRY /* CL_EXT_PREFIX__VERSION_1_1_DEPRECATED */ cl_mem clCreateImage2D( + cl_context context, cl_mem_flags flags, const cl_image_format *image_format, + size_t image_width, size_t image_height, size_t image_row_pitch, + void *host_ptr, + cl_int *errcode_ret) /* CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED */ { + auto func = paddle::lite::OpenCLLibrary::Get()->clCreateImage2D; + if (func != nullptr) { + return func(context, flags, image_format, image_width, image_height, + image_row_pitch, host_ptr, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } +} + +// Deprecated OpenCL 2.0 APIs +CL_API_ENTRY /*CL_EXT_PREFIX__VERSION_1_2_DEPRECATED*/ cl_command_queue +clCreateCommandQueue(cl_context context, cl_device_id device, + cl_command_queue_properties properties, + cl_int *errcode_ret) +/* CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED */ { // NOLINT + auto func = paddle::lite::OpenCLLibrary::Get()->clCreateCommandQueue; + if (func != nullptr) { + return func(context, device, properties, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } +} diff --git a/paddle/fluid/lite/operators/use_ops.h b/paddle/fluid/lite/operators/use_ops.h index 933b3c849a390c335bd914c476c61636c607aa41..316e08ad4784849865b3d7722dfb7d1935d51247 100644 --- a/paddle/fluid/lite/operators/use_ops.h +++ b/paddle/fluid/lite/operators/use_ops.h @@ -13,9 +13,10 @@ // limitations under the License. #pragma once -/* - * ATTENTION this header file can only include in .cc file. - */ + +// ATTENTION This can only include in a .cc file. + +#include "paddle/fluid/lite/core/op_registry.h" USE_LITE_OP(mul); USE_LITE_OP(fc); diff --git a/paddle/fluid/lite/tools/build.sh b/paddle/fluid/lite/tools/build.sh index 4436d91cdfd782ac6cbed9768c85a7bf01bead71..fe956a0384554ea2d2d065c5bd231cbd6d646ecb 100755 --- a/paddle/fluid/lite/tools/build.sh +++ b/paddle/fluid/lite/tools/build.sh @@ -25,6 +25,23 @@ function cmake_x86 { cmake .. -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags} } +function cmake_opencl { + # $1: ARM_TARGET_OS in "android" , "armlinux" + # $2: ARM_TARGET_ARCH_ABI in "arm64-v8a", "armeabi-v7a" ,"armeabi-v7a-hf" + cmake .. \ + -DLITE_WITH_OPENCL=ON \ + -DWITH_GPU=OFF \ + -DWITH_MKL=OFF \ + -DWITH_LITE=ON \ + -DLITE_WITH_CUDA=OFF \ + -DLITE_WITH_X86=OFF \ + -DLITE_WITH_ARM=ON \ + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ + -DWITH_TESTING=ON \ + -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 +} + + # This method is only called in CI. function cmake_x86_for_CI { prepare_for_codegen # fake an empty __generated_code__.cc to pass cmake. @@ -85,8 +102,8 @@ function build_test_server { # test_arm_android function test_arm_android { - test_name=$1 - port=$2 + local test_name=$1 + local port=$2 if [[ "${test_name}x" == "x" ]]; then echo "test_name can not be empty" exit 1 @@ -99,12 +116,18 @@ function test_arm_android { echo "test name: ${test_name}" adb_work_dir="/data/local/tmp" - skip_list=("test_model_parser_lite" "test_mobilenetv1_lite" "test_mobilenetv2_lite" "test_resnet50_lite" "test_inceptionv4_lite") + skip_list=("test_model_parser_lite" "test_mobilenetv1_lite" "test_mobilenetv2_lite" "test_resnet50_lite" "test_inceptionv4_lite" "test_light_api_lite" "test_apis_lite") for skip_name in ${skip_list[@]} ; do [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return done - testpath=$(find ./paddle/fluid -name ${test_name}) + local testpath=$(find ./paddle/fluid -name ${test_name}) + + # if [[ "$test_name" == "test_light_api" ]]; then + # local model_path=$(find . -name "lite_naive_model") + # arm_push_necessary_file $port $model_path $adb_work_dir + # fi + adb -s emulator-${port} push ${testpath} ${adb_work_dir} adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}" adb -s emulator-${port} shell "./${adb_work_dir}/${test_name}" @@ -204,6 +227,7 @@ function test_arm { abi=$2 lang=$3 port=$4 + if [[ ${os} == "armlinux" ]]; then # TODO(hongming): enable test armlinux on armv8, armv7 and armv7hf echo "Skip test arm linux yet. armlinux must in another docker" @@ -214,13 +238,14 @@ function test_arm { echo "android do not need armv7hf" return 0 fi - + # TODO(yuanshuai): enable armv7 on android if [[ ${abi} == "armv7" ]]; then echo "skip android v7 test yet" return 0 fi + echo "test file: ${TESTS_FILE}" for _test in $(cat $TESTS_FILE); do test_arm_android $_test $port @@ -235,13 +260,21 @@ function prepare_emulator { adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done # start android armv8 and armv7 emulators first echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a" - echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -verbose -port ${port_armv8} & + echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port ${port_armv8} & sleep 1m echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a" - echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -verbose -port ${port_armv7} & + echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port ${port_armv7} & sleep 1m } +function arm_push_necessary_file { + local port=$1 + local testpath=$2 + local adb_work_dir=$3 + + adb -s emulator-${port} push ${testpath} ${adb_work_dir} +} + # We split the arm unittest into several sub-tasks to parallel and reduce the overall CI timetime. # sub-task1 @@ -286,20 +319,22 @@ function build_test_arm_subtask_armlinux { prepare_emulator $port_armv8 $port_armv7 + cur=$PWD + # job 5 - build_arm "armlinux" "armv8" - test_arm "armlinux" "armv8" - cd - + build_arm "armlinux" "armv8" "gcc" $port_armv8 + test_arm "armlinux" "armv8" "gcc" $port_armv8 + cd $cur # job 6 - build_arm "armlinux" "armv7" - test_arm "armlinux" "armv7" - cd - + build_arm "armlinux" "armv7" "gcc" $port_armv8 + test_arm "armlinux" "armv7" "gcc" $port_armv8 + cd $cur # job 7 - build_arm "armlinux" "armv7hf" - test_arm "armlinux" "armv7hf" - cd - + build_arm "armlinux" "armv7hf" "gcc" $port_armv8 + test_arm "armlinux" "armv7hf" "gcc" $port_armv8 + cd $cur adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done echo "Done" @@ -333,6 +368,22 @@ function build_test_arm_subtask_model { echo "Done" } + +# this test load a model, optimize it and check the prediction result of both cxx and light APIS. +function test_arm_predict_apis { + local port=$1 + local workspace=$2 + local naive_model_path=$3 + local api_test_path=$(find . -name "test_apis_lite") + # the model is pushed to ./lite_naive_model + adb -s emulator-${port} push ${naive_model_path} ${workspace} + adb -s emulator-${port} push $api_test_path ${workspace} + + # test cxx_api first to store the optimized model. + adb -s emulator-${port} shell ./test_apis_lite --model_dir ./lite_naive_model --optimized_model ./lite_naive_model_opt +} + + # Build the code and run lite arm tests. This is executed in the CI system. function build_test_arm { ######################################################################## @@ -404,6 +455,10 @@ function main { cmake_x86 shift ;; + cmake_opencl) + cmake_opencl $ARM_OS $ARM_ABI + shift + ;; cmake_cuda) cmake_cuda shift diff --git a/paddle/fluid/lite/utils/io.h b/paddle/fluid/lite/utils/io.h index 4dba6f984292235d3f947477b09152bc37c2adb9..86161a4b1ab7139795d777cb6a8f266835bcd680 100644 --- a/paddle/fluid/lite/utils/io.h +++ b/paddle/fluid/lite/utils/io.h @@ -18,11 +18,12 @@ #include #include #include "paddle/fluid/lite/utils/cp_logging.h" +#include "paddle/fluid/lite/utils/string.h" namespace paddle { namespace lite { -static bool IsFileExists(const std::string &path) { +static bool IsFileExists(const std::string& path) { std::ifstream file(path); bool res = file.is_open(); if (res) { @@ -31,5 +32,15 @@ static bool IsFileExists(const std::string &path) { return res; } +// ARM mobile not support mkdir in C++ +static void MkDirRecur(const std::string& path) { +#ifndef LITE_WITH_ARM + CHECK_EQ(system(string_format("mkdir -p %s", path.c_str()).c_str()), 0) + << "Cann't mkdir " << path; +#else // On ARM + CHECK_NE(mkdir(path.c_str(), S_IRWXU), -1) << "Cann't mkdir " << path; +#endif +} + } // namespace lite } // namespace paddle diff --git a/paddle/fluid/lite/utils/string.h b/paddle/fluid/lite/utils/string.h index 31b131276bfa220f85a9a7606d504b6d330425b2..5e918bf5f841b3f8d18ccf9ff94721534ec6a698 100644 --- a/paddle/fluid/lite/utils/string.h +++ b/paddle/fluid/lite/utils/string.h @@ -74,5 +74,15 @@ static std::string Repr(const std::vector& v) { return "{" + Join(tmp, ",") + "}"; } +static std::vector Split(const std::string& s, char delim) { + std::stringstream ss(s); + std::string line; + std::vector res; + while (std::getline(ss, line, delim)) { + res.push_back(line); + } + return res; +} + } // namespace lite } // namespace paddle