提交 a95dcea5 编写于 作者: H hong19860320

Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into hongming/fix_pooling

......@@ -76,8 +76,8 @@ build:mobile_android:
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- build.lite.android.armv8.gcc/third_party
- build.lite.android.armv7.gcc/third_party
- ~/.ccache
- $CI_PROJECT_DIR/build_mobile_ccache
script:
......@@ -96,8 +96,9 @@ build:mobile_armlinux:
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- build.lite.armlinux.armv8.gcc
- build.lite.armlinux.armv7.gcc
- build.lite.armlinux.armv7hf.gcc
- ~/.ccache
- $CI_PROJECT_DIR/build_mobile_ccache2
script:
......@@ -107,24 +108,13 @@ build:mobile_armlinux:
dependencies:
- build:server
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
build:mobile_model_mobilenetv1:
tags:
- lite
stage: build_mobile
image: $MOBILE_LITE_DOCKER_IMAGE
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
script:
- export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_mobilenetv1
- ./paddle/fluid/lite/tools/build.sh build_test_arm_model_mobilenetv1
......@@ -135,8 +125,7 @@ build:mobile_model_mobilenetv1:
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- build.lite.android.armv8.gcc
- ~/.ccache
- $CI_PROJECT_DIR/build_mobile_model_mobilenetv1
......@@ -145,12 +134,7 @@ build:mobile_model_mobilenetv2:
- lite
stage: build_mobile
image: $MOBILE_LITE_DOCKER_IMAGE
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
script:
- export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_mobilenetv2
- ./paddle/fluid/lite/tools/build.sh build_test_arm_model_mobilenetv2
......@@ -161,8 +145,7 @@ build:mobile_model_mobilenetv2:
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- build.lite.android.armv8.gcc
- ~/.ccache
- $CI_PROJECT_DIR/build_mobile_model_mobilenetv2
......@@ -171,12 +154,7 @@ build:mobile_model_resnet50:
- lite
stage: build_mobile
image: $MOBILE_LITE_DOCKER_IMAGE
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
script:
- export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_resnet50
- ./paddle/fluid/lite/tools/build.sh build_test_arm_model_resnet50
......@@ -187,8 +165,7 @@ build:mobile_model_resnet50:
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- build.lite.android.armv8.gcc
- ~/.ccache
- $CI_PROJECT_DIR/build_mobile_model_resnet50
......
......@@ -150,6 +150,7 @@ option(WITH_LITE "Enable lite framework" OFF)
option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
option(LITE_WITH_X86 "Enable X86 in lite mode" ON)
option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF)
option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OFF)
option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF)
......@@ -181,6 +182,12 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
include(external/eigen) # download eigen3
include(ccache) # set ccache for compilation
# for opencl
if (LITE_WITH_OPENCL)
include(external/opencl-headers)
include(external/opencl-clhpp)
endif()
include(generic) # simplify cmake module
include(configure) # add paddle env configuration
......
......@@ -176,6 +176,10 @@ if (LITE_WITH_ARM)
add_definitions("-DLITE_WITH_ARM")
endif()
if (LITE_WITH_OPENCL)
add_definitions("-DLITE_WITH_OPENCL")
endif()
if (LITE_WITH_PROFILE)
add_definitions("-DLITE_WITH_PROFILE")
endif()
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
INCLUDE(ExternalProject)
SET(OPENCL_CLHPP_SRCS_DIR ${THIRD_PARTY_PATH}/opencl-clhpp)
SET(OPENCL_CLHPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/opencl-clhpp)
SET(OPENCL_CLHPP_INCLUDE_DIR "${OPENCL_CLHPP_INSTALL_DIR}" CACHE PATH "opencl-clhpp include directory." FORCE)
INCLUDE_DIRECTORIES(${OPENCL_CLHPP_INCLUDE_DIR})
ExternalProject_Add(
opencl_clhpp
GIT_REPOSITORY "https://github.com/KhronosGroup/OpenCL-CLHPP.git"
GIT_TAG "v2.0.10"
PREFIX "${OPENCL_CLHPP_SRCS_DIR}"
CMAKE_ARGS -DBUILD_DOCS=OFF
-DBUILD_EXAMPLES=OFF
-DBUILD_TESTS=OFF
-DCMAKE_INSTALL_PREFIX=${OPENCL_CLHPP_INSTALL_DIR}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${OPENCL_CLHPP_INSTALL_DIR}
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
ADD_DEPENDENCIES(opencl_clhpp opencl_headers)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
INCLUDE(ExternalProject)
SET(OPENCL_HEADERS_SRCS_DIR ${THIRD_PARTY_PATH}/opencl-headers)
SET(OPENCL_HEADERS_INCLUDE_DIR "${OPENCL_HEADERS_SRCS_DIR}/src/opencl_headers" CACHE PATH "opencl-headers include directory." FORCE)
INCLUDE_DIRECTORIES(${OPENCL_HEADERS_INCLUDE_DIR})
ExternalProject_Add(
opencl_headers
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/KhronosGroup/OpenCL-Headers.git"
GIT_TAG "c5a4bbeabb10d8ed3d1c651b93aa31737bc473dd"
PREFIX ${OPENCL_HEADERS_SRCS_DIR}
DOWNLOAD_NAME "OpenCL-Headers"
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
......@@ -24,8 +24,7 @@ function(lite_download_and_uncompress INSTALL_DIR URL FILENAME)
${EXTERNAL_PROJECT_NAME}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${INSTALL_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
DOWNLOAD_COMMAND wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
DOWNLOAD_DIR ${INSTALL_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
......@@ -143,6 +142,8 @@ function(lite_cc_binary TARGET)
HVY_DEPS ${args_HVY_DEPS}
)
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
# collect targets need to compile for lite
add_dependencies(lite_compile_deps ${TARGET})
endfunction()
# Add a unit-test name to file for latter offline manual test.
......@@ -181,6 +182,7 @@ add_subdirectory(x86)
add_subdirectory(arm)
add_subdirectory(host)
add_subdirectory(cuda)
add_subdirectory(opencl)
add_subdirectory(model_parser)
add_subdirectory(utils)
add_subdirectory(api)
......
......@@ -12,7 +12,6 @@ lite_cc_library(lite_api_test_helper SRCS lite_api_test_helper.cc
CUDA_DEPS kernels_cuda
X86_DEPS ${x86_kernels}
)
lite_cc_library(cxx_api_lite SRCS cxx_api.cc DEPS lite_api_test_helper)
set(light_api_deps
scope_lite target_wrapper_host model_parser_lite program_lite)
......@@ -21,27 +20,34 @@ if(LITE_WITH_CUDA)
set(light_api_deps ${light_api_deps} target_wrapper_cuda)
endif()
lite_cc_library(light_api_lite SRCS light_api.cc
DEPS ${light_api_deps} ${ops_lite} ${host_kernels}
)
message(STATUS "get ops ${ops_lite}")
message(STATUS "get Host kernels ${host_kernels}")
message(STATUS "get ARM kernels ${arm_kernels}")
lite_cc_library(cxx_api_lite SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} ${ops_lite} ${host_kernels} program_lite)
lite_cc_library(light_api_lite SRCS light_api.cc
DEPS scope_lite target_wrapper_host model_parser_lite
${light_api_deps} ${ops_lite} ${host_kernels} program_lite
CUDA_DEPS target_wrapper_cuda
X86_DEPS ${x86_kernels} operator
ARM_DEPS ${arm_kernels}
)
include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
"A path setting inference demo download directories.")
if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc
DEPS cxx_api_lite mir_passes
DEPS cxx_api_lite mir_passes lite_api_test_helper
${ops_lite} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz)
endif()
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
set(lite_model_test_DEPS cxx_api_lite mir_passes ${ops_lite} ${host_kernels} ${arm_kernels})
......@@ -68,25 +74,18 @@ endif()
# These tests needs CLI arguments, and is not supported in ARM CI.
# TODO(Superjomn) support latter.
if(NOT LITE_ON_MOBILE)
lite_cc_test(test_light_api SRCS light_api_test.cc
DEPS light_api_lite mir_passes
X86_DEPS ${x86_kernels}
lite_cc_test(test_light_api_lite SRCS light_api_test.cc
DEPS light_api_lite program_lite mir_passes
ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt
SERIAL)
lite_cc_test(test_apis_lite SRCS apis_test.cc
DEPS cxx_api_lite light_api_lite ${ops_lite} mir_passes
X86_DEPS ${x86_kernels}
lite_cc_test(test_apis_lite SRCS apis_test.cc
DEPS cxx_api_lite light_api_lite ${ops_lite}
X86_DEPS ${x86_kernels} operator
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
endif()
lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
DEPS
cxx_api_lite
model_parser_lite
target_wrapper_host
mir_passes
${ops_lite} ${host_kernels}
ARM_DEPS ${arm_kernels})
#lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
#X86_DEPS operator
#DEPS light_api_lite model_parser_lite target_wrapper_host mir_passes
#ARM_DEPS ${arm_kernels})
......@@ -39,23 +39,41 @@ void SetConstInput(lite::Tensor* x) {
}
}
bool CompareTensors(const std::string& name, const ExecutorLite& cxx_api,
bool CompareTensors(const std::string& name, const Predictor& cxx_api,
const LightPredictor& light_api) {
const auto* a = cxx_api.GetTensor(name);
const auto* b = light_api.GetTensor(name);
return TensorCompareWith(*a, *b);
}
#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
TEST(CXXApi_LightApi, optim_model) {
lite::Predictor cxx_api;
std::vector<Place> valid_places({
Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}, // Both works on X86 and ARM
});
// On ARM devices, the preferred X86 target not works, but it can still
// select ARM kernels.
cxx_api.Build(FLAGS_model_dir, Place{TARGET(kX86), PRECISION(kFloat)},
valid_places);
cxx_api.SaveModel(FLAGS_optimized_model);
}
TEST(CXXApi_LightApi, save_and_load_model) {
lite::ExecutorLite cxx_api;
lite::LightPredictor light_api;
lite::Predictor cxx_api;
lite::LightPredictor light_api(FLAGS_optimized_model);
// CXXAPi
{
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)}});
cxx_api.Build(FLAGS_model_dir, Place{TARGET(kCUDA), PRECISION(kFloat)},
std::vector<Place> valid_places({
Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}, // Both works on X86 and ARM
});
// On ARM devices, the preferred X86 target not works, but it can still
// select ARM kernels.
cxx_api.Build(FLAGS_model_dir, Place{TARGET(kX86), PRECISION(kFloat)},
valid_places);
auto* x = cxx_api.GetInput(0);
......@@ -69,8 +87,6 @@ TEST(CXXApi_LightApi, save_and_load_model) {
// LightApi
{
light_api.Build(FLAGS_optimized_model);
auto* x = light_api.GetInput(0);
SetConstInput(x);
......@@ -89,7 +105,6 @@ TEST(CXXApi_LightApi, save_and_load_model) {
ASSERT_TRUE(CompareTensors(tensor_name, cxx_api, light_api));
}
}
#endif // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
} // namespace lite
} // namespace paddle
......@@ -17,19 +17,66 @@
#include <string>
#include <utility>
#include <vector>
#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
#include "paddle/fluid/platform/port.h"
#endif
#include "paddle/fluid/lite/utils/io.h"
namespace paddle {
namespace lite {
#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
void ExecutorLite::SaveModel(const std::string &dir) {
MkDirRecursively(dir.c_str());
void Predictor::SaveModel(const std::string &dir) {
#ifndef LITE_WITH_ARM
MkDirRecur(dir);
#else
#endif
program_->PersistModel(dir, program_desc_);
LOG(INFO) << "Save model to " << dir;
}
lite::Tensor *Predictor::GetInput(size_t offset) {
auto *_feed_list = program_->exec_scope()->FindVar("feed");
CHECK(_feed_list) << "no feed variable in exec_scope";
auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
if (offset >= feed_list->size()) {
feed_list->resize(offset + 1);
}
return &feed_list->at(offset);
}
const lite::Tensor *Predictor::GetOutput(size_t offset) {
auto *_fetch_list = program_->exec_scope()->FindVar("fetch");
CHECK(_fetch_list) << "no fatch variable in exec_scope";
auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
return &fetch_list.at(offset);
}
void Predictor::Build(const std::string &model_path, const Place &prefer_place,
const std::vector<Place> &valid_places) {
LoadModel(model_path, scope_.get(), &program_desc_);
Build(program_desc_, prefer_place, valid_places);
}
const framework::proto::ProgramDesc &Predictor::program_desc() const {
return program_desc_;
}
void Predictor::Build(const framework::proto::ProgramDesc &desc,
const Place &prefer_place,
const std::vector<Place> &valid_places) {
program_desc_ = desc;
Program program(desc, scope_, valid_places);
optimizer_.KernelPickPreferPlace(prefer_place);
core::KernelPickFactor factor;
factor.ConsiderTarget();
factor.ConsiderPrecision();
optimizer_.Run(std::move(program), valid_places, factor);
program_ = optimizer_.GenRuntimeProgram();
}
const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
auto *var = program_->exec_scope()->FindVar(name);
return &var->Get<lite::Tensor>();
}
#endif
} // namespace lite
} // namespace paddle
......@@ -26,68 +26,39 @@
namespace paddle {
namespace lite {
struct Config {};
class ExecutorLite {
/*
* Predictor for inference, input a model, it will optimize and execute it.
*/
class Predictor {
public:
ExecutorLite() { scope_ = std::make_shared<Scope>(); }
explicit ExecutorLite(const std::shared_ptr<lite::Scope>& root_scope) {
scope_ = root_scope;
}
// Create an empty predictor.
Predictor() { scope_ = std::make_shared<Scope>(); }
// Create a predictor with the weight variable scope set.
explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
: scope_(root_scope) {}
// Build from a model, with places set for hardware config.
void Build(const std::string& model_path, const Place& prefer_place,
const std::vector<Place>& valid_places) {
LoadModel(model_path, scope_.get(), &program_desc_);
Build(program_desc_, prefer_place, valid_places);
}
const std::vector<Place>& valid_places);
void Build(const framework::proto::ProgramDesc& desc,
const Place& prefer_place,
const std::vector<Place>& valid_places) {
program_desc_ = desc;
Program program(desc, scope_, valid_places);
optimizer_.KernelPickPreferPlace(prefer_place);
core::KernelPickFactor factor;
factor.ConsiderTarget();
optimizer_.Run(std::move(program), valid_places, factor);
program_ = optimizer_.GenRuntimeProgram();
}
const Place& prefer_place, const std::vector<Place>& valid_places);
// This method is disabled in mobile, or unnecessary dependencies required.
#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
void SaveModel(const std::string& dir);
#endif
// Get offset-th col of feed.
lite::Tensor* GetInput(size_t offset) {
auto* _feed_list = program_->exec_scope()->FindVar("feed");
CHECK(_feed_list) << "no feed variable in exec_scope";
auto* feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
if (offset >= feed_list->size()) {
feed_list->resize(offset + 1);
}
return &feed_list->at(offset);
}
// Run the predictor for a single batch of data.
void Run() { program_->Run(); }
const lite::Tensor* GetOutput(size_t offset) {
auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
CHECK(_fetch_list) << "no fatch variable in exec_scope";
auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
return &fetch_list.at(offset);
}
// Get offset-th col of feed inputs.
lite::Tensor* GetInput(size_t offset);
const lite::Tensor* GetTensor(const std::string& name) const {
auto* var = program_->exec_scope()->FindVar(name);
return &var->Get<lite::Tensor>();
}
// Get offset-th col of fetch results.
const lite::Tensor* GetOutput(size_t offset);
void Run() { program_->Run(); }
// Return the program desc for debug.
const framework::proto::ProgramDesc& program_desc() const;
const lite::Tensor* GetTensor(const std::string& name) const;
const framework::proto::ProgramDesc& program_desc() const {
return program_desc_;
}
// This method is disabled in mobile, for unnecessary dependencies required.
void SaveModel(const std::string& dir);
private:
Optimizer optimizer_;
......@@ -96,6 +67,7 @@ class ExecutorLite {
std::unique_ptr<RuntimeProgram> program_;
};
#ifdef LITE_WITH_X86
/*
* An executor for training.
*
......@@ -119,13 +91,13 @@ class CXXTrainer {
: scope_(root_scope),
preferred_place_(preferred_place),
valid_places_(valid_places),
main_program_executor_(ExecutorLite(scope_)) {}
main_program_executor_(Predictor(scope_)) {}
// Build the RuntimeProgram cache for the main program. The cache will run
// multiple times for the epoches.
// NOTE Just support to execute the 0-th block currently.
ExecutorLite& BuildMainProgramExecutor(
const framework::proto::ProgramDesc& desc, int block_id = 0) {
Predictor& BuildMainProgramExecutor(const framework::proto::ProgramDesc& desc,
int block_id = 0) {
main_program_executor_.Build(desc, preferred_place_, valid_places_);
return main_program_executor_;
}
......@@ -133,7 +105,7 @@ class CXXTrainer {
// Run the startup program. It just executes once, no cache needed.
void RunStartupProgram(const framework::proto::ProgramDesc& desc,
int block_id = 0) {
ExecutorLite exe(scope_);
Predictor exe(scope_);
exe.Build(desc, preferred_place_, valid_places_);
exe.Run();
}
......@@ -145,8 +117,9 @@ class CXXTrainer {
std::vector<Place> valid_places_;
// The training program.
ExecutorLite main_program_executor_;
Predictor main_program_executor_;
};
#endif
} // namespace lite
} // namespace paddle
......@@ -34,7 +34,7 @@ void Run(const char* model_dir, int repeat, int thread_num) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num);
#endif
lite::ExecutorLite predictor;
lite::Predictor predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
......
......@@ -42,7 +42,7 @@ TEST(CXXApi, test) {
}
TEST(CXXApi, save_model) {
lite::ExecutorLite predictor;
lite::Predictor predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)}});
predictor.Build(FLAGS_model_dir, Place{TARGET(kCUDA), PRECISION(kFloat)},
......
......@@ -16,21 +16,20 @@
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/lite/api/cxx_api.h"
#include "paddle/fluid/lite/api/test_helper.h"
#include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/kernels/use_kernels.h"
#include "paddle/fluid/lite/operators/use_ops.h"
// for eval
DEFINE_string(model_dir, "", "");
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
TEST(InceptionV4, test) {
DeviceInfo::Init();
lite::ExecutorLite predictor;
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
......@@ -44,7 +43,20 @@ TEST(InceptionV4, test) {
data[i] = 1;
}
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor.Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < FLAGS_repeats; ++i) {
predictor.Run();
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
auto* out = predictor.GetOutput(0);
std::vector<float> results({0.00078033, 0.00083865, 0.00060029, 0.00057083,
......
......@@ -13,3 +13,67 @@
// limitations under the License.
#include "paddle/fluid/lite/api/light_api.h"
namespace paddle {
namespace lite {
void LightPredictor::Build(const std::string& model_dir) {
framework::proto::ProgramDesc desc;
LoadModel(model_dir, scope_.get(), &desc);
BuildRuntimeProgram(desc);
}
Tensor* LightPredictor::GetInput(size_t offset) {
auto* _feed_list = program_->exec_scope()->FindVar("feed");
CHECK(_feed_list) << "no feed variable in exec_scope";
auto* feed_list = _feed_list->GetMutable<std::vector<Tensor>>();
if (offset >= feed_list->size()) {
feed_list->resize(offset + 1);
}
return &feed_list->at(offset);
}
const Tensor* LightPredictor::GetOutput(size_t offset) {
auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
CHECK(_fetch_list) << "no fatch variable in exec_scope";
auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
return &fetch_list.at(offset);
}
void LightPredictor::BuildRuntimeProgram(
const framework::proto::ProgramDesc& prog) {
std::vector<Instruction> insts;
// 1. Create op first
Program program(prog, scope_, {});
// 2. Create Instructs
// Create the kernels of the target places, and filter out the specific
// kernel with the target alias.
for (auto& op : program.ops()) {
auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
std::string op_type, alias;
Place place;
KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
auto kernels = op->CreateKernels({place});
// filter out a kernel
auto it = std::find_if(
kernels.begin(), kernels.end(),
[&](std::unique_ptr<KernelBase>& it) { return it->alias() == alias; });
CHECK(it != kernels.end());
(*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
insts.emplace_back(op, std::move(*it));
}
program_.reset(new RuntimeProgram(std::move(insts)));
CHECK(program.exec_scope());
program_->set_exec_scope(program.exec_scope());
}
LightPredictor::LightPredictor(const std::string& model_dir) {
scope_ = std::make_shared<Scope>();
Build(model_dir);
}
} // namespace lite
} // namespace paddle
......@@ -32,36 +32,21 @@
namespace paddle {
namespace lite {
/*
* The light weight predictor, mainly for mobile. It loads an optimized model,
* and will not depend on the MIR or perform latter optimization.
*/
class LightPredictor {
public:
LightPredictor() { scope_ = std::make_shared<Scope>(); }
void Build(const std::string& model_dir) {
framework::proto::ProgramDesc desc;
LoadModel(model_dir, scope_.get(), &desc);
BuildRuntimeProgram(desc);
}
explicit LightPredictor(const std::string& model_dir);
void Run() { program_->Run(); }
// Get offset-th col of feed.
Tensor* GetInput(size_t offset) {
auto* _feed_list = program_->exec_scope()->FindVar("feed");
CHECK(_feed_list) << "no feed variable in exec_scope";
auto* feed_list = _feed_list->GetMutable<std::vector<Tensor>>();
if (offset >= feed_list->size()) {
feed_list->resize(offset + 1);
}
return &feed_list->at(offset);
}
// Get offset-th col of feed inputs.
Tensor* GetInput(size_t offset);
const Tensor* GetOutput(size_t offset) {
auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
CHECK(_fetch_list) << "no fatch variable in exec_scope";
auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
return &fetch_list.at(offset);
}
// Get offset-th col of fetch outputs.
const Tensor* GetOutput(size_t offset);
const lite::Tensor* GetTensor(const std::string& name) const {
auto* var = program_->exec_scope()->FindVar(name);
......@@ -69,34 +54,8 @@ class LightPredictor {
}
private:
void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog) {
std::vector<Instruction> insts;
// 1. Create op first
Program program(prog, scope_, {});
// 2. Create Instructs
// Create the kernels of the target places, and filter out the specific
// kernel with the target alias.
for (auto& op : program.ops()) {
auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
std::string op_type, alias;
Place place;
KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
auto kernels = op->CreateKernels({place});
// filter out a kernel
auto it = std::find_if(kernels.begin(), kernels.end(),
[&](std::unique_ptr<KernelBase>& it) {
return it->alias() == alias;
});
CHECK(it != kernels.end());
(*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
insts.emplace_back(op, std::move(*it));
}
program_.reset(new RuntimeProgram(std::move(insts)));
CHECK(program.exec_scope());
program_->set_exec_scope(program.exec_scope());
}
void Build(const std::string& model_dir);
void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog);
private:
std::shared_ptr<Scope> scope_;
......
......@@ -25,8 +25,10 @@ namespace paddle {
namespace lite {
TEST(LightAPI, load) {
LightPredictor predictor;
predictor.Build(FLAGS_optimized_model);
if (FLAGS_optimized_model.empty()) {
FLAGS_optimized_model = "lite_naive_model";
}
LightPredictor predictor(FLAGS_optimized_model);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<int64_t>({100, 100})));
......
......@@ -22,7 +22,7 @@ namespace paddle {
namespace lite {
const lite::Tensor* RunHvyModel() {
lite::ExecutorLite predictor;
lite::Predictor predictor;
#ifndef LITE_WITH_CUDA
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)}});
......
......@@ -16,21 +16,20 @@
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/lite/api/cxx_api.h"
#include "paddle/fluid/lite/api/test_helper.h"
#include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/kernels/use_kernels.h"
#include "paddle/fluid/lite/operators/use_ops.h"
// for eval
DEFINE_string(model_dir, "", "");
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
TEST(MobileNetV1, test) {
DeviceInfo::Init();
lite::ExecutorLite predictor;
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
......@@ -44,7 +43,20 @@ TEST(MobileNetV1, test) {
data[i] = 1;
}
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor.Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < FLAGS_repeats; ++i) {
predictor.Run();
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
auto* out = predictor.GetOutput(0);
std::vector<float> results({1.91308980e-04, 5.92055148e-04, 1.12303176e-04,
......
......@@ -16,21 +16,20 @@
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/lite/api/cxx_api.h"
#include "paddle/fluid/lite/api/test_helper.h"
#include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/kernels/use_kernels.h"
#include "paddle/fluid/lite/operators/use_ops.h"
// for eval
DEFINE_string(model_dir, "", "");
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
TEST(MobileNetV2, test) {
DeviceInfo::Init();
lite::ExecutorLite predictor;
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
......@@ -44,7 +43,20 @@ TEST(MobileNetV2, test) {
data[i] = 1;
}
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor.Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < FLAGS_repeats; ++i) {
predictor.Run();
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
auto* out = predictor.GetOutput(0);
std::vector<float> results({0.00097802, 0.00099822, 0.00103093, 0.00100121,
......
......@@ -16,21 +16,20 @@
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/lite/api/cxx_api.h"
#include "paddle/fluid/lite/api/test_helper.h"
#include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/kernels/use_kernels.h"
#include "paddle/fluid/lite/operators/use_ops.h"
// for eval
DEFINE_string(model_dir, "", "");
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
TEST(ResNet50, test) {
DeviceInfo::Init();
lite::ExecutorLite predictor;
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
......@@ -44,7 +43,20 @@ TEST(ResNet50, test) {
data[i] = 1;
}
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor.Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < FLAGS_repeats; ++i) {
predictor.Run();
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
auto* out = predictor.GetOutput(0);
std::vector<float> results({2.41399175e-04, 4.13724629e-04, 2.64324830e-04,
......
......@@ -13,13 +13,24 @@
// limitations under the License.
#pragma once
#include "paddle/fluid/lite/core/op_registry.h"
USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
#include <gflags/gflags.h>
#include <time.h>
// for eval
DEFINE_string(model_dir, "", "model dir");
DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times");
DEFINE_int32(threads, 1, "threads num");
namespace paddle {
namespace lite {
inline double GetCurrentUS() {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
}
} // namespace lite
} // namespace paddle
......@@ -35,6 +35,8 @@ cc_library(math_arm SRCS
split.cc
activation.cc
dropout.cc
gemm_prepacked_int8.cc
gemv_arm_int8.cc
DEPS ${lite_kernel_deps} eigen3 framework_proto_lite)
# TODO(TJ): fix me do not deps proto
......
......@@ -25,7 +25,7 @@ cc_library(op_registry_lite SRCS op_registry.cc DEPS framework_proto_lite)
cc_library(scope_lite SRCS scope.cc DEPS ${tensor_lite})
cc_library(cpu_info_lite SRCS cpu_info.cc)
lite_cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite eigen3)
cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite
cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite kernel_lite
cpp_op_desc_lite ${tensor_lite})
cc_library(types_lite SRCS types.cc)
cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite} target_wrapper_lite)
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/lite/core/kernel.h"
#include <cstdlib>
namespace paddle {
namespace lite {
......@@ -49,6 +50,36 @@ std::string KernelBase::GenParamTypeKey() const {
return ss.str();
}
void KernelBase::ParseKernelType(const std::string &kernel_type,
std::string *op_type, std::string *alias,
Place *place) {
std::stringstream ss(kernel_type);
std::getline(ss, *op_type, '/');
std::getline(ss, *alias, '/');
std::string target, precision, layout;
std::getline(ss, target, '/');
std::getline(ss, precision, '/');
std::getline(ss, layout, '/');
place->target = static_cast<TargetType>(std::atoi(target.c_str()));
place->precision = static_cast<PrecisionType>(std::atoi(precision.c_str()));
place->layout = static_cast<DataLayoutType>(std::atoi(layout.c_str()));
}
std::string KernelBase::SerializeKernelType(const std::string &op_type,
const std::string &alias,
const Place &place) {
std::stringstream ss;
ss << op_type << "/";
ss << alias << "/";
// We serialize the place value not the string representation here for
// easier deserialization.
ss << static_cast<int>(place.target) << "/";
ss << static_cast<int>(place.precision) << "/";
ss << static_cast<int>(place.layout);
return ss.str();
}
bool ParamTypeRegistry::KeyCmp::operator()(
const ParamTypeRegistry::key_t &a,
const ParamTypeRegistry::key_t &b) const {
......
......@@ -118,33 +118,11 @@ class KernelBase {
static std::string SerializeKernelType(const std::string& op_type,
const std::string& alias,
const Place& place) {
std::stringstream ss;
ss << op_type << "/";
ss << alias << "/";
// We serialize the place value not the string representation here for
// easier deserialization.
ss << static_cast<int>(place.target) << "/";
ss << static_cast<int>(place.precision) << "/";
ss << static_cast<int>(place.layout);
return ss.str();
}
const Place& place);
static void ParseKernelType(const std::string& kernel_type,
std::string* op_type, std::string* alias,
Place* place) {
std::stringstream ss(kernel_type);
std::getline(ss, *op_type, '/');
std::getline(ss, *alias, '/');
std::string target, precision, layout;
std::getline(ss, target, '/');
std::getline(ss, precision, '/');
std::getline(ss, layout, '/');
place->target = static_cast<TargetType>(std::stoi(target));
place->precision = static_cast<PrecisionType>(std::stoi(precision));
place->layout = static_cast<DataLayoutType>(std::stoi(layout));
}
Place* place);
virtual ~KernelBase() = default;
void Torch() {}
......
......@@ -28,7 +28,7 @@ namespace lite {
namespace mir {
TEST(fc_fuse_pass, fuse_test) {
lite::ExecutorLite predictor;
lite::Predictor predictor;
#ifndef LITE_WITH_CUDA
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)}});
......@@ -69,7 +69,7 @@ TEST(fc_fuse_pass, fuse_test) {
#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
TEST(fc_fuse_pass, save_model_test) {
lite::ExecutorLite predictor;
lite::Predictor predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)}});
predictor.Build(FLAGS_model_dir, Place{TARGET(kX86), PRECISION(kFloat)},
......
......@@ -41,7 +41,7 @@ void FuseBase::DeleteInterNodes(SSAGraph *graph) {
}
}
LOG(INFO) << "keys: " << key2nodes_.size();
VLOG(4) << "keys: " << key2nodes_.size();
std::unordered_set<const Node *> nodes2rm;
for (auto &matched : key2nodes_) {
for (const auto &key : keys) {
......
......@@ -80,6 +80,8 @@ class KernelRegistry final {
KernelRegistryForTarget<TARGET(kARM), PRECISION(kAny),
DATALAYOUT(kAny)> *, //
KernelRegistryForTarget<TARGET(kARM), PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kARM), PRECISION(kInt8),
DATALAYOUT(kNCHW)> * //
>;
......
......@@ -58,7 +58,6 @@ class Optimizer {
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
"lite_elementwise_add_activation_fuse_pass", //
#endif
"lite_fc_fuse_pass", //
"static_kernel_pick_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass", //
......
......@@ -38,6 +38,7 @@ enum class PrecisionType : int {
kUnk = 0,
kFloat,
kInt8,
kInt32,
kAny, // any precision
NUM, // number of fields.
};
......@@ -48,6 +49,19 @@ enum class DataLayoutType : int {
NUM, // number of fields.
};
static size_t PrecisionTypeLength(PrecisionType type) {
switch (type) {
case PrecisionType::kFloat:
return 4;
case PrecisionType::kInt8:
return 1;
case PrecisionType::kInt32:
return 4;
default:
return 4;
}
}
// Some helper macro to get a specific TargetType.
#define TARGET(item__) paddle::lite::TargetType::item__
// Some helper macro to get a specific PrecisionType.
......@@ -87,7 +101,7 @@ static const std::string& TargetRepr(TargetType target) {
static const std::string& PrecisionRepr(PrecisionType precision) {
static const std::string precision2string[] = {"kUnk", "kFloat", "kInt8",
"kAny"};
"kInt32", "kAny"};
auto x = static_cast<int>(precision);
CHECK_LT(x, static_cast<int>(PRECISION(NUM)));
return precision2string[x];
......
......@@ -51,5 +51,3 @@ set(arm_kernels
)
set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
......@@ -92,6 +92,9 @@ void ConvCompute::Run() {
// }
}
void ConvComputeInt8::PrepareForRun() {}
void ConvComputeInt8::Run() {}
} // namespace arm
} // namespace kernels
} // namespace lite
......@@ -112,3 +115,23 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW,
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW,
paddle::lite::kernels::arm::ConvComputeInt8, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("Filter",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
.Finalize();
REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW,
paddle::lite::kernels::arm::ConvComputeInt8, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("Filter",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
.Finalize();
......@@ -41,6 +41,25 @@ class ConvCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
nullptr};
};
class ConvComputeInt8 : public KernelLite<TARGET(kARM), PRECISION(kInt8)> {
public:
using param_t = operators::ConvParam;
void PrepareForRun() override;
void Run() override;
~ConvComputeInt8() {
if (impl_ != nullptr) {
delete impl_;
}
}
private:
lite::arm::math::ImplBase<TARGET(kARM), PRECISION(kInt8), param_t>* impl_{
nullptr};
};
} // namespace arm
} // namespace kernels
} // namespace lite
......
......@@ -12,14 +12,33 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
/*
* ATTENTION this header file can only include in .cc file.
*/
#pragma once
#include "paddle/fluid/lite/core/op_registry.h"
USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
#ifdef LITE_WITH_ARM
USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
#endif
#ifdef LITE_WITH_X86
USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
......@@ -36,21 +55,6 @@ USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
#endif
#ifdef LITE_WITH_ARM
USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
#endif
#ifdef LITE_WITH_CUDA
USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
......
......@@ -50,4 +50,3 @@ set(x86_kernels
)
set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels")
if (NOT LITE_WITH_OPENCL)
return()
endif()
if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
cc_library(cl_wrapper SRCS cl_wrapper.cc)
cc_library(cl_tool SRCS cl_tool.cc)
target_compile_options(cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers)
cc_library(cl_half SRCS cl_half.cc)
target_compile_options(cl_half BEFORE PUBLIC -fno-strict-aliasing)
cc_library(cl_engine SRCS cl_engine.cc DEPS cl_tool)
cc_library(cl_context SRCS cl_context.cc DEPS cl_engine)
cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context)
cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS cl_half lite_tensor)
cc_library(cl_image SRCS cl_image.cc DEPS cl_half lite_tensor cl_image_converter cl_engine)
cc_library(cl_caller SRCS cl_caller.cc DEPS cl_helper cl_image)
lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller cl_wrapper)
add_dependencies(cl_tool opencl_clhpp)
endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#define CL_TARGET_OPENCL_VERSION 200
#define CL_HPP_TARGET_OPENCL_VERSION 200
#define CL_HPP_MINIMUM_OPENCL_VERSION 110
#include <CL/cl2.hpp>
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/lite/opencl/cl_caller.h"
#include <string>
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_context.h"
#include "paddle/fluid/lite/opencl/cl_engine.h"
#include "paddle/fluid/lite/opencl/cl_helper.h"
#include "paddle/fluid/lite/opencl/cl_image.h"
#include "paddle/fluid/lite/opencl/cl_tool.h"
namespace paddle {
namespace lite {
static void CopyImageData(const CLImage& cl_image, float* out) {
int width = cl_image.image_dims()[0];
int height = cl_image.image_dims()[1];
half_t* image_data = new half_t[height * width * 4];
cl::Image* image = cl_image.cl_image();
const std::array<size_t, 3> origin{0, 0, 0};
const std::array<size_t, 3> region{static_cast<size_t>(width),
static_cast<size_t>(height), 1};
cl_int err = CLEngine::Global()->command_queue().enqueueReadImage(
*image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
CL_CHECK_ERRORS(err);
auto* converter = cl_image.image_converter();
converter->ImageToNCHW(image_data, out, cl_image.image_dims(),
cl_image.tensor_dims());
delete[] image_data;
}
bool InitOpenCLEngine(std::string cl_path) {
auto* engine = CLEngine::Global();
engine->set_cl_path(cl_path);
return engine->IsInitSuccess();
}
void elementwise_add(CLContext* context, float* in, const DDim& in_dim,
float* bias, const DDim& bias_dim, float* out,
const DDim& out_dim) {
CLHelper helper(context);
helper.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
auto kernel = helper.KernelAt(0);
CLImage in_image;
in_image.set_tensor_data(in, in_dim);
in_image.InitNormalCLImage(helper.OpenCLContext());
VLOG(3) << " --- Inpu image: " << in_image << " --- ";
CLImage bias_image;
bias_image.set_tensor_data(bias, bias_dim);
bias_image.InitNormalCLImage(helper.OpenCLContext());
VLOG(3) << " --- Bias image: " << bias_image << " --- ";
CLImage out_image;
out_image.InitEmptyImage(helper.OpenCLContext(), out_dim);
cl_int status;
status = kernel.setArg(0, *in_image.cl_image());
CL_CHECK_ERRORS(status);
status = kernel.setArg(1, *bias_image.cl_image());
CL_CHECK_ERRORS(status);
status = kernel.setArg(2, *out_image.cl_image());
CL_CHECK_ERRORS(status);
size_t width = in_image.ImageWidth();
size_t height = in_image.ImageHeight();
auto global_work_size = cl::NDRange{width, height};
status = helper.OpenCLCommandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
CL_CHECK_ERRORS(status);
VLOG(3) << " --- Out image: " << out_image << " --- ";
CopyImageData(out_image, out);
}
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_context.h"
namespace paddle {
namespace lite {
bool InitOpenCLEngine(std::string cl_path);
void elementwise_add(CLContext* context, float* in, const DDim& in_dim,
float* bias, const DDim& bias_dim, float* out,
const DDim& out_dim);
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <memory>
#include <string>
#include <utility>
#include "paddle/fluid/lite/opencl/cl_context.h"
#include "paddle/fluid/lite/opencl/cl_engine.h"
#include "paddle/fluid/lite/opencl/cl_tool.h"
namespace paddle {
namespace lite {
cl::CommandQueue &CLContext::GetCommandQueue() {
return CLEngine::Global()->command_queue();
}
cl::Context &CLContext::GetContext() { return CLEngine::Global()->context(); }
cl::Program &CLContext::GetProgram(const std::string &file_name,
const std::string &options) {
std::string program_key = file_name;
if (!options.empty()) {
program_key += options;
}
auto it = programs_.find(program_key);
if (it != programs_.end()) {
VLOG(3) << " --- program -> " << program_key << " has been built --- ";
return *(it->second);
}
auto program = CLEngine::Global()->CreateProgram(
GetContext(), CLEngine::Global()->cl_path() + "/cl_kernel/" + file_name);
VLOG(3) << " --- begin build program -> " << program_key << " --- ";
CLEngine::Global()->BuildProgram(program.get(), options);
VLOG(3) << " --- end build program -> " << program_key << " --- ";
programs_[program_key] = std::move(program);
return *(programs_[program_key]);
}
std::unique_ptr<cl::Kernel> CLContext::GetKernel(const std::string &kernel_name,
const std::string &file_name,
const std::string &options) {
cl_int status{CL_SUCCESS};
VLOG(3) << " --- to get program " << file_name << " --- ";
auto program = GetProgram(file_name, options);
VLOG(3) << " --- end get program --- ";
VLOG(3) << " --- to create kernel: " << kernel_name << " --- ";
std::unique_ptr<cl::Kernel> kernel(
new cl::Kernel(program, kernel_name.c_str(), &status));
CL_CHECK_ERRORS(status);
VLOG(3) << " --- end create kernel --- ";
return std::move(kernel);
}
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/lite/opencl/cl2_header.h"
namespace paddle {
namespace lite {
class CLContext {
public:
cl::CommandQueue &GetCommandQueue();
cl::Context &GetContext();
cl::Program &GetProgram(const std::string &file_name,
const std::string &options);
std::unique_ptr<cl::Kernel> GetKernel(const std::string &kernel_name,
const std::string &file_name,
const std::string &options);
private:
std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
};
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/lite/opencl/cl_engine.h"
#include <glog/logging.h>
#include <string>
#include <utility>
#include <vector>
namespace paddle {
namespace lite {
CLEngine* CLEngine::Global() {
static CLEngine cl_engine_;
cl_engine_.Init();
return &cl_engine_;
}
CLEngine::~CLEngine() {
if (command_queue_ != nullptr) {
command_queue_->finish();
}
// For controlling the destruction order:
command_queue_.reset();
context_.reset();
device_.reset();
platform_.reset();
}
bool CLEngine::Init() {
if (initialized_) {
return true;
}
bool is_platform_init = InitializePlatform();
bool is_device_init = InitializeDevice();
is_init_success_ = is_platform_init && is_device_init;
initialized_ = true;
return initialized_;
}
cl::Platform& CLEngine::platform() {
CHECK(platform_ != nullptr) << "platform_ is not initialized!";
return *platform_;
}
cl::Context& CLEngine::context() {
if (context_ == nullptr) {
context_ = CreateContext();
}
return *context_;
}
cl::Device& CLEngine::device() {
CHECK(device_ != nullptr) << "device_ is not initialized!";
return *device_;
}
cl::CommandQueue& CLEngine::command_queue() {
if (command_queue_ == nullptr) {
command_queue_ = CreateCommandQueue(context());
}
return *command_queue_;
}
std::unique_ptr<cl::Program> CLEngine::CreateProgram(const cl::Context& context,
std::string file_name) {
std::ifstream file{file_name, std::ios::binary | std::ios::ate};
CHECK(file.is_open()) << "Can't open file from " << file_name;
auto size = file.tellg();
CHECK(size > 0) << "size is too small.";
std::string content(size, '\0');
file.seekg(0);
file.read(&content[0], size);
cl::Program::Sources sources;
sources.push_back(content);
auto prog =
std::unique_ptr<cl::Program>(new cl::Program(context, sources, &status_));
LOG(INFO) << "OpenCL kernel file name: " << file_name;
LOG(INFO) << "Program source size: " << content.size();
CL_CHECK_ERRORS(status_);
return std::move(prog);
}
std::unique_ptr<cl::UserEvent> CLEngine::CreateEvent(
const cl::Context& context) {
auto event =
std::unique_ptr<cl::UserEvent>(new cl::UserEvent(context, &status_));
CL_CHECK_ERRORS(status_);
return std::move(event);
}
bool CLEngine::BuildProgram(cl::Program* program, const std::string& options) {
std::string build_option = options + " -cl-fast-relaxed-math -I " +
CLEngine::Global()->cl_path() + "/cl_kernel";
status_ = program->build({*device_}, build_option.c_str());
CL_CHECK_ERRORS(status_);
if (status_ != CL_SUCCESS) {
if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device()) ==
CL_BUILD_ERROR) {
std::string log = program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(device());
LOG(INFO) << "Program build error: " << log;
}
return false;
}
return true;
}
bool CLEngine::InitializePlatform() {
std::vector<cl::Platform> all_platforms;
status_ = cl::Platform::get(&all_platforms);
CL_CHECK_ERRORS(status_);
if (all_platforms.empty()) {
LOG(ERROR) << "No OpenCL platform found!";
return false;
}
platform_ = std::make_shared<cl::Platform>();
*platform_ = all_platforms[0];
return true;
}
bool CLEngine::InitializeDevice() {
std::vector<cl::Device> all_devices;
status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
CL_CHECK_ERRORS(status_);
if (all_devices.empty()) {
LOG(ERROR) << "No OpenCL GPU device found!";
return false;
}
device_ = std::make_shared<cl::Device>();
*device_ = all_devices[0];
auto device_name = device_->getInfo<CL_DEVICE_NAME>();
LOG(INFO) << "Using device: " << device_name;
auto image_support = device_->getInfo<CL_DEVICE_IMAGE_SUPPORT>();
if (image_support) {
LOG(INFO) << "The chosen device supports image processing.";
} else {
LOG(ERROR) << "The chosen device doesn't support image processing!";
return false;
}
auto ext_data = device_->getInfo<CL_DEVICE_EXTENSIONS>();
LOG(INFO) << "The extensions supported by this device: " << ext_data;
if (ext_data.find("cl_khr_fp16") != std::string::npos) {
LOG(INFO) << "The chosen device supports the half data type.";
} else {
LOG(ERROR) << "The chosen device doesn't support the half data type!";
return false;
}
auto max_units = device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
LOG(INFO) << "The chosen device has " << max_units << " compute units.";
auto local_mem = device_->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
LOG(INFO) << "The local memory size of the chosen device is "
<< static_cast<float>(local_mem) / 1024 << " KB.";
return true;
}
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <fstream>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/lite/opencl/cl2_header.h"
#include "paddle/fluid/lite/opencl/cl_tool.h"
namespace paddle {
namespace lite {
class CLEngine {
public:
static CLEngine* Global();
bool Init();
cl::Platform& platform();
cl::Context& context();
cl::Device& device();
cl::CommandQueue& command_queue();
std::unique_ptr<cl::Program> CreateProgram(const cl::Context& context,
std::string file_name);
std::unique_ptr<cl::UserEvent> CreateEvent(const cl::Context& context);
bool BuildProgram(cl::Program* program, const std::string& options = "");
bool IsInitSuccess() { return is_init_success_; }
std::string cl_path() { return cl_path_; }
void set_cl_path(std::string cl_path) { cl_path_ = cl_path; }
private:
CLEngine() = default;
~CLEngine();
bool InitializePlatform();
bool InitializeDevice();
std::shared_ptr<cl::Context> CreateContext() {
auto context = std::make_shared<cl::Context>(
std::vector<cl::Device>{device()}, nullptr, nullptr, nullptr, &status_);
CL_CHECK_ERRORS(status_);
return context;
}
std::shared_ptr<cl::CommandQueue> CreateCommandQueue(
const cl::Context& context) {
auto queue =
std::make_shared<cl::CommandQueue>(context, device(), 0, &status_);
CL_CHECK_ERRORS(status_);
return queue;
}
std::string cl_path_;
std::shared_ptr<cl::Platform> platform_{nullptr};
std::shared_ptr<cl::Context> context_{nullptr};
std::shared_ptr<cl::Device> device_{nullptr};
std::shared_ptr<cl::CommandQueue> command_queue_{nullptr};
cl_int status_{CL_SUCCESS};
bool initialized_{false};
bool is_init_success_{false};
};
} // namespace lite
} // namespace paddle
此差异已折叠。
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstdint>
namespace paddle {
namespace lite {
typedef uint16_t half_t;
half_t Float2Half(float f);
float Half2Float(half_t h);
void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/lite/opencl/cl_helper.h"
#include <glog/logging.h>
#include <string>
#include <utility>
#include <vector>
namespace paddle {
namespace lite {
void CLHelper::set_context(CLContext *context) { context_ = context; }
void CLHelper::AddKernel(const std::string &kernel_name,
const std::string &file_name,
const std::string &options) {
CHECK(context_ != nullptr) << "Please use set_context first!";
VLOG(3) << " --- begin to add kernel ---";
auto kernel = context_->GetKernel(kernel_name, file_name, options);
kernels.emplace_back(std::move(kernel));
VLOG(3) << " --- end to add kernel --- ";
}
cl::Kernel &CLHelper::KernelAt(const int index) {
VLOG(3) << " --- kernel count: " << kernels.size() << " --- ";
CHECK(static_cast<size_t>(index) < kernels.size())
<< "The index must be less than the size of kernels.";
CHECK(kernels[index] != nullptr)
<< "The target kernel pointer cannot be null.";
return *(kernels[index]);
}
cl::CommandQueue &CLHelper::OpenCLCommandQueue() {
CHECK(context_ != nullptr) << "Please use set_context first!";
return context_->GetCommandQueue();
}
cl::Context &CLHelper::OpenCLContext() {
CHECK(context_ != nullptr) << "Please use set_context first!";
return context_->GetContext();
}
cl::NDRange CLHelper::DefaultWorkSize(const CLImage &image) {
// n c h w
auto image_dim = image.tensor_dims();
if (image_dim.size() == 4) {
auto n = image_dim[0];
auto h = image_dim[2];
auto w = image_dim[3];
auto image_width = image.ImageWidth();
auto work_size_0 = image_width / w;
auto work_size_1 = w;
auto work_size_2 = n * h;
return cl::NDRange{static_cast<size_t>(work_size_0),
static_cast<size_t>(work_size_1),
static_cast<size_t>(work_size_2)};
} else if (image_dim.size() == 2) {
return cl::NDRange{static_cast<size_t>(1),
static_cast<size_t>(image.ImageWidth()),
static_cast<size_t>(image.ImageHeight())};
} else if (image_dim.size() == 1) {
return cl::NDRange{static_cast<size_t>(1),
static_cast<size_t>(image.ImageWidth()),
static_cast<size_t>(1)};
} else if (image_dim.size() == 3) {
auto c = image_dim[0];
auto h = image_dim[1];
auto w = image_dim[2];
return cl::NDRange{static_cast<size_t>((c + 3) / 4), static_cast<size_t>(w),
static_cast<size_t>(h)};
} else {
LOG(FATAL) << "Not support this dimension, need to be implemented!";
return cl::NDRange{};
}
}
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/lite/opencl/cl2_header.h"
#include "paddle/fluid/lite/opencl/cl_context.h"
#include "paddle/fluid/lite/opencl/cl_image.h"
namespace paddle {
namespace lite {
class CLHelper {
public:
CLHelper() = default;
explicit CLHelper(CLContext *context) : context_(context) {}
void set_context(CLContext *context);
void AddKernel(const std::string &kernel_name, const std::string &file_name,
const std::string &options = "");
cl::Kernel &KernelAt(const int index);
cl::CommandQueue &OpenCLCommandQueue();
cl::Context &OpenCLContext();
cl::NDRange DefaultWorkSize(const CLImage &image);
private:
CLContext *context_{nullptr};
std::vector<std::unique_ptr<cl::Kernel>> kernels;
};
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/lite/opencl/cl_image.h"
#include <glog/logging.h>
#include <array>
#include "paddle/fluid/lite/opencl/cl_engine.h"
#include "paddle/fluid/lite/opencl/cl_half.h"
#include "paddle/fluid/lite/opencl/cl_tool.h"
namespace paddle {
namespace lite {
std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
int width = cl_image.image_dims_[0];
int height = cl_image.image_dims_[1];
half_t* image_data = new half_t[height * width * 4];
cl::Image* image = cl_image.cl_image();
const std::array<size_t, 3> origin{0, 0, 0};
const std::array<size_t, 3> region{static_cast<size_t>(width),
static_cast<size_t>(height), 1};
cl_int err = CLEngine::Global()->command_queue().enqueueReadImage(
*image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
CL_CHECK_ERRORS(err);
float* tensor_data = new float[cl_image.numel()];
auto* converter = cl_image.image_converter();
converter->ImageToNCHW(image_data, tensor_data, cl_image.image_dims_,
cl_image.tensor_dims_);
int stride = cl_image.numel() / 20;
stride = stride > 0 ? stride : 1;
os << " dims: " << cl_image.tensor_dims_ << "\n";
for (int i = 0; i < cl_image.numel(); i += stride) {
os << tensor_data[i] << " ";
}
delete[] tensor_data;
delete[] image_data;
return os;
}
void CLImage::set_tensor_data(float* tensor_data, const DDim& dim) {
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
auto numel = dim.product();
#else
auto numel = dim.production();
#endif
tensor_data_.reset(new float[numel]);
memcpy(tensor_data_.get(), tensor_data, numel * sizeof(float));
tensor_dims_ = dim;
}
void CLImage::InitCLImage(const cl::Context& context) {
CHECK(tensor_data_ != nullptr) << " Please call "
"set_tensohelper->DefaultWorkSize(out_"
"image)r_data first!";
image_converter_.reset(new CLImageConverterFolder);
InitCLImage(context, image_converter_.get());
}
void CLImage::InitNormalCLImage(const cl::Context& context) {
CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
image_converter_.reset(new CLImageConverterNormal);
InitCLImage(context, image_converter_.get());
}
void CLImage::InitNImage(const cl::Context& context) {
CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4.";
image_converter_.reset(new CLImageConverterNWBlock);
InitCLImage(context, image_converter_.get());
}
void CLImage::InitDWImage(const cl::Context& context) {
CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4.";
image_converter_.reset(new CLImageConverterDWBlock);
InitCLImage(context, image_converter_.get());
}
void CLImage::InitEmptyImage(const cl::Context& context, const DDim& dim) {
CHECK(tensor_data_ == nullptr)
<< " Empty image tensor data shouldn't have value";
tensor_dims_ = dim;
image_converter_.reset(new CLImageConverterNormal);
VLOG(3) << " to get image dims ";
image_dims_ = image_converter_->InitImageDimInfoWith(tensor_dims_);
VLOG(3) << " end get image dims " << image_dims_;
InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
cl_event_ = CLEngine::Global()->CreateEvent(context);
initialized_ = true;
VLOG(3) << " end init cl image ";
}
void CLImage::InitEmptyWithImageDim(const cl::Context& context,
const DDim& image_dims) {
VLOG(3) << " to get image dims ";
image_dims_ = image_dims;
VLOG(3) << " end get image dims " << image_dims_;
InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
cl_event_ = CLEngine::Global()->CreateEvent(context);
initialized_ = true;
VLOG(3) << " end init cl image";
}
void CLImage::InitCLImage(const cl::Context& context,
CLImageConverterBase* converter) {
CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
VLOG(3) << " begin init cl image ";
image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
half_t* image_data = new half_t[image_dims_.product() * 4];
#else
half_t* image_data = new half_t[image_dims_.production() * 4];
#endif
VLOG(3) << " convert to image ";
converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_);
VLOG(3) << " end convert to image ";
InitCLImage(context, image_dims_[0], image_dims_[1], image_data);
delete[] image_data;
tensor_data_ = nullptr;
cl_event_ = CLEngine::Global()->CreateEvent(context);
initialized_ = true;
VLOG(3) << " end init cl image ";
}
void CLImage::InitCLImage(const cl::Context& context, int width, int height,
void* data) {
cl::ImageFormat img_format(CL_RGBA, CL_HALF_FLOAT);
cl_int err;
cl_image_.reset(new cl::Image2D(
context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
img_format, width, height, 0, data, &err));
CL_CHECK_ERRORS(err);
CHECK(err == CL_SUCCESS) << " Create image 2d error.";
}
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <iostream>
#include <memory>
#include <vector>
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl2_header.h"
#include "paddle/fluid/lite/opencl/cl_image_converter.h"
namespace paddle {
namespace lite {
class CLImage {
// For debug
friend std::ostream& operator<<(std::ostream& os, const CLImage& image);
public:
CLImage() = default;
/*
* Will not hold input tensor data, memcpy in this method.
* */
void set_tensor_data(float* tensor_data, const DDim& dim);
bool IsInit() { return initialized_; }
/*
* Need call set_tensor_data first.
* Folder when one dim or two dim.
* */
void InitCLImage(const cl::Context& context);
void InitNormalCLImage(const cl::Context& context);
void InitNImage(const cl::Context& context);
void InitDWImage(const cl::Context& context);
void InitEmptyImage(const cl::Context& context, const DDim& dim);
void InitEmptyWithImageDim(const cl::Context& context,
const DDim& image_dims);
cl::Image* cl_image() const { return cl_image_.get(); }
const DDim& image_dims() const { return image_dims_; }
inline size_t ImageWidth() const { return image_dims_[0]; }
inline size_t ImageHeight() const { return image_dims_[1]; }
const DDim& tensor_dims() const { return tensor_dims_; }
/*with_da
* Resize original tensor dim.
* */
inline CLImage& Resize(const DDim& dims) {
tensor_dims_ = dims;
return *this;
}
template <typename T>
T* data() const {
CHECK(!initialized_) << "CL image has initialized, tensor data has been "
"deleted, can't use tensor data!";
return reinterpret_cast<T*>(tensor_data_);
}
/*
* Numel of tensor dim
* */
inline int64_t numel() const {
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
return tensor_dims_.product();
#else
return tensor_dims_.production();
#endif
}
/*
* Original tensor dim
* */
cl::UserEvent& cl_event() const { return *cl_event_; }
CLImageConverterBase* image_converter() const {
return image_converter_.get();
}
private:
void InitCLImage(const cl::Context& context, CLImageConverterBase* converter);
void InitCLImage(const cl::Context& context, int width, int height,
void* data);
bool initialized_ = false;
std::unique_ptr<cl::Image2D> cl_image_{nullptr};
std::unique_ptr<cl::UserEvent> cl_event_{nullptr};
DDim tensor_dims_;
DDim image_dims_;
std::unique_ptr<float> tensor_data_{nullptr};
std::unique_ptr<CLImageConverterBase> image_converter_{nullptr};
};
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/lite/opencl/cl_image_converter.h"
#include <glog/logging.h>
#include <vector>
namespace paddle {
namespace lite {
DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
return DDim(
std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
static_cast<DDim::value_type>(height)}));
}
void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
VLOG(3) << " tensor dim: " << tensor_dim;
VLOG(3) << " image dim: " << in_image_dim;
size_t width = in_image_dim[0];
size_t w_block = width / W;
float *p = nchw;
size_t i0 = 0;
for (size_t n = 0; n < N; n++) {
for (size_t c = 0; c < w_block * 4; c++) {
size_t i1 = i0 + (c / 4) * W;
for (size_t h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (size_t w = 0; w < W; w++) {
if (c < C) {
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image[i2] = Float2Half(*p);
i2 += 4;
p++;
} else {
image[i2] = 0.0;
i2 += 4;
}
}
i1 += width;
}
}
i0 += width * H;
}
}
void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
size_t width = image_dim[0];
float *p = tensor;
size_t i0 = 0;
for (size_t n = 0; n < N; n++) {
for (size_t c = 0; c < C; c++) {
size_t i1 = i0 + (c / 4) * W;
for (size_t h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (size_t w = 0; w < W; w++) {
*p = Half2Float(image[i2]);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
}
DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
if (tensor_dim.size() <= 2) {
size_t tdim[2] = {1, 1};
if (tensor_dim.size() == 1) {
tdim[1] = tensor_dim[0];
} else {
tdim[0] = tensor_dim[0];
tdim[1] = tensor_dim[1];
}
size_t width = (tdim[1] + 3) / 4;
size_t height = tdim[0];
width_of_one_block_ = width;
height_of_one_block_ = height;
c_block_ = 1;
return DDim(
std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
static_cast<DDim::value_type>(height)}));
} else {
size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
width_of_one_block_ = W;
height_of_one_block_ = H;
c_block_ = width / W;
return DDim(
std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
static_cast<DDim::value_type>(height)}));
}
}
void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
const DDim &tensor_dim) {
CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
<< " Tensor dim is not support!";
if (tensor_dim.size() > 2) {
CLImageConverterDefault default_converter;
default_converter.NCHWToImage(tensor, image, tensor_dim);
} else {
size_t tdim[2] = {1, 1};
if (tensor_dim.size() == 1) {
tdim[1] = tensor_dim[0];
} else {
tdim[0] = tensor_dim[0];
tdim[1] = tensor_dim[1];
}
DDim image_dim = InitImageDimInfoWith(tensor_dim);
size_t width = image_dim[0];
for (size_t h = 0; h < tdim[0]; h++) {
for (size_t w = 0; w < tdim[1]; w++) {
image[(h * width + w / 4) * 4 + (w % 4)] =
Float2Half(tensor[h * tdim[1] + w]);
}
}
}
}
void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
if (tensor_dim.size() > 2) {
CLImageConverterDefault default_converter;
default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
} else {
size_t width = image_dim[0];
size_t H = 1, W = 1;
if (tensor_dim.size() == 2) {
H = tensor_dim[0];
W = tensor_dim[1];
} else if (tensor_dim.size() == 1) {
W = tensor_dim[0];
}
float *p = tensor;
for (size_t h = 0; h < H; h++) {
for (size_t w = 0; w < W; w++) {
p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
}
}
}
}
DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
size_t N, C, H, W;
N = tensor_dim[0];
C = tensor_dim[1];
H = tensor_dim[2];
W = tensor_dim[3];
size_t width = W * ((N + 3) / 4);
size_t height = C * H;
return DDim(
std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
static_cast<DDim::value_type>(height)}));
}
void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
const DDim &tensor_dim) {
CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
auto image_dim = InitImageDimInfoWith(tensor_dim);
float *p = tensor;
size_t N = tensor_dim[0];
size_t C = tensor_dim[1];
size_t H = tensor_dim[2];
size_t W = tensor_dim[3];
size_t width = image_dim[0];
size_t height = image_dim[1];
size_t block = image_dim[0] / tensor_dim[3];
for (size_t n = 0; n < block * 4; n++) {
for (size_t c = 0; c < C; c++) {
for (size_t h = 0; h < H; ++h) {
for (size_t w = 0; w < W; ++w) {
size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4;
if (n < N) {
image[index] = Float2Half(*p);
p++;
} else {
image[index] = 0.0;
}
if (index >= (width * height * 4)) {
LOG(INFO) << " index out of range ";
}
}
}
}
}
VLOG(3) << " init done";
}
void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
float *p = tensor;
size_t N = tensor_dim[0];
size_t C = tensor_dim[1];
size_t H = tensor_dim[2];
size_t W = tensor_dim[3];
size_t width = image_dim[0];
size_t height = image_dim[1];
for (size_t n = 0; n < N; n++) {
for (size_t c = 0; c < C; c++) {
for (size_t h = 0; h < H; ++h) {
for (size_t w = 0; w < W; ++w) {
size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4;
*p = Half2Float(image[index]);
p++;
if (index >= (width * height * 4)) {
LOG(INFO) << " index out of range ";
}
}
}
}
}
VLOG(3) << " init done";
}
DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
size_t N, C, H, W;
N = tensor_dim[0];
C = tensor_dim[1];
H = tensor_dim[2];
W = tensor_dim[3];
size_t width = W * ((N + 3) / 4);
size_t height = C * H;
return DDim(
std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
static_cast<DDim::value_type>(height)}));
}
void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[1];
C = new_dims[0];
H = new_dims[2];
W = new_dims[3];
DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
VLOG(3) << " tensor dim: " << tensor_dim;
VLOG(3) << " image dim: " << in_image_dim;
size_t width = in_image_dim[0];
size_t w_block = width / W;
float *p = tensor;
size_t i0 = 0;
for (size_t n = 0; n < N; n++) {
for (size_t c = 0; c < w_block * 4; c++) {
size_t i1 = i0 + (c / 4) * W;
for (size_t h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (size_t w = 0; w < W; w++) {
if (c < C) {
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image[i2] = Float2Half(*p);
i2 += 4;
p++;
} else {
image[i2] = 0.0;
i2 += 4;
}
}
i1 += width;
}
}
i0 += width * H;
}
}
void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
float *p = tensor;
size_t N = tensor_dim[1];
size_t C = tensor_dim[0];
size_t H = tensor_dim[2];
size_t W = tensor_dim[3];
size_t width = image_dim[0];
size_t i0 = 0;
for (size_t n = 0; n < N; n++) {
for (size_t c = 0; c < C; c++) {
size_t i1 = i0 + (c / 4) * W;
for (size_t h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (size_t w = 0; w < W; w++) {
*p = Half2Float(image[i2]);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
}
DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
width_of_one_block_ = W;
height_of_one_block_ = H;
c_block_ = width / W;
return DDim(
std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
static_cast<DDim::value_type>(height)}));
}
void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
const DDim &tensor_dim) {
CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
<< " Tensor dim is not support!";
CLImageConverterDefault default_converter;
default_converter.NCHWToImage(tensor, image, tensor_dim);
}
void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
CLImageConverterDefault default_converter;
default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
}
DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
const DDim &tensor_dim) {
CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
size_t N, C;
N = tensor_dim[0];
C = tensor_dim[1];
size_t width = (C + 3) / 4;
size_t height = N * 16; // N * (wino_blk_size + 2) * (wino_blk_size + 2)
return DDim(
std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
static_cast<DDim::value_type>(height)}));
}
void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, half_t *image,
const DDim &tensor_dim) {}
void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {}
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_half.h"
namespace paddle {
namespace lite {
class CLImageConverterBase {
public:
virtual ~CLImageConverterBase() {}
virtual void NCHWToImage(float *nchw, half_t *image,
const DDim &tensor_dim) = 0;
virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim,
const DDim &tensor_dim) = 0;
virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0;
};
class CLImageConverterDefault : public CLImageConverterBase {
public:
DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
class CLImageConverterFolder : public CLImageConverterBase {
public:
DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
/*
* width of original tensor
* */
inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
/*
* height of original tensor
* */
inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
int GetCBlock() const { return c_block_; }
private:
int c_block_;
int width_of_one_block_;
int height_of_one_block_;
};
class CLImageConverterNormal : public CLImageConverterBase {
public:
DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
/*
* width of original tensor
* */
inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
/*
* height of original tensor
* */
inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
int GetCBlock() const { return c_block_; }
private:
int c_block_;
int width_of_one_block_;
int height_of_one_block_;
};
class CLImageConverterNWBlock : public CLImageConverterBase {
DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
class CLImageConverterDWBlock : public CLImageConverterBase {
DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
class CLImageConverterWinoTransWeight : public CLImageConverterBase {
public:
DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
inline half4 activation(half4 in
#ifdef PRELU
,
half4 prelu_alpha
#endif
) {
half4 output;
#ifdef PRELU
output = select(prelu_alpha * in, in, in >= (half4)0.0);
#endif
#ifdef RELU
output = fmax(in, (half4)(0.0f));
#endif
return output;
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
half4 in = read_imageh(input, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords);
half4 output = in + biase;
write_imageh(outputImage,coords,output);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define MIN_VALUE -FLT_MAX
__kernel void pool_max(
__private const int in_height, __private const int in_width,
__private const int out_height, __private const int out_width,
__private const int pad_top, __private const int pad_left,
__private const int stride_h, __private const int stride_w,
__private const int ksize_h, __private const int ksize_w,
__read_only image2d_t input, __write_only image2d_t output) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_height;
const int out_h = out_nh % out_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int start_h = out_h * stride_h - pad_top;
int end_h = min(start_h + ksize_h, in_height);
start_h = max(start_h,0);
int start_w = out_w * stride_w - pad_left;
int end_w = min(start_w + ksize_w, in_width);
start_w = max(start_w,0);
const int pos_in_x = out_c * in_width;
const int pos_in_y = out_n * in_height;
half4 max_value = (half4)(MIN_VALUE);
for (int y = start_h; y < end_h; ++y) {
for (int x = start_w; x < end_w; ++x) {
half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
max_value = max(max_value, tmp);
}
}
const int pos_out_x = mad24(out_c, out_width, out_w);
write_imageh(output, (int2)(pos_out_x, out_nh), max_value);
}
__kernel void pool_avg(
__private const int in_height, __private const int in_width,
__private const int out_height, __private const int out_width,
__private const int pad_top, __private const int pad_left,
__private const int stride_h, __private const int stride_w,
__private const int ksize_h, __private const int ksize_w,
__read_only image2d_t input, __write_only image2d_t output) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_height;
const int out_h = out_nh % out_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int start_h = max(out_h * stride_h - pad_top, 0);
int end_h = min(start_h + ksize_h, in_height);
int start_w = max(out_w * stride_w - pad_left, 0);
int end_w = min(start_w + ksize_w, in_width);
const int pos_in_x = out_c * in_width;
const int pos_in_y = out_n * in_height;
half4 sum = (half4)(0.0f);
int num = 0;
for (int y = start_h; y < end_h; ++y) {
for (int x = start_w; x < end_w; ++x) {
sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
num++;
}
}
half4 avg = sum / num;
const int pos_out_x = mad24(out_c, out_width, out_w);
write_imageh(output, (int2)(pos_out_x, out_nh), avg);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <memory>
#include <random>
#include <vector>
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_caller.h"
#include "paddle/fluid/lite/opencl/cl_context.h"
#include "paddle/fluid/lite/opencl/cl_engine.h"
#include "paddle/fluid/lite/opencl/cl_helper.h"
#include "paddle/fluid/lite/opencl/cl_image.h"
DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path.");
namespace paddle {
namespace lite {
TEST(cl_test, engine_test) {
auto* engine = CLEngine::Global();
CHECK(engine->IsInitSuccess());
engine->set_cl_path(FLAGS_cl_path);
engine->platform();
engine->device();
engine->command_queue();
auto& context = engine->context();
auto program = engine->CreateProgram(
context, engine->cl_path() + "/cl_kernel/" + "elementwise_add_kernel.cl");
auto event = engine->CreateEvent(context);
CHECK(engine->BuildProgram(program.get()));
}
TEST(cl_test, context_test) {
auto* engine = CLEngine::Global();
CHECK(engine->IsInitSuccess());
engine->set_cl_path(FLAGS_cl_path);
CLContext context;
context.GetKernel("pool_max", "pool_kernel.cl", "");
context.GetKernel("elementwise_add", "elementwise_add_kernel.cl", "");
context.GetKernel("elementwise_add", "elementwise_add_kernel.cl", "");
}
TEST(cl_test, kernel_test) {
auto* engine = CLEngine::Global();
CHECK(engine->IsInitSuccess());
engine->set_cl_path(FLAGS_cl_path);
std::unique_ptr<CLContext> context(new CLContext);
// std::unique_ptr<CLHelper> helper(new CLHelper(context.get()));
std::unique_ptr<CLHelper> helper(new CLHelper);
helper->set_context(context.get());
helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
helper->AddKernel("pool_max", "pool_kernel.cl");
helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
auto kernel = helper->KernelAt(2);
std::unique_ptr<float[]> in_data(new float[1024 * 512]);
for (int i = 0; i < 1024 * 512; i++) {
in_data[i] = 1.f;
}
const DDim in_dim = DDim(std::vector<DDim::value_type>{1024, 512});
CLImage in_image;
in_image.set_tensor_data(in_data.get(), in_dim);
in_image.InitNormalCLImage(helper->OpenCLContext());
LOG(INFO) << in_image;
std::unique_ptr<float[]> bias_data(new float[1024 * 512]);
for (int i = 0; i < 1024 * 512; i++) {
bias_data[i] = 2.f;
}
const DDim bias_dim = DDim(std::vector<DDim::value_type>{1024, 512});
CLImage bias_image;
bias_image.set_tensor_data(bias_data.get(), bias_dim);
bias_image.InitNormalCLImage(helper->OpenCLContext());
LOG(INFO) << bias_image;
CLImage out_image;
const DDim out_dim = DDim(std::vector<DDim::value_type>{1024, 512});
out_image.InitEmptyImage(helper->OpenCLContext(), out_dim);
LOG(INFO) << out_image;
cl_int status;
status = kernel.setArg(0, *in_image.cl_image());
CL_CHECK_ERRORS(status);
status = kernel.setArg(1, *bias_image.cl_image());
CL_CHECK_ERRORS(status);
status = kernel.setArg(2, *out_image.cl_image());
CL_CHECK_ERRORS(status);
// auto global_work_size = helper->DefaultWorkSize(out_image);
size_t width = in_image.ImageWidth();
size_t height = in_image.ImageHeight();
auto global_work_size = cl::NDRange{width, height};
cl::Event event;
status = helper->OpenCLCommandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event);
CL_CHECK_ERRORS(status);
double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
LOG(INFO) << out_image;
}
TEST(cl_test, elementwise_add_test) {
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-5, 5);
const DDim in_dim = DDim(std::vector<DDim::value_type>{1024, 512});
std::unique_ptr<float[]> in_data(new float[1024 * 512]);
for (int i = 0; i < 1024 * 512; i++) {
in_data[i] = dist(engine);
}
const DDim bias_dim = DDim(std::vector<DDim::value_type>{1024, 512});
std::unique_ptr<float[]> bias_data(new float[1024 * 512]);
for (int i = 0; i < 1024 * 512; i++) {
bias_data[i] = dist(engine);
}
const DDim out_dim = DDim(std::vector<DDim::value_type>{1024, 512});
std::unique_ptr<float[]> out(new float[1024 * 512]);
bool status = InitOpenCLEngine(FLAGS_cl_path);
CHECK(status) << "Fail to initialize OpenCL engine.";
CLContext context;
elementwise_add(&context, in_data.get(), in_dim, bias_data.get(), bias_dim,
out.get(), out_dim);
int stride = 1024 * 512 / 20;
for (int i = 0; i < 1024 * 512; i += stride) {
std::cout << out[i] << " ";
}
std::cout << std::endl;
}
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/lite/opencl/cl_tool.h"
namespace paddle {
namespace lite {
const char *opencl_error_to_str(cl_int error) {
#define CASE_CL_CONSTANT(NAME) \
case NAME: \
return #NAME;
// Suppose that no combinations are possible.
switch (error) {
CASE_CL_CONSTANT(CL_SUCCESS)
CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND)
CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE)
CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE)
CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE)
CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES)
CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY)
CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE)
CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP)
CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH)
CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED)
CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE)
CASE_CL_CONSTANT(CL_MAP_FAILURE)
CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET)
CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
CASE_CL_CONSTANT(CL_INVALID_VALUE)
CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE)
CASE_CL_CONSTANT(CL_INVALID_PLATFORM)
CASE_CL_CONSTANT(CL_INVALID_DEVICE)
CASE_CL_CONSTANT(CL_INVALID_CONTEXT)
CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES)
CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE)
CASE_CL_CONSTANT(CL_INVALID_HOST_PTR)
CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT)
CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE)
CASE_CL_CONSTANT(CL_INVALID_SAMPLER)
CASE_CL_CONSTANT(CL_INVALID_BINARY)
CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS)
CASE_CL_CONSTANT(CL_INVALID_PROGRAM)
CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE)
CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME)
CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION)
CASE_CL_CONSTANT(CL_INVALID_KERNEL)
CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX)
CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE)
CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE)
CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS)
CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION)
CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE)
CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE)
CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET)
CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST)
CASE_CL_CONSTANT(CL_INVALID_EVENT)
CASE_CL_CONSTANT(CL_INVALID_OPERATION)
CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT)
CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE)
CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL)
CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE)
CASE_CL_CONSTANT(CL_INVALID_PROPERTY)
default:
return "UNKNOWN ERROR CODE";
}
#undef CASE_CL_CONSTANT
}
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/lite/opencl/cl2_header.h"
namespace paddle {
namespace lite {
const char* opencl_error_to_str(cl_int error);
#define CL_CHECK_ERRORS(ERR) \
if (ERR != CL_SUCCESS) { \
printf( \
"OpenCL error with code %s happened in file %s at line %d. " \
"Exiting.\n", \
opencl_error_to_str(ERR), __FILE__, __LINE__); \
}
} // namespace lite
} // namespace paddle
此差异已折叠。
......@@ -13,9 +13,10 @@
// limitations under the License.
#pragma once
/*
* ATTENTION this header file can only include in .cc file.
*/
// ATTENTION This can only include in a .cc file.
#include "paddle/fluid/lite/core/op_registry.h"
USE_LITE_OP(mul);
USE_LITE_OP(fc);
......
......@@ -25,6 +25,23 @@ function cmake_x86 {
cmake .. -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags}
}
function cmake_opencl {
# $1: ARM_TARGET_OS in "android" , "armlinux"
# $2: ARM_TARGET_ARCH_ABI in "arm64-v8a", "armeabi-v7a" ,"armeabi-v7a-hf"
cmake .. \
-DLITE_WITH_OPENCL=ON \
-DWITH_GPU=OFF \
-DWITH_MKL=OFF \
-DWITH_LITE=ON \
-DLITE_WITH_CUDA=OFF \
-DLITE_WITH_X86=OFF \
-DLITE_WITH_ARM=ON \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-DWITH_TESTING=ON \
-DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2
}
# This method is only called in CI.
function cmake_x86_for_CI {
prepare_for_codegen # fake an empty __generated_code__.cc to pass cmake.
......@@ -85,8 +102,8 @@ function build_test_server {
# test_arm_android <some_test_name> <adb_port_number>
function test_arm_android {
test_name=$1
port=$2
local test_name=$1
local port=$2
if [[ "${test_name}x" == "x" ]]; then
echo "test_name can not be empty"
exit 1
......@@ -99,12 +116,18 @@ function test_arm_android {
echo "test name: ${test_name}"
adb_work_dir="/data/local/tmp"
skip_list=("test_model_parser_lite" "test_mobilenetv1_lite" "test_mobilenetv2_lite" "test_resnet50_lite" "test_inceptionv4_lite")
skip_list=("test_model_parser_lite" "test_mobilenetv1_lite" "test_mobilenetv2_lite" "test_resnet50_lite" "test_inceptionv4_lite" "test_light_api_lite" "test_apis_lite")
for skip_name in ${skip_list[@]} ; do
[[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
done
testpath=$(find ./paddle/fluid -name ${test_name})
local testpath=$(find ./paddle/fluid -name ${test_name})
# if [[ "$test_name" == "test_light_api" ]]; then
# local model_path=$(find . -name "lite_naive_model")
# arm_push_necessary_file $port $model_path $adb_work_dir
# fi
adb -s emulator-${port} push ${testpath} ${adb_work_dir}
adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
adb -s emulator-${port} shell "./${adb_work_dir}/${test_name}"
......@@ -204,6 +227,7 @@ function test_arm {
abi=$2
lang=$3
port=$4
if [[ ${os} == "armlinux" ]]; then
# TODO(hongming): enable test armlinux on armv8, armv7 and armv7hf
echo "Skip test arm linux yet. armlinux must in another docker"
......@@ -221,6 +245,7 @@ function test_arm {
return 0
fi
echo "test file: ${TESTS_FILE}"
for _test in $(cat $TESTS_FILE); do
test_arm_android $_test $port
......@@ -235,13 +260,21 @@ function prepare_emulator {
adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
# start android armv8 and armv7 emulators first
echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a"
echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -verbose -port ${port_armv8} &
echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port ${port_armv8} &
sleep 1m
echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -verbose -port ${port_armv7} &
echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port ${port_armv7} &
sleep 1m
}
function arm_push_necessary_file {
local port=$1
local testpath=$2
local adb_work_dir=$3
adb -s emulator-${port} push ${testpath} ${adb_work_dir}
}
# We split the arm unittest into several sub-tasks to parallel and reduce the overall CI timetime.
# sub-task1
......@@ -286,20 +319,22 @@ function build_test_arm_subtask_armlinux {
prepare_emulator $port_armv8 $port_armv7
cur=$PWD
# job 5
build_arm "armlinux" "armv8"
test_arm "armlinux" "armv8"
cd -
build_arm "armlinux" "armv8" "gcc" $port_armv8
test_arm "armlinux" "armv8" "gcc" $port_armv8
cd $cur
# job 6
build_arm "armlinux" "armv7"
test_arm "armlinux" "armv7"
cd -
build_arm "armlinux" "armv7" "gcc" $port_armv8
test_arm "armlinux" "armv7" "gcc" $port_armv8
cd $cur
# job 7
build_arm "armlinux" "armv7hf"
test_arm "armlinux" "armv7hf"
cd -
build_arm "armlinux" "armv7hf" "gcc" $port_armv8
test_arm "armlinux" "armv7hf" "gcc" $port_armv8
cd $cur
adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
echo "Done"
......@@ -333,6 +368,22 @@ function build_test_arm_subtask_model {
echo "Done"
}
# this test load a model, optimize it and check the prediction result of both cxx and light APIS.
function test_arm_predict_apis {
local port=$1
local workspace=$2
local naive_model_path=$3
local api_test_path=$(find . -name "test_apis_lite")
# the model is pushed to ./lite_naive_model
adb -s emulator-${port} push ${naive_model_path} ${workspace}
adb -s emulator-${port} push $api_test_path ${workspace}
# test cxx_api first to store the optimized model.
adb -s emulator-${port} shell ./test_apis_lite --model_dir ./lite_naive_model --optimized_model ./lite_naive_model_opt
}
# Build the code and run lite arm tests. This is executed in the CI system.
function build_test_arm {
########################################################################
......@@ -404,6 +455,10 @@ function main {
cmake_x86
shift
;;
cmake_opencl)
cmake_opencl $ARM_OS $ARM_ABI
shift
;;
cmake_cuda)
cmake_cuda
shift
......
......@@ -18,11 +18,12 @@
#include <fstream>
#include <string>
#include "paddle/fluid/lite/utils/cp_logging.h"
#include "paddle/fluid/lite/utils/string.h"
namespace paddle {
namespace lite {
static bool IsFileExists(const std::string &path) {
static bool IsFileExists(const std::string& path) {
std::ifstream file(path);
bool res = file.is_open();
if (res) {
......@@ -31,5 +32,15 @@ static bool IsFileExists(const std::string &path) {
return res;
}
// ARM mobile not support mkdir in C++
static void MkDirRecur(const std::string& path) {
#ifndef LITE_WITH_ARM
CHECK_EQ(system(string_format("mkdir -p %s", path.c_str()).c_str()), 0)
<< "Cann't mkdir " << path;
#else // On ARM
CHECK_NE(mkdir(path.c_str(), S_IRWXU), -1) << "Cann't mkdir " << path;
#endif
}
} // namespace lite
} // namespace paddle
......@@ -74,5 +74,15 @@ static std::string Repr(const std::vector<std::string>& v) {
return "{" + Join(tmp, ",") + "}";
}
static std::vector<std::string> Split(const std::string& s, char delim) {
std::stringstream ss(s);
std::string line;
std::vector<std::string> res;
while (std::getline(ss, line, delim)) {
res.push_back(line);
}
return res;
}
} // namespace lite
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册