未验证 提交 e7d31e1f 编写于 作者: S Santa An 提交者: GitHub

Merge branch 'develop' into baolei/bitmain

...@@ -63,6 +63,16 @@ test/models/ ...@@ -63,6 +63,16 @@ test/models/
test/images/ test/images/
*.pyc
# model
*.nb
*.svg
*.dot
# vim intermediate files
*.swp
# Emacs intermediate files # Emacs intermediate files
*~ *~
......
...@@ -45,7 +45,7 @@ else() ...@@ -45,7 +45,7 @@ else()
# we changed the source code to adapt for windows compiling # we changed the source code to adapt for windows compiling
# git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h # git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
###################################################################################################### ######################################################################################################
URL https://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip URL http://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR} DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR}
DOWNLOAD_NO_PROGRESS 1 DOWNLOAD_NO_PROGRESS 1
PREFIX ${EIGEN_SOURCE_DIR} PREFIX ${EIGEN_SOURCE_DIR}
......
...@@ -48,7 +48,7 @@ cuda的编译结果位于 `build_cuda/inference_lite_lib` ...@@ -48,7 +48,7 @@ cuda的编译结果位于 `build_cuda/inference_lite_lib`
4、 `demo` 文件夹:c++ demo. 4、 `demo` 文件夹:c++ demo.
如果编译打开了python选项,则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so` 如果编译打开了python选项,则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite.so`
## 运行 ## 运行
...@@ -66,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg ...@@ -66,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg
二: 运行 二: 运行
**NOTE:**此处示例使用的是python接口。 **NOTE:** 此处示例使用的是python接口。
``` python ``` python
#-*- coding: utf-8 -*- #-*- coding: utf-8 -*-
...@@ -75,7 +75,7 @@ import sys ...@@ -75,7 +75,7 @@ import sys
import numpy as np import numpy as np
import cv2 import cv2
sys.path.append('build_cuda/inference_lite_lib/python/lib') sys.path.append('build_cuda/inference_lite_lib/python/lib')
from lite_core import * from lite import *
def read_img(im_path, resize_h, resize_w): def read_img(im_path, resize_h, resize_w):
im = cv2.imread(im_path).astype('float32') im = cv2.imread(im_path).astype('float32')
......
...@@ -369,6 +369,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -369,6 +369,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_libs" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_libs/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_libs/Makefile"
) )
add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_android_cxx_demos logging gflags)
add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
......
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_SHUTDOWN_LOG)
lite_cc_library(place SRCS paddle_place.cc DEPS logging) lite_cc_library(place SRCS paddle_place.cc DEPS logging)
else() else()
lite_cc_library(place SRCS paddle_place.cc DEPS glog) lite_cc_library(place SRCS paddle_place.cc DEPS glog)
......
...@@ -151,6 +151,11 @@ std::vector<std::string> Predictor::GetInputNames() { return input_names_; } ...@@ -151,6 +151,11 @@ std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
// get outputnames // get outputnames
std::vector<std::string> Predictor::GetOutputNames() { return output_names_; } std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
// get param names
std::vector<std::string> Predictor::GetParamNames() {
return exec_scope_->AttributeVarNames();
}
// append the names of inputs and outputs into input_names_ and output_names_ // append the names of inputs and outputs into input_names_ and output_names_
void Predictor::PrepareFeedFetch() { void Predictor::PrepareFeedFetch() {
if (!program_) { if (!program_) {
...@@ -293,6 +298,7 @@ void Predictor::Build(const cpp::ProgramDesc &desc, ...@@ -293,6 +298,7 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
// `inner_places` is used to optimize passes // `inner_places` is used to optimize passes
std::vector<Place> inner_places = valid_places; std::vector<Place> inner_places = valid_places;
for (auto &valid_place : valid_places) { for (auto &valid_place : valid_places) {
if (valid_place.target == TARGET(kOpenCL)) continue;
inner_places.emplace_back( inner_places.emplace_back(
Place(TARGET(kHost), valid_place.precision, valid_place.layout)); Place(TARGET(kHost), valid_place.precision, valid_place.layout));
} }
...@@ -345,9 +351,16 @@ void Predictor::GenRuntimeProgram() { ...@@ -345,9 +351,16 @@ void Predictor::GenRuntimeProgram() {
const lite::Tensor *Predictor::GetTensor(const std::string &name) const { const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
auto *var = exec_scope_->FindVar(name); auto *var = exec_scope_->FindVar(name);
CHECK(var) << "no variable named with " << name << " in exec_scope";
return &var->Get<lite::Tensor>(); return &var->Get<lite::Tensor>();
} }
lite::Tensor *Predictor::GetMutableTensor(const std::string &name) {
auto *var = exec_scope_->FindVar(name);
CHECK(var) << "no variable named with " << name << " in exec_scope";
return var->GetMutable<lite::Tensor>();
}
// get input by name // get input by name
lite::Tensor *Predictor::GetInputByName(const std::string &name) { lite::Tensor *Predictor::GetInputByName(const std::string &name) {
auto element = std::find(input_names_.begin(), input_names_.end(), name); auto element = std::find(input_names_.begin(), input_names_.end(), name);
......
...@@ -85,6 +85,9 @@ class LITE_API Predictor { ...@@ -85,6 +85,9 @@ class LITE_API Predictor {
// get inputnames and get outputnames. // get inputnames and get outputnames.
std::vector<std::string> GetInputNames(); std::vector<std::string> GetInputNames();
std::vector<std::string> GetOutputNames(); std::vector<std::string> GetOutputNames();
// get param names
std::vector<std::string> GetParamNames();
void PrepareFeedFetch(); void PrepareFeedFetch();
// Get offset-th col of fetch results. // Get offset-th col of fetch results.
...@@ -92,6 +95,9 @@ class LITE_API Predictor { ...@@ -92,6 +95,9 @@ class LITE_API Predictor {
std::vector<const lite::Tensor*> GetOutputs() const; std::vector<const lite::Tensor*> GetOutputs() const;
const cpp::ProgramDesc& program_desc() const; const cpp::ProgramDesc& program_desc() const;
// get a mutable tensor according to its name
lite::Tensor* GetMutableTensor(const std::string& name);
// get a const tensor according to its name
const lite::Tensor* GetTensor(const std::string& name) const; const lite::Tensor* GetTensor(const std::string& name) const;
const RuntimeProgram& runtime_program() const; const RuntimeProgram& runtime_program() const;
...@@ -142,9 +148,15 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor { ...@@ -142,9 +148,15 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
// get inputs names and get outputs names // get inputs names and get outputs names
std::vector<std::string> GetInputNames() override; std::vector<std::string> GetInputNames() override;
std::vector<std::string> GetOutputNames() override; std::vector<std::string> GetOutputNames() override;
// get param names
std::vector<std::string> GetParamNames() override;
// get tensor according to tensor's name
std::unique_ptr<const lite_api::Tensor> GetTensor( std::unique_ptr<const lite_api::Tensor> GetTensor(
const std::string& name) const override; const std::string& name) const override;
// get a mutable tensor according to tensor's name
std::unique_ptr<lite_api::Tensor> GetMutableTensor(
const std::string& name) override;
// Get InputTebsor by name // Get InputTebsor by name
std::unique_ptr<lite_api::Tensor> GetInputByName( std::unique_ptr<lite_api::Tensor> GetInputByName(
......
...@@ -97,6 +97,10 @@ std::vector<std::string> CxxPaddleApiImpl::GetInputNames() { ...@@ -97,6 +97,10 @@ std::vector<std::string> CxxPaddleApiImpl::GetInputNames() {
return raw_predictor_.GetInputNames(); return raw_predictor_.GetInputNames();
} }
std::vector<std::string> CxxPaddleApiImpl::GetParamNames() {
return raw_predictor_.GetParamNames();
}
std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() { std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() {
return raw_predictor_.GetOutputNames(); return raw_predictor_.GetOutputNames();
} }
...@@ -123,6 +127,12 @@ std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetTensor( ...@@ -123,6 +127,12 @@ std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetTensor(
return std::unique_ptr<const lite_api::Tensor>(new lite_api::Tensor(x)); return std::unique_ptr<const lite_api::Tensor>(new lite_api::Tensor(x));
} }
std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetMutableTensor(
const std::string &name) {
return std::unique_ptr<lite_api::Tensor>(
new lite_api::Tensor(raw_predictor_.GetMutableTensor(name)));
}
std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInputByName( std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInputByName(
const std::string &name) { const std::string &name) {
return std::unique_ptr<lite_api::Tensor>( return std::unique_ptr<lite_api::Tensor>(
......
...@@ -36,7 +36,7 @@ DEFINE_string(model_dir_0, "", "model_dir_0"); ...@@ -36,7 +36,7 @@ DEFINE_string(model_dir_0, "", "model_dir_0");
DEFINE_string(input_shape_0, DEFINE_string(input_shape_0,
"1,3,224,224", "1,3,224,224",
"input shapes another, separated by colon and comma"); "input shapes another, separated by colon and comma");
DEFINE_string(target, "arm", "main target for Predictor: arm, opencl");
DEFINE_bool(use_optimize_nb, DEFINE_bool(use_optimize_nb,
false, false,
"optimized & naive buffer model for mobile devices"); "optimized & naive buffer model for mobile devices");
...@@ -51,9 +51,19 @@ void OutputOptModel(const std::string& load_model_dir, ...@@ -51,9 +51,19 @@ void OutputOptModel(const std::string& load_model_dir,
const std::vector<std::vector<int64_t>>& input_shapes) { const std::vector<std::vector<int64_t>>& input_shapes) {
lite_api::CxxConfig config; lite_api::CxxConfig config;
config.set_model_dir(load_model_dir); config.set_model_dir(load_model_dir);
if (FLAGS_target == "arm") {
config.set_valid_places({ config.set_valid_places({
Place{TARGET(kARM), PRECISION(kFloat)}, Place{TARGET(kARM), PRECISION(kFloat)},
}); });
} else if (FLAGS_target == "opencl") {
config.set_valid_places({
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
Place{TARGET(kARM)}, // enable kARM CPU kernel when no opencl kernel
});
}
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
// delete old optimized model // delete old optimized model
...@@ -78,7 +88,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes, ...@@ -78,7 +88,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
int tid, int tid,
const int warmup_times = 5) { const int warmup_times = 5) {
lite_api::MobileConfig config; lite_api::MobileConfig config;
config.set_model_dir(model_dir); config.set_model_from_file(model_dir + ".nb");
config.set_power_mode(power_mode); config.set_power_mode(power_mode);
config.set_threads(thread_num); config.set_threads(thread_num);
...@@ -197,7 +207,7 @@ void RunTestType_10(const std::vector<std::vector<int64_t>>& input_shapes, ...@@ -197,7 +207,7 @@ void RunTestType_10(const std::vector<std::vector<int64_t>>& input_shapes,
const int repeat, const int repeat,
int warmup = 5) { int warmup = 5) {
lite_api::MobileConfig config; lite_api::MobileConfig config;
config.set_model_dir(model_dir); config.set_model_from_file(model_dir + ".nb");
config.set_power_mode(power_mode); config.set_power_mode(power_mode);
config.set_threads(thread_num); config.set_threads(thread_num);
...@@ -218,13 +228,13 @@ void RunTestType_11(const std::vector<std::vector<int64_t>>& input_shapes, ...@@ -218,13 +228,13 @@ void RunTestType_11(const std::vector<std::vector<int64_t>>& input_shapes,
const int repeat, const int repeat,
int warmup = 5) { int warmup = 5) {
lite_api::MobileConfig config; lite_api::MobileConfig config;
config.set_model_dir(model_dir); config.set_model_from_file(model_dir + ".nb");
config.set_power_mode(power_mode); config.set_power_mode(power_mode);
config.set_threads(thread_num); config.set_threads(thread_num);
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
config.set_model_dir(model_dir_0); config.set_model_from_file(model_dir_0 + ".nb");
auto predictor_0 = lite_api::CreatePaddlePredictor(config); auto predictor_0 = lite_api::CreatePaddlePredictor(config);
for (int i = 0; i < 2 * repeat; i += 2) { for (int i = 0; i < 2 * repeat; i += 2) {
...@@ -246,7 +256,8 @@ int main(int argc, char** argv) { ...@@ -246,7 +256,8 @@ int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true); gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir == "") { if (FLAGS_model_dir == "") {
LOG(INFO) << "usage: " LOG(INFO) << "usage: "
<< "--model_dir /path/to/your/model"; << "--model_dir /path/to/your/model --model_dir_0 "
"/path/to/your/model0 --target `arm` or `opencl`";
exit(0); exit(0);
} }
std::string save_optimized_model_dir = ""; std::string save_optimized_model_dir = "";
......
...@@ -55,7 +55,7 @@ DEFINE_string(model_file, "", "model file path of the combined-param model"); ...@@ -55,7 +55,7 @@ DEFINE_string(model_file, "", "model file path of the combined-param model");
DEFINE_string(param_file, "", "param file path of the combined-param model"); DEFINE_string(param_file, "", "param file path of the combined-param model");
DEFINE_string( DEFINE_string(
optimize_out_type, optimize_out_type,
"protobuf", "naive_buffer",
"store type of the output optimized model. protobuf/naive_buffer"); "store type of the output optimized model. protobuf/naive_buffer");
DEFINE_bool(display_kernels, false, "Display kernel information"); DEFINE_bool(display_kernels, false, "Display kernel information");
DEFINE_bool(record_tailoring_info, DEFINE_bool(record_tailoring_info,
...@@ -207,7 +207,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) { ...@@ -207,7 +207,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
} }
std::cout << std::setiosflags(std::ios::internal); std::cout << std::setiosflags(std::ios::internal);
std::cout << std::setw(maximum_optype_length) << "OP_name"; std::cout << std::setw(maximum_optype_length) << "OP_name";
for (int i = 0; i < targets.size(); i++) { for (size_t i = 0; i < targets.size(); i++) {
std::cout << std::setw(10) << targets[i].substr(1); std::cout << std::setw(10) << targets[i].substr(1);
} }
std::cout << std::endl; std::cout << std::endl;
...@@ -215,7 +215,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) { ...@@ -215,7 +215,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) { for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
std::cout << std::setw(maximum_optype_length) << it->first; std::cout << std::setw(maximum_optype_length) << it->first;
auto ops_valid_places = it->second; auto ops_valid_places = it->second;
for (int i = 0; i < targets.size(); i++) { for (size_t i = 0; i < targets.size(); i++) {
if (std::find(ops_valid_places.begin(), if (std::find(ops_valid_places.begin(),
ops_valid_places.end(), ops_valid_places.end(),
targets[i]) != ops_valid_places.end()) { targets[i]) != ops_valid_places.end()) {
...@@ -235,7 +235,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) { ...@@ -235,7 +235,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
} }
// Print OP info. // Print OP info.
auto ops_valid_places = supported_ops.at(*op); auto ops_valid_places = supported_ops.at(*op);
for (int i = 0; i < targets.size(); i++) { for (size_t i = 0; i < targets.size(); i++) {
if (std::find(ops_valid_places.begin(), if (std::find(ops_valid_places.begin(),
ops_valid_places.end(), ops_valid_places.end(),
targets[i]) != ops_valid_places.end()) { targets[i]) != ops_valid_places.end()) {
...@@ -288,11 +288,11 @@ void ParseInputCommand() { ...@@ -288,11 +288,11 @@ void ParseInputCommand() {
auto valid_places = paddle::lite_api::ParserValidPlaces(); auto valid_places = paddle::lite_api::ParserValidPlaces();
// get valid_targets string // get valid_targets string
std::vector<TargetType> target_types = {}; std::vector<TargetType> target_types = {};
for (int i = 0; i < valid_places.size(); i++) { for (size_t i = 0; i < valid_places.size(); i++) {
target_types.push_back(valid_places[i].target); target_types.push_back(valid_places[i].target);
} }
std::string targets_str = TargetToStr(target_types[0]); std::string targets_str = TargetToStr(target_types[0]);
for (int i = 1; i < target_types.size(); i++) { for (size_t i = 1; i < target_types.size(); i++) {
targets_str = targets_str + TargetToStr(target_types[i]); targets_str = targets_str + TargetToStr(target_types[i]);
} }
...@@ -301,7 +301,7 @@ void ParseInputCommand() { ...@@ -301,7 +301,7 @@ void ParseInputCommand() {
target_types.push_back(TARGET(kUnk)); target_types.push_back(TARGET(kUnk));
std::set<std::string> valid_ops; std::set<std::string> valid_ops;
for (int i = 0; i < target_types.size(); i++) { for (size_t i = 0; i < target_types.size(); i++) {
auto ops = supported_ops_target[static_cast<int>(target_types[i])]; auto ops = supported_ops_target[static_cast<int>(target_types[i])];
valid_ops.insert(ops.begin(), ops.end()); valid_ops.insert(ops.begin(), ops.end());
} }
...@@ -318,7 +318,7 @@ void CheckIfModelSupported() { ...@@ -318,7 +318,7 @@ void CheckIfModelSupported() {
auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))]; auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
valid_ops.insert( valid_ops.insert(
valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end()); valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
for (int i = 0; i < valid_places.size(); i++) { for (size_t i = 0; i < valid_places.size(); i++) {
auto target = valid_places[i].target; auto target = valid_places[i].target;
auto ops = supported_ops_target[static_cast<int>(target)]; auto ops = supported_ops_target[static_cast<int>(target)];
valid_ops.insert(valid_ops.end(), ops.begin(), ops.end()); valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
...@@ -340,7 +340,7 @@ void CheckIfModelSupported() { ...@@ -340,7 +340,7 @@ void CheckIfModelSupported() {
std::set<std::string> unsupported_ops; std::set<std::string> unsupported_ops;
std::set<std::string> input_model_ops; std::set<std::string> input_model_ops;
for (int index = 0; index < cpp_prog.BlocksSize(); index++) { for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) {
auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index); auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
for (size_t i = 0; i < current_block->OpsSize(); ++i) { for (size_t i = 0; i < current_block->OpsSize(); ++i) {
auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i); auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
...@@ -364,13 +364,13 @@ void CheckIfModelSupported() { ...@@ -364,13 +364,13 @@ void CheckIfModelSupported() {
unsupported_ops_str = unsupported_ops_str + ", " + *op_str; unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
} }
std::vector<TargetType> targets = {}; std::vector<TargetType> targets = {};
for (int i = 0; i < valid_places.size(); i++) { for (size_t i = 0; i < valid_places.size(); i++) {
targets.push_back(valid_places[i].target); targets.push_back(valid_places[i].target);
} }
std::sort(targets.begin(), targets.end()); std::sort(targets.begin(), targets.end());
targets.erase(unique(targets.begin(), targets.end()), targets.end()); targets.erase(unique(targets.begin(), targets.end()), targets.end());
std::string targets_str = TargetToStr(targets[0]); std::string targets_str = TargetToStr(targets[0]);
for (int i = 1; i < targets.size(); i++) { for (size_t i = 1; i < targets.size(); i++) {
targets_str = targets_str + "," + TargetToStr(targets[i]); targets_str = targets_str + "," + TargetToStr(targets[i]);
} }
......
...@@ -82,27 +82,56 @@ void OptBase::SetValidPlaces(const std::string& valid_places) { ...@@ -82,27 +82,56 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
"command argument 'valid_targets'"; "command argument 'valid_targets'";
} }
void OptBase::SetOptimizeOut(const std::string& optimized_out_path) { void OptBase::SetLiteOut(const std::string& lite_out_name) {
optimize_out_path_ = optimized_out_path; lite_out_name_ = lite_out_name;
} }
void OptBase::RunOptimize(bool record_strip_info) { void OptBase::RecordModelInfo(bool record_strip_info) {
record_strip_info_ = record_strip_info;
}
void OptBase::Run() {
CheckIfModelSupported(false); CheckIfModelSupported(false);
OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map); OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
opt_config_.set_valid_places(valid_places_); opt_config_.set_valid_places(valid_places_);
if (model_set_dir_ != "") { if (model_set_dir_ != "") {
RunOptimizeFromModelSet(record_strip_info); RunOptimizeFromModelSet(record_strip_info_);
} else { } else {
auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_); auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
opt_predictor->SaveOptimizedModel( opt_predictor->SaveOptimizedModel(
optimize_out_path_, model_type_, record_strip_info); lite_out_name_, model_type_, record_strip_info_);
auto resulted_model_name = auto resulted_model_name =
record_strip_info ? "information of striped model" : "optimized model"; record_strip_info_ ? "information of striped model" : "optimized model";
std::cout << "Save the " << resulted_model_name std::cout << "Save the " << resulted_model_name
<< " into :" << optimize_out_path_ << "successfully"; << " into :" << lite_out_name_ << "successfully";
} }
} }
void OptBase::RunOptimize(const std::string& model_dir_path,
const std::string& model_path,
const std::string& param_path,
const std::string& valid_places,
const std::string& optimized_out_path) {
SetModelDir(model_dir_path);
SetModelFile(model_path);
SetParamFile(param_path);
SetValidPlaces(valid_places);
SetLiteOut(optimized_out_path);
CheckIfModelSupported(false);
OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
opt_config_.set_valid_places(valid_places_);
if (model_set_dir_ != "") {
RunOptimizeFromModelSet(record_strip_info_);
} else {
auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
opt_predictor->SaveOptimizedModel(
lite_out_name_, model_type_, record_strip_info_);
auto resulted_model_name =
record_strip_info_ ? "information of striped model" : "optimized model";
std::cout << "Save the " << resulted_model_name
<< " into :" << lite_out_name_ << "successfully";
}
}
// collect ops info of modelset // collect ops info of modelset
void CollectModelMetaInfo(const std::string& output_dir, void CollectModelMetaInfo(const std::string& output_dir,
const std::vector<std::string>& models, const std::vector<std::string>& models,
...@@ -125,7 +154,7 @@ void OptBase::SetModelSetDir(const std::string& model_set_path) { ...@@ -125,7 +154,7 @@ void OptBase::SetModelSetDir(const std::string& model_set_path) {
} }
void OptBase::RunOptimizeFromModelSet(bool record_strip_info) { void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
// 1. mkdir of outputed optimized model set. // 1. mkdir of outputed optimized model set.
lite::MkDirRecur(optimize_out_path_); lite::MkDirRecur(lite_out_name_);
auto model_dirs = lite::ListDir(model_set_dir_, true); auto model_dirs = lite::ListDir(model_set_dir_, true);
if (model_dirs.size() == 0) { if (model_dirs.size() == 0) {
LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model"; LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model";
...@@ -138,7 +167,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) { ...@@ -138,7 +167,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
std::string input_model_dir = std::string input_model_dir =
lite::Join<std::string>({model_set_dir_, name}, "/"); lite::Join<std::string>({model_set_dir_, name}, "/");
std::string output_model_dir = std::string output_model_dir =
lite::Join<std::string>({optimize_out_path_, name}, "/"); lite::Join<std::string>({lite_out_name_, name}, "/");
if (opt_config_.model_file() != "" && opt_config_.param_file() != "") { if (opt_config_.model_file() != "" && opt_config_.param_file() != "") {
auto model_file_path = auto model_file_path =
...@@ -155,7 +184,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) { ...@@ -155,7 +184,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_); auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
opt_predictor->SaveOptimizedModel( opt_predictor->SaveOptimizedModel(
optimize_out_path_, model_type_, record_strip_info); lite_out_name_, model_type_, record_strip_info);
std::cout << "Optimize done. "; std::cout << "Optimize done. ";
} }
...@@ -164,46 +193,60 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) { ...@@ -164,46 +193,60 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
if (record_strip_info) { if (record_strip_info) {
// Collect all models information // Collect all models information
CollectModelMetaInfo( CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME); lite_out_name_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(
lite_out_name_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
CollectModelMetaInfo( CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME); lite_out_name_, model_dirs, lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(optimize_out_path_,
model_dirs,
lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo( CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME); lite_out_name_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
std::cout << "Record the information of stripped models into :" std::cout << "Record the information of stripped models into :"
<< optimize_out_path_ << "successfully"; << lite_out_name_ << "successfully";
} }
} }
void OptBase::PrintHelpInfo() { void OptBase::PrintHelpInfo() {
const std::string opt_version = lite::version(); const std::string opt_version = lite::version();
const char help_info[] = const char help_info[] =
"At least one argument should be inputed. Valid arguments are listed " "------------------------------------------------------------------------"
"below:\n" "-----------------------------------------------------------\n"
" Valid arguments of Paddle-Lite opt are listed below:\n"
"------------------------------------------------------------------------"
"-----------------------------------------------------------\n"
" Arguments of help information:\n" " Arguments of help information:\n"
" `help()` Print help infomation\n" " `help()` Print help infomation\n"
" Arguments of model optimization:\n" "\n"
" Arguments of model transformation:\n"
" `set_model_dir(model_dir)`\n" " `set_model_dir(model_dir)`\n"
" `set_model_file(model_file_path)`\n" " `set_model_file(model_file_path)`\n"
" `set_param_file(param_file_path)`\n" " `set_param_file(param_file_path)`\n"
" `set_model_type(protobuf|naive_buffer)`\n" " `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
" `set_optimize_out(output_optimize_model_dir)`\n" "default\n"
" `set_lite_out(output_optimize_model_dir)`\n"
" `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n" " `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
" `run_optimize(false|true)`\n" " `record_model_info(false|true)`: refer to whether to record ops "
" ` ----fasle&true refer to whether to record ops info for " "info for striping lib, false by default`\n"
"tailoring lib, false by default`\n" " `run() : start model transformation`\n"
" Arguments of model checking and ops information:\n" " eg. `opt.set_model_dir(\"./mobilenetv1\"); "
"opt.set_lite_out(\"mobilenetv1_opt\"); opt.set_valid_places(\"arm\"); "
"opt.run();`\n"
"\n"
" You can also transform model through a single input argument:\n"
" `run_optimize(model_dir, model_file_path, param_file_path, "
"model_type, valid_places, lite_out_name) `\n"
" eg. `opt.run_optimize(\"./mobilenetv1\", \"\", \"\", "
"\"naive_buffer\", \"arm\", \"mobilenetv1_opt\");`"
"\n"
" Arguments of checking model and printing ops information:\n"
" `print_all_ops()` Display all the valid operators of " " `print_all_ops()` Display all the valid operators of "
"Paddle-Lite\n" "Paddle-Lite\n"
" `print_supported_ops` Display supported operators of valid " " `print_supported_ops` Display supported operators of valid "
"places\n" "places\n"
" `check_if_model_supported()` Check if the input model is " " `check_if_model_supported()` Check if the input model is "
"supported\n"; "supported\n"
"------------------------------------------------------------------------"
std::cout << "opt version:" << opt_version << std::endl "-----------------------------------------------------------\n";
<< help_info << std::endl; std::cout << "opt version:" << opt_version << std::endl << help_info;
} }
// 2. Print supported info of inputed ops // 2. Print supported info of inputed ops
void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) { void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
......
...@@ -44,16 +44,21 @@ class LITE_API OptBase { ...@@ -44,16 +44,21 @@ class LITE_API OptBase {
public: public:
OptBase() = default; OptBase() = default;
void SetModelSetDir(const std::string &model_set_path); void SetModelSetDir(const std::string &model_set_path);
void SetModelDir(const std::string &model_path); void SetModelDir(const std::string &model_dir_path);
void SetModelFile(const std::string &model_path); void SetModelFile(const std::string &model_path);
void SetParamFile(const std::string &param_path); void SetParamFile(const std::string &param_path);
void SetValidPlaces(const std::string &valid_places); void SetValidPlaces(const std::string &valid_places);
void SetOptimizeOut(const std::string &optimized_out_path); void SetLiteOut(const std::string &lite_out_name);
void RecordModelInfo(bool record_strip_info = true);
// set optimized_model type // set optimized_model type
void SetModelType(std::string model_type); void SetModelType(std::string model_type);
// transform and save the optimized model // transform and save the optimized model
void RunOptimize(bool record_strip_info = false); void Run();
void RunOptimize(const std::string &model_dir_path = "",
const std::string &model_path = "",
const std::string &param_path = "",
const std::string &valid_places = "",
const std::string &optimized_out_path = "");
// fuctions of printing info // fuctions of printing info
// 1. help info // 1. help info
void PrintHelpInfo(); void PrintHelpInfo();
...@@ -71,12 +76,12 @@ class LITE_API OptBase { ...@@ -71,12 +76,12 @@ class LITE_API OptBase {
// valid places for the optimized_model // valid places for the optimized_model
std::vector<Place> valid_places_; std::vector<Place> valid_places_;
// filename of the optimized_model // filename of the optimized_model
std::string optimize_out_path_; std::string lite_out_name_;
// type of the optimized_model, kNaiveBuffer default. // type of the optimized_model, kNaiveBuffer default.
LiteModelType model_type_{LiteModelType::kNaiveBuffer}; LiteModelType model_type_{LiteModelType::kNaiveBuffer};
// Dir path of a set of models, this should be combined with model // Dir path of a set of models, this should be combined with model
std::string model_set_dir_; std::string model_set_dir_;
bool record_strip_info_{false};
void RunOptimizeFromModelSet(bool record_strip_info = false); void RunOptimizeFromModelSet(bool record_strip_info = false);
}; };
......
...@@ -167,6 +167,20 @@ lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); } ...@@ -167,6 +167,20 @@ lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); }
void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); } void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); }
std::unique_ptr<Tensor> PaddlePredictor::GetMutableTensor(
const std::string &name) {
LOG(FATAL)
<< "The GetMutableTensor API is only supported by CxxConfig predictor.";
return nullptr;
}
std::vector<std::string> PaddlePredictor::GetParamNames() {
std::vector<std::string> null_result = {};
LOG(FATAL)
<< "The GetParamNames API is only supported by CxxConfig predictor.";
return null_result;
}
void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir, void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir,
LiteModelType model_type, LiteModelType model_type,
bool record_info) { bool record_info) {
......
...@@ -86,6 +86,8 @@ class LITE_API PaddlePredictor { ...@@ -86,6 +86,8 @@ class LITE_API PaddlePredictor {
virtual std::vector<std::string> GetInputNames() = 0; virtual std::vector<std::string> GetInputNames() = 0;
// Get output names // Get output names
virtual std::vector<std::string> GetOutputNames() = 0; virtual std::vector<std::string> GetOutputNames() = 0;
// Get output names
virtual std::vector<std::string> GetParamNames();
// Get Input by name // Get Input by name
virtual std::unique_ptr<Tensor> GetInputByName(const std::string& name) = 0; virtual std::unique_ptr<Tensor> GetInputByName(const std::string& name) = 0;
...@@ -93,6 +95,9 @@ class LITE_API PaddlePredictor { ...@@ -93,6 +95,9 @@ class LITE_API PaddlePredictor {
/// Get a readonly tensor, return null if no one called `name` exists. /// Get a readonly tensor, return null if no one called `name` exists.
virtual std::unique_ptr<const Tensor> GetTensor( virtual std::unique_ptr<const Tensor> GetTensor(
const std::string& name) const = 0; const std::string& name) const = 0;
/// Get a mutable tensor, return null if on one called `name` exists
/// internal infereces API, not recommanded.
virtual std::unique_ptr<Tensor> GetMutableTensor(const std::string& name);
/// Persist the optimized model to disk. This API is only supported by /// Persist the optimized model to disk. This API is only supported by
/// CxxConfig, and the persisted model can be reused for MobileConfig. /// CxxConfig, and the persisted model can be reused for MobileConfig.
...@@ -176,7 +181,7 @@ class LITE_API CxxConfig : public ConfigBase { ...@@ -176,7 +181,7 @@ class LITE_API CxxConfig : public ConfigBase {
#endif #endif
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; } void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
int multi_stream() const { return multi_stream_; } bool multi_stream() const { return multi_stream_; }
#endif #endif
#ifdef LITE_WITH_MLU #ifdef LITE_WITH_MLU
...@@ -208,6 +213,8 @@ class LITE_API CxxConfig : public ConfigBase { ...@@ -208,6 +213,8 @@ class LITE_API CxxConfig : public ConfigBase {
// current thread. // current thread.
void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00); void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
// XPU only, specify the target device ID for the current thread. // XPU only, specify the target device ID for the current thread.
// **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
// thread
void set_xpu_dev_per_thread(int dev_no = 0); void set_xpu_dev_per_thread(int dev_no = 0);
}; };
......
...@@ -19,7 +19,13 @@ ...@@ -19,7 +19,13 @@
#pragma once #pragma once
// some platform-independent defintion // some platform-independent defintion
#include "lite/utils/macros.h"
#if defined(_WIN32)
#define UNUSED
#define __builtin_expect(EXP, C) (EXP)
#else
#define UNUSED __attribute__((unused))
#endif
#define USE_LITE_OP(op_type__) \ #define USE_LITE_OP(op_type__) \
extern int touch_op_##op_type__(); \ extern int touch_op_##op_type__(); \
......
...@@ -33,6 +33,7 @@ USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass); ...@@ -33,6 +33,7 @@ USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
USE_MIR_PASS(lite_interpolate_fuse_pass); USE_MIR_PASS(lite_interpolate_fuse_pass);
USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass); USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass);
USE_MIR_PASS(identity_scale_eliminate_pass); USE_MIR_PASS(identity_scale_eliminate_pass);
USE_MIR_PASS(identity_dropout_eliminate_pass);
USE_MIR_PASS(lite_conv_elementwise_fuse_pass); USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
USE_MIR_PASS(lite_conv_activation_fuse_pass); USE_MIR_PASS(lite_conv_activation_fuse_pass);
USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass); USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
...@@ -51,5 +52,8 @@ USE_MIR_PASS(mlu_postprocess_pass); ...@@ -51,5 +52,8 @@ USE_MIR_PASS(mlu_postprocess_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass); USE_MIR_PASS(weight_quantization_preprocess_pass);
USE_MIR_PASS(apu_subgraph_pass); USE_MIR_PASS(apu_subgraph_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass); USE_MIR_PASS(quantized_op_attributes_inference_pass);
USE_MIR_PASS(lite_scale_activation_fuse_pass);
USE_MIR_PASS(__xpu__resnet_fuse_pass); USE_MIR_PASS(__xpu__resnet_fuse_pass);
USE_MIR_PASS(__xpu__multi_encoder_fuse_pass); USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
USE_MIR_PASS(__xpu__fc_fuse_pass);
...@@ -62,8 +62,10 @@ void BindLiteOpt(py::module *m) { ...@@ -62,8 +62,10 @@ void BindLiteOpt(py::module *m) {
.def("set_model_file", &OptBase::SetModelFile) .def("set_model_file", &OptBase::SetModelFile)
.def("set_param_file", &OptBase::SetParamFile) .def("set_param_file", &OptBase::SetParamFile)
.def("set_valid_places", &OptBase::SetValidPlaces) .def("set_valid_places", &OptBase::SetValidPlaces)
.def("set_optimize_out", &OptBase::SetOptimizeOut) .def("set_lite_out", &OptBase::SetLiteOut)
.def("set_model_type", &OptBase::SetModelType) .def("set_model_type", &OptBase::SetModelType)
.def("record_model_info", &OptBase::RecordModelInfo)
.def("run", &OptBase::Run)
.def("run_optimize", &OptBase::RunOptimize) .def("run_optimize", &OptBase::RunOptimize)
.def("help", &OptBase::PrintHelpInfo) .def("help", &OptBase::PrintHelpInfo)
.def("print_supported_ops", &OptBase::PrintSupportedOps) .def("print_supported_ops", &OptBase::PrintSupportedOps)
......
...@@ -50,7 +50,7 @@ if '${WITH_MKL}' == 'ON': ...@@ -50,7 +50,7 @@ if '${WITH_MKL}' == 'ON':
# link lite.so to paddlelite.libs # link lite.so to paddlelite.libs
if os.name != 'nt': if os.name != 'nt':
COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\ COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
/inference_lite_lib/python/install/lite/lite.so" /inference_lite_lib/python/install/lite/lite.so"
if os.system(COMMAND) != 0: if os.system(COMMAND) != 0:
raise Exception("patch third_party libs failed, command: %s" % COMMAND) raise Exception("patch third_party libs failed, command: %s" % COMMAND)
......
...@@ -80,8 +80,10 @@ void conv_compute_6x6_3x3(const float* input, ...@@ -80,8 +80,10 @@ void conv_compute_6x6_3x3(const float* input,
const operators::ConvParam& param, const operators::ConvParam& param,
ARMContext* ctx) { ARMContext* ctx) {
auto act_param = param.activation_param; auto act_param = param.activation_param;
const int pad_h = (*param.paddings)[0]; const int pad_h0 = (*param.paddings)[0];
const int pad_w = (*param.paddings)[2]; const int pad_h1 = (*param.paddings)[1];
const int pad_w0 = (*param.paddings)[2];
const int pad_w1 = (*param.paddings)[3];
float* tmp_work_space = float* tmp_work_space =
ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float); ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
...@@ -96,8 +98,8 @@ void conv_compute_6x6_3x3(const float* input, ...@@ -96,8 +98,8 @@ void conv_compute_6x6_3x3(const float* input,
int tile_h = (hout + 5) / 6; int tile_h = (hout + 5) / 6;
int size_tile = tile_h * tile_w; int size_tile = tile_h * tile_w;
int w_pad = win + pad_w * 2; int w_pad = win + pad_w0 + pad_w1;
int h_pad = hin + pad_h * 2; int h_pad = hin + pad_h0 + pad_h1;
const int zero_len = w_pad; const int zero_len = w_pad;
float zero_ptr[zero_len]; // NOLINT float zero_ptr[zero_len]; // NOLINT
...@@ -127,10 +129,10 @@ void conv_compute_6x6_3x3(const float* input, ...@@ -127,10 +129,10 @@ void conv_compute_6x6_3x3(const float* input,
prepack_input_nxwc4_dw(input + ni * in_n_stride, prepack_input_nxwc4_dw(input + ni * in_n_stride,
input_c4 + i * new_c_stride, input_c4 + i * new_c_stride,
i * 4, i * 4,
-pad_h, -pad_h0,
hin + pad_h, hin + pad_h1,
-pad_w, -pad_w0,
win + pad_w, win + pad_w1,
chin, chin,
win, win,
hin, hin,
...@@ -367,8 +369,10 @@ void conv_compute_2x2_3x3(const float* input, ...@@ -367,8 +369,10 @@ void conv_compute_2x2_3x3(const float* input,
const operators::ConvParam& param, const operators::ConvParam& param,
ARMContext* ctx) { ARMContext* ctx) {
auto act_param = param.activation_param; auto act_param = param.activation_param;
const int pad_h = (*param.paddings)[0]; const int pad_h0 = (*param.paddings)[0];
const int pad_w = (*param.paddings)[2]; const int pad_h1 = (*param.paddings)[1];
const int pad_w0 = (*param.paddings)[2];
const int pad_w1 = (*param.paddings)[3];
float* tmp_work_space = float* tmp_work_space =
ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float); ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
...@@ -383,8 +387,8 @@ void conv_compute_2x2_3x3(const float* input, ...@@ -383,8 +387,8 @@ void conv_compute_2x2_3x3(const float* input,
int tile_h = (hout + 1) / 2; int tile_h = (hout + 1) / 2;
int size_tile = tile_h * tile_w; int size_tile = tile_h * tile_w;
int w_pad = win + pad_w * 2; int w_pad = win + pad_w0 + pad_w1;
int h_pad = hin + pad_h * 2; int h_pad = hin + pad_h0 + pad_h1;
const int zero_len = w_pad; const int zero_len = w_pad;
float zero_ptr[zero_len]; // NOLINT float zero_ptr[zero_len]; // NOLINT
...@@ -414,10 +418,10 @@ void conv_compute_2x2_3x3(const float* input, ...@@ -414,10 +418,10 @@ void conv_compute_2x2_3x3(const float* input,
prepack_input_nxwc4_dw(input + ni * in_n_stride, prepack_input_nxwc4_dw(input + ni * in_n_stride,
input_c4 + i * new_c_stride, input_c4 + i * new_c_stride,
i * 4, i * 4,
-pad_h, -pad_h0,
hin + pad_h, hin + pad_h1,
-pad_w, -pad_w0,
win + pad_w, win + pad_w1,
chin, chin,
win, win,
hin, hin,
...@@ -628,8 +632,10 @@ void conv_compute_2x2_3x3_small(const float* input, ...@@ -628,8 +632,10 @@ void conv_compute_2x2_3x3_small(const float* input,
const operators::ConvParam& param, const operators::ConvParam& param,
ARMContext* ctx) { ARMContext* ctx) {
auto act_param = param.activation_param; auto act_param = param.activation_param;
const int pad_h = (*param.paddings)[0]; const int pad_h0 = (*param.paddings)[0];
const int pad_w = (*param.paddings)[2]; const int pad_h1 = (*param.paddings)[1];
const int pad_w0 = (*param.paddings)[2];
const int pad_w1 = (*param.paddings)[3];
float* tmp_work_space = float* tmp_work_space =
ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float); ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
...@@ -644,8 +650,8 @@ void conv_compute_2x2_3x3_small(const float* input, ...@@ -644,8 +650,8 @@ void conv_compute_2x2_3x3_small(const float* input,
int tile_h = (hout + 1) / 2; int tile_h = (hout + 1) / 2;
int size_tile = tile_h * tile_w; int size_tile = tile_h * tile_w;
int w_pad = win + pad_w * 2; int w_pad = win + pad_w0 + pad_w1;
int h_pad = hin + pad_h * 2; int h_pad = hin + pad_h0 + pad_h1;
const int zero_len = w_pad; const int zero_len = w_pad;
float zero_ptr[zero_len]; // NOLINT float zero_ptr[zero_len]; // NOLINT
...@@ -676,10 +682,10 @@ void conv_compute_2x2_3x3_small(const float* input, ...@@ -676,10 +682,10 @@ void conv_compute_2x2_3x3_small(const float* input,
prepack_input_nxwc4_dw(input + ni * in_n_stride, prepack_input_nxwc4_dw(input + ni * in_n_stride,
input_c4 + i * new_c_stride, input_c4 + i * new_c_stride,
i * 4, i * 4,
-pad_h, -pad_h0,
hin + pad_h, hin + pad_h1,
-pad_w, -pad_w0,
win + pad_w, win + pad_w1,
chin, chin,
win, win,
hin, hin,
......
...@@ -33,6 +33,7 @@ void add_bias_rowwise(Tensor* input, ...@@ -33,6 +33,7 @@ void add_bias_rowwise(Tensor* input,
for (int w = start_w; w < w_adds; ++w) { for (int w = start_w; w < w_adds; ++w) {
i_data[w] += b_data[w]; i_data[w] += b_data[w];
} }
i_data += width;
} }
} }
void vector_dot( void vector_dot(
...@@ -67,15 +68,8 @@ void vector_dot( ...@@ -67,15 +68,8 @@ void vector_dot(
for (int i = 0; i < remain; ++i) { for (int i = 0; i < remain; ++i) {
if (!v2) { if (!v2) {
out_ptr[i] = in_ptr[i] * v1_ptr[i]; out_ptr[i] = in_ptr[i] * v1_ptr[i];
++out_ptr;
++in_ptr;
++v1_ptr;
} else { } else {
out_ptr[i] = in_ptr[i] + v1_ptr[i] * v2_ptr[i]; out_ptr[i] = in_ptr[i] + v1_ptr[i] * v2_ptr[i];
++out_ptr;
++in_ptr;
++v1_ptr;
++v2_ptr;
} }
} }
} }
......
...@@ -21,6 +21,17 @@ namespace paddle { ...@@ -21,6 +21,17 @@ namespace paddle {
namespace lite { namespace lite {
namespace arm { namespace arm {
namespace math { namespace math {
int AdaptStartIndex(int ph, int input_size, int output_size) {
return static_cast<int>(
floor(static_cast<double>(ph * input_size) / output_size));
}
int AdaptEndIndex(int ph, int input_size, int output_size) {
return static_cast<int>(
ceil(static_cast<double>((ph + 1) * input_size) / output_size));
}
void pooling_basic(const float* din, void pooling_basic(const float* din,
float* dout, float* dout,
int num, int num,
...@@ -88,15 +99,27 @@ void pooling_basic(const float* din, ...@@ -88,15 +99,27 @@ void pooling_basic(const float* din,
#pragma omp parallel for #pragma omp parallel for
for (int ind_c = 0; ind_c < chin; ++ind_c) { for (int ind_c = 0; ind_c < chin; ++ind_c) {
for (int ind_h = 0; ind_h < hout; ++ind_h) { for (int ind_h = 0; ind_h < hout; ++ind_h) {
int sh = ind_h * stride_h; int sh, eh;
int eh = sh + kernel_h; if (adaptive) {
sh = AdaptStartIndex(ind_h, hin, hout);
eh = AdaptEndIndex(ind_h, hin, hout);
} else {
sh = ind_h * stride_h;
eh = sh + kernel_h;
sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
eh = (eh - pad_h) > hin ? hin : eh - pad_h; eh = (eh - pad_h) > hin ? hin : eh - pad_h;
}
for (int ind_w = 0; ind_w < wout; ++ind_w) { for (int ind_w = 0; ind_w < wout; ++ind_w) {
int sw = ind_w * stride_w; int sw, ew;
int ew = sw + kernel_w; if (adaptive) {
sw = AdaptStartIndex(ind_w, win, wout);
ew = AdaptEndIndex(ind_w, win, wout);
} else {
sw = ind_w * stride_w;
ew = sw + kernel_w;
sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
ew = (ew - pad_w) > win ? win : ew - pad_w; ew = (ew - pad_w) > win ? win : ew - pad_w;
}
float result = static_cast<float>(0); float result = static_cast<float>(0);
int dst_ind = (ind_n * chout + ind_c) * size_channel_out + int dst_ind = (ind_n * chout + ind_c) * size_channel_out +
ind_h * wout + ind_w; ind_h * wout + ind_w;
......
此差异已折叠。
...@@ -40,6 +40,15 @@ void scale_compute_basic(const operators::ScaleParam& param) { ...@@ -40,6 +40,15 @@ void scale_compute_basic(const operators::ScaleParam& param) {
template <typename T> template <typename T>
void scale(const T* din, T* dout, int num, T scale, T bias); void scale(const T* din, T* dout, int num, T scale, T bias);
template <typename T>
void scale_relu(const T* din, T* dout, int num, T scale, T bias);
template <typename T>
void scale_relu6(const T* din, T* dout, int num, T scale, T bias, T alpha);
template <typename T>
void scale_leaky_relu(const T* din, T* dout, int num, T scale, T bias, T alpha);
template <typename T> template <typename T>
void scale(const T* din, void scale(const T* din,
T* dout, T* dout,
......
...@@ -28,6 +28,7 @@ namespace lite { ...@@ -28,6 +28,7 @@ namespace lite {
class CLContext { class CLContext {
public: public:
~CLContext() { ~CLContext() {
GetCommandQueue().finish();
for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) { for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
// Note(ysh329): Don't need `clReleaseKernel` // Note(ysh329): Don't need `clReleaseKernel`
kernels_[kidx].reset(); kernels_[kidx].reset();
......
...@@ -100,16 +100,18 @@ TEST(cl_test, kernel_test) { ...@@ -100,16 +100,18 @@ TEST(cl_test, kernel_test) {
size_t width = in_image.ImageWidth(); size_t width = in_image.ImageWidth();
size_t height = in_image.ImageHeight(); size_t height = in_image.ImageHeight();
auto global_work_size = cl::NDRange{width, height}; auto global_work_size = cl::NDRange{width, height};
cl::Event event;
status = context->GetCommandQueue().enqueueNDRangeKernel( status = context->GetCommandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event); kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = context->GetCommandQueue().finish(); status = context->GetCommandQueue().finish();
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
#if 0
double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
double elapsed_micros = (stop_nanos - start_nanos) / 1000.0; double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us."; LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
#endif
LOG(INFO) << out_image; LOG(INFO) << out_image;
} }
......
...@@ -73,7 +73,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, ...@@ -73,7 +73,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
i2 += 4; i2 += 4;
p++; p++;
} else { } else {
image[i2] = 0.0; image[i2] = Float2Half(0.f);
i2 += 4; i2 += 4;
} }
} }
...@@ -261,7 +261,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, ...@@ -261,7 +261,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
image[index] = Float2Half(*p); image[index] = Float2Half(*p);
p++; p++;
} else { } else {
image[index] = 0.0; image[index] = Float2Half(0.f);
} }
if (index >= (width * height * 4)) { if (index >= (width * height * 4)) {
LOG(INFO) << " index out of range "; LOG(INFO) << " index out of range ";
......
...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
///////////////////////////////// /////////////////////////////////
...@@ -108,7 +107,8 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in ...@@ -108,7 +107,8 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in
#endif #endif
#ifdef RELU6 #ifdef RELU6
output = clamp(in, (CL_DTYPE4)0, (CL_DTYPE4)6); in = fmax((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
output = fmin((CL_DTYPE4)(6.0f, 6.0f, 6.0f, 6.0f), in);
#endif #endif
return output; return output;
} }
...@@ -14,36 +14,30 @@ limitations under the License. */ ...@@ -14,36 +14,30 @@ limitations under the License. */
#include <cl_common.h> #include <cl_common.h>
__kernel void relu(__read_only image2d_t input, __kernel void relu(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale) { __private const float scale) {
const int x = get_global_id(0); // image_width const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
in = max((CL_DTYPE4)(0.0f), in); in = max((CL_DTYPE4)(0.0f), in);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
} }
__kernel void relu6(__read_only image2d_t input, __kernel void relu6(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale){ __private const float scale) {
const int x = get_global_id(0); const int x = get_global_id(0);
const int y = get_global_id(1); const int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in); in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
...@@ -51,7 +45,6 @@ __kernel void relu6(__read_only image2d_t input, ...@@ -51,7 +45,6 @@ __kernel void relu6(__read_only image2d_t input,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
} }
__kernel void sigmoid(__read_only image2d_t input, __kernel void sigmoid(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
...@@ -64,10 +57,11 @@ __kernel void sigmoid(__read_only image2d_t input, ...@@ -64,10 +57,11 @@ __kernel void sigmoid(__read_only image2d_t input,
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out; CL_DTYPE4 out;
out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y))); out.x = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.x))));
out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z))); out.y = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.y))));
out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w))); out.z = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.z))));
out.w = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.w))));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
} }
...@@ -79,22 +73,21 @@ __kernel void leaky_relu(__read_only image2d_t input, ...@@ -79,22 +73,21 @@ __kernel void leaky_relu(__read_only image2d_t input,
const int x = get_global_id(0); const int x = get_global_id(0);
const int y = get_global_id(1); const int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in; CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in;
if (in.x < 0.0f){ if (in.x < 0.0f) {
in.x = s_val.x; in.x = s_val.x;
} }
if (in.y < 0.0f){ if (in.y < 0.0f) {
in.y = s_val.y; in.y = s_val.y;
} }
if (in.z < 0.0f){ if (in.z < 0.0f) {
in.z = s_val.z; in.z = s_val.z;
} }
if (in.w < 0.0f){ if (in.w < 0.0f) {
in.w = s_val.w; in.w = s_val.w;
} }
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
...@@ -104,16 +97,14 @@ __kernel void tanh_act(__read_only image2d_t input, ...@@ -104,16 +97,14 @@ __kernel void tanh_act(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale) { __private const float scale) {
const int x = get_global_id(0); // image_width const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out= (exp(in) - exp(-in))/ (exp(in) + exp(-in)); CL_DTYPE4 out = (exp(in) - exp(-in)) / (exp(in) + exp(-in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
} }
...@@ -121,13 +112,11 @@ __kernel void exp_act(__read_only image2d_t input, ...@@ -121,13 +112,11 @@ __kernel void exp_act(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale) { __private const float scale) {
const int x = get_global_id(0); // image_width const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = exp(in); CL_DTYPE4 out = exp(in);
...@@ -138,16 +127,13 @@ __kernel void swish(__read_only image2d_t input, ...@@ -138,16 +127,13 @@ __kernel void swish(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale) { __private const float scale) {
const int x = get_global_id(0); // image_width const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = in / (1 + exp(-(CL_DTYPE)scale * in)); CL_DTYPE4 out = in / (1 + exp(-(CL_DTYPE)scale * in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
} }
#include <cl_common.h> #include <cl_common.h>
__kernel void conv2d_1x1_opt(__private const int global_size_dim0, __kernel void conv2d_1x1_opt(
__private const int global_size_dim0,
__private const int global_size_dim1, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2,
__read_only image2d_t input_image, __read_only image2d_t input_image,
...@@ -9,7 +10,7 @@ __kernel void conv2d_1x1_opt(__private const int global_size_dim0, ...@@ -9,7 +10,7 @@ __kernel void conv2d_1x1_opt(__private const int global_size_dim0,
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale,
__read_only image2d_t new_biase, __read_only image2d_t new_biase,
#endif #endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
...@@ -287,7 +288,7 @@ __kernel void conv2d_1x1_simple( ...@@ -287,7 +288,7 @@ __kernel void conv2d_1x1_simple(
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale,
__read_only image2d_t new_biase, __read_only image2d_t new_biase,
#endif #endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
......
...@@ -27,33 +27,33 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, ...@@ -27,33 +27,33 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
__private const int offset, __private const int offset,
__private const int input_c, __private const int input_c,
__private const int dilation, __private const int dilation,
__private const int input_width,/* of one block */ __private const int input_width, /* of one block */
__private const int input_height,/* of one block */ __private const int input_height, /* of one block */
__private const int output_width, __private const int output_width,
__private const int output_height, __private const int output_height,
__private const int output_c, __private const int output_c,
__private const int filter_channel, __private const int filter_channel,
__private const int filter_width, __private const int filter_width,
__private const int filter_height, __private const int filter_height,
__private const int group) { __private const int group,
__private const int input_tensor_c
) {
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 || if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) { out_nh >= global_size_dim2) {
return; return;
} }
int2 stride_xy; int2 stride_xy;
stride_xy.x = stride; stride_xy.x = stride;
stride_xy.y = stride; stride_xy.y = stride;
...@@ -67,80 +67,167 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, ...@@ -67,80 +67,167 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE_CH #ifdef BIASE_CH
CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); CL_DTYPE4 output =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
#else #else
CL_DTYPE4 output = 0.0f; CL_DTYPE4 output = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
#endif #endif
CL_DTYPE4 input[9]; // 3x3 region of input CL_DTYPE4 input[9]; // 3x3 region of input
if (group == 1) { if (group == 1) {
for (int i = 0; i < input_c; ++i) { // each run for 3x3 for (int i = 0; i < input_c; ++i) { // each run for 3x3
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
in_pos_in_one_block.y);
input[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input[0] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)), (int2)(pos_in.x - dilation, pos_in.y - dilation)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input[1] =
select(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x, pos_in.y - dilation)), (int2)(pos_in.x, pos_in.y - dilation)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input[2] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)), (int2)(pos_in.x + dilation, pos_in.y - dilation)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input[3] =
select(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x - dilation, pos_in.y)), (int2)(pos_in.x - dilation, pos_in.y)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input[4] = select(
(int2)(pos_in.x, pos_in.y)), READ_IMG_TYPE(
(CL_DTYPE4)(0.0f), CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input[5] =
select(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x + dilation, pos_in.y)), (int2)(pos_in.x + dilation, pos_in.y)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input[6] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)), (int2)(pos_in.x - dilation, pos_in.y + dilation)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input[7] =
select(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x, pos_in.y + dilation)), (int2)(pos_in.x, pos_in.y + dilation)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input[8] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)), (int2)(pos_in.x + dilation, pos_in.y + dilation)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
if (i == input_c - 1) {
int c_shr = input_tensor_c % 4;
if (c_shr == 1) {
for (int k = 0; k < 9; k++) {
input[k].y = (half)0.f;
input[k].z = (half)0.f;
input[k].w = (half)0.f;
}
} else if (c_shr == 2) {
for (int k = 0; k < 9; k++) {
input[k].z = (half)0.f;
input[k].w = (half)0.f;
}
} else if (c_shr == 3) {
for (int k = 0; k < 9; k++) {
input[k].w = (half)0.f;
}
} else if (c_shr == 0) {
}
}
int j = 0; int j = 0;
int2 pos_of_weight; int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3; pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
CL_DTYPE4 weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); CL_DTYPE4 weight_x =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x); output.x += dot(input[j], weight_x);
pos_of_weight.y += 3; pos_of_weight.y += 3;
CL_DTYPE4 weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); CL_DTYPE4 weight_y =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y); output.y += dot(input[j], weight_y);
pos_of_weight.y += 3; pos_of_weight.y += 3;
CL_DTYPE4 weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); CL_DTYPE4 weight_z =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z); output.z += dot(input[j], weight_z);
pos_of_weight.y += 3; pos_of_weight.y += 3;
CL_DTYPE4 weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); CL_DTYPE4 weight_w =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w); output.w += dot(input[j], weight_w);
j = 1; j = 1;
...@@ -297,79 +384,98 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, ...@@ -297,79 +384,98 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
in_pos_in_one_block.y); in_pos_in_one_block.y);
input[0] = select( input[0] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)), (int2)(pos_in.x - dilation, pos_in.y - dilation)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height) in_pos_in_one_block.y - dilation >= input_height)
<< 15)); << 15));
input[1] = input[1] =
select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, select(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x, pos_in.y - dilation)), (int2)(pos_in.x, pos_in.y - dilation)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || (ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x >= input_width || in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - dilation >= input_height) in_pos_in_one_block.y - dilation >= input_height)
<< 15)); << 15));
input[2] = select( input[2] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)), (int2)(pos_in.x + dilation, pos_in.y - dilation)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height) in_pos_in_one_block.y - dilation >= input_height)
<< 15)); << 15));
input[3] = select( input[3] =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, select(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x - dilation, pos_in.y)), (int2)(pos_in.x - dilation, pos_in.y)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y >= input_height) in_pos_in_one_block.y >= input_height)
<< 15)); << 15));
input[4] = select( input[4] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)), READ_IMG_TYPE(CL_DTYPE_CHAR,
(CL_DTYPE4)(0.0f), input_image,
sampler,
(int2)(pos_in.x, pos_in.y)),
(CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width || in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height) in_pos_in_one_block.y >= input_height)
<< 15)); << 15));
input[5] = input[5] =
select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, select(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x + dilation, pos_in.y)), (int2)(pos_in.x + dilation, pos_in.y)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y >= input_height) in_pos_in_one_block.y >= input_height)
<< 15)); << 15));
input[6] = select( input[6] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)), (int2)(pos_in.x - dilation, pos_in.y + dilation)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height) in_pos_in_one_block.y + dilation >= input_height)
<< 15)); << 15));
input[7] = input[7] =
select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, select(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x, pos_in.y + dilation)), (int2)(pos_in.x, pos_in.y + dilation)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || (ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x >= input_width || in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + dilation >= input_height) in_pos_in_one_block.y + dilation >= input_height)
<< 15)); << 15));
input[8] = select( input[8] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)), (int2)(pos_in.x + dilation, pos_in.y + dilation)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.x + dilation >= input_width ||
...@@ -381,7 +487,8 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, ...@@ -381,7 +487,8 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
int2 pos_of_weight; int2 pos_of_weight;
pos_of_weight.x = (f_c / 4) * 3 + j % 3; pos_of_weight.x = (f_c / 4) * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3; pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
CL_DTYPE4 weight = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); CL_DTYPE4 weight =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
int f_c_offset = f_c % 4; int f_c_offset = f_c % 4;
CL_DTYPE f_value; CL_DTYPE f_value;
......
...@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cl_common.h> #include <cl_common.h>
__kernel void depth_conv2d_3x3(__private const int global_size_dim0, __kernel void depth_conv2d_3x3(
__private const int global_size_dim0,
__private const int global_size_dim1, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2,
__read_only image2d_t input, __read_only image2d_t input,
...@@ -28,7 +28,7 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0, ...@@ -28,7 +28,7 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
__private const int offset, __private const int offset,
__private const int dilation, __private const int dilation,
__private const int input_c, __private const int input_c,
__private const int input_width,/* of one block */ __private const int input_width, /* of one block */
__private const int input_height, /* of one block */ __private const int input_height, /* of one block */
__private const int output_width, __private const int output_width,
__private const int output_height) { __private const int output_height) {
...@@ -39,23 +39,22 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0, ...@@ -39,23 +39,22 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const sampler_t sampler =
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
const int batch_index = out_nh / output_height; const int batch_index = out_nh / output_height;
const int out_nh_in_one_batch = out_nh % output_height; const int out_nh_in_one_batch = out_nh % output_height;
int2 stride_xy = (int2)(stride, stride); int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH #ifdef BIASE_CH
CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); CL_DTYPE4 output =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
#else #else
...@@ -65,30 +64,66 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0, ...@@ -65,30 +64,66 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
const int filter_width = 3; const int filter_width = 3;
const int filter_height = 3; const int filter_height = 3;
int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); int2 pos_in_input_block =
(int2)(out_c * input_width, batch_index * input_height);
int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height); int2 pos_in_filter_block =
(int2)(out_c * filter_width, batch_index * filter_height);
int filter_x = pos_in_filter_block.x ; int filter_x = pos_in_filter_block.x;
int filter_y = pos_in_filter_block.y ; int filter_y = pos_in_filter_block.y;
CL_DTYPE4 inputs[9]; CL_DTYPE4 inputs[9];
inputs[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), inputs[0] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
in_pos_in_one_block.y - 1 < 0 ||
inputs[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), in_pos_in_one_block.x - 1 >= input_width ||
in_pos_in_one_block.y - 1 >= input_height)
<< 15));
inputs[1] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x,
pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
in_pos_in_one_block.x >= input_width ||
inputs[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), in_pos_in_one_block.y - 1 >= input_height)
<< 15));
inputs[2] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
in_pos_in_one_block.y - 1 < 0 ||
inputs[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), in_pos_in_one_block.x + 1 >= input_width ||
in_pos_in_one_block.y - 1 >= input_height)
<< 15));
inputs[3] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
pos_in_input_block.y + in_pos_in_one_block.y)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - 1 >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
/* /*
if (output_pos.x == 112 && output_pos.y == 0) { if (output_pos.x == 112 && output_pos.y == 0) {
CL_DTYPE4 input1 = inputs[3]; CL_DTYPE4 input1 = inputs[3];
...@@ -98,45 +133,94 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0, ...@@ -98,45 +133,94 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
} }
*/ */
inputs[4] = select(
inputs[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x,
pos_in_input_block.y + in_pos_in_one_block.y)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
inputs[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), in_pos_in_one_block.y >= input_height)
<< 15));
inputs[5] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
inputs[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), in_pos_in_one_block.y >= input_height)
<< 15));
inputs[6] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
in_pos_in_one_block.y + 1 < 0 ||
inputs[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), in_pos_in_one_block.x - 1 >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
<< 15));
inputs[7] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x >= input_width ||
inputs[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), in_pos_in_one_block.y + 1 >= input_height)
<< 15));
inputs[8] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(CL_DTYPE4)(0.0f), (CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
<< 15));
CL_DTYPE4 filters[9]; CL_DTYPE4 filters[9];
filters[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y)); filters[0] =
filters[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y)); READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y));
filters[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y)); filters[1] = READ_IMG_TYPE(
filters[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1)); CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y));
filters[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1)); filters[2] = READ_IMG_TYPE(
filters[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1)); CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y));
filters[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2)); filters[3] = READ_IMG_TYPE(
filters[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2)); CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 1));
filters[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2)); filters[4] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 1));
for(int i = 0 ;i < 9 ; i++){ filters[5] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 1));
filters[6] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 2));
filters[7] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 2));
filters[8] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 2));
for (int i = 0; i < 9; i++) {
output += inputs[i] * filters[i]; output += inputs[i] * filters[i];
} }
output = activation_type4(output); output = activation_type4(output);
/* /*
if (output_pos.x == 112 && output_pos.y == 0) { if (output_pos.x == 112 && output_pos.y == 0) {
...@@ -158,11 +242,8 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0, ...@@ -158,11 +242,8 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
*/ */
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
} }
__kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
__private const int ou_w_blk, __private const int ou_w_blk,
__private const int ou_nh, __private const int ou_nh,
...@@ -176,7 +257,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, ...@@ -176,7 +257,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
__private const int pad, __private const int pad,
__private const int dilation, __private const int dilation,
__private const int in_ch, __private const int in_ch,
__private const int in_w,/* of one block */ __private const int in_w, /* of one block */
__private const int in_h, /* of one block */ __private const int in_h, /* of one block */
__private const int ou_w, __private const int ou_w,
__private const int ou_h) { __private const int ou_h) {
...@@ -195,19 +276,21 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, ...@@ -195,19 +276,21 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
int col_id = ou_col_id - pad; int col_id = ou_col_id - pad;
int row_id = ou_row_id - pad; int row_id = ou_row_id - pad;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
#ifdef BIASE_CH #ifdef BIASE_CH
CL_DTYPE4 output[2]; CL_DTYPE4 output[2];
output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0)); output[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0));
output[1] = output[0]; output[1] = output[0];
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
CL_DTYPE4 output[2]; CL_DTYPE4 output[2];
output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id)); output[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id));
if (ou_col_id + 1 < ou_w) { if (ou_col_id + 1 < ou_w) {
output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id)); output[1] =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id));
} }
#else #else
CL_DTYPE4 output[2] = {0.0f}; CL_DTYPE4 output[2] = {0.0f};
...@@ -218,9 +301,12 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, ...@@ -218,9 +301,12 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
int filter_x = ou_ch_blk_id * 3; int filter_x = ou_ch_blk_id * 3;
int filter_y = 0; int filter_y = 0;
CL_DTYPE4 filters[9]; CL_DTYPE4 filters[9];
filters[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y)); filters[0] =
filters[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y)); READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y));
filters[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y)); filters[1] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y));
filters[2] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y));
int in_x = mad24(ou_ch_blk_id, in_w, col_id); int in_x = mad24(ou_ch_blk_id, in_w, col_id);
int in_y = mad24(batch_id, in_h, row_id); int in_y = mad24(batch_id, in_h, row_id);
...@@ -244,11 +330,12 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, ...@@ -244,11 +330,12 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
output[0] = mad(inputs[2], filters[2], output[0]); output[0] = mad(inputs[2], filters[2], output[0]);
output[1] = mad(inputs[3], filters[2], output[1]); output[1] = mad(inputs[3], filters[2], output[1]);
filters[3] = READ_IMG_TYPE(
filters[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1)); CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 1));
filters[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1)); filters[4] = READ_IMG_TYPE(
filters[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1)); CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 1));
filters[5] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 1));
int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h); int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1)); inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1));
...@@ -256,7 +343,6 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, ...@@ -256,7 +343,6 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1)); inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1));
inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1)); inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1));
output[0] = mad(inputs[4], filters[3], output[0]); output[0] = mad(inputs[4], filters[3], output[0]);
output[1] = mad(inputs[5], filters[3], output[1]); output[1] = mad(inputs[5], filters[3], output[1]);
...@@ -266,10 +352,12 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, ...@@ -266,10 +352,12 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
output[0] = mad(inputs[6], filters[5], output[0]); output[0] = mad(inputs[6], filters[5], output[0]);
output[1] = mad(inputs[7], filters[5], output[1]); output[1] = mad(inputs[7], filters[5], output[1]);
filters[6] = READ_IMG_TYPE(
filters[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2)); CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 2));
filters[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2)); filters[7] = READ_IMG_TYPE(
filters[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2)); CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 2));
filters[8] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 2));
int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h); int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2)); inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2));
...@@ -277,7 +365,6 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, ...@@ -277,7 +365,6 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2)); inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2));
inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2)); inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2));
output[0] = mad(inputs[8], filters[6], output[0]); output[0] = mad(inputs[8], filters[6], output[0]);
output[1] = mad(inputs[9], filters[6], output[1]); output[1] = mad(inputs[9], filters[6], output[1]);
...@@ -290,10 +377,10 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, ...@@ -290,10 +377,10 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
output[0] = activation_type4(output[0]); output[0] = activation_type4(output[0]);
output[1] = activation_type4(output[1]); output[1] = activation_type4(output[1]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]); WRITE_IMG_TYPE(
CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]);
if (ou_col_id + 1 < ou_w) { if (ou_col_id + 1 < ou_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]); WRITE_IMG_TYPE(
CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
} }
} }
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// buffer -> image2d // buffer -> image2d
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
__kernel void buffer_to_image2d(__global CL_DTYPE *in, __kernel void buffer_to_image2d(__global CL_DTYPE* in,
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int out_H, __private const int out_H,
__private const int out_W, __private const int out_W,
...@@ -26,7 +26,6 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in, ...@@ -26,7 +26,6 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
__private const int Stride0, __private const int Stride0,
__private const int Stride1, __private const int Stride1,
__private const int Stride2) { __private const int Stride2) {
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
...@@ -66,16 +65,25 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in, ...@@ -66,16 +65,25 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
#ifdef DEBUG #ifdef DEBUG
if (out_w > 2045) { if (out_w > 2045) {
printf("out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f %.2f\n", printf(
"out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f "
"%.2f\n",
out_w, out_w,
out_C - 4 * out_c, out_C - 4 * out_c,
(float)(in[input_pos0]), (float)(in[input_pos0]),
(float)(in[input_pos1]), (float)(in[input_pos1]),
(float)(in[input_pos2]), (float)(in[input_pos2]),
(float)(in[input_pos3])); (float)(in[input_pos3]));
printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", out_c, out_w, out_nh, printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n",
output_pos.x, output_pos.y, out_c,
(float)(output.x), (float)(output.y), (float)(output.z), (float)(output.w)); out_w,
out_nh,
output_pos.x,
output_pos.y,
(float)(output.x),
(float)(output.y),
(float)(output.z),
(float)(output.w));
} }
#endif #endif
...@@ -104,30 +112,38 @@ __kernel void image2d_to_buffer(__read_only image2d_t input, ...@@ -104,30 +112,38 @@ __kernel void image2d_to_buffer(__read_only image2d_t input,
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int pos_x = mad24(in_c, in_width, in_w); const int pos_x = mad24(in_c, in_width, in_w);
CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)); CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(
CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
#ifdef DEBUG #ifdef DEBUG
if (in_w > 2045) { if (in_w > 2045) {
printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", in_c, in_w, in_nh, printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n",
pos_x, in_nh, in_c,
(float)(in.x), (float)(in.y), (float)(in.z), (float)(in.w)); in_w,
in_nh,
pos_x,
in_nh,
(float)(in.x),
(float)(in.y),
(float)(in.z),
(float)(in.w));
} }
#endif #endif
const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; const int index =
in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE); out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE);
if (C - 4 * in_c >= 2) { if (C - 4 * in_c >= 2) {
out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE); out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
} }
if(C - 4 * in_c >= 3) { if (C - 4 * in_c >= 3) {
out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE); out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
} }
if(C - 4 * in_c >= 4) { if (C - 4 * in_c >= 4) {
out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE); out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
} }
} }
#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile #if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// buffer -> image2d_nw // buffer -> image2d_nw
...@@ -182,7 +198,6 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in, ...@@ -182,7 +198,6 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
} }
#endif #endif
#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile #if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
// image2d -> buffer // image2d -> buffer
__kernel void image2d_to_buffer_2d(__private const int in_height, __kernel void image2d_to_buffer_2d(__private const int in_height,
...@@ -208,15 +223,14 @@ __kernel void image2d_to_buffer_2d(__private const int in_height, ...@@ -208,15 +223,14 @@ __kernel void image2d_to_buffer_2d(__private const int in_height,
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// buffer -> image2d (divide by 255 to normalize) // buffer -> image2d (divide by 255 to normalize)
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
__kernel void buffer_to_image2d_with_pre255(__global uchar *in, __kernel void buffer_to_image2d_with_pre255(__global uchar* in,
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int out_H, __private const int out_H,
__private const int out_W, __private const int out_W,
__private const int out_C, __private const int out_C,
__private const int Stride0, __private const int Stride0,
__private const int Stride1, __private const int Stride1,
__private const int Stride2){ __private const int Stride2) {
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
...@@ -231,7 +245,6 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in, ...@@ -231,7 +245,6 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
const int in_h = out_h; const int in_h = out_h;
const int in_w = out_w; const int in_w = out_w;
int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w; int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w; int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w; int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
...@@ -243,19 +256,18 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in, ...@@ -243,19 +256,18 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f; CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f;
output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255; output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255;
if(out_C - 4 * out_c>=2){ if (out_C - 4 * out_c >= 2) {
output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255; output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255;
} }
if(out_C - 4 * out_c>=3){ if (out_C - 4 * out_c >= 3) {
output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255; output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255;
} }
if(out_C - 4 * out_c>=4){ if (out_C - 4 * out_c >= 4) {
output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255; output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255;
} }
WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output); WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
} }
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// image2d -> buffer (multiply by 255 to de-normalize) // image2d -> buffer (multiply by 255 to de-normalize)
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
...@@ -277,22 +289,34 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input, ...@@ -277,22 +289,34 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int pos_x = mad24(in_c, in_width, in_w); const int pos_x = mad24(in_c, in_width, in_w);
CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) * 255; CL_COMPUTE_DTYPE4 in =
READ_IMG_TYPE(
CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) *
255;
#ifdef DEBUG #ifdef DEBUG
printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n", printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n",
in_c, in_w, in_nh, pos_x, in_nh, in.x, in.y, in.z, in.w); in_c,
in_w,
in_nh,
pos_x,
in_nh,
in.x,
in.y,
in.z,
in.w);
#endif #endif
const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; const int index =
in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
out[index] = convert_uchar_sat(in.x); out[index] = convert_uchar_sat(in.x);
if(C - 4 * in_c>=2){ if (C - 4 * in_c >= 2) {
out[index + size_ch] = convert_uchar_sat(in.y); out[index + size_ch] = convert_uchar_sat(in.y);
} }
if(C - 4 * in_c>=3){ if (C - 4 * in_c >= 3) {
out[index + size_ch * 2] = convert_uchar_sat(in.z); out[index + size_ch * 2] = convert_uchar_sat(in.z);
} }
if(C - 4 * in_c>=4){ if (C - 4 * in_c >= 4) {
out[index + size_ch * 3] = convert_uchar_sat(in.w); out[index + size_ch * 3] = convert_uchar_sat(in.w);
} }
} }
...@@ -45,6 +45,9 @@ bool CLRuntime::Init() { ...@@ -45,6 +45,9 @@ bool CLRuntime::Init() {
bool is_device_init = InitializeDevice(); bool is_device_init = InitializeDevice();
is_init_success_ = is_platform_init && is_device_init; is_init_success_ = is_platform_init && is_device_init;
initialized_ = true; initialized_ = true;
context_ = CreateContext();
command_queue_ = CreateCommandQueue(context());
return initialized_; return initialized_;
} }
...@@ -55,7 +58,7 @@ cl::Platform& CLRuntime::platform() { ...@@ -55,7 +58,7 @@ cl::Platform& CLRuntime::platform() {
cl::Context& CLRuntime::context() { cl::Context& CLRuntime::context() {
if (context_ == nullptr) { if (context_ == nullptr) {
context_ = CreateContext(); LOG(FATAL) << "context_ create failed. ";
} }
return *context_; return *context_;
} }
...@@ -67,7 +70,7 @@ cl::Device& CLRuntime::device() { ...@@ -67,7 +70,7 @@ cl::Device& CLRuntime::device() {
cl::CommandQueue& CLRuntime::command_queue() { cl::CommandQueue& CLRuntime::command_queue() {
if (command_queue_ == nullptr) { if (command_queue_ == nullptr) {
command_queue_ = CreateCommandQueue(context()); LOG(FATAL) << "command_queue_ create failed. ";
} }
return *command_queue_; return *command_queue_;
} }
...@@ -96,7 +99,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent( ...@@ -96,7 +99,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent(
bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) { bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) {
/* -I +CLRuntime::Global()->cl_path() + "/cl_kernel"*/ /* -I +CLRuntime::Global()->cl_path() + "/cl_kernel"*/
std::string build_option = options + " -cl-fast-relaxed-math "; std::string build_option = options + " -cl-fast-relaxed-math -cl-mad-enable";
VLOG(4) << "OpenCL build_option: " << build_option; VLOG(4) << "OpenCL build_option: " << build_option;
status_ = program->build({*device_}, build_option.c_str()); status_ = program->build({*device_}, build_option.c_str());
CL_CHECK_ERROR(status_); CL_CHECK_ERROR(status_);
......
...@@ -66,7 +66,8 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width, ...@@ -66,7 +66,8 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
cl_int status; cl_int status;
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(), new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
: CL_MEM_ALLOC_HOST_PTR),
img_format, img_format,
cl_image2d_width, cl_image2d_width,
cl_image2d_height, cl_image2d_height,
...@@ -89,7 +90,8 @@ void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width, ...@@ -89,7 +90,8 @@ void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width,
cl_int status; cl_int status;
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(), new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
: CL_MEM_ALLOC_HOST_PTR),
img_format, img_format,
cl_image2d_width, cl_image2d_width,
cl_image2d_height, cl_image2d_height,
...@@ -112,7 +114,8 @@ void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width, ...@@ -112,7 +114,8 @@ void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width,
cl_int status; cl_int status;
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(), new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
: CL_MEM_ALLOC_HOST_PTR),
img_format, img_format,
cl_image2d_width, cl_image2d_width,
cl_image2d_height, cl_image2d_height,
...@@ -192,7 +195,6 @@ void TargetWrapperCL::MemcpySync(void *dst, ...@@ -192,7 +195,6 @@ void TargetWrapperCL::MemcpySync(void *dst,
size_t size, size_t size,
IoDirection dir) { IoDirection dir) {
cl_int status; cl_int status;
cl::Event event;
auto stream = CLRuntime::Global()->command_queue(); auto stream = CLRuntime::Global()->command_queue();
switch (dir) { switch (dir) {
case IoDirection::DtoD: case IoDirection::DtoD:
...@@ -202,9 +204,9 @@ void TargetWrapperCL::MemcpySync(void *dst, ...@@ -202,9 +204,9 @@ void TargetWrapperCL::MemcpySync(void *dst,
0, 0,
size, size,
nullptr, nullptr,
&event); nullptr);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
event.wait(); CLRuntime::Global()->command_queue().finish();
break; break;
case IoDirection::HtoD: case IoDirection::HtoD:
status = stream.enqueueWriteBuffer(*static_cast<cl::Buffer *>(dst), status = stream.enqueueWriteBuffer(*static_cast<cl::Buffer *>(dst),
...@@ -283,7 +285,6 @@ void TargetWrapperCL::ImgcpySync(void *dst, ...@@ -283,7 +285,6 @@ void TargetWrapperCL::ImgcpySync(void *dst,
cl::array<size_t, 3> origin = {0, 0, 0}; cl::array<size_t, 3> origin = {0, 0, 0};
cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1}; cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1};
cl_int status; cl_int status;
cl::Event event;
auto stream = CLRuntime::Global()->command_queue(); auto stream = CLRuntime::Global()->command_queue();
switch (dir) { switch (dir) {
case IoDirection::DtoD: case IoDirection::DtoD:
...@@ -293,9 +294,9 @@ void TargetWrapperCL::ImgcpySync(void *dst, ...@@ -293,9 +294,9 @@ void TargetWrapperCL::ImgcpySync(void *dst,
origin, origin,
region, region,
nullptr, nullptr,
&event); nullptr);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
event.wait(); CLRuntime::Global()->command_queue().finish();
break; break;
case IoDirection::HtoD: case IoDirection::HtoD:
status = stream.enqueueWriteImage(*static_cast<cl::Image2D *>(dst), status = stream.enqueueWriteImage(*static_cast<cl::Image2D *>(dst),
......
...@@ -129,8 +129,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> { ...@@ -129,8 +129,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
T* output_data = output->template mutable_data<T>(); T* output_data = output->template mutable_data<T>();
for (int64_t i = 0; i < in_dims[0]; ++i) { for (int64_t i = 0; i < in_dims[0]; ++i) {
for (int64_t j = 0; j < size; ++j) { for (int64_t j = 0; j < size; ++j) {
output_data[i * in_dims[0] + j] = output_data[i * size + j] = input_data[i * size + j] + vector_data[j];
input_data[i * in_dims[0] + j] + vector_data[j];
} }
} }
} }
......
...@@ -279,7 +279,7 @@ struct MergeAdd<lite::TargetType::kX86, T> { ...@@ -279,7 +279,7 @@ struct MergeAdd<lite::TargetType::kX86, T> {
} }
} }
if (has_value_input == nullptr) { if (has_value_input == nullptr) {
VLOG(3) << "no input has value! just return" << std::endl; VLOG(3) << "no input has value! just return";
return; return;
} }
auto input_width = has_value_input->value().dims()[1]; auto input_width = has_value_input->value().dims()[1];
......
...@@ -19,6 +19,7 @@ namespace lite { ...@@ -19,6 +19,7 @@ namespace lite {
#ifdef LITE_WITH_XPU #ifdef LITE_WITH_XPU
thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr}; thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
int Context<TargetType::kXPU>::_workspace_l3_size_per_thread{0};
#endif #endif
} // namespace lite } // namespace lite
......
...@@ -151,14 +151,23 @@ class Context<TargetType::kXPU> { ...@@ -151,14 +151,23 @@ class Context<TargetType::kXPU> {
if (_tls_raw_ctx == nullptr) { if (_tls_raw_ctx == nullptr) {
_tls_raw_ctx = xdnn::create_context(); _tls_raw_ctx = xdnn::create_context();
CHECK(_tls_raw_ctx); CHECK(_tls_raw_ctx);
int r = xdnn::set_workspace_l3_size(_tls_raw_ctx,
_workspace_l3_size_per_thread);
if (r != 0) {
LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r
<< ", _workspace_l3_size_per_thread = "
<< _workspace_l3_size_per_thread;
}
} }
return _tls_raw_ctx; return _tls_raw_ctx;
} }
static void SetWorkspaceL3Size(int l3_size = 0xfffc00) { static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
xdnn::set_workspace_l3_size(GetRawContext(), l3_size); _workspace_l3_size_per_thread = l3_size;
} }
// **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
// thread
static void SetDev(int dev_no = 0) { static void SetDev(int dev_no = 0) {
const char* dev_env = getenv("LITE_XPU_DEV"); const char* dev_env = getenv("LITE_XPU_DEV");
if (dev_env) { if (dev_env) {
...@@ -173,6 +182,7 @@ class Context<TargetType::kXPU> { ...@@ -173,6 +182,7 @@ class Context<TargetType::kXPU> {
private: private:
static thread_local xdnn::Context* _tls_raw_ctx; static thread_local xdnn::Context* _tls_raw_ctx;
static int _workspace_l3_size_per_thread;
}; };
#endif #endif
...@@ -340,27 +350,17 @@ class Context<TargetType::kX86> { ...@@ -340,27 +350,17 @@ class Context<TargetType::kX86> {
template <> template <>
class Context<TargetType::kOpenCL> { class Context<TargetType::kOpenCL> {
std::shared_ptr<CLContext> cl_context_; std::shared_ptr<CLContext> cl_context_;
using WaitListType =
std::unordered_map<decltype(static_cast<const void*>(nullptr)),
std::shared_ptr<cl::Event>>;
std::shared_ptr<WaitListType> cl_wait_list_;
public: public:
CLContext* cl_context() { return cl_context_.get(); } CLContext* cl_context() { return cl_context_.get(); }
WaitListType* cl_wait_list() { return cl_wait_list_.get(); }
void InitOnce() { void InitOnce() {
// Init cl runtime. // Init cl runtime.
CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed"; CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed";
cl_context_ = std::make_shared<CLContext>(); cl_context_ = std::make_shared<CLContext>();
cl_wait_list_ = std::make_shared<WaitListType>();
} }
void CopySharedTo(OpenCLContext* ctx) { void CopySharedTo(OpenCLContext* ctx) { ctx->cl_context_ = cl_context_; }
ctx->cl_context_ = cl_context_;
ctx->cl_wait_list_ = cl_wait_list_;
}
}; };
#endif #endif
......
...@@ -21,9 +21,13 @@ lite_cc_library(mir_passes ...@@ -21,9 +21,13 @@ lite_cc_library(mir_passes
fusion/elementwise_add_activation_fuse_pass.cc fusion/elementwise_add_activation_fuse_pass.cc
fusion/quant_dequant_fuse_pass.cc fusion/quant_dequant_fuse_pass.cc
fusion/sequence_pool_concat_fuse_pass.cc fusion/sequence_pool_concat_fuse_pass.cc
fusion/scale_activation_fuse_pass.cc
fusion/__xpu__resnet_fuse_pass.cc fusion/__xpu__resnet_fuse_pass.cc
fusion/__xpu__multi_encoder_fuse_pass.cc fusion/__xpu__multi_encoder_fuse_pass.cc
fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
fusion/__xpu__fc_fuse_pass.cc
elimination/identity_scale_eliminate_pass.cc elimination/identity_scale_eliminate_pass.cc
elimination/identity_dropout_eliminate_pass.cc
elimination/elementwise_mul_constant_eliminate_pass.cc elimination/elementwise_mul_constant_eliminate_pass.cc
static_kernel_pick_pass.cc static_kernel_pick_pass.cc
variable_place_inference_pass.cc variable_place_inference_pass.cc
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/pass.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/pattern_matcher_high_api.h"
namespace paddle {
namespace lite {
namespace mir {
namespace {
class Eliminator : public FuseBase {
public:
void BuildPattern() override {
// the previous op's output need updat
auto* pre_op = OpNode("preop")->assert_is_not_op_type("conditional_block");
// TODO(Superjomn) check has only one output
auto* x = VarNode("x")->assert_is_op_input("dropout", "X");
auto* dropout_op = OpNode("dropout", "dropout")
->assert_op_attr<int>("is_test", 1)
->assert_op_attr<std::string>(
"dropout_implementation", "upscale_in_train");
auto* out = VarNode("out")->assert_is_op_output("dropout", "Out");
auto* mask = VarNode("mask")->assert_is_op_output("dropout", "Mask");
*pre_op >> *x >> *dropout_op >> *out;
*dropout_op >> *mask;
// The pre_op will be eliminated, and a new output-updated op will insert.
x->AsIntermediate(); // x is pre_op's output, need to update
dropout_op->AsIntermediate();
mask->AsIntermediate();
}
private:
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
auto& pre_op = matched.at("preop")->AsStmt();
auto op_info = *pre_op.op_info();
op_info.UpdateAllOutputs(matched.at("x")->AsArg().name,
matched.at("out")->AsArg().name);
pre_op.ResetOp(op_info, graph->valid_places());
IR_NODE_LINK_TO(matched.at("preop"), matched.at("out"));
}
};
} // namespace
class IdentityDropoutEliminatePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
Eliminator eliminator;
eliminator(graph.get());
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(identity_dropout_eliminate_pass,
paddle::lite::mir::IdentityDropoutEliminatePass)
.BindTargets({TARGET(kXPU)});
...@@ -31,6 +31,9 @@ lite_cc_library(fuse_interpolate ...@@ -31,6 +31,9 @@ lite_cc_library(fuse_interpolate
lite_cc_library(fuse_sequence_pool_concat lite_cc_library(fuse_sequence_pool_concat
SRCS sequence_pool_concat_fuser.cc SRCS sequence_pool_concat_fuser.cc
DEPS pattern_matcher_high_api) DEPS pattern_matcher_high_api)
lite_cc_library(fuse_scale_activation
SRCS scale_activation_fuser.cc
DEPS pattern_matcher_high_api)
set(mir_fusers set(mir_fusers
fuse_fc fuse_fc
...@@ -44,6 +47,7 @@ set(mir_fusers ...@@ -44,6 +47,7 @@ set(mir_fusers
fuse_transpose_softmax_transpose fuse_transpose_softmax_transpose
fuse_interpolate fuse_interpolate
fuse_sequence_pool_concat fuse_sequence_pool_concat
fuse_scale_activation
CACHE INTERNAL "fusers") CACHE INTERNAL "fusers")
if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <vector>
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
#include "lite/utils/string.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class XPUEmbeddingWithEltwiseAddFuser : public FuseBase {
public:
explicit XPUEmbeddingWithEltwiseAddFuser(int n_embedding)
: n_embedding_(n_embedding) {}
void BuildPattern() override {
auto* ids0 =
VarNode("ids0")->assert_is_op_input("lookup_table", "Ids")->AsInput();
auto* table0 =
VarNode("table0")->assert_is_op_input("lookup_table", "W")->AsInput();
auto* embedding0 = OpNode("embedding0", "lookup_table");
auto* embedding_out0 = VarNode("embedding_out0")
->assert_is_op_output("lookup_table", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* ids1 =
VarNode("ids1")->assert_is_op_input("lookup_table", "Ids")->AsInput();
auto* table1 =
VarNode("table1")->assert_is_op_input("lookup_table", "W")->AsInput();
auto* embedding1 = OpNode("embedding1", "lookup_table")->AsIntermediate();
auto* embedding_out1 = VarNode("embedding_out1")
->assert_is_op_output("lookup_table", "Out")
->assert_is_op_input("elementwise_add", "Y")
->AsIntermediate();
auto* ewadd01 = OpNode("ewadd01", "elementwise_add")->AsIntermediate();
auto* ewadd01_out = VarNode("ewadd01_out")
->assert_is_op_output("elementwise_add", "Out")
->AsIntermediate();
embedding0->LinksFrom({ids0, table0});
embedding0->LinksTo({embedding_out0});
embedding1->LinksFrom({ids1, table1});
embedding1->LinksTo({embedding_out1});
ewadd01->LinksFrom({embedding_out0, embedding_out1});
ewadd01->LinksTo({ewadd01_out});
auto* last_ewadd_out = ewadd01_out;
for (int i = 2; i < n_embedding_; ++i) {
auto ids_name = paddle::lite::string_format("ids%d", i);
auto table_name = paddle::lite::string_format("table%d", i);
auto embedding_name = paddle::lite::string_format("embedding%d", i);
auto embedding_out_name =
paddle::lite::string_format("embedding_out%d", i);
auto* new_ids = VarNode(ids_name)
->assert_is_op_input("lookup_table", "Ids")
->AsInput();
auto* new_table = VarNode(table_name)
->assert_is_op_input("lookup_table", "W")
->AsInput();
auto* new_embedding =
OpNode(embedding_name, "lookup_table")->AsIntermediate();
auto* new_embedding_out = VarNode(embedding_out_name)
->assert_is_op_output("lookup_table", "Out")
->assert_is_op_input("elementwise_add", "Y")
->AsIntermediate();
new_embedding->LinksFrom({new_ids, new_table});
new_embedding->LinksTo({new_embedding_out});
auto ewadd_name = paddle::lite::string_format("ewadd%d%d", i - 1, i);
auto ewadd_out_name = ewadd_name + "_out";
auto* new_ewadd = OpNode(ewadd_name, "elementwise_add")->AsIntermediate();
auto* new_ewadd_out = VarNode(ewadd_out_name)
->assert_is_op_output("elementwise_add", "Out")
->AsIntermediate();
new_ewadd->LinksFrom({last_ewadd_out, new_embedding_out});
new_ewadd->LinksTo({new_ewadd_out});
last_ewadd_out = new_ewadd_out;
}
last_ewadd_out->AsOutput();
}
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
cpp::OpDesc op_desc;
op_desc.SetType("__xpu__embedding_with_eltwise_add");
std::vector<std::string> ids_names;
std::vector<std::string> table_names;
for (int i = 0; i < n_embedding_; ++i) {
auto ids_name = paddle::lite::string_format("ids%d", i);
ids_names.push_back(matched.at(ids_name)->arg()->name);
auto table_name = paddle::lite::string_format("table%d", i);
table_names.push_back(matched.at(table_name)->arg()->name);
}
op_desc.SetInput("Ids", ids_names);
op_desc.SetInput("Tables", table_names);
auto output_name = paddle::lite::string_format(
"ewadd%d%d_out", n_embedding_ - 2, n_embedding_ - 1);
op_desc.SetOutput("Output", {matched.at(output_name)->arg()->name});
op_desc.SetAttr<int>("n_embedding", n_embedding_);
auto* embedding0_op_info = matched.at("embedding0")->stmt()->op_info();
op_desc.SetAttr<int64_t>(
"padding_idx", embedding0_op_info->GetAttr<int64_t>("padding_idx"));
auto* new_stmt = matched.at("embedding0")->stmt();
auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
new_op->Attach(op_desc, new_stmt->op()->scope());
new_op->SetValidPlaces(new_stmt->op()->valid_places());
auto kernels = new_op->CreateKernels(new_op->valid_places());
new_stmt->SetOp(new_op);
new_stmt->SetKernels(std::move(kernels));
for (int i = 0; i < n_embedding_; ++i) {
auto ids_name = paddle::lite::string_format("ids%d", i);
auto table_name = paddle::lite::string_format("table%d", i);
DirectedLink(matched.at(ids_name), matched.at("embedding0"));
DirectedLink(matched.at(table_name), matched.at("embedding0"));
}
IR_OP_VAR_LINK(matched.at("embedding0"), matched.at(output_name));
}
private:
int n_embedding_;
};
} // namespace fusion
class XPUEmbeddingWithEltwiseAddFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
for (int n_embedding : {4, 3}) {
fusion::XPUEmbeddingWithEltwiseAddFuser fuser(n_embedding);
fuser(graph.get());
}
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass,
paddle::lite::mir::XPUEmbeddingWithEltwiseAddFusePass)
.BindTargets({TARGET(kXPU)})
.BindKernel("lookup_table");
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <string>
#include "lite/backends/xpu/math.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/pattern_matcher_high_api.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class XPUFcFuser : public FuseBase {
public:
explicit XPUFcFuser(bool with_relu) : with_relu_(with_relu) {}
void BuildPattern() override {
// create nodes.
auto* x = VarNode("x")->assert_is_op_input("mul", "X");
auto* W = VarNode("W")->assert_is_op_input("mul", "Y");
auto* b = VarNode("b")->assert_is_persistable_var();
auto* mul = OpNode("mul", "mul");
auto* mul_out = VarNode("mul_out");
auto* add = OpNode("add", "elementwise_add");
auto* Out = VarNode("Out");
// create topology.
std::vector<PMNode*> mul_inputs{W, x};
std::vector<PMNode*> add_inputs{mul_out, b};
mul_inputs >> *mul >> *mul_out;
// Some op specialities.
mul_out->AsIntermediate();
mul->AsIntermediate();
add->AsIntermediate();
if (with_relu_) {
auto* add_out = VarNode("add_out");
auto* relu = OpNode("relu", "relu");
std::vector<PMNode*> relu_inputs{add_out};
add_inputs >> *add >> *add_out;
relu_inputs >> *relu >> *Out;
add_out->AsIntermediate();
relu->AsIntermediate();
} else {
add_inputs >> *add >> *Out;
}
}
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
auto mul = matched.at("mul")->stmt()->op();
auto* scope = mul->scope();
// convert W from float to int16, and transpose W
auto weight_name = matched.at("W")->arg()->name;
auto* weight_t = scope->FindMutableTensor(weight_name);
auto weight_dims = weight_t->dims();
int weight_len = weight_t->numel();
float* weight_on_host = weight_t->mutable_data<float>();
float max_f =
paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
std::unique_ptr<int16_t[]> weight_trans_int16(new int16_t[weight_len]);
paddle::lite::xpu::math::ConvertFP32ToInt16(
weight_on_host, weight_int16.get(), max_f, weight_len);
paddle::lite::xpu::math::Transpose(weight_int16.get(),
weight_trans_int16.get(),
weight_dims[0],
weight_dims[1]);
memcpy(
weight_on_host, weight_trans_int16.get(), weight_len * sizeof(int16_t));
auto op_desc = GenOpDesc(matched, max_f, true);
auto fc_op = LiteOpRegistry::Global().Create("__xpu__fc");
auto& valid_places = mul->valid_places();
fc_op->Attach(op_desc, scope);
auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places);
IR_NODE_LINK_TO(matched.at("W"), new_op_node);
IR_NODE_LINK_TO(matched.at("x"), new_op_node);
IR_NODE_LINK_TO(matched.at("b"), new_op_node);
IR_NODE_LINK_TO(new_op_node, matched.at("Out"));
}
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched,
float w_max,
bool transpose_w) {
cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info();
op_desc.mutable_inputs()->clear();
op_desc.mutable_outputs()->clear();
op_desc.SetType("__xpu__fc");
op_desc.SetInput("Input", {matched.at("x")->arg()->name});
op_desc.SetInput("W", {matched.at("W")->arg()->name});
op_desc.SetInput("Bias", {matched.at("b")->arg()->name});
op_desc.SetOutput("Out", {matched.at("Out")->arg()->name});
op_desc.SetAttr(
"in_num_col_dims",
matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
op_desc.SetAttr("w_max", w_max);
op_desc.SetAttr("transpose_w", transpose_w);
if (with_relu_) {
op_desc.SetAttr("activation_type", std::string{"relu"});
}
return op_desc;
}
bool with_relu_;
};
} // namespace fusion
class XPUFcFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
fusion::XPUFcFuser fuser(true /* with_relu */);
fuser(graph.get());
fusion::XPUFcFuser fuser2(false /* with_relu */);
fuser2(graph.get());
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(__xpu__fc_fuse_pass, paddle::lite::mir::XPUFcFusePass)
.BindTargets({TARGET(kXPU)})
.BindKernel("fc");
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <vector> #include <vector>
#include "lite/backends/xpu/math.h" #include "lite/backends/xpu/math.h"
#include "lite/core/mir/pass_registry.h" #include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/type_precision_cast_pass.h" // For UpdateInputs()
#include "lite/core/mir/xpu_pattern_matcher_high_api.h" #include "lite/core/mir/xpu_pattern_matcher_high_api.h"
#include "lite/operators/subgraph_op.h" #include "lite/operators/subgraph_op.h"
...@@ -588,8 +589,7 @@ class XPUMultiEncoderFuser { ...@@ -588,8 +589,7 @@ class XPUMultiEncoderFuser {
multi_encoder_stmt->SetOp(multi_encoder_op); multi_encoder_stmt->SetOp(multi_encoder_op);
multi_encoder_stmt->SetKernels(std::move(kernels)); multi_encoder_stmt->SetKernels(std::move(kernels));
// temp remove useless cast // remove dangling/useless cast
std::unordered_set<const Node*> to_remove2;
Node* stack = nullptr; Node* stack = nullptr;
for (auto* node : graph->StmtTopologicalOrder()) { for (auto* node : graph->StmtTopologicalOrder()) {
CHECK(node->IsStmt()); CHECK(node->IsStmt());
...@@ -597,17 +597,40 @@ class XPUMultiEncoderFuser { ...@@ -597,17 +597,40 @@ class XPUMultiEncoderFuser {
stack = node; stack = node;
} }
} }
if (stack) {
std::unordered_set<const Node*> to_remove2;
Node* stack_out = stack->outlinks.front(); Node* stack_out = stack->outlinks.front();
for (Node* cast : stack_out->outlinks) { // avoid modification while traversing
auto stack_out_outlinks = stack_out->outlinks;
for (Node* cast : stack_out_outlinks) {
if (cast->stmt()->op_info()->Type() != "cast") {
continue;
}
Node* cast_out = cast->outlinks.front(); Node* cast_out = cast->outlinks.front();
if (cast_out->outlinks.size() == 0) { if (cast_out->outlinks.size() == 0) {
// remove // dangling cast
to_remove2.insert(cast);
to_remove2.insert(cast_out); to_remove2.insert(cast_out);
VLOG(3) << "Remove dangling cast [" << cast_out->arg()->name << "]";
} else if (cast_out->outlinks.size() == 1) {
// useless cast
to_remove2.insert(cast); to_remove2.insert(cast);
to_remove2.insert(cast_out);
VLOG(3) << "Remove useless cast [" << cast_out->arg()->name << "]";
auto* multi_encoder = cast_out->outlinks.front();
DirectedLink(stack_out, multi_encoder);
UpdateInputs(multi_encoder->stmt()->op().get(),
cast_out->arg()->name,
stack_out->arg()->name);
auto update_op_info = *multi_encoder->stmt()->op_info();
multi_encoder->stmt()->ResetOp(update_op_info, graph->valid_places());
} }
} }
GraphSafeRemoveNodes(graph, to_remove2); GraphSafeRemoveNodes(graph, to_remove2);
} }
}
}; };
} // namespace fusion } // namespace fusion
......
...@@ -103,9 +103,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -103,9 +103,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
std::string conv_weight_name = matched.at("conv_weight")->arg()->name; std::string conv_weight_name = matched.at("conv_weight")->arg()->name;
auto conv_weight_t = auto conv_weight_t =
scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>(); scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
auto groups = conv_op_desc->GetAttr<int>("groups");
bool depthwise = false;
if (conv_type_ == "conv2d_transpose") { if (conv_type_ == "conv2d_transpose") {
depthwise = (conv_weight_t->dims()[0] == conv_weight_t->dims()[1] * groups);
CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()), CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
static_cast<size_t>(conv_weight_t->dims()[1])) static_cast<size_t>(conv_weight_t->dims()[1] * groups))
<< "The BN bias's size should be equal to the size of the first " << "The BN bias's size should be equal to the size of the first "
<< "dim size of the conv weights"; << "dim size of the conv weights";
} else { } else {
...@@ -159,7 +162,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -159,7 +162,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
// compute new conv_weight for int8 // compute new conv_weight for int8
auto weight_scale = auto weight_scale =
conv_op_desc->GetAttr<std::vector<float>>("weight_scale"); conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
if (conv_type_ == "conv2d_transpose") { if (conv_type_ == "conv2d_transpose" && !depthwise) {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
conv_weight_t->dims()[3]; conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
...@@ -199,7 +202,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -199,7 +202,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
} else { } else {
// compute new conv_weight // compute new conv_weight
auto conv_weight_d = conv_weight_t->mutable_data<float>(); auto conv_weight_d = conv_weight_t->mutable_data<float>();
if (conv_type_ == "conv2d_transpose") { if (conv_type_ == "conv2d_transpose" && !depthwise) {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
conv_weight_t->dims()[3]; conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/scale_activation_fuse_pass.h"
#include <memory>
#include <vector>
#include "lite/core/mir/fusion/scale_activation_fuser.h"
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
void ScaleActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
for (auto act_type : {"relu", "relu6", "leaky_relu"}) {
fusion::ScaleActivationFuser fuser(act_type);
fuser(graph.get());
}
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(lite_scale_activation_fuse_pass,
paddle::lite::mir::ScaleActivationFusePass)
.BindTargets({TARGET(kARM)})
.BindKernel("scale");
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "lite/core/mir/pass.h"
namespace paddle {
namespace lite {
namespace mir {
class ScaleActivationFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/scale_activation_fuser.h"
#include <memory>
#include <vector>
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
void ScaleActivationFuser::BuildPattern() {
// create input nodes.
auto* x = VarNode("x")->assert_is_op_input("scale", "X")->AsInput();
// create op nodes
auto* scale =
OpNode("scale", "scale")->assert_is_op("scale")->AsIntermediate();
auto* act =
OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate();
// create intermediate nodes
auto* scale_out = VarNode("scale_out")
->assert_is_op_output("scale", "Out")
->assert_is_op_input(act_type_, "X")
->AsIntermediate();
// create output node
auto* out =
VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
// create topology.
*x >> *scale >> *scale_out;
*scale_out >> *act >> *out;
}
void ScaleActivationFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) {
auto op_desc = GenOpDesc(matched);
auto scale_op = LiteOpRegistry::Global().Create("scale");
auto scale = matched.at("scale")->stmt()->op();
auto* scope = scale->scope();
auto& valid_places = scale->valid_places();
scale_op->Attach(op_desc, scope);
auto* new_op_node = graph->GraphCreateInstructNode(scale_op, valid_places);
IR_NODE_LINK_TO(matched.at("x"), new_op_node);
IR_NODE_LINK_TO(new_op_node, matched.at("output"));
}
cpp::OpDesc ScaleActivationFuser::GenOpDesc(const key2nodes_t& matched) {
cpp::OpDesc op_desc = *matched.at("scale")->stmt()->op_info();
op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info();
op_desc.SetAttr("activation_type", act_type_);
if (act_type_ == "relu") {
op_desc.SetAttr("fuse_relu", true);
} else if (act_type_ == "relu6") {
float alpha = act_op_desc.GetAttr<float>("threshold");
op_desc.SetAttr("alpha", alpha);
} else if (act_type_ == "leaky_relu") {
float alpha = act_op_desc.GetAttr<float>("alpha");
op_desc.SetAttr("alpha", alpha);
}
return op_desc;
}
} // namespace fusion
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "lite/core/mir/pattern_matcher_high_api.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class ScaleActivationFuser : public FuseBase {
public:
explicit ScaleActivationFuser(const std::string& act_type) {
act_type_ = act_type;
}
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
std::string act_type_;
};
} // namespace fusion
} // namespace mir
} // namespace lite
} // namespace paddle
...@@ -25,16 +25,16 @@ namespace lite { ...@@ -25,16 +25,16 @@ namespace lite {
bool OpLite::InferShape() { bool OpLite::InferShape() {
// if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_ // if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_
// InferShapeByMemoryInternal will be applied. // InferShapeByMemoryInternal will be applied.
if (param_.input_tensor_ptrs() && param_.output_tensor_ptrs()) { if (op_param_ && op_param_->input_tensor_ptrs() &&
op_param_->output_tensor_ptrs()) {
return this->InferShapeWithCache(); return this->InferShapeWithCache();
} else { } else {
// otherwise, InferShapeImpl is applied directly.
return this->InferShapeImpl(); return this->InferShapeImpl();
} }
} }
bool OpLite::InferShapeWithCache() { bool OpLite::InferShapeWithCache() {
// 1. Get vector of current input tensors // 1. Get vector of current input tensors
auto *current_inputs = param_.input_tensor_ptrs(); auto *current_inputs = op_param_->input_tensor_ptrs();
// 2. Get hash value of current inputs shape and lod // 2. Get hash value of current inputs shape and lod
size_t new_hash = 0; size_t new_hash = 0;
for (auto iter = current_inputs->begin(); iter != current_inputs->end(); for (auto iter = current_inputs->begin(); iter != current_inputs->end();
...@@ -59,7 +59,7 @@ bool OpLite::InferShapeWithCache() { ...@@ -59,7 +59,7 @@ bool OpLite::InferShapeWithCache() {
if (new_hash == io_shape_lod_hash_ && new_hash != 0) { if (new_hash == io_shape_lod_hash_ && new_hash != 0) {
// if current hash value is consistent with io_shape_lod_hash_, // if current hash value is consistent with io_shape_lod_hash_,
// previous outputs shape and lod are reused. // previous outputs shape and lod are reused.
auto *current_outputs = param_.output_tensor_ptrs(); auto *current_outputs = op_param_->output_tensor_ptrs();
for (size_t i = 0; i < current_outputs->size(); i++) { for (size_t i = 0; i < current_outputs->size(); i++) {
current_outputs->at(i)->Resize(last_output_shapes[i]); current_outputs->at(i)->Resize(last_output_shapes[i]);
current_outputs->at(i)->set_lod(last_output_lods[i]); current_outputs->at(i)->set_lod(last_output_lods[i]);
...@@ -68,10 +68,12 @@ bool OpLite::InferShapeWithCache() { ...@@ -68,10 +68,12 @@ bool OpLite::InferShapeWithCache() {
// otherwise, current hash value is changed, InferShapeImpl will apply. // otherwise, current hash value is changed, InferShapeImpl will apply.
io_shape_lod_hash_ = new_hash; io_shape_lod_hash_ = new_hash;
this->InferShapeImpl(); this->InferShapeImpl();
auto *current_outputs = param_.output_tensor_ptrs(); auto *current_outputs = op_param_->output_tensor_ptrs();
last_output_shapes.clear();
last_output_lods.clear();
for (size_t i = 0; i < current_outputs->size(); i++) { for (size_t i = 0; i < current_outputs->size(); i++) {
last_output_shapes[i] = current_outputs->at(i)->dims(); last_output_shapes.push_back(current_outputs->at(i)->dims());
last_output_lods[i] = current_outputs->at(i)->lod(); last_output_lods.push_back(current_outputs->at(i)->lod());
} }
} }
return true; return true;
......
...@@ -77,6 +77,11 @@ class OpLite : public Registry { ...@@ -77,6 +77,11 @@ class OpLite : public Registry {
// Link the external execution environ to internal context. // Link the external execution environ to internal context.
bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope); bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope);
template <typename T>
inline void AttachParam(T *param) {
op_param_ = static_cast<T *>(param);
}
const OpInfo *op_info() const { return op_info_.get(); } const OpInfo *op_info() const { return op_info_.get(); }
OpInfo *mutable_op_info() { return op_info_.get(); } OpInfo *mutable_op_info() { return op_info_.get(); }
...@@ -167,11 +172,10 @@ class OpLite : public Registry { ...@@ -167,11 +172,10 @@ class OpLite : public Registry {
std::vector<Place> valid_places_; std::vector<Place> valid_places_;
Place kernel_place_{TARGET(kHost), PRECISION(kFloat)}; Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
std::unique_ptr<OpInfo> op_info_; std::unique_ptr<OpInfo> op_info_;
std::vector<DDimLite> last_output_shapes{}; std::vector<DDimLite> last_output_shapes{};
std::vector<std::vector<std::vector<uint64_t>>> last_output_lods{}; std::vector<std::vector<std::vector<uint64_t>>> last_output_lods{};
size_t io_shape_lod_hash_{}; size_t io_shape_lod_hash_{};
mutable operators::ParamBase param_; mutable operators::ParamBase *op_param_{nullptr};
private: private:
// Infer Shape according to memory, if current input shapes are consistent // Infer Shape according to memory, if current input shapes are consistent
......
...@@ -111,18 +111,23 @@ class KernelRegistry final { ...@@ -111,18 +111,23 @@ class KernelRegistry final {
KernelRegistryForTarget<TARGET(kCUDA), KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNHWC)> *, // DATALAYOUT(kNHWC)> *, //
KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kAny),
DATALAYOUT(kAny)> *, //
KernelRegistryForTarget<TARGET(kCUDA), KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kInt8), PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kCUDA), KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kInt8), PRECISION(kInt8),
DATALAYOUT(kNHWC)> *, // DATALAYOUT(kNHWC)> *, //
KernelRegistryForTarget<TARGET(kX86), KernelRegistryForTarget<TARGET(kX86),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kX86), KernelRegistryForTarget<TARGET(kX86),
PRECISION(kInt8), PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kHost), KernelRegistryForTarget<TARGET(kHost),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
...@@ -141,9 +146,7 @@ class KernelRegistry final { ...@@ -141,9 +146,7 @@ class KernelRegistry final {
KernelRegistryForTarget<TARGET(kHost), KernelRegistryForTarget<TARGET(kHost),
PRECISION(kInt64), PRECISION(kInt64),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kAny),
DATALAYOUT(kAny)> *, //
KernelRegistryForTarget<TARGET(kARM), KernelRegistryForTarget<TARGET(kARM),
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kAny)> *, // DATALAYOUT(kAny)> *, //
......
...@@ -71,12 +71,17 @@ class Optimizer { ...@@ -71,12 +71,17 @@ class Optimizer {
"identity_scale_eliminate_pass", // "identity_scale_eliminate_pass", //
"elementwise_mul_constant_eliminate_pass", // "elementwise_mul_constant_eliminate_pass", //
"lite_sequence_pool_concat_fuse_pass", // "lite_sequence_pool_concat_fuse_pass", //
"lite_scale_activation_fuse_pass", //
#if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \ #if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
(defined LITE_WITH_ARM) (defined LITE_WITH_ARM)
"lite_elementwise_add_activation_fuse_pass", // "lite_elementwise_add_activation_fuse_pass", //
#endif #endif
"__xpu__resnet_fuse_pass", "__xpu__resnet_fuse_pass",
"__xpu__multi_encoder_fuse_pass", "__xpu__multi_encoder_fuse_pass",
"__xpu__embedding_with_eltwise_add_fuse_pass",
"__xpu__fc_fuse_pass",
"identity_dropout_eliminate_pass", // should be placed after
// xpu fusion
"quantized_op_attributes_inference_pass", // Only for fully "quantized_op_attributes_inference_pass", // Only for fully
// quantized model, infer // quantized model, infer
// the output scale and // the output scale and
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/core/program.h" #include "lite/core/program.h"
#include "lite/fluid/float16.h"
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
#include "lite/backends/opencl/cl_image_converter.h" #include "lite/backends/opencl/cl_image_converter.h"
...@@ -52,6 +53,24 @@ static bool write_tensorfile(const Tensor* tensor, const std::string& locate) { ...@@ -52,6 +53,24 @@ static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
return true; return true;
} }
static bool write_precision_summary_tofile(const std::string& string,
const std::string& log_dir = "") {
if (log_dir == "") {
LOG(INFO) << "The `log_dir` of precision summary file is not set. log_dir:"
<< log_dir;
return false;
}
FILE* fp = fopen(log_dir.c_str(), "a");
if (fp == nullptr) {
LOG(INFO) << "Open precision summary file:" << log_dir << "failed.";
return false;
} else {
fprintf(fp, "%s\n", string.c_str());
}
fclose(fp);
return true;
}
class PrecisionProfiler { class PrecisionProfiler {
public: public:
// TODO(ysh329): need to remove `explicit PrecisionProfiler` // TODO(ysh329): need to remove `explicit PrecisionProfiler`
...@@ -67,7 +86,7 @@ class PrecisionProfiler { ...@@ -67,7 +86,7 @@ class PrecisionProfiler {
using std::left; using std::left;
using std::fixed; using std::fixed;
STL::stringstream ss; STL::stringstream ss;
ss << "========================================= " ss << "\n\n========================================= "
<< "Detailed Precision Profiler Summary " << "Detailed Precision Profiler Summary "
<< "=========================================" << std::endl; << "=========================================" << std::endl;
ss << setw(45) << left << "operator:(kernel_info)" ss << setw(45) << left << "operator:(kernel_info)"
...@@ -77,6 +96,13 @@ class PrecisionProfiler { ...@@ -77,6 +96,13 @@ class PrecisionProfiler {
<< " " << setw(15) << left << "std_deviation" << " " << setw(15) << left << "std_deviation"
<< " " << setw(15) << left << "ave_grow_rate*" << std::endl; << " " << setw(15) << left << "ave_grow_rate*" << std::endl;
// write to file with path: `log_dir`
if (log_dir_ != "") {
FILE* fp = fopen(log_dir_.c_str(), "a");
std::string header_str{ss.str()};
fprintf(fp, "%s\n", header_str.c_str());
fclose(fp);
}
return ss.str(); return ss.str();
} }
...@@ -194,6 +220,7 @@ class PrecisionProfiler { ...@@ -194,6 +220,7 @@ class PrecisionProfiler {
} }
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
} else if (target_type == TARGET(kOpenCL)) { } else if (target_type == TARGET(kOpenCL)) {
CLRuntime::Global()->command_queue().finish();
switch (layout_type) { switch (layout_type) {
case DATALAYOUT(kImageDefault): { case DATALAYOUT(kImageDefault): {
paddle::lite::CLImageConverterDefault default_convertor; paddle::lite::CLImageConverterDefault default_convertor;
...@@ -360,8 +387,12 @@ class PrecisionProfiler { ...@@ -360,8 +387,12 @@ class PrecisionProfiler {
} }
} }
} }
write_precision_summary_tofile(ss.str(), log_dir_);
return ss.str(); return ss.str();
} }
private:
std::string log_dir_{"/storage/emulated/0/precision.log"};
}; };
} // namespace profile } // namespace profile
......
...@@ -60,6 +60,29 @@ Variable *Scope::FindLocalVar(const std::string &name) const { ...@@ -60,6 +60,29 @@ Variable *Scope::FindLocalVar(const std::string &name) const {
return nullptr; return nullptr;
} }
// AttributeVarNames will get persistive attribute names stored in parent scope
std::vector<std::string> Scope::AttributeVarNames() const {
std::vector<std::string> resulted_keys;
const Scope *cur_scope = this;
while (cur_scope->parent()) {
cur_scope = cur_scope->parent();
auto keys = cur_scope->LocalVarNames();
resulted_keys.insert(resulted_keys.end(), keys.begin(), keys.end());
}
// remove feed and fetch
std::vector<std::string> skiped_vars = {"feed", "fetch"};
for (int i = 0; i < skiped_vars.size(); i++) {
auto iter =
std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]);
while (iter != resulted_keys.end()) {
resulted_keys.erase(iter);
iter =
std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]);
}
}
return resulted_keys;
}
std::vector<std::string> Scope::LocalVarNames() const { std::vector<std::string> Scope::LocalVarNames() const {
std::vector<std::string> keys; std::vector<std::string> keys;
for (const auto &item : vars_) { for (const auto &item : vars_) {
......
...@@ -45,6 +45,8 @@ class Scope final { ...@@ -45,6 +45,8 @@ class Scope final {
const Scope* parent() const { return parent_; } const Scope* parent() const { return parent_; }
// Get attribute params stored in parent scopes.
std::vector<std::string> AttributeVarNames() const;
// Following the legacy scope interface. // Following the legacy scope interface.
std::vector<std::string> LocalVarNames() const; std::vector<std::string> LocalVarNames() const;
......
project(demo CXX C)
cmake_minimum_required(VERSION 2.8) cmake_minimum_required(VERSION 2.8)
project(demo CXX C)
add_definitions(-DLITE_WITH_CUDA)
set(TARGET demo) set(TARGET demo)
set(CMAKE_CXX_FLAGS "-std=c++11 -O3") set(CMAKE_CXX_FLAGS "-std=c++11 -O3")
set(LITE_LIB "${PROJECT_SOURCE_DIR}/../../cxx") set(LITE_ROOT "${PROJECT_SOURCE_DIR}/../../cxx")
set(PROTOBUF_LIB "${PROJECT_SOURCE_DIR}/../../third_party/protobuf") set(PROTOBUF_ROOT "${PROJECT_SOURCE_DIR}/../../third_party/protobuf")
include_directories("${LITE_LIB}/include") include_directories("${LITE_ROOT}/include")
link_directories("${LITE_LIB}/lib") link_directories("${LITE_ROOT}/lib")
link_directories("${PROTOBUF_LIB}/lib") link_directories("${PROTOBUF_ROOT}/lib")
# cuda lib
link_directories("/usr/local/cuda/lib64/")
add_executable(${TARGET} ${TARGET}.cc) add_executable(${TARGET} ${TARGET}.cc)
set(DEPS ${LITE_LIB}/lib/libpaddle_full_api_shared.so) set(DEPS ${LITE_ROOT}/lib/libpaddle_full_api_shared.so)
set(DEPS ${DEPS} protobuf-lite) set(DEPS ${DEPS} protobuf-lite)
set(DEPS ${DEPS} "-lrt -lpthread -ldl") set(DEPS ${DEPS} "-lrt -lpthread -ldl -lcudart")
target_link_libraries(${TARGET} ${DEPS}) target_link_libraries(${TARGET} ${DEPS})
ARM_ABI = arm7
export ARM_ABI
include ../Makefile.def
LITE_ROOT=../../../
THIRD_PARTY_DIR=${LITE_ROOT}/third_party
OPENCV_VERSION=opencv4.1.0
OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include
CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS)
LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared
LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a
LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared
LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a
##########
fetch_opencv:
@ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR}
@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
(echo "fetch opencv libs" && \
wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
test_helper.o: test_helper.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc
classification_full.o: classification_full.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc
classification_light.o: classification_light.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc
classification_full_shared: fetch_opencv classification_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
classification_full_static: fetch_opencv classification_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
classification_light_shared: fetch_opencv classification_light.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
classification_light_static: fetch_opencv classification_light.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
######
yolov3_full.o: yolov3_full.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc
yolov3_light.o: yolov3_light.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc
yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
#####
all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static
clean:
rm -f *.o
rm -f classification_full_shared
rm -r classification_full_static
rm -r classification_light_shared
rm -f classification_light_static
rm -f yolov3_full_shared
rm -f yolov3_full_static
rm -f yolov3_light_shared
rm -f yolov3_light_static
ARM_ABI = arm8
export ARM_ABI
include ../Makefile.def
LITE_ROOT=../../../
THIRD_PARTY_DIR=${LITE_ROOT}/third_party
OPENCV_VERSION=opencv4.1.0
OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include
CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS)
LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared
LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a
LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared
LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a
##########
fetch_opencv:
@ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR}
@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
(echo "fetch opencv libs" && \
wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
test_helper.o: test_helper.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc
classification_full.o: classification_full.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc
classification_light.o: classification_light.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc
classification_full_shared: fetch_opencv classification_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
classification_full_static: fetch_opencv classification_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
classification_light_shared: fetch_opencv classification_light.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
classification_light_static: fetch_opencv classification_light.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
######
yolov3_full.o: yolov3_full.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc
yolov3_light.o: yolov3_light.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc
yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
#####
all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static
clean:
rm -f *.o
rm -f classification_full_shared
rm -r classification_full_static
rm -r classification_light_shared
rm -f classification_light_static
rm -f yolov3_full_shared
rm -f yolov3_full_static
rm -f yolov3_light_shared
rm -f yolov3_light_static
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <fstream>
#include <iostream>
#include "paddle_api.h" // NOLINT
#include "test_helper.h" // NOLINT
DEFINE_string(model_dir,
"",
"the path of the model, the model and param files is under "
"model_dir.");
DEFINE_string(model_filename,
"",
"the filename of model file. When the model is combined formate, "
"please set model_file.");
DEFINE_string(param_filename,
"",
"the filename of param file, set param_file when the model is "
"combined formate.");
DEFINE_string(img_path, "", "the path of input image");
DEFINE_string(img_txt_path,
"",
"the path of input image, the image is processed "
" and saved in txt file");
DEFINE_double(out_max_value, 0.0, "The max value in output tensor");
DEFINE_double(threshold,
1e-3,
"If the max value diff is smaller than threshold, pass test");
DEFINE_int32(out_max_value_index, 65, "The max value index in output tensor");
// Optimize model for ARM CPU.
// If the model is not combined, set model_filename and params_filename as empty
void OptModel(const std::string& load_model_dir,
const std::string& model_filename,
const std::string& params_filename,
const std::string& save_model_path) {
paddle::lite_api::CxxConfig config;
config.set_model_dir(load_model_dir);
if (!model_filename.empty() && !params_filename.empty()) {
config.set_model_file(load_model_dir + "/" + model_filename);
config.set_param_file(load_model_dir + "/" + params_filename);
}
std::vector<paddle::lite_api::Place> vaild_places = {
paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt32)},
paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt64)},
};
config.set_valid_places(vaild_places);
auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
std::string cmd_str = "rm -rf " + save_model_path;
int ret = system(cmd_str.c_str());
if (ret == 0) {
std::cout << "Delete old optimized model " << save_model_path << std::endl;
}
predictor->SaveOptimizedModel(save_model_path,
paddle::lite_api::LiteModelType::kNaiveBuffer);
std::cout << "Load model from " << load_model_dir << std::endl;
std::cout << "Save optimized model to " << save_model_path << std::endl;
}
void Run(const std::string& model_path,
const std::string& img_path,
const std::string& img_txt_path,
const float out_max_value,
const int out_max_value_index,
const float threshold,
const int height,
const int width) {
// set config and create predictor
paddle::lite_api::MobileConfig config;
config.set_threads(3);
config.set_model_from_file(model_path);
auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
// set input
auto input_tensor = predictor->GetInput(0);
input_tensor->Resize({1, 3, height, width});
auto input_data = input_tensor->mutable_data<float>();
if (img_txt_path.size() > 0) {
std::fstream fs(img_txt_path);
if (!fs.is_open()) {
std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
}
int num = 1 * 3 * height * width;
for (int i = 0; i < num; i++) {
fs >> input_data[i];
}
} else {
cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
if (!img.data) {
std::cerr << "Fail to open img:" << img_path << std::endl;
exit(1);
}
float means[3] = {0.485f, 0.456f, 0.406f};
float scales[3] = {0.229f, 0.224f, 0.225f};
process_img(img, width, height, input_data, means, scales);
}
predictor->Run();
auto out_tensor = predictor->GetOutput(0);
auto* out_data = out_tensor->data<float>();
int64_t output_num = ShapeProduction(out_tensor->shape());
float max_value = out_data[0];
int max_index = 0;
for (int i = 0; i < output_num; i++) {
if (max_value < out_data[i]) {
max_value = out_data[i];
max_index = i;
}
}
std::cout << "max_value:" << max_value << std::endl;
std::cout << "max_index:" << max_index << std::endl;
std::cout << "max_value_ground_truth:" << out_max_value << std::endl;
std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl;
if (max_index != out_max_value_index ||
fabs(max_value - out_max_value) > threshold) {
std::cerr << "----------Fail Test.---------- \n\n";
} else {
std::cout << "----------Pass Test.---------- \n\n";
}
}
int main(int argc, char** argv) {
// Check inputs
google::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir.empty() ||
(FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
std::cerr << "Input error." << std::endl;
std::cerr
<< "Usage: " << argv[0] << std::endl
<< "--model_dir: the path of not optimized model \n"
"--model_filename: the model filename of not optimized model \n"
"--param_filename: the param filename of not optimized model \n"
"--img_txt_path: the path of input image, the image is processed \n"
" and saved in txt file \n"
"--img_path: the path of input image \n"
"--out_max_value: The max value in output tensor \n"
"--threshold: If the max value diff is smaller than threshold,\n"
" pass test. Default 1e-3.\n"
"--out_max_value_index: The max value index in output tensor \n";
exit(1);
}
const int height = 224;
const int width = 224;
std::string model_dir = FLAGS_model_dir;
if (model_dir.back() == '/') {
model_dir.pop_back();
}
std::string optimized_model_path = model_dir + "_opt2";
OptModel(FLAGS_model_dir,
FLAGS_model_filename,
FLAGS_param_filename,
optimized_model_path);
std::string run_model_path = optimized_model_path + ".nb";
// Run test
Run(run_model_path,
FLAGS_img_path,
FLAGS_img_txt_path,
FLAGS_out_max_value,
FLAGS_out_max_value_index,
FLAGS_threshold,
height,
width);
return 0;
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <fstream>
#include <iostream>
#include "paddle_api.h" // NOLINT
#include "test_helper.h" // NOLINT
DEFINE_string(optimized_model_path, "", "the path of optimized model");
DEFINE_string(img_path, "", "the path of input image");
DEFINE_string(img_txt_path,
"",
"the path of input image, the image is processed "
" and saved in txt file");
DEFINE_double(out_max_value, 0.0, "The max value in output tensor");
DEFINE_double(threshold,
1e-3,
"If the max value diff is smaller than threshold, pass test");
DEFINE_int32(out_max_value_index, -1, "The max value index in output tensor");
void Run(const std::string& model_path,
const std::string& img_path,
const std::string& img_txt_path,
const float out_max_value,
const int out_max_value_index,
const float threshold,
const int height,
const int width) {
// set config and create predictor
paddle::lite_api::MobileConfig config;
config.set_threads(3);
config.set_model_from_file(model_path);
auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
// set input
auto input_tensor = predictor->GetInput(0);
input_tensor->Resize({1, 3, height, width});
auto input_data = input_tensor->mutable_data<float>();
if (img_txt_path.size() > 0) {
std::fstream fs(img_txt_path);
if (!fs.is_open()) {
std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
}
int num = 1 * 3 * height * width;
for (int i = 0; i < num; i++) {
fs >> input_data[i];
}
} else {
cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
if (!img.data) {
std::cerr << "Fail to open img:" << img_path << std::endl;
exit(1);
}
float means[3] = {0.485f, 0.456f, 0.406f};
float scales[3] = {0.229f, 0.224f, 0.225f};
process_img(img, width, height, input_data, means, scales);
}
predictor->Run();
auto out_tensor = predictor->GetOutput(0);
auto* out_data = out_tensor->data<float>();
int64_t output_num = ShapeProduction(out_tensor->shape());
float max_value = out_data[0];
int max_index = 0;
for (int i = 0; i < output_num; i++) {
if (max_value < out_data[i]) {
max_value = out_data[i];
max_index = i;
}
}
std::cout << "max_value:" << max_value << std::endl;
std::cout << "max_index:" << max_index << std::endl;
std::cout << "max_value_ground_truth:" << out_max_value << std::endl;
std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl;
if (max_index != out_max_value_index ||
fabs(max_value - out_max_value) > threshold) {
std::cerr << "----------Fail Test---------- \n\n";
} else {
std::cout << "----------Pass Test---------- \n\n";
}
}
int main(int argc, char** argv) {
// Check inputs
google::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_optimized_model_path.empty() ||
(FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
std::cerr << "Input error." << std::endl;
std::cerr
<< "Usage: " << argv[0] << std::endl
<< "--optimized_model_path: the path of optimized model \n"
"--img_txt_path: the path of input image, the image is processed \n"
" and saved in txt file \n"
"--img_path: the path of input image \n"
"--out_max_value: The max value in output tensor \n"
"--threshold: If the max value diff is smaller than threshold,\n"
" pass test. Default 1e-3.\n"
"--out_max_value_index: The max value index in output tensor \n";
exit(1);
}
const int height = 224;
const int width = 224;
// Run test
Run(FLAGS_optimized_model_path,
FLAGS_img_path,
FLAGS_img_txt_path,
FLAGS_out_max_value,
FLAGS_out_max_value_index,
FLAGS_threshold,
height,
width);
return 0;
}
make clean
make all -j
gf=test_lite_lib_files
if [ -d ${gf} ];then
rm -rf ${gf}
fi
mkdir ${gf}
mv classification_full_shared ${gf}
mv classification_full_static ${gf}
mv classification_light_shared ${gf}
mv classification_light_static ${gf}
mv yolov3_full_shared ${gf}
mv yolov3_full_static ${gf}
mv yolov3_light_shared ${gf}
mv yolov3_light_static ${gf}
cp run.sh ${gf}
make clean
cp -r ../../../cxx/ ${gf}
mv ${gf}/cxx ${gf}/lite
if [ ! -f "test_libs_models_imgs.tgz" ];then
wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/test_libs_models_imgs.tgz
fi
tar zxvf test_libs_models_imgs.tgz
mv test_libs_models_imgs ${gf}
mv ${gf}/test_libs_models_imgs ${gf}/models_imgs
export LD_LIBRARY_PATH=$PWD/lite/lib/:${LD_LIBRARY_PATH}
# mobilenetv1
./classification_light_shared \
--optimized_model_path=models_imgs/models/mobilenetv1.nb \
--img_txt_path=models_imgs/images/classification.jpg.txt \
--out_max_value=0.936887 \
--out_max_value_index=65
./classification_light_static \
--optimized_model_path=models_imgs/models/mobilenetv1.nb \
--img_txt_path=models_imgs/images/classification.jpg.txt \
--out_max_value=0.936887 \
--out_max_value_index=65
./classification_full_static \
--model_dir=models_imgs/models/mobilenetv1 \
--img_txt_path=models_imgs/images/classification.jpg.txt \
--out_max_value=0.936887 \
--out_max_value_index=65
./classification_full_shared \
--model_dir=models_imgs/models/mobilenetv1 \
--img_txt_path=models_imgs/images/classification.jpg.txt \
--out_max_value=0.936887 \
--out_max_value_index=65
# mobilenetv2
./classification_light_shared \
--optimized_model_path=models_imgs/models/mobilenetv2.nb \
--img_txt_path=models_imgs/images/classification.jpg.txt \
--out_max_value=0.868888 \
--out_max_value_index=65
./classification_light_static \
--optimized_model_path=models_imgs/models/mobilenetv2.nb \
--img_txt_path=models_imgs/images/classification.jpg.txt \
--out_max_value=0.868888 \
--out_max_value_index=65
./classification_full_static \
--model_dir=models_imgs/models/mobilenetv2 \
--img_txt_path=models_imgs/images/classification.jpg.txt \
--out_max_value=0.868888 \
--out_max_value_index=65
./classification_full_shared \
--model_dir=models_imgs/models/mobilenetv2 \
--img_txt_path=models_imgs/images/classification.jpg.txt \
--out_max_value=0.868888 \
--out_max_value_index=65
# yolov3
./yolov3_light_shared \
--optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb \
--img_txt_path=models_imgs/images/yolov3.jpg.txt \
--out_values=0,0.153605,174.494,199.729,562.075,604.014
./yolov3_light_static \
--optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb \
--img_txt_path=models_imgs/images/yolov3.jpg.txt \
--out_values=0,0.153605,174.494,199.729,562.075,604.014
./yolov3_full_static \
--model_dir=models_imgs/models/yolov3_mobilenetv1 \
--img_txt_path=models_imgs/images/yolov3.jpg.txt \
--out_values=0,0.153605,174.494,199.729,562.075,604.014
./yolov3_full_shared \
--model_dir=models_imgs/models/yolov3_mobilenetv1 \
--img_txt_path=models_imgs/images/yolov3.jpg.txt \
--out_values=0,0.153605,174.494,199.729,562.075,604.014
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "test_helper.h" // NOLINT
#include <sys/time.h>
#include <time.h>
#include <algorithm>
#include <cstdio>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <numeric>
#include <string>
#include <vector>
#include "opencv2/core.hpp"
#include "opencv2/imgcodecs.hpp"
#include "opencv2/imgproc.hpp"
double GetCurrentUS() {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
}
int64_t ShapeProduction(const std::vector<int64_t>& shape) {
int64_t num = 1;
for (auto i : shape) {
num *= i;
}
return num;
}
std::vector<int64_t> GetIntNumsFromStr(const std::string& str) {
std::vector<int64_t> nums;
std::string tmp_str = str;
while (!tmp_str.empty()) {
int num = atoi(tmp_str.data());
nums.push_back(num);
size_t next_offset = tmp_str.find(",");
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return nums;
}
std::vector<double> GetDoubleNumsFromStr(const std::string& str) {
std::vector<double> nums;
std::string tmp_str = str;
while (!tmp_str.empty()) {
double num = atof(tmp_str.data());
nums.push_back(num);
size_t next_offset = tmp_str.find(",");
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return nums;
}
// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
void neon_mean_scale(
const float* din, float* dout, int size, float* mean, float* scale) {
float32x4_t vmean0 = vdupq_n_f32(mean[0]);
float32x4_t vmean1 = vdupq_n_f32(mean[1]);
float32x4_t vmean2 = vdupq_n_f32(mean[2]);
float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
float* dout_c0 = dout;
float* dout_c1 = dout + size;
float* dout_c2 = dout + size * 2;
int i = 0;
for (; i < size - 3; i += 4) {
float32x4x3_t vin3 = vld3q_f32(din);
float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
vst1q_f32(dout_c0, vs0);
vst1q_f32(dout_c1, vs1);
vst1q_f32(dout_c2, vs2);
din += 12;
dout_c0 += 4;
dout_c1 += 4;
dout_c2 += 4;
}
for (; i < size; i++) {
*(dout_c0++) = (*(din++) - mean[0]) / scale[0];
*(dout_c0++) = (*(din++) - mean[1]) / scale[1];
*(dout_c0++) = (*(din++) - mean[2]) / scale[2];
}
}
// Process img and set it as input
void process_img(const cv::Mat& img,
int width,
int height,
float* dest_data,
float* means,
float* scales) {
cv::Mat rgb_img;
cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
cv::Mat imgf;
rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
const float* dimg = reinterpret_cast<const float*>(imgf.data);
neon_mean_scale(dimg, dest_data, width * height, means, scales);
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "opencv2/core.hpp"
#include "opencv2/imgcodecs.hpp"
#include "opencv2/imgproc.hpp"
double GetCurrentUS();
int64_t ShapeProduction(const std::vector<int64_t>& shape);
std::vector<int64_t> GetIntNumsFromStr(const std::string& str);
std::vector<double> GetDoubleNumsFromStr(const std::string& str);
void neon_mean_scale(
const float* din, float* dout, int size, float* mean, float* scale);
void process_img(const cv::Mat& img,
int width,
int height,
float* dst_data,
float* means,
float* scales);
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册