未验证 提交 e7d31e1f 编写于 作者: S Santa An 提交者: GitHub

Merge branch 'develop' into baolei/bitmain

...@@ -63,6 +63,16 @@ test/models/ ...@@ -63,6 +63,16 @@ test/models/
test/images/ test/images/
*.pyc
# model
*.nb
*.svg
*.dot
# vim intermediate files
*.swp
# Emacs intermediate files # Emacs intermediate files
*~ *~
......
...@@ -45,7 +45,7 @@ else() ...@@ -45,7 +45,7 @@ else()
# we changed the source code to adapt for windows compiling # we changed the source code to adapt for windows compiling
# git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h # git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
###################################################################################################### ######################################################################################################
URL https://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip URL http://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR} DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR}
DOWNLOAD_NO_PROGRESS 1 DOWNLOAD_NO_PROGRESS 1
PREFIX ${EIGEN_SOURCE_DIR} PREFIX ${EIGEN_SOURCE_DIR}
......
...@@ -48,7 +48,7 @@ cuda的编译结果位于 `build_cuda/inference_lite_lib` ...@@ -48,7 +48,7 @@ cuda的编译结果位于 `build_cuda/inference_lite_lib`
4、 `demo` 文件夹:c++ demo. 4、 `demo` 文件夹:c++ demo.
如果编译打开了python选项,则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so` 如果编译打开了python选项,则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite.so`
## 运行 ## 运行
...@@ -66,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg ...@@ -66,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg
二: 运行 二: 运行
**NOTE:**此处示例使用的是python接口。 **NOTE:** 此处示例使用的是python接口。
``` python ``` python
#-*- coding: utf-8 -*- #-*- coding: utf-8 -*-
...@@ -75,7 +75,7 @@ import sys ...@@ -75,7 +75,7 @@ import sys
import numpy as np import numpy as np
import cv2 import cv2
sys.path.append('build_cuda/inference_lite_lib/python/lib') sys.path.append('build_cuda/inference_lite_lib/python/lib')
from lite_core import * from lite import *
def read_img(im_path, resize_h, resize_w): def read_img(im_path, resize_h, resize_w):
im = cv2.imread(im_path).astype('float32') im = cv2.imread(im_path).astype('float32')
......
...@@ -369,6 +369,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -369,6 +369,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_libs" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_libs/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_libs/Makefile"
) )
add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_android_cxx_demos logging gflags)
add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
......
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_SHUTDOWN_LOG)
lite_cc_library(place SRCS paddle_place.cc DEPS logging) lite_cc_library(place SRCS paddle_place.cc DEPS logging)
else() else()
lite_cc_library(place SRCS paddle_place.cc DEPS glog) lite_cc_library(place SRCS paddle_place.cc DEPS glog)
......
...@@ -151,6 +151,11 @@ std::vector<std::string> Predictor::GetInputNames() { return input_names_; } ...@@ -151,6 +151,11 @@ std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
// get outputnames // get outputnames
std::vector<std::string> Predictor::GetOutputNames() { return output_names_; } std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
// get param names
std::vector<std::string> Predictor::GetParamNames() {
return exec_scope_->AttributeVarNames();
}
// append the names of inputs and outputs into input_names_ and output_names_ // append the names of inputs and outputs into input_names_ and output_names_
void Predictor::PrepareFeedFetch() { void Predictor::PrepareFeedFetch() {
if (!program_) { if (!program_) {
...@@ -293,6 +298,7 @@ void Predictor::Build(const cpp::ProgramDesc &desc, ...@@ -293,6 +298,7 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
// `inner_places` is used to optimize passes // `inner_places` is used to optimize passes
std::vector<Place> inner_places = valid_places; std::vector<Place> inner_places = valid_places;
for (auto &valid_place : valid_places) { for (auto &valid_place : valid_places) {
if (valid_place.target == TARGET(kOpenCL)) continue;
inner_places.emplace_back( inner_places.emplace_back(
Place(TARGET(kHost), valid_place.precision, valid_place.layout)); Place(TARGET(kHost), valid_place.precision, valid_place.layout));
} }
...@@ -345,9 +351,16 @@ void Predictor::GenRuntimeProgram() { ...@@ -345,9 +351,16 @@ void Predictor::GenRuntimeProgram() {
const lite::Tensor *Predictor::GetTensor(const std::string &name) const { const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
auto *var = exec_scope_->FindVar(name); auto *var = exec_scope_->FindVar(name);
CHECK(var) << "no variable named with " << name << " in exec_scope";
return &var->Get<lite::Tensor>(); return &var->Get<lite::Tensor>();
} }
lite::Tensor *Predictor::GetMutableTensor(const std::string &name) {
auto *var = exec_scope_->FindVar(name);
CHECK(var) << "no variable named with " << name << " in exec_scope";
return var->GetMutable<lite::Tensor>();
}
// get input by name // get input by name
lite::Tensor *Predictor::GetInputByName(const std::string &name) { lite::Tensor *Predictor::GetInputByName(const std::string &name) {
auto element = std::find(input_names_.begin(), input_names_.end(), name); auto element = std::find(input_names_.begin(), input_names_.end(), name);
......
...@@ -85,6 +85,9 @@ class LITE_API Predictor { ...@@ -85,6 +85,9 @@ class LITE_API Predictor {
// get inputnames and get outputnames. // get inputnames and get outputnames.
std::vector<std::string> GetInputNames(); std::vector<std::string> GetInputNames();
std::vector<std::string> GetOutputNames(); std::vector<std::string> GetOutputNames();
// get param names
std::vector<std::string> GetParamNames();
void PrepareFeedFetch(); void PrepareFeedFetch();
// Get offset-th col of fetch results. // Get offset-th col of fetch results.
...@@ -92,6 +95,9 @@ class LITE_API Predictor { ...@@ -92,6 +95,9 @@ class LITE_API Predictor {
std::vector<const lite::Tensor*> GetOutputs() const; std::vector<const lite::Tensor*> GetOutputs() const;
const cpp::ProgramDesc& program_desc() const; const cpp::ProgramDesc& program_desc() const;
// get a mutable tensor according to its name
lite::Tensor* GetMutableTensor(const std::string& name);
// get a const tensor according to its name
const lite::Tensor* GetTensor(const std::string& name) const; const lite::Tensor* GetTensor(const std::string& name) const;
const RuntimeProgram& runtime_program() const; const RuntimeProgram& runtime_program() const;
...@@ -142,9 +148,15 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor { ...@@ -142,9 +148,15 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
// get inputs names and get outputs names // get inputs names and get outputs names
std::vector<std::string> GetInputNames() override; std::vector<std::string> GetInputNames() override;
std::vector<std::string> GetOutputNames() override; std::vector<std::string> GetOutputNames() override;
// get param names
std::vector<std::string> GetParamNames() override;
// get tensor according to tensor's name
std::unique_ptr<const lite_api::Tensor> GetTensor( std::unique_ptr<const lite_api::Tensor> GetTensor(
const std::string& name) const override; const std::string& name) const override;
// get a mutable tensor according to tensor's name
std::unique_ptr<lite_api::Tensor> GetMutableTensor(
const std::string& name) override;
// Get InputTebsor by name // Get InputTebsor by name
std::unique_ptr<lite_api::Tensor> GetInputByName( std::unique_ptr<lite_api::Tensor> GetInputByName(
......
...@@ -97,6 +97,10 @@ std::vector<std::string> CxxPaddleApiImpl::GetInputNames() { ...@@ -97,6 +97,10 @@ std::vector<std::string> CxxPaddleApiImpl::GetInputNames() {
return raw_predictor_.GetInputNames(); return raw_predictor_.GetInputNames();
} }
std::vector<std::string> CxxPaddleApiImpl::GetParamNames() {
return raw_predictor_.GetParamNames();
}
std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() { std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() {
return raw_predictor_.GetOutputNames(); return raw_predictor_.GetOutputNames();
} }
...@@ -123,6 +127,12 @@ std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetTensor( ...@@ -123,6 +127,12 @@ std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetTensor(
return std::unique_ptr<const lite_api::Tensor>(new lite_api::Tensor(x)); return std::unique_ptr<const lite_api::Tensor>(new lite_api::Tensor(x));
} }
std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetMutableTensor(
const std::string &name) {
return std::unique_ptr<lite_api::Tensor>(
new lite_api::Tensor(raw_predictor_.GetMutableTensor(name)));
}
std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInputByName( std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInputByName(
const std::string &name) { const std::string &name) {
return std::unique_ptr<lite_api::Tensor>( return std::unique_ptr<lite_api::Tensor>(
......
...@@ -36,7 +36,7 @@ DEFINE_string(model_dir_0, "", "model_dir_0"); ...@@ -36,7 +36,7 @@ DEFINE_string(model_dir_0, "", "model_dir_0");
DEFINE_string(input_shape_0, DEFINE_string(input_shape_0,
"1,3,224,224", "1,3,224,224",
"input shapes another, separated by colon and comma"); "input shapes another, separated by colon and comma");
DEFINE_string(target, "arm", "main target for Predictor: arm, opencl");
DEFINE_bool(use_optimize_nb, DEFINE_bool(use_optimize_nb,
false, false,
"optimized & naive buffer model for mobile devices"); "optimized & naive buffer model for mobile devices");
...@@ -51,9 +51,19 @@ void OutputOptModel(const std::string& load_model_dir, ...@@ -51,9 +51,19 @@ void OutputOptModel(const std::string& load_model_dir,
const std::vector<std::vector<int64_t>>& input_shapes) { const std::vector<std::vector<int64_t>>& input_shapes) {
lite_api::CxxConfig config; lite_api::CxxConfig config;
config.set_model_dir(load_model_dir); config.set_model_dir(load_model_dir);
config.set_valid_places({ if (FLAGS_target == "arm") {
Place{TARGET(kARM), PRECISION(kFloat)}, config.set_valid_places({
}); Place{TARGET(kARM), PRECISION(kFloat)},
});
} else if (FLAGS_target == "opencl") {
config.set_valid_places({
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
Place{TARGET(kARM)}, // enable kARM CPU kernel when no opencl kernel
});
}
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
// delete old optimized model // delete old optimized model
...@@ -78,7 +88,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes, ...@@ -78,7 +88,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
int tid, int tid,
const int warmup_times = 5) { const int warmup_times = 5) {
lite_api::MobileConfig config; lite_api::MobileConfig config;
config.set_model_dir(model_dir); config.set_model_from_file(model_dir + ".nb");
config.set_power_mode(power_mode); config.set_power_mode(power_mode);
config.set_threads(thread_num); config.set_threads(thread_num);
...@@ -197,7 +207,7 @@ void RunTestType_10(const std::vector<std::vector<int64_t>>& input_shapes, ...@@ -197,7 +207,7 @@ void RunTestType_10(const std::vector<std::vector<int64_t>>& input_shapes,
const int repeat, const int repeat,
int warmup = 5) { int warmup = 5) {
lite_api::MobileConfig config; lite_api::MobileConfig config;
config.set_model_dir(model_dir); config.set_model_from_file(model_dir + ".nb");
config.set_power_mode(power_mode); config.set_power_mode(power_mode);
config.set_threads(thread_num); config.set_threads(thread_num);
...@@ -218,13 +228,13 @@ void RunTestType_11(const std::vector<std::vector<int64_t>>& input_shapes, ...@@ -218,13 +228,13 @@ void RunTestType_11(const std::vector<std::vector<int64_t>>& input_shapes,
const int repeat, const int repeat,
int warmup = 5) { int warmup = 5) {
lite_api::MobileConfig config; lite_api::MobileConfig config;
config.set_model_dir(model_dir); config.set_model_from_file(model_dir + ".nb");
config.set_power_mode(power_mode); config.set_power_mode(power_mode);
config.set_threads(thread_num); config.set_threads(thread_num);
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
config.set_model_dir(model_dir_0); config.set_model_from_file(model_dir_0 + ".nb");
auto predictor_0 = lite_api::CreatePaddlePredictor(config); auto predictor_0 = lite_api::CreatePaddlePredictor(config);
for (int i = 0; i < 2 * repeat; i += 2) { for (int i = 0; i < 2 * repeat; i += 2) {
...@@ -246,7 +256,8 @@ int main(int argc, char** argv) { ...@@ -246,7 +256,8 @@ int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true); gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir == "") { if (FLAGS_model_dir == "") {
LOG(INFO) << "usage: " LOG(INFO) << "usage: "
<< "--model_dir /path/to/your/model"; << "--model_dir /path/to/your/model --model_dir_0 "
"/path/to/your/model0 --target `arm` or `opencl`";
exit(0); exit(0);
} }
std::string save_optimized_model_dir = ""; std::string save_optimized_model_dir = "";
......
...@@ -55,7 +55,7 @@ DEFINE_string(model_file, "", "model file path of the combined-param model"); ...@@ -55,7 +55,7 @@ DEFINE_string(model_file, "", "model file path of the combined-param model");
DEFINE_string(param_file, "", "param file path of the combined-param model"); DEFINE_string(param_file, "", "param file path of the combined-param model");
DEFINE_string( DEFINE_string(
optimize_out_type, optimize_out_type,
"protobuf", "naive_buffer",
"store type of the output optimized model. protobuf/naive_buffer"); "store type of the output optimized model. protobuf/naive_buffer");
DEFINE_bool(display_kernels, false, "Display kernel information"); DEFINE_bool(display_kernels, false, "Display kernel information");
DEFINE_bool(record_tailoring_info, DEFINE_bool(record_tailoring_info,
...@@ -207,7 +207,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) { ...@@ -207,7 +207,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
} }
std::cout << std::setiosflags(std::ios::internal); std::cout << std::setiosflags(std::ios::internal);
std::cout << std::setw(maximum_optype_length) << "OP_name"; std::cout << std::setw(maximum_optype_length) << "OP_name";
for (int i = 0; i < targets.size(); i++) { for (size_t i = 0; i < targets.size(); i++) {
std::cout << std::setw(10) << targets[i].substr(1); std::cout << std::setw(10) << targets[i].substr(1);
} }
std::cout << std::endl; std::cout << std::endl;
...@@ -215,7 +215,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) { ...@@ -215,7 +215,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) { for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
std::cout << std::setw(maximum_optype_length) << it->first; std::cout << std::setw(maximum_optype_length) << it->first;
auto ops_valid_places = it->second; auto ops_valid_places = it->second;
for (int i = 0; i < targets.size(); i++) { for (size_t i = 0; i < targets.size(); i++) {
if (std::find(ops_valid_places.begin(), if (std::find(ops_valid_places.begin(),
ops_valid_places.end(), ops_valid_places.end(),
targets[i]) != ops_valid_places.end()) { targets[i]) != ops_valid_places.end()) {
...@@ -235,7 +235,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) { ...@@ -235,7 +235,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
} }
// Print OP info. // Print OP info.
auto ops_valid_places = supported_ops.at(*op); auto ops_valid_places = supported_ops.at(*op);
for (int i = 0; i < targets.size(); i++) { for (size_t i = 0; i < targets.size(); i++) {
if (std::find(ops_valid_places.begin(), if (std::find(ops_valid_places.begin(),
ops_valid_places.end(), ops_valid_places.end(),
targets[i]) != ops_valid_places.end()) { targets[i]) != ops_valid_places.end()) {
...@@ -288,11 +288,11 @@ void ParseInputCommand() { ...@@ -288,11 +288,11 @@ void ParseInputCommand() {
auto valid_places = paddle::lite_api::ParserValidPlaces(); auto valid_places = paddle::lite_api::ParserValidPlaces();
// get valid_targets string // get valid_targets string
std::vector<TargetType> target_types = {}; std::vector<TargetType> target_types = {};
for (int i = 0; i < valid_places.size(); i++) { for (size_t i = 0; i < valid_places.size(); i++) {
target_types.push_back(valid_places[i].target); target_types.push_back(valid_places[i].target);
} }
std::string targets_str = TargetToStr(target_types[0]); std::string targets_str = TargetToStr(target_types[0]);
for (int i = 1; i < target_types.size(); i++) { for (size_t i = 1; i < target_types.size(); i++) {
targets_str = targets_str + TargetToStr(target_types[i]); targets_str = targets_str + TargetToStr(target_types[i]);
} }
...@@ -301,7 +301,7 @@ void ParseInputCommand() { ...@@ -301,7 +301,7 @@ void ParseInputCommand() {
target_types.push_back(TARGET(kUnk)); target_types.push_back(TARGET(kUnk));
std::set<std::string> valid_ops; std::set<std::string> valid_ops;
for (int i = 0; i < target_types.size(); i++) { for (size_t i = 0; i < target_types.size(); i++) {
auto ops = supported_ops_target[static_cast<int>(target_types[i])]; auto ops = supported_ops_target[static_cast<int>(target_types[i])];
valid_ops.insert(ops.begin(), ops.end()); valid_ops.insert(ops.begin(), ops.end());
} }
...@@ -318,7 +318,7 @@ void CheckIfModelSupported() { ...@@ -318,7 +318,7 @@ void CheckIfModelSupported() {
auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))]; auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
valid_ops.insert( valid_ops.insert(
valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end()); valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
for (int i = 0; i < valid_places.size(); i++) { for (size_t i = 0; i < valid_places.size(); i++) {
auto target = valid_places[i].target; auto target = valid_places[i].target;
auto ops = supported_ops_target[static_cast<int>(target)]; auto ops = supported_ops_target[static_cast<int>(target)];
valid_ops.insert(valid_ops.end(), ops.begin(), ops.end()); valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
...@@ -340,7 +340,7 @@ void CheckIfModelSupported() { ...@@ -340,7 +340,7 @@ void CheckIfModelSupported() {
std::set<std::string> unsupported_ops; std::set<std::string> unsupported_ops;
std::set<std::string> input_model_ops; std::set<std::string> input_model_ops;
for (int index = 0; index < cpp_prog.BlocksSize(); index++) { for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) {
auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index); auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
for (size_t i = 0; i < current_block->OpsSize(); ++i) { for (size_t i = 0; i < current_block->OpsSize(); ++i) {
auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i); auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
...@@ -364,13 +364,13 @@ void CheckIfModelSupported() { ...@@ -364,13 +364,13 @@ void CheckIfModelSupported() {
unsupported_ops_str = unsupported_ops_str + ", " + *op_str; unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
} }
std::vector<TargetType> targets = {}; std::vector<TargetType> targets = {};
for (int i = 0; i < valid_places.size(); i++) { for (size_t i = 0; i < valid_places.size(); i++) {
targets.push_back(valid_places[i].target); targets.push_back(valid_places[i].target);
} }
std::sort(targets.begin(), targets.end()); std::sort(targets.begin(), targets.end());
targets.erase(unique(targets.begin(), targets.end()), targets.end()); targets.erase(unique(targets.begin(), targets.end()), targets.end());
std::string targets_str = TargetToStr(targets[0]); std::string targets_str = TargetToStr(targets[0]);
for (int i = 1; i < targets.size(); i++) { for (size_t i = 1; i < targets.size(); i++) {
targets_str = targets_str + "," + TargetToStr(targets[i]); targets_str = targets_str + "," + TargetToStr(targets[i]);
} }
......
...@@ -82,27 +82,56 @@ void OptBase::SetValidPlaces(const std::string& valid_places) { ...@@ -82,27 +82,56 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
"command argument 'valid_targets'"; "command argument 'valid_targets'";
} }
void OptBase::SetOptimizeOut(const std::string& optimized_out_path) { void OptBase::SetLiteOut(const std::string& lite_out_name) {
optimize_out_path_ = optimized_out_path; lite_out_name_ = lite_out_name;
} }
void OptBase::RunOptimize(bool record_strip_info) { void OptBase::RecordModelInfo(bool record_strip_info) {
record_strip_info_ = record_strip_info;
}
void OptBase::Run() {
CheckIfModelSupported(false); CheckIfModelSupported(false);
OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map); OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
opt_config_.set_valid_places(valid_places_); opt_config_.set_valid_places(valid_places_);
if (model_set_dir_ != "") { if (model_set_dir_ != "") {
RunOptimizeFromModelSet(record_strip_info); RunOptimizeFromModelSet(record_strip_info_);
} else { } else {
auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_); auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
opt_predictor->SaveOptimizedModel( opt_predictor->SaveOptimizedModel(
optimize_out_path_, model_type_, record_strip_info); lite_out_name_, model_type_, record_strip_info_);
auto resulted_model_name = auto resulted_model_name =
record_strip_info ? "information of striped model" : "optimized model"; record_strip_info_ ? "information of striped model" : "optimized model";
std::cout << "Save the " << resulted_model_name std::cout << "Save the " << resulted_model_name
<< " into :" << optimize_out_path_ << "successfully"; << " into :" << lite_out_name_ << "successfully";
} }
} }
void OptBase::RunOptimize(const std::string& model_dir_path,
const std::string& model_path,
const std::string& param_path,
const std::string& valid_places,
const std::string& optimized_out_path) {
SetModelDir(model_dir_path);
SetModelFile(model_path);
SetParamFile(param_path);
SetValidPlaces(valid_places);
SetLiteOut(optimized_out_path);
CheckIfModelSupported(false);
OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
opt_config_.set_valid_places(valid_places_);
if (model_set_dir_ != "") {
RunOptimizeFromModelSet(record_strip_info_);
} else {
auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
opt_predictor->SaveOptimizedModel(
lite_out_name_, model_type_, record_strip_info_);
auto resulted_model_name =
record_strip_info_ ? "information of striped model" : "optimized model";
std::cout << "Save the " << resulted_model_name
<< " into :" << lite_out_name_ << "successfully";
}
}
// collect ops info of modelset // collect ops info of modelset
void CollectModelMetaInfo(const std::string& output_dir, void CollectModelMetaInfo(const std::string& output_dir,
const std::vector<std::string>& models, const std::vector<std::string>& models,
...@@ -125,7 +154,7 @@ void OptBase::SetModelSetDir(const std::string& model_set_path) { ...@@ -125,7 +154,7 @@ void OptBase::SetModelSetDir(const std::string& model_set_path) {
} }
void OptBase::RunOptimizeFromModelSet(bool record_strip_info) { void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
// 1. mkdir of outputed optimized model set. // 1. mkdir of outputed optimized model set.
lite::MkDirRecur(optimize_out_path_); lite::MkDirRecur(lite_out_name_);
auto model_dirs = lite::ListDir(model_set_dir_, true); auto model_dirs = lite::ListDir(model_set_dir_, true);
if (model_dirs.size() == 0) { if (model_dirs.size() == 0) {
LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model"; LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model";
...@@ -138,7 +167,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) { ...@@ -138,7 +167,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
std::string input_model_dir = std::string input_model_dir =
lite::Join<std::string>({model_set_dir_, name}, "/"); lite::Join<std::string>({model_set_dir_, name}, "/");
std::string output_model_dir = std::string output_model_dir =
lite::Join<std::string>({optimize_out_path_, name}, "/"); lite::Join<std::string>({lite_out_name_, name}, "/");
if (opt_config_.model_file() != "" && opt_config_.param_file() != "") { if (opt_config_.model_file() != "" && opt_config_.param_file() != "") {
auto model_file_path = auto model_file_path =
...@@ -155,7 +184,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) { ...@@ -155,7 +184,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_); auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
opt_predictor->SaveOptimizedModel( opt_predictor->SaveOptimizedModel(
optimize_out_path_, model_type_, record_strip_info); lite_out_name_, model_type_, record_strip_info);
std::cout << "Optimize done. "; std::cout << "Optimize done. ";
} }
...@@ -164,46 +193,60 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) { ...@@ -164,46 +193,60 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
if (record_strip_info) { if (record_strip_info) {
// Collect all models information // Collect all models information
CollectModelMetaInfo( CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME); lite_out_name_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(
lite_out_name_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
CollectModelMetaInfo( CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME); lite_out_name_, model_dirs, lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(optimize_out_path_,
model_dirs,
lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo( CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME); lite_out_name_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
std::cout << "Record the information of stripped models into :" std::cout << "Record the information of stripped models into :"
<< optimize_out_path_ << "successfully"; << lite_out_name_ << "successfully";
} }
} }
void OptBase::PrintHelpInfo() { void OptBase::PrintHelpInfo() {
const std::string opt_version = lite::version(); const std::string opt_version = lite::version();
const char help_info[] = const char help_info[] =
"At least one argument should be inputed. Valid arguments are listed " "------------------------------------------------------------------------"
"below:\n" "-----------------------------------------------------------\n"
" Valid arguments of Paddle-Lite opt are listed below:\n"
"------------------------------------------------------------------------"
"-----------------------------------------------------------\n"
" Arguments of help information:\n" " Arguments of help information:\n"
" `help()` Print help infomation\n" " `help()` Print help infomation\n"
" Arguments of model optimization:\n" "\n"
" Arguments of model transformation:\n"
" `set_model_dir(model_dir)`\n" " `set_model_dir(model_dir)`\n"
" `set_model_file(model_file_path)`\n" " `set_model_file(model_file_path)`\n"
" `set_param_file(param_file_path)`\n" " `set_param_file(param_file_path)`\n"
" `set_model_type(protobuf|naive_buffer)`\n" " `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
" `set_optimize_out(output_optimize_model_dir)`\n" "default\n"
" `set_lite_out(output_optimize_model_dir)`\n"
" `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n" " `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
" `run_optimize(false|true)`\n" " `record_model_info(false|true)`: refer to whether to record ops "
" ` ----fasle&true refer to whether to record ops info for " "info for striping lib, false by default`\n"
"tailoring lib, false by default`\n" " `run() : start model transformation`\n"
" Arguments of model checking and ops information:\n" " eg. `opt.set_model_dir(\"./mobilenetv1\"); "
"opt.set_lite_out(\"mobilenetv1_opt\"); opt.set_valid_places(\"arm\"); "
"opt.run();`\n"
"\n"
" You can also transform model through a single input argument:\n"
" `run_optimize(model_dir, model_file_path, param_file_path, "
"model_type, valid_places, lite_out_name) `\n"
" eg. `opt.run_optimize(\"./mobilenetv1\", \"\", \"\", "
"\"naive_buffer\", \"arm\", \"mobilenetv1_opt\");`"
"\n"
" Arguments of checking model and printing ops information:\n"
" `print_all_ops()` Display all the valid operators of " " `print_all_ops()` Display all the valid operators of "
"Paddle-Lite\n" "Paddle-Lite\n"
" `print_supported_ops` Display supported operators of valid " " `print_supported_ops` Display supported operators of valid "
"places\n" "places\n"
" `check_if_model_supported()` Check if the input model is " " `check_if_model_supported()` Check if the input model is "
"supported\n"; "supported\n"
"------------------------------------------------------------------------"
std::cout << "opt version:" << opt_version << std::endl "-----------------------------------------------------------\n";
<< help_info << std::endl; std::cout << "opt version:" << opt_version << std::endl << help_info;
} }
// 2. Print supported info of inputed ops // 2. Print supported info of inputed ops
void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) { void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
......
...@@ -44,16 +44,21 @@ class LITE_API OptBase { ...@@ -44,16 +44,21 @@ class LITE_API OptBase {
public: public:
OptBase() = default; OptBase() = default;
void SetModelSetDir(const std::string &model_set_path); void SetModelSetDir(const std::string &model_set_path);
void SetModelDir(const std::string &model_path); void SetModelDir(const std::string &model_dir_path);
void SetModelFile(const std::string &model_path); void SetModelFile(const std::string &model_path);
void SetParamFile(const std::string &param_path); void SetParamFile(const std::string &param_path);
void SetValidPlaces(const std::string &valid_places); void SetValidPlaces(const std::string &valid_places);
void SetOptimizeOut(const std::string &optimized_out_path); void SetLiteOut(const std::string &lite_out_name);
void RecordModelInfo(bool record_strip_info = true);
// set optimized_model type // set optimized_model type
void SetModelType(std::string model_type); void SetModelType(std::string model_type);
// transform and save the optimized model // transform and save the optimized model
void RunOptimize(bool record_strip_info = false); void Run();
void RunOptimize(const std::string &model_dir_path = "",
const std::string &model_path = "",
const std::string &param_path = "",
const std::string &valid_places = "",
const std::string &optimized_out_path = "");
// fuctions of printing info // fuctions of printing info
// 1. help info // 1. help info
void PrintHelpInfo(); void PrintHelpInfo();
...@@ -71,12 +76,12 @@ class LITE_API OptBase { ...@@ -71,12 +76,12 @@ class LITE_API OptBase {
// valid places for the optimized_model // valid places for the optimized_model
std::vector<Place> valid_places_; std::vector<Place> valid_places_;
// filename of the optimized_model // filename of the optimized_model
std::string optimize_out_path_; std::string lite_out_name_;
// type of the optimized_model, kNaiveBuffer default. // type of the optimized_model, kNaiveBuffer default.
LiteModelType model_type_{LiteModelType::kNaiveBuffer}; LiteModelType model_type_{LiteModelType::kNaiveBuffer};
// Dir path of a set of models, this should be combined with model // Dir path of a set of models, this should be combined with model
std::string model_set_dir_; std::string model_set_dir_;
bool record_strip_info_{false};
void RunOptimizeFromModelSet(bool record_strip_info = false); void RunOptimizeFromModelSet(bool record_strip_info = false);
}; };
......
...@@ -167,6 +167,20 @@ lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); } ...@@ -167,6 +167,20 @@ lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); }
void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); } void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); }
std::unique_ptr<Tensor> PaddlePredictor::GetMutableTensor(
const std::string &name) {
LOG(FATAL)
<< "The GetMutableTensor API is only supported by CxxConfig predictor.";
return nullptr;
}
std::vector<std::string> PaddlePredictor::GetParamNames() {
std::vector<std::string> null_result = {};
LOG(FATAL)
<< "The GetParamNames API is only supported by CxxConfig predictor.";
return null_result;
}
void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir, void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir,
LiteModelType model_type, LiteModelType model_type,
bool record_info) { bool record_info) {
......
...@@ -86,6 +86,8 @@ class LITE_API PaddlePredictor { ...@@ -86,6 +86,8 @@ class LITE_API PaddlePredictor {
virtual std::vector<std::string> GetInputNames() = 0; virtual std::vector<std::string> GetInputNames() = 0;
// Get output names // Get output names
virtual std::vector<std::string> GetOutputNames() = 0; virtual std::vector<std::string> GetOutputNames() = 0;
// Get output names
virtual std::vector<std::string> GetParamNames();
// Get Input by name // Get Input by name
virtual std::unique_ptr<Tensor> GetInputByName(const std::string& name) = 0; virtual std::unique_ptr<Tensor> GetInputByName(const std::string& name) = 0;
...@@ -93,6 +95,9 @@ class LITE_API PaddlePredictor { ...@@ -93,6 +95,9 @@ class LITE_API PaddlePredictor {
/// Get a readonly tensor, return null if no one called `name` exists. /// Get a readonly tensor, return null if no one called `name` exists.
virtual std::unique_ptr<const Tensor> GetTensor( virtual std::unique_ptr<const Tensor> GetTensor(
const std::string& name) const = 0; const std::string& name) const = 0;
/// Get a mutable tensor, return null if on one called `name` exists
/// internal infereces API, not recommanded.
virtual std::unique_ptr<Tensor> GetMutableTensor(const std::string& name);
/// Persist the optimized model to disk. This API is only supported by /// Persist the optimized model to disk. This API is only supported by
/// CxxConfig, and the persisted model can be reused for MobileConfig. /// CxxConfig, and the persisted model can be reused for MobileConfig.
...@@ -176,7 +181,7 @@ class LITE_API CxxConfig : public ConfigBase { ...@@ -176,7 +181,7 @@ class LITE_API CxxConfig : public ConfigBase {
#endif #endif
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; } void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
int multi_stream() const { return multi_stream_; } bool multi_stream() const { return multi_stream_; }
#endif #endif
#ifdef LITE_WITH_MLU #ifdef LITE_WITH_MLU
...@@ -208,6 +213,8 @@ class LITE_API CxxConfig : public ConfigBase { ...@@ -208,6 +213,8 @@ class LITE_API CxxConfig : public ConfigBase {
// current thread. // current thread.
void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00); void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
// XPU only, specify the target device ID for the current thread. // XPU only, specify the target device ID for the current thread.
// **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
// thread
void set_xpu_dev_per_thread(int dev_no = 0); void set_xpu_dev_per_thread(int dev_no = 0);
}; };
......
...@@ -19,7 +19,13 @@ ...@@ -19,7 +19,13 @@
#pragma once #pragma once
// some platform-independent defintion // some platform-independent defintion
#include "lite/utils/macros.h"
#if defined(_WIN32)
#define UNUSED
#define __builtin_expect(EXP, C) (EXP)
#else
#define UNUSED __attribute__((unused))
#endif
#define USE_LITE_OP(op_type__) \ #define USE_LITE_OP(op_type__) \
extern int touch_op_##op_type__(); \ extern int touch_op_##op_type__(); \
......
...@@ -33,6 +33,7 @@ USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass); ...@@ -33,6 +33,7 @@ USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
USE_MIR_PASS(lite_interpolate_fuse_pass); USE_MIR_PASS(lite_interpolate_fuse_pass);
USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass); USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass);
USE_MIR_PASS(identity_scale_eliminate_pass); USE_MIR_PASS(identity_scale_eliminate_pass);
USE_MIR_PASS(identity_dropout_eliminate_pass);
USE_MIR_PASS(lite_conv_elementwise_fuse_pass); USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
USE_MIR_PASS(lite_conv_activation_fuse_pass); USE_MIR_PASS(lite_conv_activation_fuse_pass);
USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass); USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
...@@ -51,5 +52,8 @@ USE_MIR_PASS(mlu_postprocess_pass); ...@@ -51,5 +52,8 @@ USE_MIR_PASS(mlu_postprocess_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass); USE_MIR_PASS(weight_quantization_preprocess_pass);
USE_MIR_PASS(apu_subgraph_pass); USE_MIR_PASS(apu_subgraph_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass); USE_MIR_PASS(quantized_op_attributes_inference_pass);
USE_MIR_PASS(lite_scale_activation_fuse_pass);
USE_MIR_PASS(__xpu__resnet_fuse_pass); USE_MIR_PASS(__xpu__resnet_fuse_pass);
USE_MIR_PASS(__xpu__multi_encoder_fuse_pass); USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
USE_MIR_PASS(__xpu__fc_fuse_pass);
...@@ -62,8 +62,10 @@ void BindLiteOpt(py::module *m) { ...@@ -62,8 +62,10 @@ void BindLiteOpt(py::module *m) {
.def("set_model_file", &OptBase::SetModelFile) .def("set_model_file", &OptBase::SetModelFile)
.def("set_param_file", &OptBase::SetParamFile) .def("set_param_file", &OptBase::SetParamFile)
.def("set_valid_places", &OptBase::SetValidPlaces) .def("set_valid_places", &OptBase::SetValidPlaces)
.def("set_optimize_out", &OptBase::SetOptimizeOut) .def("set_lite_out", &OptBase::SetLiteOut)
.def("set_model_type", &OptBase::SetModelType) .def("set_model_type", &OptBase::SetModelType)
.def("record_model_info", &OptBase::RecordModelInfo)
.def("run", &OptBase::Run)
.def("run_optimize", &OptBase::RunOptimize) .def("run_optimize", &OptBase::RunOptimize)
.def("help", &OptBase::PrintHelpInfo) .def("help", &OptBase::PrintHelpInfo)
.def("print_supported_ops", &OptBase::PrintSupportedOps) .def("print_supported_ops", &OptBase::PrintSupportedOps)
......
...@@ -50,7 +50,7 @@ if '${WITH_MKL}' == 'ON': ...@@ -50,7 +50,7 @@ if '${WITH_MKL}' == 'ON':
# link lite.so to paddlelite.libs # link lite.so to paddlelite.libs
if os.name != 'nt': if os.name != 'nt':
COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\ COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
/inference_lite_lib/python/install/lite/lite.so" /inference_lite_lib/python/install/lite/lite.so"
if os.system(COMMAND) != 0: if os.system(COMMAND) != 0:
raise Exception("patch third_party libs failed, command: %s" % COMMAND) raise Exception("patch third_party libs failed, command: %s" % COMMAND)
......
...@@ -80,8 +80,10 @@ void conv_compute_6x6_3x3(const float* input, ...@@ -80,8 +80,10 @@ void conv_compute_6x6_3x3(const float* input,
const operators::ConvParam& param, const operators::ConvParam& param,
ARMContext* ctx) { ARMContext* ctx) {
auto act_param = param.activation_param; auto act_param = param.activation_param;
const int pad_h = (*param.paddings)[0]; const int pad_h0 = (*param.paddings)[0];
const int pad_w = (*param.paddings)[2]; const int pad_h1 = (*param.paddings)[1];
const int pad_w0 = (*param.paddings)[2];
const int pad_w1 = (*param.paddings)[3];
float* tmp_work_space = float* tmp_work_space =
ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float); ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
...@@ -96,8 +98,8 @@ void conv_compute_6x6_3x3(const float* input, ...@@ -96,8 +98,8 @@ void conv_compute_6x6_3x3(const float* input,
int tile_h = (hout + 5) / 6; int tile_h = (hout + 5) / 6;
int size_tile = tile_h * tile_w; int size_tile = tile_h * tile_w;
int w_pad = win + pad_w * 2; int w_pad = win + pad_w0 + pad_w1;
int h_pad = hin + pad_h * 2; int h_pad = hin + pad_h0 + pad_h1;
const int zero_len = w_pad; const int zero_len = w_pad;
float zero_ptr[zero_len]; // NOLINT float zero_ptr[zero_len]; // NOLINT
...@@ -127,10 +129,10 @@ void conv_compute_6x6_3x3(const float* input, ...@@ -127,10 +129,10 @@ void conv_compute_6x6_3x3(const float* input,
prepack_input_nxwc4_dw(input + ni * in_n_stride, prepack_input_nxwc4_dw(input + ni * in_n_stride,
input_c4 + i * new_c_stride, input_c4 + i * new_c_stride,
i * 4, i * 4,
-pad_h, -pad_h0,
hin + pad_h, hin + pad_h1,
-pad_w, -pad_w0,
win + pad_w, win + pad_w1,
chin, chin,
win, win,
hin, hin,
...@@ -367,8 +369,10 @@ void conv_compute_2x2_3x3(const float* input, ...@@ -367,8 +369,10 @@ void conv_compute_2x2_3x3(const float* input,
const operators::ConvParam& param, const operators::ConvParam& param,
ARMContext* ctx) { ARMContext* ctx) {
auto act_param = param.activation_param; auto act_param = param.activation_param;
const int pad_h = (*param.paddings)[0]; const int pad_h0 = (*param.paddings)[0];
const int pad_w = (*param.paddings)[2]; const int pad_h1 = (*param.paddings)[1];
const int pad_w0 = (*param.paddings)[2];
const int pad_w1 = (*param.paddings)[3];
float* tmp_work_space = float* tmp_work_space =
ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float); ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
...@@ -383,8 +387,8 @@ void conv_compute_2x2_3x3(const float* input, ...@@ -383,8 +387,8 @@ void conv_compute_2x2_3x3(const float* input,
int tile_h = (hout + 1) / 2; int tile_h = (hout + 1) / 2;
int size_tile = tile_h * tile_w; int size_tile = tile_h * tile_w;
int w_pad = win + pad_w * 2; int w_pad = win + pad_w0 + pad_w1;
int h_pad = hin + pad_h * 2; int h_pad = hin + pad_h0 + pad_h1;
const int zero_len = w_pad; const int zero_len = w_pad;
float zero_ptr[zero_len]; // NOLINT float zero_ptr[zero_len]; // NOLINT
...@@ -414,10 +418,10 @@ void conv_compute_2x2_3x3(const float* input, ...@@ -414,10 +418,10 @@ void conv_compute_2x2_3x3(const float* input,
prepack_input_nxwc4_dw(input + ni * in_n_stride, prepack_input_nxwc4_dw(input + ni * in_n_stride,
input_c4 + i * new_c_stride, input_c4 + i * new_c_stride,
i * 4, i * 4,
-pad_h, -pad_h0,
hin + pad_h, hin + pad_h1,
-pad_w, -pad_w0,
win + pad_w, win + pad_w1,
chin, chin,
win, win,
hin, hin,
...@@ -628,8 +632,10 @@ void conv_compute_2x2_3x3_small(const float* input, ...@@ -628,8 +632,10 @@ void conv_compute_2x2_3x3_small(const float* input,
const operators::ConvParam& param, const operators::ConvParam& param,
ARMContext* ctx) { ARMContext* ctx) {
auto act_param = param.activation_param; auto act_param = param.activation_param;
const int pad_h = (*param.paddings)[0]; const int pad_h0 = (*param.paddings)[0];
const int pad_w = (*param.paddings)[2]; const int pad_h1 = (*param.paddings)[1];
const int pad_w0 = (*param.paddings)[2];
const int pad_w1 = (*param.paddings)[3];
float* tmp_work_space = float* tmp_work_space =
ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float); ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
...@@ -644,8 +650,8 @@ void conv_compute_2x2_3x3_small(const float* input, ...@@ -644,8 +650,8 @@ void conv_compute_2x2_3x3_small(const float* input,
int tile_h = (hout + 1) / 2; int tile_h = (hout + 1) / 2;
int size_tile = tile_h * tile_w; int size_tile = tile_h * tile_w;
int w_pad = win + pad_w * 2; int w_pad = win + pad_w0 + pad_w1;
int h_pad = hin + pad_h * 2; int h_pad = hin + pad_h0 + pad_h1;
const int zero_len = w_pad; const int zero_len = w_pad;
float zero_ptr[zero_len]; // NOLINT float zero_ptr[zero_len]; // NOLINT
...@@ -676,10 +682,10 @@ void conv_compute_2x2_3x3_small(const float* input, ...@@ -676,10 +682,10 @@ void conv_compute_2x2_3x3_small(const float* input,
prepack_input_nxwc4_dw(input + ni * in_n_stride, prepack_input_nxwc4_dw(input + ni * in_n_stride,
input_c4 + i * new_c_stride, input_c4 + i * new_c_stride,
i * 4, i * 4,
-pad_h, -pad_h0,
hin + pad_h, hin + pad_h1,
-pad_w, -pad_w0,
win + pad_w, win + pad_w1,
chin, chin,
win, win,
hin, hin,
......
...@@ -33,6 +33,7 @@ void add_bias_rowwise(Tensor* input, ...@@ -33,6 +33,7 @@ void add_bias_rowwise(Tensor* input,
for (int w = start_w; w < w_adds; ++w) { for (int w = start_w; w < w_adds; ++w) {
i_data[w] += b_data[w]; i_data[w] += b_data[w];
} }
i_data += width;
} }
} }
void vector_dot( void vector_dot(
...@@ -67,15 +68,8 @@ void vector_dot( ...@@ -67,15 +68,8 @@ void vector_dot(
for (int i = 0; i < remain; ++i) { for (int i = 0; i < remain; ++i) {
if (!v2) { if (!v2) {
out_ptr[i] = in_ptr[i] * v1_ptr[i]; out_ptr[i] = in_ptr[i] * v1_ptr[i];
++out_ptr;
++in_ptr;
++v1_ptr;
} else { } else {
out_ptr[i] = in_ptr[i] + v1_ptr[i] * v2_ptr[i]; out_ptr[i] = in_ptr[i] + v1_ptr[i] * v2_ptr[i];
++out_ptr;
++in_ptr;
++v1_ptr;
++v2_ptr;
} }
} }
} }
......
...@@ -21,6 +21,17 @@ namespace paddle { ...@@ -21,6 +21,17 @@ namespace paddle {
namespace lite { namespace lite {
namespace arm { namespace arm {
namespace math { namespace math {
int AdaptStartIndex(int ph, int input_size, int output_size) {
return static_cast<int>(
floor(static_cast<double>(ph * input_size) / output_size));
}
int AdaptEndIndex(int ph, int input_size, int output_size) {
return static_cast<int>(
ceil(static_cast<double>((ph + 1) * input_size) / output_size));
}
void pooling_basic(const float* din, void pooling_basic(const float* din,
float* dout, float* dout,
int num, int num,
...@@ -88,15 +99,27 @@ void pooling_basic(const float* din, ...@@ -88,15 +99,27 @@ void pooling_basic(const float* din,
#pragma omp parallel for #pragma omp parallel for
for (int ind_c = 0; ind_c < chin; ++ind_c) { for (int ind_c = 0; ind_c < chin; ++ind_c) {
for (int ind_h = 0; ind_h < hout; ++ind_h) { for (int ind_h = 0; ind_h < hout; ++ind_h) {
int sh = ind_h * stride_h; int sh, eh;
int eh = sh + kernel_h; if (adaptive) {
sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; sh = AdaptStartIndex(ind_h, hin, hout);
eh = (eh - pad_h) > hin ? hin : eh - pad_h; eh = AdaptEndIndex(ind_h, hin, hout);
} else {
sh = ind_h * stride_h;
eh = sh + kernel_h;
sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
eh = (eh - pad_h) > hin ? hin : eh - pad_h;
}
for (int ind_w = 0; ind_w < wout; ++ind_w) { for (int ind_w = 0; ind_w < wout; ++ind_w) {
int sw = ind_w * stride_w; int sw, ew;
int ew = sw + kernel_w; if (adaptive) {
sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; sw = AdaptStartIndex(ind_w, win, wout);
ew = (ew - pad_w) > win ? win : ew - pad_w; ew = AdaptEndIndex(ind_w, win, wout);
} else {
sw = ind_w * stride_w;
ew = sw + kernel_w;
sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
ew = (ew - pad_w) > win ? win : ew - pad_w;
}
float result = static_cast<float>(0); float result = static_cast<float>(0);
int dst_ind = (ind_n * chout + ind_c) * size_channel_out + int dst_ind = (ind_n * chout + ind_c) * size_channel_out +
ind_h * wout + ind_w; ind_h * wout + ind_w;
......
此差异已折叠。
...@@ -40,6 +40,15 @@ void scale_compute_basic(const operators::ScaleParam& param) { ...@@ -40,6 +40,15 @@ void scale_compute_basic(const operators::ScaleParam& param) {
template <typename T> template <typename T>
void scale(const T* din, T* dout, int num, T scale, T bias); void scale(const T* din, T* dout, int num, T scale, T bias);
template <typename T>
void scale_relu(const T* din, T* dout, int num, T scale, T bias);
template <typename T>
void scale_relu6(const T* din, T* dout, int num, T scale, T bias, T alpha);
template <typename T>
void scale_leaky_relu(const T* din, T* dout, int num, T scale, T bias, T alpha);
template <typename T> template <typename T>
void scale(const T* din, void scale(const T* din,
T* dout, T* dout,
......
...@@ -28,6 +28,7 @@ namespace lite { ...@@ -28,6 +28,7 @@ namespace lite {
class CLContext { class CLContext {
public: public:
~CLContext() { ~CLContext() {
GetCommandQueue().finish();
for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) { for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
// Note(ysh329): Don't need `clReleaseKernel` // Note(ysh329): Don't need `clReleaseKernel`
kernels_[kidx].reset(); kernels_[kidx].reset();
......
...@@ -100,16 +100,18 @@ TEST(cl_test, kernel_test) { ...@@ -100,16 +100,18 @@ TEST(cl_test, kernel_test) {
size_t width = in_image.ImageWidth(); size_t width = in_image.ImageWidth();
size_t height = in_image.ImageHeight(); size_t height = in_image.ImageHeight();
auto global_work_size = cl::NDRange{width, height}; auto global_work_size = cl::NDRange{width, height};
cl::Event event;
status = context->GetCommandQueue().enqueueNDRangeKernel( status = context->GetCommandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event); kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = context->GetCommandQueue().finish(); status = context->GetCommandQueue().finish();
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
#if 0
double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
double elapsed_micros = (stop_nanos - start_nanos) / 1000.0; double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us."; LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
#endif
LOG(INFO) << out_image; LOG(INFO) << out_image;
} }
......
...@@ -73,7 +73,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, ...@@ -73,7 +73,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
i2 += 4; i2 += 4;
p++; p++;
} else { } else {
image[i2] = 0.0; image[i2] = Float2Half(0.f);
i2 += 4; i2 += 4;
} }
} }
...@@ -261,7 +261,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, ...@@ -261,7 +261,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
image[index] = Float2Half(*p); image[index] = Float2Half(*p);
p++; p++;
} else { } else {
image[index] = 0.0; image[index] = Float2Half(0.f);
} }
if (index >= (width * height * 4)) { if (index >= (width * height * 4)) {
LOG(INFO) << " index out of range "; LOG(INFO) << " index out of range ";
......
...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
///////////////////////////////// /////////////////////////////////
...@@ -108,7 +107,8 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in ...@@ -108,7 +107,8 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in
#endif #endif
#ifdef RELU6 #ifdef RELU6
output = clamp(in, (CL_DTYPE4)0, (CL_DTYPE4)6); in = fmax((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
output = fmin((CL_DTYPE4)(6.0f, 6.0f, 6.0f, 6.0f), in);
#endif #endif
return output; return output;
} }
...@@ -14,36 +14,30 @@ limitations under the License. */ ...@@ -14,36 +14,30 @@ limitations under the License. */
#include <cl_common.h> #include <cl_common.h>
__kernel void relu(__read_only image2d_t input, __kernel void relu(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale) { __private const float scale) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const int x = get_global_id(0); // image_width const sampler_t sampler =
const int y = get_global_id(1); // image_height CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
in = max((CL_DTYPE4)(0.0f), in); in = max((CL_DTYPE4)(0.0f), in);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
} }
__kernel void relu6(__read_only image2d_t input, __kernel void relu6(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale){ __private const float scale) {
const int x = get_global_id(0); const int x = get_global_id(0);
const int y = get_global_id(1); const int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in); in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
...@@ -51,7 +45,6 @@ __kernel void relu6(__read_only image2d_t input, ...@@ -51,7 +45,6 @@ __kernel void relu6(__read_only image2d_t input,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
} }
__kernel void sigmoid(__read_only image2d_t input, __kernel void sigmoid(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
...@@ -64,70 +57,66 @@ __kernel void sigmoid(__read_only image2d_t input, ...@@ -64,70 +57,66 @@ __kernel void sigmoid(__read_only image2d_t input,
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out; CL_DTYPE4 out;
out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y))); out.x = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.x))));
out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z))); out.y = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.y))));
out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w))); out.z = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.z))));
out.w = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.w))));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
} }
__kernel void leaky_relu(__read_only image2d_t input, __kernel void leaky_relu(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale) { __private const float scale) {
const int x = get_global_id(0); const int x = get_global_id(0);
const int y = get_global_id(1); const int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in; CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in;
if (in.x < 0.0f){ if (in.x < 0.0f) {
in.x = s_val.x; in.x = s_val.x;
} }
if (in.y < 0.0f){ if (in.y < 0.0f) {
in.y = s_val.y; in.y = s_val.y;
} }
if (in.z < 0.0f){ if (in.z < 0.0f) {
in.z = s_val.z; in.z = s_val.z;
} }
if (in.w < 0.0f){ if (in.w < 0.0f) {
in.w = s_val.w; in.w = s_val.w;
} }
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
} }
__kernel void tanh_act(__read_only image2d_t input, __kernel void tanh_act(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale) { __private const float scale) {
const int x = get_global_id(0); // image_width
const int x = get_global_id(0); // image_width const int y = get_global_id(1); // image_height
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out= (exp(in) - exp(-in))/ (exp(in) + exp(-in)); CL_DTYPE4 out = (exp(in) - exp(-in)) / (exp(in) + exp(-in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
} }
__kernel void exp_act(__read_only image2d_t input, __kernel void exp_act(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale) { __private const float scale) {
const int x = get_global_id(0); // image_width
const int x = get_global_id(0); // image_width const int y = get_global_id(1); // image_height
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = exp(in); CL_DTYPE4 out = exp(in);
...@@ -135,19 +124,16 @@ __kernel void exp_act(__read_only image2d_t input, ...@@ -135,19 +124,16 @@ __kernel void exp_act(__read_only image2d_t input,
} }
__kernel void swish(__read_only image2d_t input, __kernel void swish(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale) { __private const float scale) {
const int x = get_global_id(0); // image_width
const int x = get_global_id(0); // image_width const int y = get_global_id(1); // image_height
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = in / (1 + exp(-(CL_DTYPE)scale * in)); CL_DTYPE4 out = in / (1 + exp(-(CL_DTYPE)scale * in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
} }
#include <cl_common.h> #include <cl_common.h>
__kernel void conv2d_1x1_opt(__private const int global_size_dim0, __kernel void conv2d_1x1_opt(
__private const int global_size_dim1, __private const int global_size_dim0,
__private const int global_size_dim2, __private const int global_size_dim1,
__read_only image2d_t input_image, __private const int global_size_dim2,
__read_only image2d_t filter, __read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE) #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale,
__read_only image2d_t new_biase, __read_only image2d_t new_biase,
#endif #endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int stride, __private const int stride,
__private const int offset, __private const int offset,
__private const int input_c_block, __private const int input_c_block,
__private const int input_c_origin, __private const int input_c_origin,
__private const int dilation, __private const int dilation,
__private const int input_width, /* of one block */ __private const int input_width, /* of one block */
__private const int input_height, /* of one block */ __private const int input_height, /* of one block */
__private const int output_width, __private const int output_width,
__private const int output_height, __private const int output_height,
__private const int old_w) { __private const int old_w) {
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
...@@ -287,7 +288,7 @@ __kernel void conv2d_1x1_simple( ...@@ -287,7 +288,7 @@ __kernel void conv2d_1x1_simple(
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale,
__read_only image2d_t new_biase, __read_only image2d_t new_biase,
#endif #endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// buffer -> image2d // buffer -> image2d
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
__kernel void buffer_to_image2d(__global CL_DTYPE *in, __kernel void buffer_to_image2d(__global CL_DTYPE* in,
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int out_H, __private const int out_H,
__private const int out_W, __private const int out_W,
...@@ -26,7 +26,6 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in, ...@@ -26,7 +26,6 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
__private const int Stride0, __private const int Stride0,
__private const int Stride1, __private const int Stride1,
__private const int Stride2) { __private const int Stride2) {
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
...@@ -66,16 +65,25 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in, ...@@ -66,16 +65,25 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
#ifdef DEBUG #ifdef DEBUG
if (out_w > 2045) { if (out_w > 2045) {
printf("out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f %.2f\n", printf(
out_w, "out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f "
out_C - 4 * out_c, "%.2f\n",
(float)(in[input_pos0]), out_w,
(float)(in[input_pos1]), out_C - 4 * out_c,
(float)(in[input_pos2]), (float)(in[input_pos0]),
(float)(in[input_pos3])); (float)(in[input_pos1]),
printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", out_c, out_w, out_nh, (float)(in[input_pos2]),
output_pos.x, output_pos.y, (float)(in[input_pos3]));
(float)(output.x), (float)(output.y), (float)(output.z), (float)(output.w)); printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n",
out_c,
out_w,
out_nh,
output_pos.x,
output_pos.y,
(float)(output.x),
(float)(output.y),
(float)(output.z),
(float)(output.w));
} }
#endif #endif
...@@ -101,34 +109,42 @@ __kernel void image2d_to_buffer(__read_only image2d_t input, ...@@ -101,34 +109,42 @@ __kernel void image2d_to_buffer(__read_only image2d_t input,
const int in_h = in_nh % in_height; const int in_h = in_nh % in_height;
const sampler_t sampler = const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int pos_x = mad24(in_c, in_width, in_w); const int pos_x = mad24(in_c, in_width, in_w);
CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)); CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(
CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
#ifdef DEBUG #ifdef DEBUG
if (in_w > 2045) { if (in_w > 2045) {
printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", in_c, in_w, in_nh, printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n",
pos_x, in_nh, in_c,
(float)(in.x), (float)(in.y), (float)(in.z), (float)(in.w)); in_w,
in_nh,
pos_x,
in_nh,
(float)(in.x),
(float)(in.y),
(float)(in.z),
(float)(in.w));
} }
#endif #endif
const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; const int index =
in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE); out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE);
if (C - 4 * in_c >= 2) { if (C - 4 * in_c >= 2) {
out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE); out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
} }
if(C - 4 * in_c >= 3) { if (C - 4 * in_c >= 3) {
out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE); out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
} }
if(C - 4 * in_c >= 4) { if (C - 4 * in_c >= 4) {
out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE); out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
} }
} }
#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// buffer -> image2d_nw // buffer -> image2d_nw
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
...@@ -182,8 +198,7 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in, ...@@ -182,8 +198,7 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
} }
#endif #endif
#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
// image2d -> buffer // image2d -> buffer
__kernel void image2d_to_buffer_2d(__private const int in_height, __kernel void image2d_to_buffer_2d(__private const int in_height,
__private const int in_width, __private const int in_width,
...@@ -208,15 +223,14 @@ __kernel void image2d_to_buffer_2d(__private const int in_height, ...@@ -208,15 +223,14 @@ __kernel void image2d_to_buffer_2d(__private const int in_height,
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// buffer -> image2d (divide by 255 to normalize) // buffer -> image2d (divide by 255 to normalize)
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
__kernel void buffer_to_image2d_with_pre255(__global uchar *in, __kernel void buffer_to_image2d_with_pre255(__global uchar* in,
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int out_H, __private const int out_H,
__private const int out_W, __private const int out_W,
__private const int out_C, __private const int out_C,
__private const int Stride0, __private const int Stride0,
__private const int Stride1, __private const int Stride1,
__private const int Stride2){ __private const int Stride2) {
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
...@@ -231,7 +245,6 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in, ...@@ -231,7 +245,6 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
const int in_h = out_h; const int in_h = out_h;
const int in_w = out_w; const int in_w = out_w;
int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w; int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w; int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w; int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
...@@ -243,30 +256,29 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in, ...@@ -243,30 +256,29 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f; CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f;
output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255; output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255;
if(out_C - 4 * out_c>=2){ if (out_C - 4 * out_c >= 2) {
output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255; output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255;
} }
if(out_C - 4 * out_c>=3){ if (out_C - 4 * out_c >= 3) {
output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255; output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255;
} }
if(out_C - 4 * out_c>=4){ if (out_C - 4 * out_c >= 4) {
output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255; output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255;
} }
WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output); WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
} }
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// image2d -> buffer (multiply by 255 to de-normalize) // image2d -> buffer (multiply by 255 to de-normalize)
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
__kernel void image2d_to_buffer_with_post255(__read_only image2d_t input, __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
__private const int in_width, __private const int in_width,
__private const int in_height, __private const int in_height,
__global uchar* out, __global uchar* out,
__private const int size_ch, __private const int size_ch,
__private const int size_block, __private const int size_block,
__private const int size_batch, __private const int size_batch,
__private const int C) { __private const int C) {
const int in_c = get_global_id(0); const int in_c = get_global_id(0);
const int in_w = get_global_id(1); const int in_w = get_global_id(1);
const int in_nh = get_global_id(2); const int in_nh = get_global_id(2);
...@@ -277,22 +289,34 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input, ...@@ -277,22 +289,34 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int pos_x = mad24(in_c, in_width, in_w); const int pos_x = mad24(in_c, in_width, in_w);
CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) * 255; CL_COMPUTE_DTYPE4 in =
READ_IMG_TYPE(
CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) *
255;
#ifdef DEBUG #ifdef DEBUG
printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n", printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n",
in_c, in_w, in_nh, pos_x, in_nh, in.x, in.y, in.z, in.w); in_c,
in_w,
in_nh,
pos_x,
in_nh,
in.x,
in.y,
in.z,
in.w);
#endif #endif
const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; const int index =
in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
out[index] = convert_uchar_sat(in.x); out[index] = convert_uchar_sat(in.x);
if(C - 4 * in_c>=2){ if (C - 4 * in_c >= 2) {
out[index + size_ch] = convert_uchar_sat(in.y); out[index + size_ch] = convert_uchar_sat(in.y);
} }
if(C - 4 * in_c>=3){ if (C - 4 * in_c >= 3) {
out[index + size_ch * 2] = convert_uchar_sat(in.z); out[index + size_ch * 2] = convert_uchar_sat(in.z);
} }
if(C - 4 * in_c>=4){ if (C - 4 * in_c >= 4) {
out[index + size_ch * 3] = convert_uchar_sat(in.w); out[index + size_ch * 3] = convert_uchar_sat(in.w);
} }
} }
...@@ -45,6 +45,9 @@ bool CLRuntime::Init() { ...@@ -45,6 +45,9 @@ bool CLRuntime::Init() {
bool is_device_init = InitializeDevice(); bool is_device_init = InitializeDevice();
is_init_success_ = is_platform_init && is_device_init; is_init_success_ = is_platform_init && is_device_init;
initialized_ = true; initialized_ = true;
context_ = CreateContext();
command_queue_ = CreateCommandQueue(context());
return initialized_; return initialized_;
} }
...@@ -55,7 +58,7 @@ cl::Platform& CLRuntime::platform() { ...@@ -55,7 +58,7 @@ cl::Platform& CLRuntime::platform() {
cl::Context& CLRuntime::context() { cl::Context& CLRuntime::context() {
if (context_ == nullptr) { if (context_ == nullptr) {
context_ = CreateContext(); LOG(FATAL) << "context_ create failed. ";
} }
return *context_; return *context_;
} }
...@@ -67,7 +70,7 @@ cl::Device& CLRuntime::device() { ...@@ -67,7 +70,7 @@ cl::Device& CLRuntime::device() {
cl::CommandQueue& CLRuntime::command_queue() { cl::CommandQueue& CLRuntime::command_queue() {
if (command_queue_ == nullptr) { if (command_queue_ == nullptr) {
command_queue_ = CreateCommandQueue(context()); LOG(FATAL) << "command_queue_ create failed. ";
} }
return *command_queue_; return *command_queue_;
} }
...@@ -96,7 +99,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent( ...@@ -96,7 +99,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent(
bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) { bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) {
/* -I +CLRuntime::Global()->cl_path() + "/cl_kernel"*/ /* -I +CLRuntime::Global()->cl_path() + "/cl_kernel"*/
std::string build_option = options + " -cl-fast-relaxed-math "; std::string build_option = options + " -cl-fast-relaxed-math -cl-mad-enable";
VLOG(4) << "OpenCL build_option: " << build_option; VLOG(4) << "OpenCL build_option: " << build_option;
status_ = program->build({*device_}, build_option.c_str()); status_ = program->build({*device_}, build_option.c_str());
CL_CHECK_ERROR(status_); CL_CHECK_ERROR(status_);
......
...@@ -66,7 +66,8 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width, ...@@ -66,7 +66,8 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
cl_int status; cl_int status;
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(), new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
: CL_MEM_ALLOC_HOST_PTR),
img_format, img_format,
cl_image2d_width, cl_image2d_width,
cl_image2d_height, cl_image2d_height,
...@@ -89,7 +90,8 @@ void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width, ...@@ -89,7 +90,8 @@ void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width,
cl_int status; cl_int status;
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(), new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
: CL_MEM_ALLOC_HOST_PTR),
img_format, img_format,
cl_image2d_width, cl_image2d_width,
cl_image2d_height, cl_image2d_height,
...@@ -112,7 +114,8 @@ void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width, ...@@ -112,7 +114,8 @@ void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width,
cl_int status; cl_int status;
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(), new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
: CL_MEM_ALLOC_HOST_PTR),
img_format, img_format,
cl_image2d_width, cl_image2d_width,
cl_image2d_height, cl_image2d_height,
...@@ -192,7 +195,6 @@ void TargetWrapperCL::MemcpySync(void *dst, ...@@ -192,7 +195,6 @@ void TargetWrapperCL::MemcpySync(void *dst,
size_t size, size_t size,
IoDirection dir) { IoDirection dir) {
cl_int status; cl_int status;
cl::Event event;
auto stream = CLRuntime::Global()->command_queue(); auto stream = CLRuntime::Global()->command_queue();
switch (dir) { switch (dir) {
case IoDirection::DtoD: case IoDirection::DtoD:
...@@ -202,9 +204,9 @@ void TargetWrapperCL::MemcpySync(void *dst, ...@@ -202,9 +204,9 @@ void TargetWrapperCL::MemcpySync(void *dst,
0, 0,
size, size,
nullptr, nullptr,
&event); nullptr);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
event.wait(); CLRuntime::Global()->command_queue().finish();
break; break;
case IoDirection::HtoD: case IoDirection::HtoD:
status = stream.enqueueWriteBuffer(*static_cast<cl::Buffer *>(dst), status = stream.enqueueWriteBuffer(*static_cast<cl::Buffer *>(dst),
...@@ -283,7 +285,6 @@ void TargetWrapperCL::ImgcpySync(void *dst, ...@@ -283,7 +285,6 @@ void TargetWrapperCL::ImgcpySync(void *dst,
cl::array<size_t, 3> origin = {0, 0, 0}; cl::array<size_t, 3> origin = {0, 0, 0};
cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1}; cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1};
cl_int status; cl_int status;
cl::Event event;
auto stream = CLRuntime::Global()->command_queue(); auto stream = CLRuntime::Global()->command_queue();
switch (dir) { switch (dir) {
case IoDirection::DtoD: case IoDirection::DtoD:
...@@ -293,9 +294,9 @@ void TargetWrapperCL::ImgcpySync(void *dst, ...@@ -293,9 +294,9 @@ void TargetWrapperCL::ImgcpySync(void *dst,
origin, origin,
region, region,
nullptr, nullptr,
&event); nullptr);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
event.wait(); CLRuntime::Global()->command_queue().finish();
break; break;
case IoDirection::HtoD: case IoDirection::HtoD:
status = stream.enqueueWriteImage(*static_cast<cl::Image2D *>(dst), status = stream.enqueueWriteImage(*static_cast<cl::Image2D *>(dst),
......
...@@ -129,8 +129,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> { ...@@ -129,8 +129,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
T* output_data = output->template mutable_data<T>(); T* output_data = output->template mutable_data<T>();
for (int64_t i = 0; i < in_dims[0]; ++i) { for (int64_t i = 0; i < in_dims[0]; ++i) {
for (int64_t j = 0; j < size; ++j) { for (int64_t j = 0; j < size; ++j) {
output_data[i * in_dims[0] + j] = output_data[i * size + j] = input_data[i * size + j] + vector_data[j];
input_data[i * in_dims[0] + j] + vector_data[j];
} }
} }
} }
......
...@@ -279,7 +279,7 @@ struct MergeAdd<lite::TargetType::kX86, T> { ...@@ -279,7 +279,7 @@ struct MergeAdd<lite::TargetType::kX86, T> {
} }
} }
if (has_value_input == nullptr) { if (has_value_input == nullptr) {
VLOG(3) << "no input has value! just return" << std::endl; VLOG(3) << "no input has value! just return";
return; return;
} }
auto input_width = has_value_input->value().dims()[1]; auto input_width = has_value_input->value().dims()[1];
......
...@@ -19,6 +19,7 @@ namespace lite { ...@@ -19,6 +19,7 @@ namespace lite {
#ifdef LITE_WITH_XPU #ifdef LITE_WITH_XPU
thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr}; thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
int Context<TargetType::kXPU>::_workspace_l3_size_per_thread{0};
#endif #endif
} // namespace lite } // namespace lite
......
...@@ -151,14 +151,23 @@ class Context<TargetType::kXPU> { ...@@ -151,14 +151,23 @@ class Context<TargetType::kXPU> {
if (_tls_raw_ctx == nullptr) { if (_tls_raw_ctx == nullptr) {
_tls_raw_ctx = xdnn::create_context(); _tls_raw_ctx = xdnn::create_context();
CHECK(_tls_raw_ctx); CHECK(_tls_raw_ctx);
int r = xdnn::set_workspace_l3_size(_tls_raw_ctx,
_workspace_l3_size_per_thread);
if (r != 0) {
LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r
<< ", _workspace_l3_size_per_thread = "
<< _workspace_l3_size_per_thread;
}
} }
return _tls_raw_ctx; return _tls_raw_ctx;
} }
static void SetWorkspaceL3Size(int l3_size = 0xfffc00) { static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
xdnn::set_workspace_l3_size(GetRawContext(), l3_size); _workspace_l3_size_per_thread = l3_size;
} }
// **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
// thread
static void SetDev(int dev_no = 0) { static void SetDev(int dev_no = 0) {
const char* dev_env = getenv("LITE_XPU_DEV"); const char* dev_env = getenv("LITE_XPU_DEV");
if (dev_env) { if (dev_env) {
...@@ -173,6 +182,7 @@ class Context<TargetType::kXPU> { ...@@ -173,6 +182,7 @@ class Context<TargetType::kXPU> {
private: private:
static thread_local xdnn::Context* _tls_raw_ctx; static thread_local xdnn::Context* _tls_raw_ctx;
static int _workspace_l3_size_per_thread;
}; };
#endif #endif
...@@ -340,27 +350,17 @@ class Context<TargetType::kX86> { ...@@ -340,27 +350,17 @@ class Context<TargetType::kX86> {
template <> template <>
class Context<TargetType::kOpenCL> { class Context<TargetType::kOpenCL> {
std::shared_ptr<CLContext> cl_context_; std::shared_ptr<CLContext> cl_context_;
using WaitListType =
std::unordered_map<decltype(static_cast<const void*>(nullptr)),
std::shared_ptr<cl::Event>>;
std::shared_ptr<WaitListType> cl_wait_list_;
public: public:
CLContext* cl_context() { return cl_context_.get(); } CLContext* cl_context() { return cl_context_.get(); }
WaitListType* cl_wait_list() { return cl_wait_list_.get(); }
void InitOnce() { void InitOnce() {
// Init cl runtime. // Init cl runtime.
CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed"; CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed";
cl_context_ = std::make_shared<CLContext>(); cl_context_ = std::make_shared<CLContext>();
cl_wait_list_ = std::make_shared<WaitListType>();
} }
void CopySharedTo(OpenCLContext* ctx) { void CopySharedTo(OpenCLContext* ctx) { ctx->cl_context_ = cl_context_; }
ctx->cl_context_ = cl_context_;
ctx->cl_wait_list_ = cl_wait_list_;
}
}; };
#endif #endif
......
...@@ -21,9 +21,13 @@ lite_cc_library(mir_passes ...@@ -21,9 +21,13 @@ lite_cc_library(mir_passes
fusion/elementwise_add_activation_fuse_pass.cc fusion/elementwise_add_activation_fuse_pass.cc
fusion/quant_dequant_fuse_pass.cc fusion/quant_dequant_fuse_pass.cc
fusion/sequence_pool_concat_fuse_pass.cc fusion/sequence_pool_concat_fuse_pass.cc
fusion/scale_activation_fuse_pass.cc
fusion/__xpu__resnet_fuse_pass.cc fusion/__xpu__resnet_fuse_pass.cc
fusion/__xpu__multi_encoder_fuse_pass.cc fusion/__xpu__multi_encoder_fuse_pass.cc
fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
fusion/__xpu__fc_fuse_pass.cc
elimination/identity_scale_eliminate_pass.cc elimination/identity_scale_eliminate_pass.cc
elimination/identity_dropout_eliminate_pass.cc
elimination/elementwise_mul_constant_eliminate_pass.cc elimination/elementwise_mul_constant_eliminate_pass.cc
static_kernel_pick_pass.cc static_kernel_pick_pass.cc
variable_place_inference_pass.cc variable_place_inference_pass.cc
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/pass.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/pattern_matcher_high_api.h"
namespace paddle {
namespace lite {
namespace mir {
namespace {
class Eliminator : public FuseBase {
public:
void BuildPattern() override {
// the previous op's output need updat
auto* pre_op = OpNode("preop")->assert_is_not_op_type("conditional_block");
// TODO(Superjomn) check has only one output
auto* x = VarNode("x")->assert_is_op_input("dropout", "X");
auto* dropout_op = OpNode("dropout", "dropout")
->assert_op_attr<int>("is_test", 1)
->assert_op_attr<std::string>(
"dropout_implementation", "upscale_in_train");
auto* out = VarNode("out")->assert_is_op_output("dropout", "Out");
auto* mask = VarNode("mask")->assert_is_op_output("dropout", "Mask");
*pre_op >> *x >> *dropout_op >> *out;
*dropout_op >> *mask;
// The pre_op will be eliminated, and a new output-updated op will insert.
x->AsIntermediate(); // x is pre_op's output, need to update
dropout_op->AsIntermediate();
mask->AsIntermediate();
}
private:
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
auto& pre_op = matched.at("preop")->AsStmt();
auto op_info = *pre_op.op_info();
op_info.UpdateAllOutputs(matched.at("x")->AsArg().name,
matched.at("out")->AsArg().name);
pre_op.ResetOp(op_info, graph->valid_places());
IR_NODE_LINK_TO(matched.at("preop"), matched.at("out"));
}
};
} // namespace
class IdentityDropoutEliminatePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
Eliminator eliminator;
eliminator(graph.get());
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(identity_dropout_eliminate_pass,
paddle::lite::mir::IdentityDropoutEliminatePass)
.BindTargets({TARGET(kXPU)});
...@@ -31,6 +31,9 @@ lite_cc_library(fuse_interpolate ...@@ -31,6 +31,9 @@ lite_cc_library(fuse_interpolate
lite_cc_library(fuse_sequence_pool_concat lite_cc_library(fuse_sequence_pool_concat
SRCS sequence_pool_concat_fuser.cc SRCS sequence_pool_concat_fuser.cc
DEPS pattern_matcher_high_api) DEPS pattern_matcher_high_api)
lite_cc_library(fuse_scale_activation
SRCS scale_activation_fuser.cc
DEPS pattern_matcher_high_api)
set(mir_fusers set(mir_fusers
fuse_fc fuse_fc
...@@ -44,6 +47,7 @@ set(mir_fusers ...@@ -44,6 +47,7 @@ set(mir_fusers
fuse_transpose_softmax_transpose fuse_transpose_softmax_transpose
fuse_interpolate fuse_interpolate
fuse_sequence_pool_concat fuse_sequence_pool_concat
fuse_scale_activation
CACHE INTERNAL "fusers") CACHE INTERNAL "fusers")
if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <vector>
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
#include "lite/utils/string.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class XPUEmbeddingWithEltwiseAddFuser : public FuseBase {
public:
explicit XPUEmbeddingWithEltwiseAddFuser(int n_embedding)
: n_embedding_(n_embedding) {}
void BuildPattern() override {
auto* ids0 =
VarNode("ids0")->assert_is_op_input("lookup_table", "Ids")->AsInput();
auto* table0 =
VarNode("table0")->assert_is_op_input("lookup_table", "W")->AsInput();
auto* embedding0 = OpNode("embedding0", "lookup_table");
auto* embedding_out0 = VarNode("embedding_out0")
->assert_is_op_output("lookup_table", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* ids1 =
VarNode("ids1")->assert_is_op_input("lookup_table", "Ids")->AsInput();
auto* table1 =
VarNode("table1")->assert_is_op_input("lookup_table", "W")->AsInput();
auto* embedding1 = OpNode("embedding1", "lookup_table")->AsIntermediate();
auto* embedding_out1 = VarNode("embedding_out1")
->assert_is_op_output("lookup_table", "Out")
->assert_is_op_input("elementwise_add", "Y")
->AsIntermediate();
auto* ewadd01 = OpNode("ewadd01", "elementwise_add")->AsIntermediate();
auto* ewadd01_out = VarNode("ewadd01_out")
->assert_is_op_output("elementwise_add", "Out")
->AsIntermediate();
embedding0->LinksFrom({ids0, table0});
embedding0->LinksTo({embedding_out0});
embedding1->LinksFrom({ids1, table1});
embedding1->LinksTo({embedding_out1});
ewadd01->LinksFrom({embedding_out0, embedding_out1});
ewadd01->LinksTo({ewadd01_out});
auto* last_ewadd_out = ewadd01_out;
for (int i = 2; i < n_embedding_; ++i) {
auto ids_name = paddle::lite::string_format("ids%d", i);
auto table_name = paddle::lite::string_format("table%d", i);
auto embedding_name = paddle::lite::string_format("embedding%d", i);
auto embedding_out_name =
paddle::lite::string_format("embedding_out%d", i);
auto* new_ids = VarNode(ids_name)
->assert_is_op_input("lookup_table", "Ids")
->AsInput();
auto* new_table = VarNode(table_name)
->assert_is_op_input("lookup_table", "W")
->AsInput();
auto* new_embedding =
OpNode(embedding_name, "lookup_table")->AsIntermediate();
auto* new_embedding_out = VarNode(embedding_out_name)
->assert_is_op_output("lookup_table", "Out")
->assert_is_op_input("elementwise_add", "Y")
->AsIntermediate();
new_embedding->LinksFrom({new_ids, new_table});
new_embedding->LinksTo({new_embedding_out});
auto ewadd_name = paddle::lite::string_format("ewadd%d%d", i - 1, i);
auto ewadd_out_name = ewadd_name + "_out";
auto* new_ewadd = OpNode(ewadd_name, "elementwise_add")->AsIntermediate();
auto* new_ewadd_out = VarNode(ewadd_out_name)
->assert_is_op_output("elementwise_add", "Out")
->AsIntermediate();
new_ewadd->LinksFrom({last_ewadd_out, new_embedding_out});
new_ewadd->LinksTo({new_ewadd_out});
last_ewadd_out = new_ewadd_out;
}
last_ewadd_out->AsOutput();
}
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
cpp::OpDesc op_desc;
op_desc.SetType("__xpu__embedding_with_eltwise_add");
std::vector<std::string> ids_names;
std::vector<std::string> table_names;
for (int i = 0; i < n_embedding_; ++i) {
auto ids_name = paddle::lite::string_format("ids%d", i);
ids_names.push_back(matched.at(ids_name)->arg()->name);
auto table_name = paddle::lite::string_format("table%d", i);
table_names.push_back(matched.at(table_name)->arg()->name);
}
op_desc.SetInput("Ids", ids_names);
op_desc.SetInput("Tables", table_names);
auto output_name = paddle::lite::string_format(
"ewadd%d%d_out", n_embedding_ - 2, n_embedding_ - 1);
op_desc.SetOutput("Output", {matched.at(output_name)->arg()->name});
op_desc.SetAttr<int>("n_embedding", n_embedding_);
auto* embedding0_op_info = matched.at("embedding0")->stmt()->op_info();
op_desc.SetAttr<int64_t>(
"padding_idx", embedding0_op_info->GetAttr<int64_t>("padding_idx"));
auto* new_stmt = matched.at("embedding0")->stmt();
auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
new_op->Attach(op_desc, new_stmt->op()->scope());
new_op->SetValidPlaces(new_stmt->op()->valid_places());
auto kernels = new_op->CreateKernels(new_op->valid_places());
new_stmt->SetOp(new_op);
new_stmt->SetKernels(std::move(kernels));
for (int i = 0; i < n_embedding_; ++i) {
auto ids_name = paddle::lite::string_format("ids%d", i);
auto table_name = paddle::lite::string_format("table%d", i);
DirectedLink(matched.at(ids_name), matched.at("embedding0"));
DirectedLink(matched.at(table_name), matched.at("embedding0"));
}
IR_OP_VAR_LINK(matched.at("embedding0"), matched.at(output_name));
}
private:
int n_embedding_;
};
} // namespace fusion
class XPUEmbeddingWithEltwiseAddFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
for (int n_embedding : {4, 3}) {
fusion::XPUEmbeddingWithEltwiseAddFuser fuser(n_embedding);
fuser(graph.get());
}
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass,
paddle::lite::mir::XPUEmbeddingWithEltwiseAddFusePass)
.BindTargets({TARGET(kXPU)})
.BindKernel("lookup_table");
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <string>
#include "lite/backends/xpu/math.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/pattern_matcher_high_api.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class XPUFcFuser : public FuseBase {
public:
explicit XPUFcFuser(bool with_relu) : with_relu_(with_relu) {}
void BuildPattern() override {
// create nodes.
auto* x = VarNode("x")->assert_is_op_input("mul", "X");
auto* W = VarNode("W")->assert_is_op_input("mul", "Y");
auto* b = VarNode("b")->assert_is_persistable_var();
auto* mul = OpNode("mul", "mul");
auto* mul_out = VarNode("mul_out");
auto* add = OpNode("add", "elementwise_add");
auto* Out = VarNode("Out");
// create topology.
std::vector<PMNode*> mul_inputs{W, x};
std::vector<PMNode*> add_inputs{mul_out, b};
mul_inputs >> *mul >> *mul_out;
// Some op specialities.
mul_out->AsIntermediate();
mul->AsIntermediate();
add->AsIntermediate();
if (with_relu_) {
auto* add_out = VarNode("add_out");
auto* relu = OpNode("relu", "relu");
std::vector<PMNode*> relu_inputs{add_out};
add_inputs >> *add >> *add_out;
relu_inputs >> *relu >> *Out;
add_out->AsIntermediate();
relu->AsIntermediate();
} else {
add_inputs >> *add >> *Out;
}
}
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
auto mul = matched.at("mul")->stmt()->op();
auto* scope = mul->scope();
// convert W from float to int16, and transpose W
auto weight_name = matched.at("W")->arg()->name;
auto* weight_t = scope->FindMutableTensor(weight_name);
auto weight_dims = weight_t->dims();
int weight_len = weight_t->numel();
float* weight_on_host = weight_t->mutable_data<float>();
float max_f =
paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
std::unique_ptr<int16_t[]> weight_trans_int16(new int16_t[weight_len]);
paddle::lite::xpu::math::ConvertFP32ToInt16(
weight_on_host, weight_int16.get(), max_f, weight_len);
paddle::lite::xpu::math::Transpose(weight_int16.get(),
weight_trans_int16.get(),
weight_dims[0],
weight_dims[1]);
memcpy(
weight_on_host, weight_trans_int16.get(), weight_len * sizeof(int16_t));
auto op_desc = GenOpDesc(matched, max_f, true);
auto fc_op = LiteOpRegistry::Global().Create("__xpu__fc");
auto& valid_places = mul->valid_places();
fc_op->Attach(op_desc, scope);
auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places);
IR_NODE_LINK_TO(matched.at("W"), new_op_node);
IR_NODE_LINK_TO(matched.at("x"), new_op_node);
IR_NODE_LINK_TO(matched.at("b"), new_op_node);
IR_NODE_LINK_TO(new_op_node, matched.at("Out"));
}
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched,
float w_max,
bool transpose_w) {
cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info();
op_desc.mutable_inputs()->clear();
op_desc.mutable_outputs()->clear();
op_desc.SetType("__xpu__fc");
op_desc.SetInput("Input", {matched.at("x")->arg()->name});
op_desc.SetInput("W", {matched.at("W")->arg()->name});
op_desc.SetInput("Bias", {matched.at("b")->arg()->name});
op_desc.SetOutput("Out", {matched.at("Out")->arg()->name});
op_desc.SetAttr(
"in_num_col_dims",
matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
op_desc.SetAttr("w_max", w_max);
op_desc.SetAttr("transpose_w", transpose_w);
if (with_relu_) {
op_desc.SetAttr("activation_type", std::string{"relu"});
}
return op_desc;
}
bool with_relu_;
};
} // namespace fusion
class XPUFcFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
fusion::XPUFcFuser fuser(true /* with_relu */);
fuser(graph.get());
fusion::XPUFcFuser fuser2(false /* with_relu */);
fuser2(graph.get());
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(__xpu__fc_fuse_pass, paddle::lite::mir::XPUFcFusePass)
.BindTargets({TARGET(kXPU)})
.BindKernel("fc");
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <vector> #include <vector>
#include "lite/backends/xpu/math.h" #include "lite/backends/xpu/math.h"
#include "lite/core/mir/pass_registry.h" #include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/type_precision_cast_pass.h" // For UpdateInputs()
#include "lite/core/mir/xpu_pattern_matcher_high_api.h" #include "lite/core/mir/xpu_pattern_matcher_high_api.h"
#include "lite/operators/subgraph_op.h" #include "lite/operators/subgraph_op.h"
...@@ -588,8 +589,7 @@ class XPUMultiEncoderFuser { ...@@ -588,8 +589,7 @@ class XPUMultiEncoderFuser {
multi_encoder_stmt->SetOp(multi_encoder_op); multi_encoder_stmt->SetOp(multi_encoder_op);
multi_encoder_stmt->SetKernels(std::move(kernels)); multi_encoder_stmt->SetKernels(std::move(kernels));
// temp remove useless cast // remove dangling/useless cast
std::unordered_set<const Node*> to_remove2;
Node* stack = nullptr; Node* stack = nullptr;
for (auto* node : graph->StmtTopologicalOrder()) { for (auto* node : graph->StmtTopologicalOrder()) {
CHECK(node->IsStmt()); CHECK(node->IsStmt());
...@@ -597,16 +597,39 @@ class XPUMultiEncoderFuser { ...@@ -597,16 +597,39 @@ class XPUMultiEncoderFuser {
stack = node; stack = node;
} }
} }
Node* stack_out = stack->outlinks.front(); if (stack) {
for (Node* cast : stack_out->outlinks) { std::unordered_set<const Node*> to_remove2;
Node* cast_out = cast->outlinks.front(); Node* stack_out = stack->outlinks.front();
if (cast_out->outlinks.size() == 0) { // avoid modification while traversing
// remove auto stack_out_outlinks = stack_out->outlinks;
to_remove2.insert(cast_out); for (Node* cast : stack_out_outlinks) {
to_remove2.insert(cast); if (cast->stmt()->op_info()->Type() != "cast") {
continue;
}
Node* cast_out = cast->outlinks.front();
if (cast_out->outlinks.size() == 0) {
// dangling cast
to_remove2.insert(cast);
to_remove2.insert(cast_out);
VLOG(3) << "Remove dangling cast [" << cast_out->arg()->name << "]";
} else if (cast_out->outlinks.size() == 1) {
// useless cast
to_remove2.insert(cast);
to_remove2.insert(cast_out);
VLOG(3) << "Remove useless cast [" << cast_out->arg()->name << "]";
auto* multi_encoder = cast_out->outlinks.front();
DirectedLink(stack_out, multi_encoder);
UpdateInputs(multi_encoder->stmt()->op().get(),
cast_out->arg()->name,
stack_out->arg()->name);
auto update_op_info = *multi_encoder->stmt()->op_info();
multi_encoder->stmt()->ResetOp(update_op_info, graph->valid_places());
}
} }
GraphSafeRemoveNodes(graph, to_remove2);
} }
GraphSafeRemoveNodes(graph, to_remove2);
} }
}; };
......
...@@ -103,9 +103,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -103,9 +103,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
std::string conv_weight_name = matched.at("conv_weight")->arg()->name; std::string conv_weight_name = matched.at("conv_weight")->arg()->name;
auto conv_weight_t = auto conv_weight_t =
scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>(); scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
auto groups = conv_op_desc->GetAttr<int>("groups");
bool depthwise = false;
if (conv_type_ == "conv2d_transpose") { if (conv_type_ == "conv2d_transpose") {
depthwise = (conv_weight_t->dims()[0] == conv_weight_t->dims()[1] * groups);
CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()), CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
static_cast<size_t>(conv_weight_t->dims()[1])) static_cast<size_t>(conv_weight_t->dims()[1] * groups))
<< "The BN bias's size should be equal to the size of the first " << "The BN bias's size should be equal to the size of the first "
<< "dim size of the conv weights"; << "dim size of the conv weights";
} else { } else {
...@@ -159,7 +162,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -159,7 +162,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
// compute new conv_weight for int8 // compute new conv_weight for int8
auto weight_scale = auto weight_scale =
conv_op_desc->GetAttr<std::vector<float>>("weight_scale"); conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
if (conv_type_ == "conv2d_transpose") { if (conv_type_ == "conv2d_transpose" && !depthwise) {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
conv_weight_t->dims()[3]; conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
...@@ -199,7 +202,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -199,7 +202,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
} else { } else {
// compute new conv_weight // compute new conv_weight
auto conv_weight_d = conv_weight_t->mutable_data<float>(); auto conv_weight_d = conv_weight_t->mutable_data<float>();
if (conv_type_ == "conv2d_transpose") { if (conv_type_ == "conv2d_transpose" && !depthwise) {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
conv_weight_t->dims()[3]; conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/scale_activation_fuse_pass.h"
#include <memory>
#include <vector>
#include "lite/core/mir/fusion/scale_activation_fuser.h"
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
void ScaleActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
for (auto act_type : {"relu", "relu6", "leaky_relu"}) {
fusion::ScaleActivationFuser fuser(act_type);
fuser(graph.get());
}
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(lite_scale_activation_fuse_pass,
paddle::lite::mir::ScaleActivationFusePass)
.BindTargets({TARGET(kARM)})
.BindKernel("scale");
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "lite/core/mir/pass.h"
namespace paddle {
namespace lite {
namespace mir {
class ScaleActivationFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/scale_activation_fuser.h"
#include <memory>
#include <vector>
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
void ScaleActivationFuser::BuildPattern() {
// create input nodes.
auto* x = VarNode("x")->assert_is_op_input("scale", "X")->AsInput();
// create op nodes
auto* scale =
OpNode("scale", "scale")->assert_is_op("scale")->AsIntermediate();
auto* act =
OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate();
// create intermediate nodes
auto* scale_out = VarNode("scale_out")
->assert_is_op_output("scale", "Out")
->assert_is_op_input(act_type_, "X")
->AsIntermediate();
// create output node
auto* out =
VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
// create topology.
*x >> *scale >> *scale_out;
*scale_out >> *act >> *out;
}
void ScaleActivationFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) {
auto op_desc = GenOpDesc(matched);
auto scale_op = LiteOpRegistry::Global().Create("scale");
auto scale = matched.at("scale")->stmt()->op();
auto* scope = scale->scope();
auto& valid_places = scale->valid_places();
scale_op->Attach(op_desc, scope);
auto* new_op_node = graph->GraphCreateInstructNode(scale_op, valid_places);
IR_NODE_LINK_TO(matched.at("x"), new_op_node);
IR_NODE_LINK_TO(new_op_node, matched.at("output"));
}
cpp::OpDesc ScaleActivationFuser::GenOpDesc(const key2nodes_t& matched) {
cpp::OpDesc op_desc = *matched.at("scale")->stmt()->op_info();
op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info();
op_desc.SetAttr("activation_type", act_type_);
if (act_type_ == "relu") {
op_desc.SetAttr("fuse_relu", true);
} else if (act_type_ == "relu6") {
float alpha = act_op_desc.GetAttr<float>("threshold");
op_desc.SetAttr("alpha", alpha);
} else if (act_type_ == "leaky_relu") {
float alpha = act_op_desc.GetAttr<float>("alpha");
op_desc.SetAttr("alpha", alpha);
}
return op_desc;
}
} // namespace fusion
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "lite/core/mir/pattern_matcher_high_api.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class ScaleActivationFuser : public FuseBase {
public:
explicit ScaleActivationFuser(const std::string& act_type) {
act_type_ = act_type;
}
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
std::string act_type_;
};
} // namespace fusion
} // namespace mir
} // namespace lite
} // namespace paddle
...@@ -25,16 +25,16 @@ namespace lite { ...@@ -25,16 +25,16 @@ namespace lite {
bool OpLite::InferShape() { bool OpLite::InferShape() {
// if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_ // if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_
// InferShapeByMemoryInternal will be applied. // InferShapeByMemoryInternal will be applied.
if (param_.input_tensor_ptrs() && param_.output_tensor_ptrs()) { if (op_param_ && op_param_->input_tensor_ptrs() &&
op_param_->output_tensor_ptrs()) {
return this->InferShapeWithCache(); return this->InferShapeWithCache();
} else { } else {
// otherwise, InferShapeImpl is applied directly.
return this->InferShapeImpl(); return this->InferShapeImpl();
} }
} }
bool OpLite::InferShapeWithCache() { bool OpLite::InferShapeWithCache() {
// 1. Get vector of current input tensors // 1. Get vector of current input tensors
auto *current_inputs = param_.input_tensor_ptrs(); auto *current_inputs = op_param_->input_tensor_ptrs();
// 2. Get hash value of current inputs shape and lod // 2. Get hash value of current inputs shape and lod
size_t new_hash = 0; size_t new_hash = 0;
for (auto iter = current_inputs->begin(); iter != current_inputs->end(); for (auto iter = current_inputs->begin(); iter != current_inputs->end();
...@@ -59,7 +59,7 @@ bool OpLite::InferShapeWithCache() { ...@@ -59,7 +59,7 @@ bool OpLite::InferShapeWithCache() {
if (new_hash == io_shape_lod_hash_ && new_hash != 0) { if (new_hash == io_shape_lod_hash_ && new_hash != 0) {
// if current hash value is consistent with io_shape_lod_hash_, // if current hash value is consistent with io_shape_lod_hash_,
// previous outputs shape and lod are reused. // previous outputs shape and lod are reused.
auto *current_outputs = param_.output_tensor_ptrs(); auto *current_outputs = op_param_->output_tensor_ptrs();
for (size_t i = 0; i < current_outputs->size(); i++) { for (size_t i = 0; i < current_outputs->size(); i++) {
current_outputs->at(i)->Resize(last_output_shapes[i]); current_outputs->at(i)->Resize(last_output_shapes[i]);
current_outputs->at(i)->set_lod(last_output_lods[i]); current_outputs->at(i)->set_lod(last_output_lods[i]);
...@@ -68,10 +68,12 @@ bool OpLite::InferShapeWithCache() { ...@@ -68,10 +68,12 @@ bool OpLite::InferShapeWithCache() {
// otherwise, current hash value is changed, InferShapeImpl will apply. // otherwise, current hash value is changed, InferShapeImpl will apply.
io_shape_lod_hash_ = new_hash; io_shape_lod_hash_ = new_hash;
this->InferShapeImpl(); this->InferShapeImpl();
auto *current_outputs = param_.output_tensor_ptrs(); auto *current_outputs = op_param_->output_tensor_ptrs();
last_output_shapes.clear();
last_output_lods.clear();
for (size_t i = 0; i < current_outputs->size(); i++) { for (size_t i = 0; i < current_outputs->size(); i++) {
last_output_shapes[i] = current_outputs->at(i)->dims(); last_output_shapes.push_back(current_outputs->at(i)->dims());
last_output_lods[i] = current_outputs->at(i)->lod(); last_output_lods.push_back(current_outputs->at(i)->lod());
} }
} }
return true; return true;
......
...@@ -77,6 +77,11 @@ class OpLite : public Registry { ...@@ -77,6 +77,11 @@ class OpLite : public Registry {
// Link the external execution environ to internal context. // Link the external execution environ to internal context.
bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope); bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope);
template <typename T>
inline void AttachParam(T *param) {
op_param_ = static_cast<T *>(param);
}
const OpInfo *op_info() const { return op_info_.get(); } const OpInfo *op_info() const { return op_info_.get(); }
OpInfo *mutable_op_info() { return op_info_.get(); } OpInfo *mutable_op_info() { return op_info_.get(); }
...@@ -167,11 +172,10 @@ class OpLite : public Registry { ...@@ -167,11 +172,10 @@ class OpLite : public Registry {
std::vector<Place> valid_places_; std::vector<Place> valid_places_;
Place kernel_place_{TARGET(kHost), PRECISION(kFloat)}; Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
std::unique_ptr<OpInfo> op_info_; std::unique_ptr<OpInfo> op_info_;
std::vector<DDimLite> last_output_shapes{}; std::vector<DDimLite> last_output_shapes{};
std::vector<std::vector<std::vector<uint64_t>>> last_output_lods{}; std::vector<std::vector<std::vector<uint64_t>>> last_output_lods{};
size_t io_shape_lod_hash_{}; size_t io_shape_lod_hash_{};
mutable operators::ParamBase param_; mutable operators::ParamBase *op_param_{nullptr};
private: private:
// Infer Shape according to memory, if current input shapes are consistent // Infer Shape according to memory, if current input shapes are consistent
......
...@@ -111,18 +111,23 @@ class KernelRegistry final { ...@@ -111,18 +111,23 @@ class KernelRegistry final {
KernelRegistryForTarget<TARGET(kCUDA), KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNHWC)> *, // DATALAYOUT(kNHWC)> *, //
KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kAny),
DATALAYOUT(kAny)> *, //
KernelRegistryForTarget<TARGET(kCUDA), KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kInt8), PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kCUDA), KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kInt8), PRECISION(kInt8),
DATALAYOUT(kNHWC)> *, // DATALAYOUT(kNHWC)> *, //
KernelRegistryForTarget<TARGET(kX86), KernelRegistryForTarget<TARGET(kX86),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kX86), KernelRegistryForTarget<TARGET(kX86),
PRECISION(kInt8), PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kHost), KernelRegistryForTarget<TARGET(kHost),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
...@@ -141,9 +146,7 @@ class KernelRegistry final { ...@@ -141,9 +146,7 @@ class KernelRegistry final {
KernelRegistryForTarget<TARGET(kHost), KernelRegistryForTarget<TARGET(kHost),
PRECISION(kInt64), PRECISION(kInt64),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kAny),
DATALAYOUT(kAny)> *, //
KernelRegistryForTarget<TARGET(kARM), KernelRegistryForTarget<TARGET(kARM),
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kAny)> *, // DATALAYOUT(kAny)> *, //
......
...@@ -71,12 +71,17 @@ class Optimizer { ...@@ -71,12 +71,17 @@ class Optimizer {
"identity_scale_eliminate_pass", // "identity_scale_eliminate_pass", //
"elementwise_mul_constant_eliminate_pass", // "elementwise_mul_constant_eliminate_pass", //
"lite_sequence_pool_concat_fuse_pass", // "lite_sequence_pool_concat_fuse_pass", //
"lite_scale_activation_fuse_pass", //
#if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \ #if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
(defined LITE_WITH_ARM) (defined LITE_WITH_ARM)
"lite_elementwise_add_activation_fuse_pass", // "lite_elementwise_add_activation_fuse_pass", //
#endif #endif
"__xpu__resnet_fuse_pass", "__xpu__resnet_fuse_pass",
"__xpu__multi_encoder_fuse_pass", "__xpu__multi_encoder_fuse_pass",
"__xpu__embedding_with_eltwise_add_fuse_pass",
"__xpu__fc_fuse_pass",
"identity_dropout_eliminate_pass", // should be placed after
// xpu fusion
"quantized_op_attributes_inference_pass", // Only for fully "quantized_op_attributes_inference_pass", // Only for fully
// quantized model, infer // quantized model, infer
// the output scale and // the output scale and
......
...@@ -60,6 +60,29 @@ Variable *Scope::FindLocalVar(const std::string &name) const { ...@@ -60,6 +60,29 @@ Variable *Scope::FindLocalVar(const std::string &name) const {
return nullptr; return nullptr;
} }
// AttributeVarNames will get persistive attribute names stored in parent scope
std::vector<std::string> Scope::AttributeVarNames() const {
std::vector<std::string> resulted_keys;
const Scope *cur_scope = this;
while (cur_scope->parent()) {
cur_scope = cur_scope->parent();
auto keys = cur_scope->LocalVarNames();
resulted_keys.insert(resulted_keys.end(), keys.begin(), keys.end());
}
// remove feed and fetch
std::vector<std::string> skiped_vars = {"feed", "fetch"};
for (int i = 0; i < skiped_vars.size(); i++) {
auto iter =
std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]);
while (iter != resulted_keys.end()) {
resulted_keys.erase(iter);
iter =
std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]);
}
}
return resulted_keys;
}
std::vector<std::string> Scope::LocalVarNames() const { std::vector<std::string> Scope::LocalVarNames() const {
std::vector<std::string> keys; std::vector<std::string> keys;
for (const auto &item : vars_) { for (const auto &item : vars_) {
......
...@@ -45,6 +45,8 @@ class Scope final { ...@@ -45,6 +45,8 @@ class Scope final {
const Scope* parent() const { return parent_; } const Scope* parent() const { return parent_; }
// Get attribute params stored in parent scopes.
std::vector<std::string> AttributeVarNames() const;
// Following the legacy scope interface. // Following the legacy scope interface.
std::vector<std::string> LocalVarNames() const; std::vector<std::string> LocalVarNames() const;
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -56,7 +56,6 @@ add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_k ...@@ -56,7 +56,6 @@ add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_k
add_kernel(crop_compute_arm ARM extra SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(crop_compute_arm ARM extra SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(power_compute_arm ARM extra SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(power_compute_arm ARM extra SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(norm_compute_arm ARM extra SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(norm_compute_arm ARM extra SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm)
## 3. extra kernels ## 3. extra kernels
add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
...@@ -88,13 +87,10 @@ add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps ...@@ -88,13 +87,10 @@ add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps
add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(fill_constant_batch_size_like_compute_arm ARM basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(fill_constant_batch_size_like_compute_arm ARM basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_arm)
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册