提交 0cd75c62 编写于 作者: R root

Merge branch 'new_dev' of https://github.com/PaddleLite-EB/Paddle-Lite into fpga_pr

# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
......@@ -34,6 +34,8 @@ Welcome to Paddle-Lite's documentation!
:caption: 使用指南
:name: sec-user-guides
user_guides/cuda
.. toctree::
:maxdepth: 1
:caption: 进阶使用指南
......
# Lite基于CUDA的模型预测
Lite支持在x86_64,arm64架构上(如:TX2)进行CUDA的编译运行。
## 编译
**NOTE:** 如果是在TX2等NVIDIA嵌入式硬件上编译,请使用最新的[Jetpack](https://developer.nvidia.com/embedded/jetpack) 安装依赖库。
一: 下载代码
```
git clone https://github.com/PaddlePaddle/Paddle-Lite.git
```
二:编译
```
# 进入代码目录
cd Paddle-Lite
# 运行编译脚本
# 编译结束会在本目录下生成 build_cuda 目录
# 编译过程中如果提示找不到CUDA,CUDNN,请在环境变量设置CUDA_TOOLKIT_ROOT_DIR, CUDNN_ROOT
# CUDA_TOOLKIT_ROOT_DIR,CUDNN_ROOT分别表示CUDA,CUDNN的根目录
./lite/tools/build.sh cuda
# 如果使用python接口,需要打开build_python选项
./lite/tools/build.sh --build_python=ON cuda
```
编译结束会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`
## 运行
以下以Yolov3模型为例,介绍如何在Nvidia GPU硬件上运行模型。
一: 下载darknet_yolov3模型,模型信息请参考[这里](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/yolov3)
```
# 下载模型
wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz
tar -zxf yolov3_infer.tar.gz
# 下载图片样例
wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg
```
二: 运行
**NOTE:**此处示例使用的是python接口,后续会开放C++接口以及示例。
``` python
#-*- coding: utf-8 -*-
from __future__ import print_function
import sys
import numpy as np
import cv2
sys.path.append('build_cuda/inference_lite_lib/python/lib')
from lite_core import *
def read_img(im_path, resize_h, resize_w):
im = cv2.imread(im_path).astype('float32')
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
h, w, _ = im.shape
im_scale_x = resize_h / float(w)
im_scale_y = resize_w / float(h)
out_img = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=cv2.INTER_CUBIC)
mean = np.array([0.485, 0.456, 0.406]).reshape((1, 1, -1))
std = np.array([0.229, 0.224, 0.225]).reshape((1, 1, -1))
out_img = (out_img / 255.0 - mean) / std
out_img = out_img.transpose((2, 0, 1))
return out_img
# 配置config
a = CxxConfig()
a.set_model_file('./yolov3_infer/__model__') # 指定模型文件路径
a.set_param_file('./yolov3_infer/__params__') # 指定参数文件路径
place_cuda = Place(TargetType.CUDA)
a.set_valid_places([place_cuda])
# 创建predictor
predictor = create_paddle_predictor(a)
# 设置输入
input_tensor = predictor.get_input(0);
height, width = 608, 608
input_tensor.resize([1, 3, height, width])
data = read_img('./kite.jpg', height, width).flatten()
input_tensor.set_float_data(data, TargetType.CUDA)
in2 = predictor.get_input(1);
in2.resize([1, 2])
in2.set_int32_data([height, width], TargetType.CUDA)
# 运行
predictor.run()
# 获取输出
output_tensor = predictor.get_output(0);
print (output_tensor.shape())
# [100L, 6L]
print (output_tensor.target())
# TargetType.Host
print (output_tensor.float_data()[:6])
# [0.0, 0.9862784743309021, 98.51927185058594, 471.2381286621094, 120.73092651367188, 578.33251953125]
```
**NOTE:** 对CUDA的支持还在持续开发中。
......@@ -232,6 +232,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
)
add_dependencies(publish_inference_android_cxx_demos logging gflags)
add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
......@@ -251,6 +253,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
)
add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
endif()
......
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
lite_cc_library(place SRCS paddle_place.cc DEPS logging)
else()
lite_cc_library(place SRCS paddle_place.cc DEPS glog)
......@@ -218,20 +218,11 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
--model_dir=${LITE_MODEL_DIR}/resnet50 SERIAL)
add_dependencies(test_resnet50 extern_lite_download_resnet50_tar_gz)
lite_cc_test(test_resnet50_fpga SRCS resnet50_test_fpga.cc
lite_cc_test(test_ssd_fpga SRCS test_ssd_fpga.cc
DEPS ${lite_model_test_DEPS}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
lite_cc_test(test_inceptionv4 SRCS inceptionv4_test.cc
DEPS ${lite_model_test_DEPS}
CL_DEPS ${opencl_kernels}
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
--model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
# lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
# DEPS ${lite_model_test_DEPS})
# lite_cc_test(model_run_test_image SRCS model_run_test_image.cc
# DEPS ${lite_model_test_DEPS}
# CL_DEPS ${opencl_kernels}
......@@ -296,10 +287,10 @@ if (LITE_ON_TINY_PUBLISH)
endif()
if (LITE_ON_MODEL_OPTIMIZE_TOOL)
message(STATUS "Compiling model_optimize_tool")
lite_cc_binary(model_optimize_tool SRCS model_optimize_tool.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
message(STATUS "Compiling opt")
lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
DEPS gflags kernel op optimizer mir_passes utils)
add_dependencies(model_optimize_tool op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h)
add_dependencies(opt op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h)
endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light
......
......@@ -181,6 +181,7 @@ inline MobileConfig jmobileconfig_to_cpp_mobileconfig(JNIEnv *env,
MobileConfig config;
// set model dir
// NOTE: This is a deprecated API and will be removed in latter release.
jmethodID model_dir_method = env->GetMethodID(
mobileconfig_jclazz, "getModelDir", "()Ljava/lang/String;");
jstring java_model_dir =
......@@ -190,6 +191,27 @@ inline MobileConfig jmobileconfig_to_cpp_mobileconfig(JNIEnv *env,
config.set_model_dir(cpp_model_dir);
}
// set model from file
jmethodID model_file_method = env->GetMethodID(
mobileconfig_jclazz, "getModelFromFile", "()Ljava/lang/String;");
jstring java_model_file =
(jstring)env->CallObjectMethod(jmobileconfig, model_file_method);
if (java_model_file != nullptr) {
std::string cpp_model_file = jstring_to_cpp_string(env, java_model_file);
config.set_model_from_file(cpp_model_file);
}
// set model from buffer
jmethodID model_buffer_method = env->GetMethodID(
mobileconfig_jclazz, "getModelFromBuffer", "()Ljava/lang/String;");
jstring java_model_buffer =
(jstring)env->CallObjectMethod(jmobileconfig, model_buffer_method);
if (java_model_buffer != nullptr) {
std::string cpp_model_buffer =
jstring_to_cpp_string(env, java_model_buffer);
config.set_model_from_buffer(cpp_model_buffer);
}
// set threads
jmethodID threads_method =
env->GetMethodID(mobileconfig_jclazz, "getThreads", "()I");
......
......@@ -64,6 +64,44 @@ public class MobileConfig extends ConfigBase {
return powerMode.value();
}
/**
* Set model from file.
*
* @return
*/
public void setModelFromFile(String modelFile) {
this.liteModelFile = modelFile;
}
/**
* Returns name of model_file.
*
* @return liteModelFile
*/
public String getModelFile() {
return liteModelFile;
}
/**
* Set model from buffer.
*
* @return
*/
public void setModelFromBuffer(String modelBuffer) {
this.liteModelBuffer = modelBuffer;
}
/**
* Returns model buffer
*
* @return liteModelBuffer
*/
public String getModelBuffer() {
return liteModelBuffer;
}
private PowerMode powerMode = PowerMode.LITE_POWER_HIGH;
private int threads = 1;
private String liteModelFile;
private String liteModelBuffer;
}
......@@ -62,7 +62,7 @@ TEST(CXXApi_LightApi, optim_model) {
TEST(CXXApi_LightApi, save_and_load_model) {
lite::Predictor cxx_api;
lite::LightPredictor light_api(FLAGS_optimized_model);
lite::LightPredictor light_api(FLAGS_optimized_model + ".nb", false);
// CXXAPi
{
......
......@@ -116,7 +116,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
lite_api::MobileConfig config;
config.set_threads(FLAGS_threads);
config.set_power_mode(static_cast<PowerMode>(FLAGS_power_mode));
config.set_model_dir(model_dir);
config.set_model_from_file(model_dir + ".nb");
auto predictor = lite_api::CreatePaddlePredictor(config);
......
......@@ -121,6 +121,7 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
<< kpf_path;
}
#ifndef LITE_WITH_FPGA
lite::Tensor *Predictor::GetInput(size_t offset) {
CHECK(input_names_.size() > offset)
<< "The network has " << input_names_.size() << " inputs"
......@@ -130,6 +131,17 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
<< " in exec_scope";
return in_var->GetMutable<lite::Tensor>();
}
#else
lite::Tensor *Predictor::GetInput(size_t offset) {
auto *_feed_list = exec_scope_->FindVar("feed");
CHECK(_feed_list) << "no feed variable in exec_scope";
auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
if (offset >= feed_list->size()) {
feed_list->resize(offset + 1);
}
return &feed_list->at(offset);
}
#endif
// get inputs names
std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
......@@ -167,6 +179,8 @@ void Predictor::PrepareFeedFetch() {
}
}
#ifndef LITE_WITH_FPGA
const lite::Tensor *Predictor::GetOutput(size_t offset) const {
CHECK(output_names_.size() > offset)
<< "The network has " << output_names_.size() << " outputs"
......@@ -186,6 +200,29 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
}
return outputs;
}
#else
const lite::Tensor *Predictor::GetOutput(size_t offset) const {
auto *_fetch_list = exec_scope_->FindVar("fetch");
CHECK(_fetch_list) << "no fatch variable in exec_scope";
auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
return &fetch_list.at(offset);
}
std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
auto *_fetch_list = exec_scope_->FindVar("fetch");
CHECK(_fetch_list) << "no fatch variable in exec_scope";
auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
std::vector<const lite::Tensor *> outputs;
for (auto out : fetch_list) {
outputs.push_back(&out);
}
return outputs;
}
#endif
const cpp::ProgramDesc &Predictor::program_desc() const {
return program_desc_;
......@@ -239,7 +276,7 @@ void Predictor::Build(const std::string &model_path,
case lite_api::LiteModelType::kNaiveBuffer:
CHECK(!model_path.empty())
<< "NaiveBuffer backend only supported combined param";
LoadModelNaive(model_path, scope_.get(), &program_desc_);
LoadModelNaiveFromFile(model_path, scope_.get(), &program_desc_);
break;
default:
LOG(FATAL) << "Unknown model type";
......
......@@ -101,7 +101,7 @@ TEST(CXXApi, save_model) {
TEST(CXXApi, load_model_naive) {
lite::Predictor predictor;
std::vector<Place> valid_places({Place{TARGET(kARM), PRECISION(kFloat)}});
predictor.Build(FLAGS_optimized_model + ".naive",
predictor.Build(FLAGS_optimized_model + ".naive.nb",
"",
"",
valid_places,
......
......@@ -18,6 +18,17 @@
namespace paddle {
namespace lite {
void LightPredictor::Build(const std::string& lite_model_file,
bool model_from_memory) {
if (model_from_memory) {
LoadModelNaiveFromMemory(lite_model_file, scope_.get(), &cpp_program_desc_);
} else {
LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
}
BuildRuntimeProgram(cpp_program_desc_);
PrepareFeedFetch();
}
void LightPredictor::Build(const std::string& model_dir,
const std::string& model_buffer,
const std::string& param_buffer,
......
......@@ -18,6 +18,7 @@
*/
#pragma once
#include <algorithm>
#include <map>
#include <memory>
#include <string>
......@@ -39,12 +40,22 @@ namespace lite {
*/
class LITE_API LightPredictor {
public:
LightPredictor(
const std::string& model_dir,
const std::string& model_buffer = "",
const std::string& param_buffer = "",
bool model_from_memory = false,
lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf) {
// constructor function of LightPredictor, `lite_model_file` refers to data in
// model file or buffer,`model_from_memory` refers to whther to load model
// from memory.
LightPredictor(const std::string& lite_model_file,
bool model_from_memory = false) {
scope_ = std::make_shared<Scope>();
Build(lite_model_file, model_from_memory);
}
// NOTE: This is a deprecated API and will be removed in latter release.
LightPredictor(const std::string& model_dir,
const std::string& model_buffer = "",
const std::string& param_buffer = "",
bool model_from_memory = false,
lite_api::LiteModelType model_type =
lite_api::LiteModelType::kNaiveBuffer) {
scope_ = std::make_shared<Scope>();
Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory);
}
......@@ -69,6 +80,10 @@ class LITE_API LightPredictor {
void PrepareFeedFetch();
private:
void Build(const std::string& lite_model_file,
bool model_from_memory = false);
// NOTE: This is a deprecated API and will be removed in latter release.
void Build(
const std::string& model_dir,
const std::string& model_buffer,
......
......@@ -23,13 +23,17 @@ namespace lite {
void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
// LightPredictor Only support NaiveBuffer backend in publish lib
raw_predictor_.reset(
new LightPredictor(config.model_dir(),
config.model_buffer(),
config.param_buffer(),
config.model_from_memory(),
lite_api::LiteModelType::kNaiveBuffer));
if (config.lite_model_file().empty()) {
raw_predictor_.reset(
new LightPredictor(config.model_dir(),
config.model_buffer(),
config.param_buffer(),
config.model_from_memory(),
lite_api::LiteModelType::kNaiveBuffer));
} else {
raw_predictor_.reset(new LightPredictor(config.lite_model_file(),
config.model_from_memory()));
}
mode_ = config.power_mode();
threads_ = config.threads();
}
......
......@@ -73,7 +73,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
const int repeat,
const int warmup_times = 0) {
lite_api::MobileConfig config;
config.set_model_dir(model_dir);
config.set_model_from_file(model_dir + ".nb");
config.set_power_mode(power_mode);
config.set_threads(thread_num);
......
......@@ -17,7 +17,7 @@
#include <gtest/gtest.h>
#endif
// "supported_kernel_op_info.h", "all_kernel_faked.cc" and "kernel_src_map.h"
// are created automatically during model_optimize_tool's compiling period
// are created automatically during opt's compiling period
#include <iomanip>
#include "all_kernel_faked.cc" // NOLINT
#include "kernel_src_map.h" // NOLINT
......
......@@ -190,5 +190,27 @@ void ConfigBase::set_threads(int threads) {
#endif
}
// set model data in combined format, `set_model_from_file` refers to loading
// model from file, set_model_from_buffer refers to loading model from memory
// buffer
void MobileConfig::set_model_from_file(const std::string &x) {
lite_model_file_ = x;
}
void MobileConfig::set_model_from_buffer(const std::string &x) {
lite_model_file_ = x;
model_from_memory_ = true;
}
void MobileConfig::set_model_buffer(const char *model_buffer,
size_t model_buffer_size,
const char *param_buffer,
size_t param_buffer_size) {
LOG(WARNING) << "warning: `set_model_buffer` will be abandened in "
"release/v3.0.0, new method `set_model_from_buffer(const "
"std::string &x)` is recommended.";
model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size);
param_buffer_ = std::string(param_buffer, param_buffer + param_buffer_size);
model_from_memory_ = true;
}
} // namespace lite_api
} // namespace paddle
......@@ -168,22 +168,40 @@ class LITE_API CxxConfig : public ConfigBase {
/// MobileConfig is the config for the light weight predictor, it will skip
/// IR optimization or other unnecessary stages.
class LITE_API MobileConfig : public ConfigBase {
// whether to load data from memory. Model data will be loaded from memory
// buffer if model_from_memory_ is true.
bool model_from_memory_{false};
// model data readed from file or memory buffer in combined format.
std::string lite_model_file_;
// NOTE: This is a deprecated variable and will be removed in latter release.
std::string model_buffer_;
std::string param_buffer_;
bool model_from_memory_{false};
public:
// set model data in combined format, `set_model_from_file` refers to loading
// model from file, set_model_from_buffer refers to loading model from memory
// buffer
void set_model_from_file(const std::string& x);
void set_model_from_buffer(const std::string& x);
// return model data in lite_model_file_, which is in combined format.
const std::string& lite_model_file() const { return lite_model_file_; }
// return model_from_memory_, which indicates whether to load model from
// memory buffer.
bool model_from_memory() const { return model_from_memory_; }
// NOTE: This is a deprecated API and will be removed in latter release.
void set_model_buffer(const char* model_buffer,
size_t model_buffer_size,
const char* param_buffer,
size_t param_buffer_size) {
model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size);
param_buffer_ = std::string(param_buffer, param_buffer + param_buffer_size);
model_from_memory_ = true;
}
size_t param_buffer_size);
bool model_from_memory() const { return model_from_memory_; }
// NOTE: This is a deprecated API and will be removed in latter release.
const std::string& model_buffer() const { return model_buffer_; }
// NOTE: This is a deprecated API and will be removed in latter release.
const std::string& param_buffer() const { return param_buffer_; }
};
......
......@@ -72,7 +72,7 @@ TEST(CxxApi, run) {
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
TEST(LightApi, run) {
lite_api::MobileConfig config;
config.set_model_dir(FLAGS_model_dir + ".opt2.naive");
config.set_model_from_file(FLAGS_model_dir + ".opt2.naive.nb");
auto predictor = lite_api::CreatePaddlePredictor(config);
......@@ -109,16 +109,11 @@ TEST(LightApi, run) {
// Demo2 for Loading model from memory
TEST(MobileConfig, LoadfromMemory) {
// Get naive buffer
auto model_path = std::string(FLAGS_model_dir) + ".opt2.naive/__model__.nb";
auto params_path = std::string(FLAGS_model_dir) + ".opt2.naive/param.nb";
std::string model_buffer = lite::ReadFile(model_path);
size_t size_model = model_buffer.length();
std::string params_buffer = lite::ReadFile(params_path);
size_t size_params = params_buffer.length();
auto model_file = std::string(FLAGS_model_dir) + ".opt2.naive.nb";
std::string model_buffer = lite::ReadFile(model_file);
// set model buffer and run model
lite_api::MobileConfig config;
config.set_model_buffer(
model_buffer.c_str(), size_model, params_buffer.c_str(), size_params);
config.set_model_from_buffer(model_buffer);
auto predictor = lite_api::CreatePaddlePredictor(config);
auto input_tensor = predictor->GetInput(0);
......
......@@ -41,6 +41,7 @@ USE_MIR_PASS(lite_quant_dequant_fuse_pass);
USE_MIR_PASS(type_precision_cast_pass);
USE_MIR_PASS(type_layout_cast_pass);
USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS(kernel_place_correct_pass)
USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass);
......
......@@ -116,6 +116,8 @@ void BindLiteMobileConfig(py::module *m) {
py::class_<MobileConfig> mobile_config(*m, "MobileConfig");
mobile_config.def(py::init<>())
.def("set_model_from_file", &MobileConfig::set_model_from_file)
.def("set_model_from_buffer", &MobileConfig::set_model_from_buffer)
.def("set_model_dir", &MobileConfig::set_model_dir)
.def("model_dir", &MobileConfig::model_dir)
.def("set_model_buffer", &MobileConfig::set_model_buffer)
......
......@@ -31,11 +31,7 @@ TEST(ResNet50, test) {
std::vector<Place> valid_places(
{Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}});
predictor.Build(FLAGS_model_dir,
"",
"",
Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
valid_places);
predictor.Build(FLAGS_model_dir, "", "", valid_places);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <dirent.h>
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/op_registry.h"
DEFINE_string(input_file, "", "input_file");
namespace paddle {
namespace lite {
std::vector<std::string> GetDirectoryFiles(const std::string& dir) {
std::vector<std::string> files;
std::shared_ptr<DIR> directory_ptr(opendir(dir.c_str()),
[](DIR* dir) { dir&& closedir(dir); });
struct dirent* dirent_ptr;
if (!directory_ptr) {
std::cout << "Error opening : " << std::strerror(errno) << dir << std::endl;
return files;
}
while ((dirent_ptr = readdir(directory_ptr.get())) != nullptr) {
files.push_back(std::string(dirent_ptr->d_name));
}
return files;
}
void readFromFile(int num, std::string path, float* data) {
std::ifstream file_stream(path);
// file_stream.open(path);
if (!file_stream.good()) {
std::cout << "file: " << path << " dones not exist!\n";
exit(-1);
return;
}
// float* data = mutableData<float>();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
data[i] = value;
}
file_stream.close();
}
// #ifdef LITE_WITH_FPGA
TEST(ResNet50, test) {
lite::Predictor predictor;
std::vector<Place> valid_places({
Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)},
});
// predictor.Build(FLAGS_model_dir, "", "", valid_places);
predictor.Build("",
FLAGS_model_dir + "/model",
FLAGS_model_dir + "/params",
valid_places);
auto* input_tensor = predictor.GetInput(0);
int width = 300;
int height = 300;
// std::ifstream file_stream(FLAGS_input_file);
// if (!file_stream.good()) {
// std::cout << "file: " << FLAGS_input_file << " dones not exist!\n";
// exit(-1);
// return;
// }
// file_stream >> height;
// file_stream >> width;
input_tensor->Resize(
DDim(std::vector<DDim::value_type>({1, 3, height, width})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
for (int i = 0; i < item_size; i++) {
data[i] = 1;
}
// readFromFile(item_size, "car.data", data);
int num = 3 * width * height;
// for (int i = 0; i < num; ++i) {
// float value = 0;
// file_stream >> value;
// data[i] = value;
// }
// file_stream.close();
for (int i = 0; i < 2; ++i) {
predictor.Run();
}
auto* out = predictor.GetOutput(0);
for (int i = 0; i < out->dims().production(); i++) {
std::cout << ":" << out->data<float>()[i] << std::endl;
}
std::string file = "output/" + FLAGS_input_file.substr(6);
std::cout << "file:::" << file << std::endl;
std::ofstream ofs;
ofs.open(file);
for (int i = 0; i < out->dims().production(); i++) {
float value = out->data<float>()[i];
ofs << value << std::endl;
}
ofs.close();
LOG(INFO) << "================== Speed Report ===================";
}
// #endif
} // namespace lite
} // namespace paddle
......@@ -109,7 +109,7 @@ void conv_depthwise_5x5s1_fp32(float* dout,
tmp_din + omp_get_thread_num() * (pre_in_size + pre_out_size);
float* pre_out = pre_din + pre_in_size;
#else
float pre_din = tmp_din;
float* pre_din = tmp_din;
float* pre_out = pre_din + pre_in_size;
#endif
prepack_input_nxwc4_dw(
......
......@@ -46,6 +46,7 @@ void fp32_to_int8(const float* din,
float inv_scale = 1.f / scale[j % axis_size];
float32x4_t vzero = vdupq_n_f32(0.f);
float32x4_t vscale = vdupq_n_f32(inv_scale);
float32x4_t vmax = vdupq_n_f32(-127.f);
float32x4_t vpoff = vdupq_n_f32(0.5f);
float32x4_t vnoff = vdupq_n_f32(-0.5f);
const float* din_c = din + j * inner_size;
......@@ -63,6 +64,14 @@ void fp32_to_int8(const float* din,
"fmul v5.4s, v1.4s, %[scale].4s \n"
"fmul v6.4s, v2.4s, %[scale].4s \n"
"fmul v7.4s, v3.4s, %[scale].4s \n"
"fcmge v8.4s, v4.4s, %[vmax].4s \n"
"fcmge v9.4s, v5.4s, %[vmax].4s \n"
"fcmge v10.4s, v6.4s, %[vmax].4s \n"
"fcmge v11.4s, v7.4s, %[vmax].4s \n"
"bif v4.16b, %[vmax].16b, v8.16b \n"
"bif v5.16b, %[vmax].16b, v9.16b \n"
"bif v6.16b, %[vmax].16b, v10.16b \n"
"bif v7.16b, %[vmax].16b, v11.16b \n"
"ldp q0, q1, [%[in]], #32 \n"
"subs %[cnt], %[cnt], #1 \n"
"FCVTAS v8.4s, v4.4s \n"
......@@ -79,7 +88,7 @@ void fp32_to_int8(const float* din,
"str q8, [%[out]], #16 \n"
"bne 0b \n"
: [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
: [scale] "w"(vscale)
: [scale] "w"(vscale), [vmax] "w"(vmax)
: "v0",
"v1",
"v2",
......@@ -104,15 +113,23 @@ void fp32_to_int8(const float* din,
"vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n"
"vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n"
"vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2\n"
"vcgt.f32 q11, q3, %q[vzero] @ get mask > 0, in3\n"
"vbif.f32 q4, %q[vnoff], q8 @ get right offset\n"
"vcgt.f32 q8, q3, %q[vzero] @ get mask > 0, in3\n"
"vbif.f32 q5, %q[vnoff], q9 @ get right offset\n"
"vbif.f32 q6, %q[vnoff], q10 @ get right offset\n"
"vbif.f32 q7, %q[vnoff], q11 @ get right offset\n"
"vbif.f32 q7, %q[vnoff], q8 @ get right offset\n"
"vmla.f32 q4, q0, %q[vscale] @ mul scale\n"
"vmla.f32 q5, q1, %q[vscale] @ mul scale\n"
"vmla.f32 q6, q2, %q[vscale] @ mul scale\n"
"vmla.f32 q7, q3, %q[vscale] @ mul scale\n"
"vcge.f32 q8, q4, %q[vmax] @ q4 >= vmax \n"
"vcge.f32 q9, q5, %q[vmax] @ q4 >= vmax \n"
"vcge.f32 q10, q6, %q[vmax] @ q4 >= vmax \n"
"vbif q4, %q[vmax], q8 @ choose \n"
"vcge.f32 q8, q7, %q[vmax] @ q4 >= vmax \n"
"vbif q5, %q[vmax], q9 @ choose \n"
"vbif q6, %q[vmax], q10 @ choose \n"
"vbif q7, %q[vmax], q8 @ choose \n"
"vcvt.s32.f32 q0, q4 @ cvt to int32\n"
"vcvt.s32.f32 q1, q5 @ cvt to int32\n"
"vcvt.s32.f32 q2, q6 @ cvt to int32\n"
......@@ -133,25 +150,16 @@ void fp32_to_int8(const float* din,
: [vscale] "w"(vscale),
[vpoff] "w"(vpoff),
[vnoff] "w"(vnoff),
[vzero] "w"(vzero)
: "q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11");
[vzero] "w"(vzero),
[vmax] "w"(vmax)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10");
#endif
}
const float* din_r = din_c + 16 * cnt;
signed char* dout_r = dout_c + 16 * cnt;
for (int i = 0; i < remain; ++i) {
dout_r[i] = saturate_cast<int8_t>(roundf(inv_scale * din_r[i]));
dout_r[i] = dout_r[i] < -127 ? -127 : dout_r[i];
}
}
}
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/fpga/KD/fpga_cv.hpp"
using paddle::zynqmp::float16;
void fpga_resize(float* input,
int input_width,
int input_height,
int input_channel,
uint8_t* output,
int output_width,
int output_height) {
paddle::zynqmp::InplaceArgs inplace_args = {0, 0, 0};
paddle::zynqmp::config_inplace(inplace_args);
paddle::zynqmp::ImageInputArgs input_args = {nullptr};
input_args.address = nullptr;
input_args.scale_address = nullptr;
float16* input_image_address =
reinterpret_cast<float16*>(paddle::zynqmp::fpga_malloc(
input_width * input_height * input_channel * sizeof(float16)));
int index = 0;
for (int i = 0; i < input_width * input_height * input_channel; i++) {
input_image_address[i] = float16(1.0 * input[i]);
}
paddle::zynqmp::ResizeArgs resize_args = {0};
resize_args.input_width = input_width;
resize_args.input_height = input_height;
resize_args.image_channel = input_channel;
resize_args.output_width = output_width;
resize_args.output_height = output_height;
float height_ratio = static_cast<float>(input_height) /
static_cast<float>(resize_args.output_height);
float width_ratio = static_cast<float>(input_width) /
static_cast<float>(resize_args.output_width);
resize_args.height_ratio = *reinterpret_cast<uint32_t*>(&height_ratio);
resize_args.width_ratio = *reinterpret_cast<uint32_t*>(&width_ratio);
int output_size =
resize_args.output_width * resize_args.output_height * input_channel;
float16* fpga_output = reinterpret_cast<float16*>(
paddle::zynqmp::fpga_malloc(output_size * sizeof(float16)));
resize_args.input_image_address = input_image_address;
resize_args.output_image_address = fpga_output;
memset(fpga_output, 0, output_size * sizeof(float16));
paddle::zynqmp::fpga_flush(
input_image_address,
input_width * input_height * input_channel * sizeof(float16));
paddle::zynqmp::fpga_flush(resize_args.output_image_address,
output_size * sizeof(float16));
int ret = paddle::zynqmp::compute_fpga_resize(resize_args);
if (ret == 0) {
paddle::zynqmp::fpga_invalidate(resize_args.output_image_address,
output_size * sizeof(float16));
}
for (int i = 0; i < output_size; i++) {
output[i] = fpga_output[i];
}
}
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdlib.h>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
#include "lite/backends/fpga/KD/pe.hpp"
void fpga_resize(float* input,
int input_width,
int input_height,
int input_channel,
uint8_t* output,
int output_width,
int output_height);
......@@ -151,6 +151,10 @@ class TensorLite {
size_t offset() const { return offset_; }
bool IsInitialized() const { return buffer_->data(); }
void clear() {
buffer_->Free();
offset_ = 0;
}
// Other share data to this.
void ShareDataWith(const TensorLite &other);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void concat2(__global const CL_DTYPE* x_data0, __global const CL_DTYPE* x_data1, __global CL_DTYPE* out_data,
int size, int axis_size, int pre_size, int post_size, int total, int total0, int total1) {
const int index = get_global_id(0);
if (index < size){
for (int i = 0; i < pre_size; i++){
int offset_out = index * post_size + i * total;
int offset_in = index * post_size + i * total0;
// memcpy(out_data + offset_out, x_data0 + offset_in, post_size);
CL_DTYPE* dst = out_data + offset_out;
CL_DTYPE* src = x_data0 + offset_in;
for (int k = 0; k < post_size; k++){
*dst++ = *src++;
}
}
}else if (index < axis_size){
for (int i = 0; i < pre_size; i++){
int offset_out = index * post_size + i * total;
int offset_in = index * post_size + i * total1;
// memcpy(out_data + offset_out, x_data1 + offset_in, post_size);
CL_DTYPE* dst = out_data + offset_out;
CL_DTYPE* src = x_data1 + offset_in;
for (int k = 0; k < post_size; k++){
*dst++ = *src++;
}
}
}
}
__kernel void concat_mul(__global const CL_DTYPE* x_data, __global CL_DTYPE* out_data,
int axis_size, int pre_size, int post_size, int start, int total, int total0) {
const int index = get_global_id(0);
if (index < axis_size){
for (int i = 0; i < pre_size; i++){
int offset_out = (start + index) * post_size + i * total;
int offset_in = index * post_size + i * total0;
// memcpy(out_data + offset_out, x_data + offset_in, post_size);
CL_DTYPE* dst = out_data + offset_out;
CL_DTYPE* src = x_data + offset_in;
for (int k = 0; k < post_size; k++){
*dst++ = *src++;
}
}
}
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void concat2(__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int axis_size, int flag, int width) {
const int x = get_global_id(0); // image_width cxw/4
const int y = get_global_id(1); // image_height nxh
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int xx = x / width;
if (flag == 0){
xx = y / width;
}
if (xx < axis_size){
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(x, y));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}else{
int new_val = xx - axis_size;
new_val *= width;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(new_val, y));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}
// WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}
__kernel void concat_mul(__read_only image2d_t input0,
__write_only image2d_t output,
int axis_size, int flag, int width, int start) {
const int x = get_global_id(0); // image_width cxw/4
const int y = get_global_id(1); // image_height nxh
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int xx = x / width;
if (flag == 0){
xx = y / width;
}
if (xx < axis_size && xx >= start){
xx -= start;
xx *= width;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(xx, y));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}
}
......@@ -12,8 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cl_common.h>
#define PADDLE_LITE_ZU5
#define FPGA_PRINT_MODE
#define PADDLE_LITE_PROFILE
__kernel void scale(__read_only image2d_t input,
__write_only image2d_t output,
__private float scale,
__private float bias){
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
in = convert_float(scale) * in + convert_float(bias);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}
......@@ -110,11 +110,11 @@ void set_constant(const lite::Context<Target>& context,
lite::Tensor* tensor,
float value) {
TensorSetConstantWithTarget<Target> func(context, tensor, value);
//#ifdef PADDLE_WITH_CUDA
// #ifdef PADDLE_WITH_CUDA
// tensor->target().apply_visitor(func);
//#else
// #else
func();
//#endif
// #endif
}
template <typename T>
......@@ -128,12 +128,14 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
auto in = lite::fluid::EigenMatrix<T>::From(input);
auto vec = lite::fluid::EigenVector<T>::Flatten(vector);
auto out = lite::fluid::EigenMatrix<T>::From(*output);
const T* input_data = input.data<T>();
const T* vector_data = vector.data<T>();
T* output_data = output->mutable_data<T>();
for (int64_t i = 0; i < in_dims[0]; ++i) {
out.chip(i, 0) = in.chip(i, 0) + vec;
for (int64_t j = 0; j < size; ++j) {
output_data[i * in_dims[0] + j] =
input_data[i * in_dims[0] + j] + vector_data[j];
}
}
}
};
......
......@@ -25,6 +25,7 @@ lite_cc_library(mir_passes
elimination/elementwise_mul_constant_eliminate_pass.cc
static_kernel_pick_pass.cc
variable_place_inference_pass.cc
kernel_place_correct_pass.cc
type_target_cast_pass.cc
type_layout_cast_pass.cc
type_precision_cast_pass.cc
......
......@@ -29,6 +29,11 @@ void ConvActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
act_types.push_back("leaky_relu");
break;
}
if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) {
act_types.push_back("relu6");
act_types.push_back("leaky_relu");
break;
}
}
for (auto conv_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) {
for (auto act_type : act_types) {
......
......@@ -27,10 +27,24 @@ namespace mir {
void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// delete quant node
std::vector<std::string> quant_op_types = {
"fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
"fake_quantize_abs_max",
"fake_quantize_range_abs_max",
"fake_quantize_moving_average_abs_max"};
/*
for (auto& op_type : {"conv2d", "mul", "depthwise_conv2d"}) {
for (int i = 5; i >= 1; --i){
fusion::DynamicQuantDequantOpFuser fuser("fake_quantize_abs_max", op_type,
i);
fuser(graph.get());
}
}
*/
for (auto& op_type : quant_op_types) {
fusion::DeleteQuantOpFuser fuser(op_type);
fuser(graph.get());
fusion::DeleteDynamicQuantOpFuser dfuser(op_type);
dfuser(graph.get());
}
// fuse quantized node and dequant node
......
......@@ -77,6 +77,55 @@ cpp::OpDesc DeleteQuantOpFuser::GenOpDesc(const key2nodes_t& matched) {
return op_desc;
}
void DeleteDynamicQuantOpFuser::BuildPattern() {
auto* input_act_node =
VarNode("input_act_node")->assert_is_op_input(quant_op_type_, "X");
auto* quant_node =
OpNode("quant_node", quant_op_type_)->assert_is_op(quant_op_type_);
auto* output_scale_node =
VarNode("output_scale_node")
->assert_is_op_output(quant_op_type_, "OutScale");
auto* output_act_node =
VarNode("output_act_node")->assert_is_op_output(quant_op_type_, "Out");
quant_node->LinksFrom({input_act_node});
output_scale_node->LinksFrom({quant_node});
output_act_node->LinksFrom({quant_node});
VLOG(4) << "DeleteQuantOpFuser BuildPattern quant_op_type:" << quant_op_type_;
}
void DeleteDynamicQuantOpFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) {
auto* input_act_node = matched.at("input_act_node");
auto* quant_node = matched.at("quant_node");
auto* output_scale_node = matched.at("output_scale_node");
auto* output_act_node = matched.at("output_act_node");
// obtain values, save values and relink node
int bit_length = quant_node->stmt()->op_info()->GetAttr<int>("bit_length");
int range = ((1 << (bit_length - 1)) - 1);
auto* scope = quant_node->stmt()->op()->scope();
auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name)
->GetMutable<lite::Tensor>();
float scale_value = scale_tensor->data<float>()[0] / range;
auto outlinks = output_act_node->outlinks;
for (auto* quantized_node : outlinks) {
auto* op_desc = quantized_node->stmt()->mutable_op_info();
op_desc->SetAttr<int>("bit_length", bit_length);
IR_NODE_LINK_TO(input_act_node, quantized_node)
}
// delete nodes and edges
std::unordered_set<const Node*> nodes2rm = {
quant_node, output_scale_node, output_act_node};
GraphSafeRemoveNodes(graph, nodes2rm);
}
cpp::OpDesc DeleteDynamicQuantOpFuser::GenOpDesc(const key2nodes_t& matched) {
cpp::OpDesc op_desc;
return op_desc;
}
void DequantOpFuser::BuildPattern() {
std::string weight_name = "";
if (quantized_op_type_ == "conv2d" ||
......@@ -130,8 +179,11 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
auto& valid_places = quantized_op->stmt()->op()->valid_places();
int bit_length = quantized_op->stmt()->op_info()->GetAttr<int>("bit_length");
int range = ((1 << (bit_length - 1)) - 1);
float input_scale =
quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
float input_scale = 0;
if (quantized_op->stmt()->op_info()->HasAttr("input_scale")) {
input_scale =
quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
}
float max_range = dequant_op->stmt()->op_info()->GetAttr<float>("max_range");
float whole_weight_scale =
static_cast<float>(range * range) / max_range / range;
......@@ -162,8 +214,12 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
for (int i = 0; i < weight_scale_size; i++) {
weight_scale.push_back(whole_weight_scale);
}
#ifndef LITE_WITH_FPGA
op_desc.SetAttr("enable_int8", true);
op_desc.SetAttr("input_scale", input_scale);
#endif
if (quantized_op->stmt()->op_info()->HasAttr("input_scale")) {
op_desc.SetAttr("input_scale", input_scale);
}
op_desc.SetAttr("weight_scale", weight_scale);
// change the weight from the float type to int8 type.
......@@ -171,12 +227,29 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
temp_tensor.CopyDataFrom(*quantized_weight_t);
float* temp_data = temp_tensor.mutable_data<float>();
size_t weight_num = quantized_weight_t->data_size();
#ifdef LITE_WITH_FPGA
float* quantized_weight_data = quantized_weight_t->mutable_data<float>();
for (size_t i = 0; i < weight_num; i++) {
quantized_weight_data[i] = temp_data[i] * whole_weight_scale;
}
quantized_weight_t->set_persistable(true);
quantized_weight_t->set_precision(PRECISION(kFloat));
#else
int8_t* quantized_weight_data = quantized_weight_t->mutable_data<int8_t>();
for (size_t i = 0; i < weight_num; i++) {
quantized_weight_data[i] = static_cast<int8_t>(temp_data[i]);
}
quantized_weight_t->set_persistable(true);
quantized_weight_t->set_precision(PRECISION(kInt8));
#endif
// int8_t* quantized_weight_data = quantized_weight_t->mutable_data<int8_t>();
// for (size_t i = 0; i < weight_num; i++) {
// quantized_weight_data[i] = static_cast<int8_t>(temp_data[i]);
// }
// quantized_weight_t->set_persistable(true);
// quantized_weight_t->set_precision(PRECISION(kInt8));
// new op and relink nodes
auto new_quantized_op = LiteOpRegistry::Global().Create(quantized_op_type_);
......@@ -464,6 +537,197 @@ cpp::OpDesc DeleteQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
cpp::OpDesc op_desc;
return op_desc;
}
// ================dynamic quant fuse==============
// #define DYNAMIC_RANGE
void DynamicQuantDequantOpFuser::BuildPattern() {
const int kNumFields = 5;
const int kQuantizedWeightOffset = 0;
const int kQuantizedOpOffset = 1;
const int kQuantizedOpOutOffset = 2;
const int kDequantOpOffset = 3;
const int kDequantOpOutOffset = 4;
std::string weight_name = "";
if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") {
weight_name = "Filter";
} else {
weight_name = "Y";
}
auto* quant_op_input = VarNode("quant_op_input")
->assert_is_op_input(quant_type_, "X")
->AsInput();
#ifdef DYNAMIC_RANGE
auto* quant_op_in_scale = VarNode("quant_op_in_scale")
->assert_is_op_input(quant_type_, "InScale")
->AsIntermediate();
#endif
auto* quant_op = OpNode("quant_op", quant_type_)
->assert_is_op(quant_type_)
->AsIntermediate();
auto* quant_op_out_scale =
VarNode("quant_op_out_scale")
->assert_is_op_output(quant_type_, "OutScale")
->assert_is_op_input("fake_dequantize_max_abs", "Scale")
->AsIntermediate();
auto* quant_op_out = VarNode("quant_op_out")
->assert_is_op_output(quant_type_, "Out")
->assert_is_op_input(op_type_)
->AsIntermediate();
std::vector<PMNode*> nodes;
for (int i = 0; i < times_; i++) {
nodes.push_back(VarNode(string_format("quantized_op_weight%d", i))
->assert_is_op_input(op_type_, weight_name)
->AsInput());
nodes.push_back(OpNode(string_format("quantized_op%d", i), op_type_)
->assert_is_op(op_type_)
->AsIntermediate());
nodes.push_back(VarNode(string_format("quantized_op_out%d", i))
->assert_is_op_output(op_type_)
->assert_is_op_input("fake_dequantize_max_abs", "X")
->AsIntermediate());
nodes.push_back(
OpNode(string_format("dequant_op%d", i), "fake_dequantize_max_abs")
->assert_is_op("fake_dequantize_max_abs")
->AsIntermediate());
nodes.push_back(VarNode(string_format("dequant_op_out%d", i))
->assert_is_op_output("fake_dequantize_max_abs", "Out")
->AsOutput());
}
#ifdef DYNAMIC_RANGE
quant_op->LinksFrom({quant_op_input, quant_op_in_scale});
#endif
quant_op->LinksFrom({quant_op_input});
quant_op_out->LinksFrom({quant_op});
quant_op_out_scale->LinksFrom({quant_op});
for (int i = 0; i < times_; i++) {
nodes[i * kNumFields + kQuantizedOpOffset]->LinksFrom(
{quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]});
nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom(
{nodes[i * kNumFields + kQuantizedOpOffset]});
nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
{nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom(
{nodes[i * kNumFields + kDequantOpOffset]});
}
}
void DynamicQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) {
const int kNumFields = 5;
const int kQuantizedWeightOffset = 0;
const int kQuantizedOpOffset = 1;
const int kDequantOpOffset = 3;
const int kDequantOpOutOffset = 4;
auto* quant_op_input = matched.at("quant_op_input");
#ifdef DYNAMIC_RANGE
auto* quant_op_in_scale = matched.at("quant_op_in_scale");
#endif
auto* quant_op = matched.at("quant_op");
std::vector<Node*> nodes;
for (int i = 0; i < times_; i++) {
nodes.push_back(matched.at(string_format("quantized_op_weight%d", i)));
nodes.push_back(matched.at(string_format("quantized_op%d", i)));
nodes.push_back(matched.at(string_format("quantized_op_out%d", i)));
nodes.push_back(matched.at(string_format("dequant_op%d", i)));
nodes.push_back(matched.at(string_format("dequant_op_out%d", i)));
}
int bit_length = quant_op->stmt()->op_info()->GetAttr<int>("bit_length");
auto* scope = quant_op->stmt()->op()->scope();
auto& valid_places = quant_op->stmt()->op()->valid_places();
int range = ((1 << (bit_length - 1)) - 1);
#ifdef DYNAMIC_RANGE
auto input_scale_t = scope->FindVar(quant_op_in_scale->arg()->name)
->GetMutable<lite::Tensor>();
float input_scale = input_scale_t->data<float>()[0] / range;
VLOG(4) << "range: " << range << " input_scale: " << input_scale;
#endif
for (int i = 0; i < times_; i++) {
float max_range = nodes[i * kNumFields + kDequantOpOffset]
->stmt()
->op_info()
->GetAttr<float>("max_range");
// weight_scale = max(abs(weight))
float whole_weight_scale =
static_cast<float>(range * range) / max_range / range;
cpp::OpDesc op_desc =
*nodes[i * kNumFields + kQuantizedOpOffset]->stmt()->op_info();
auto quantized_weight_var_name =
nodes[i * kNumFields + kQuantizedWeightOffset]->arg()->name;
auto quantized_weight_t =
scope->FindVar(quantized_weight_var_name)->GetMutable<lite::Tensor>();
std::vector<float> weight_scale;
int weight_scale_size;
if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") {
op_desc.SetInput("Input", {matched.at("quant_op_input")->arg()->name});
op_desc.SetOutput(
"Output", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name});
// Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should
// be Cout.
weight_scale_size = quantized_weight_t->dims()[0];
} else if (op_type_ == "mul") {
op_desc.SetInput("X", {matched.at("quant_op_input")->arg()->name});
op_desc.SetOutput(
"Out", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name});
// Fc weight: Cin * Cout, the weight_scale_size should be Cout.
weight_scale_size = quantized_weight_t->dims()[1];
}
for (int i = 0; i < weight_scale_size; i++) {
weight_scale.push_back(whole_weight_scale);
}
// op_desc.SetAttr("enable_int8", true);
// op_desc.SetAttr("input_scale", input_scale);
op_desc.SetAttr("weight_scale", weight_scale);
Tensor temp_tensor;
temp_tensor.CopyDataFrom(*quantized_weight_t);
float* temp_data = temp_tensor.mutable_data<float>();
size_t weight_num = quantized_weight_t->data_size();
quantized_weight_t->set_persistable(true);
std::cout << "DynamicQuantDequantOpFuser::InsertNewNode===================="
"========================================"
<< std::endl;
#ifdef LITE_WITH_FPGA
float* quantized_weight_data = quantized_weight_t->mutable_data<float>();
for (size_t i = 0; i < weight_num; i++) {
quantized_weight_data[i] = temp_data[i] * whole_weight_scale;
std::cout << whole_weight_scale << "," << temp_data[i] << ","
<< quantized_weight_data[i] << std::endl;
}
quantized_weight_t->set_precision(PRECISION(kFloat));
#else
int8_t* quantized_weight_data = quantized_weight_t->mutable_data<int8_t>();
for (size_t i = 0; i < weight_num; i++) {
quantized_weight_data[i] = static_cast<int8_t>(temp_data[i]);
}
quantized_weight_t->set_precision(PRECISION(kInt8));
#endif
auto quantized_op = LiteOpRegistry::Global().Create(op_type_);
quantized_op->Attach(op_desc, scope);
auto* new_op_node =
graph->GraphCreateInstructNode(quantized_op, valid_places);
IR_NODE_LINK_TO(quant_op_input, new_op_node);
IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset],
new_op_node);
IR_NODE_LINK_TO(new_op_node, nodes[i * kNumFields + kDequantOpOutOffset]);
}
}
cpp::OpDesc DynamicQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
cpp::OpDesc op_desc;
return op_desc;
}
} // namespace fusion
} // namespace mir
......
......@@ -52,6 +52,19 @@ class DeleteQuantOpFuser : public FuseBase {
private:
std::string quant_op_type_{};
};
class DeleteDynamicQuantOpFuser : public FuseBase {
public:
explicit DeleteDynamicQuantOpFuser(const std::string& quant_op_type)
: quant_op_type_(quant_op_type) {}
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
private:
std::string quant_op_type_{};
};
/* DequantOpFuser process conv2d/depthwise_conv2d/mul + fake_dequantize_max_abs.
*/
......@@ -106,6 +119,24 @@ class DeleteQuantDequantOpFuser : public FuseBase {
private:
std::string quantized_op_type_{};
};
// dynamic quantdequant op fuser
class DynamicQuantDequantOpFuser : public FuseBase {
public:
explicit DynamicQuantDequantOpFuser(const std::string& quantized_op_type,
const std::string& op_type,
int i)
: op_type_(op_type), quant_type_(quantized_op_type), times_(i) {}
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
private:
std::string op_type_{};
std::string quant_type_{};
int times_{1};
};
} // namespace fusion
} // namespace mir
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/kernel_place_correct_pass.h"
#include <memory>
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
void KernelPlaceCorrectPass::Apply(const std::unique_ptr<SSAGraph> &graph) {
CorrectArgumentPlace(graph.get());
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(kernel_place_correct_pass,
paddle::lite::mir::KernelPlaceCorrectPass)
.BindTargets({TARGET(kFPGA)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "lite/core/mir/pass.h"
#include "lite/core/target_wrapper.h"
namespace paddle {
namespace lite {
namespace mir {
/*
* Correct the place of the variables in the SSAGrpah, it will inference the
* variables' place by the kernels outputs them.
*/
class KernelPlaceCorrectPass : public DebugPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
private:
void CorrectArgumentPlace(SSAGraph* graph) {
auto& valid_places = graph->valid_places();
auto valid_places_has_target = [&](TargetType t) -> bool {
for (auto& p : valid_places) {
if (p.target == t) {
return true;
}
}
return false;
};
std::map<std::string, bool> lite_with_targets{
{"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
{"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
for (auto& x : graph->StmtTopologicalOrder()) {
auto& inst = x->AsStmt();
// The IoCopyOp is a tool operator, it won't support the type inference.
// in fpga, we has io_copy+cali+layout tool ops, so we need type inference
// for
// tool operator
if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
VLOG(3) << "inst.op_type() == 'io_copy', continue";
if (inst.op_type() == "io_copy") continue;
}
// deal with inputs
VLOG(4) << "checking op " << inst.op_info()->Repr();
auto get_argname = [&](
const std::string& node_name,
const std::map<std::string, std::vector<std::string>>& argname_map)
-> std::string {
for (auto& ele : argname_map) {
auto it =
std::find(ele.second.begin(), ele.second.end(), node_name);
if (it != ele.second.end()) return ele.first;
}
return "";
};
bool need_correct_place = true;
std::vector<TargetType> in_types;
std::vector<TargetType> out_types;
for (auto* x_in : x->inlinks) {
std::string node_name = x_in->AsArg().name;
std::string arg_name = get_argname(node_name, inst.op_info()->inputs());
CHECK(arg_name.size() > 0) << "can not found op arguments for node "
<< node_name;
VLOG(4) << "-- input arg_name:" << arg_name << " "
<< "-- node name:" << node_name;
auto type = inst.picked_kernel().GetInputDeclType(arg_name);
if (!x_in->AsArg().type) {
need_correct_place &= false;
} else {
if (in_types.empty()) {
in_types.push_back(x_in->AsArg().type->target());
} else {
if (in_types[0] != x_in->AsArg().type->target()) {
need_correct_place &= false;
}
}
}
}
for (auto* x_out : x->outlinks) {
std::string node_name = x_out->AsArg().name;
std::string arg_name =
get_argname(node_name, inst.op_info()->outputs());
CHECK(arg_name.size() > 0) << "can not found op arguments for node "
<< node_name << " in Inst "
<< inst.op_type();
VLOG(4) << "-- output arg_name " << arg_name;
auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
if (!x_out->AsArg().type) {
need_correct_place &= false;
} else {
if (out_types.empty()) {
out_types.push_back(x_out->AsArg().type->target());
} else {
if (out_types[0] != x_out->AsArg().type->target()) {
need_correct_place &= false;
}
}
}
}
auto this_type = inst.picked_kernel().target();
bool io_target_same = (in_types[0] == out_types[0]);
need_correct_place &= (io_target_same && (in_types[0] != this_type));
if (need_correct_place) {
// update this kernel's valid place;
UpdateTarget(inst, in_types[0]);
}
}
}
// Update me's kUnk fields by other's fields.
void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) { // NOLINT
auto new_place = inst.place();
new_place.target = new_target;
std::vector<Place> places;
places.push_back(new_place);
inst.ResetKernels(places);
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
......@@ -53,6 +53,11 @@ void mir::Node::Stmt::ResetOp(const cpp::OpDesc &op_desc,
}
valid_kernels_ = op_->CreateKernels(valid_places);
}
void mir::Node::Stmt::ResetKernels(const std::vector<Place> &valid_places) {
CHECK(op_) << "change valid place failed, not created op";
valid_kernels_.clear();
valid_kernels_ = op_->CreateKernels(valid_places);
}
mir::Node::Arg &mir::Node::AsArg(const std::string &name, int id) {
auto &x = AsArg();
......
......@@ -53,6 +53,7 @@ class Node {
const std::vector<Place>& valid_places,
lite::Scope* scope = nullptr);
void ResetKernels(const std::vector<Place>& valid_places);
std::string op_type() const { return op_info()->Type(); }
const OpInfo* op_info() const;
OpInfo* mutable_op_info();
......
......@@ -140,10 +140,12 @@ void SSAGraph::Build(const Program &program,
arg_node->AsArg(name, node_storage_.size() - 1);
arg_update_node_map_[name] = arg_node;
}
/*
if (var_types.count(name) && !arg_node->arg()->type) {
arg_node->arg()->type = LiteType::GetTensorTy(
TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
}
*/
if (is_weights(name)) arg_node->AsArg().is_weight = true;
CHECK(arg_node->IsRoleSet());
DirectedLink(arg_node, op_node);
......@@ -153,10 +155,12 @@ void SSAGraph::Build(const Program &program,
auto *arg_node = &node_storage_.back();
arg_node->AsArg(name, node_storage_.size() - 1);
arg_update_node_map_[name] = arg_node;
/*
if (var_types.count(name) && !arg_node->arg()->type) {
arg_node->arg()->type = LiteType::GetTensorTy(
TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
}
*/
if (is_weights(name)) arg_node->AsArg().is_weight = true;
CHECK(arg_node->IsRoleSet());
......
......@@ -157,7 +157,7 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel(
lite_api::LiteModelType::kNaiveBuffer);
// Load optimized model
lite_api::MobileConfig mobile_config;
mobile_config.set_model_dir(optimized_model_dir);
mobile_config.set_model_from_file(optimized_model_dir + ".nb");
mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH);
mobile_config.set_threads(1);
predictor = lite_api::CreatePaddlePredictor(mobile_config);
......
......@@ -101,7 +101,6 @@ void TypeTargetTransformPass::AddIoCopyInst(
auto io_copy_output_name =
string_format("%s/target_trans", in->AsArg().name.c_str());
// string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id());
if (copied_nodes->count(in->AsArg().name)) {
// Remove the old link
RemoveDirectedLink(in, inst_node);
......@@ -116,12 +115,14 @@ void TypeTargetTransformPass::AddIoCopyInst(
} else {
// TODO(MyPandaShaoxiang) should set same place with input?
auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name);
// Set the place for io_copy_output_arg node, the target should be equal to
// to.target()
// The precision and layout should be equal to from.precision(),
// from.layout()
// Set the place for io_copy_output_arg node, the target should be equal to
// to.target()
// The precision and layout should be equal to from.precision(),
// from.layout()
#ifndef LITE_WITH_FPGA
io_copy_output_arg->AsArg().type =
LiteType::GetTensorTy(to.target(), from.precision(), from.layout());
#endif
auto* io_copy_inst = graph->NewInstructNode();
bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
......
......@@ -77,6 +77,7 @@ class Optimizer {
#endif
"static_kernel_pick_pass", // pick original kernel from graph
"variable_place_inference_pass", // inference arg/var's
"kernel_place_correct_pass",
// info(target/precision/layout/device)
// using kernel info
"argument_type_display_pass", // debug pass: show arg-type-node's
......@@ -108,7 +109,9 @@ class Optimizer {
"runtime_context_assign_pass",
"argument_type_display_pass",
#ifndef LITE_WITH_FPGA
"memory_optimize_pass",
#endif
"npu_subgraph_pass",
"xpu_subgraph_pass"}};
RunPasses(passes_local);
......
......@@ -137,11 +137,16 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
void RuntimeProgram::Run() {
for (auto& inst : instructions_) {
#ifndef LITE_WITH_FPGA
if (inst.is_feed_fetch_op()) continue;
std::string op_type = inst.op()->op_info()->Type();
#endif
inst.Run();
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
#ifndef LITE_WITH_FPGA
LITE_PRECISION_PROFILE(inst)
#endif
#endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE
}
......
文件模式从 100644 更改为 100755
......@@ -42,7 +42,7 @@ static std::string version() {
std::string tag = paddlelite_tag();
if (tag.empty()) {
ss << paddlelite_branch() << "(" << paddlelite_commit() << ")";
ss << paddlelite_commit();
} else {
ss << tag;
}
......
# C++ Demo
> 欢迎加入PaddleLite百度官方QQ群(696965088),会有专业同学解答您的疑问与困惑。
1. 环境准备
- 保证Android NDK在/opt目录下
- 一台可以编译PaddleLite的电脑
- 一台armv7或armv8架构的安卓手机
2. 编译并运行全量api的demo(注:当编译模式为tiny_pubish时将不存在该demo)
2. 人脸识别和佩戴口罩判断的Demo
参考[源码编译](https://paddlepaddle.github.io/Paddle-Lite/v2.2.0/source_compile/)准备编译环境。
执行下面命令,下载PaddleLite代码。
```shell
git clone https://github.com/PaddlePaddle/Paddle-Lite.git
cd Paddle-Lite
```
进入PaddleLite根目录,编译预测库。
```shell
./lite/tools/build.sh \
--arm_os=android \
--arm_abi=armv8 \
--arm_lang=gcc \
--android_stl=c++_static \
--build_extra=ON \
--shutdown_log=OFF \
tiny_publish
```
进入编译目录,下载模型和图片的压缩包,编译可执行文件。
```shell
cd build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mask_detection
wget https://paddle-inference-dist.bj.bcebos.com/mask_detection.tar.gz
tar zxvf mask_detection.tar.gz
make
```
当然,大家也可以通过PaddleHub下载人脸检测模型和口罩佩戴判断模型。
```
# 下载paddlehub以后,通过python执行以下代码
import paddlehub as hub
pyramidbox_lite_mask = hub.Module(name="pyramidbox_lite_mask")
# 将模型保存在test_program文件夹之中
pyramidbox_lite_mask.processor.save_inference_model(dirname="test_program")
通过以上命令,可以获得人脸检测和口罩佩戴判断模型,分别存储在pyramidbox_lite和mask_detector之中。文件夹中的__model__是模型结构文件,__param__文件是权重文件。
```
电脑连接安卓手机,将可执行文件、测试图片、模型文件、预测库push到安卓手机上。
```
adb push mask_detection /data/local/tmp/
adb push test.jpg /data/local/tmp/
adb push face_detection /data/local/tmp
adb push mask_classification /data/local/tmp
adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
adb shell chmod +x /data/local/tmp/mask_detection
```
进入安卓手机,执行demo。
```
adb shell
cd /data/local/tmp
export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH
./mask_detection face_detection mask_classification test.jpg
```
回到电脑端,将结果取出,查看如下效果图。
```
adb pull /data/local/tmp/test_mask_detection_result.jpg ./
```
![test_mask_detection_result](https://user-images.githubusercontent.com/7383104/74279176-6200cd00-4d55-11ea-9fc0-83cfc2b3b37d.jpg)
3. 编译并运行全量api的demo(注:当编译模式为tiny_pubish时将不存在该demo)
```shell
cd inference_lite_lib.android.armv8/demo/cxx/mobile_full
wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
......@@ -17,7 +86,7 @@ adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
```
运行成功将在控制台输出预测结果的前10个类别的预测概率
3. 编译并运行轻量级api的demo
4. 编译并运行轻量级api的demo
```shell
cd ../mobile_light
make
......@@ -29,7 +98,7 @@ adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
```
运行成功将在控制台输出预测结果的前10个类别的预测概率
4. 编译并运行ssd目标检测的demo
5. 编译并运行ssd目标检测的demo
```shell
cd ../ssd_detection
wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz
......@@ -46,7 +115,7 @@ adb pull /data/local/tmp/test_ssd_detection_result.jpg ./
```
运行成功将在ssd_detection目录下看到生成的目标检测结果图像: test_ssd_detection_result.jpg
5. 编译并运行yolov3目标检测的demo
6. 编译并运行yolov3目标检测的demo
```shell
cd ../yolov3_detection
wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-yolov3.tar.gz
......@@ -63,7 +132,7 @@ adb pull /data/local/tmp/test_yolov3_detection_result.jpg ./
```
运行成功将在yolov3_detection目录下看到生成的目标检测结果图像: test_yolov3_detection_result.jpg
6. 编译并运行物体分类的demo
7. 编译并运行物体分类的demo
```shell
cd ../mobile_classify
wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
......@@ -71,41 +140,41 @@ tar zxvf mobilenet_v1.tar.gz
./model_optimize_tool optimize model
make
adb -s emulator-5554 push mobile_classify /data/local/tmp/
adb -s emulator-5554 push test.jpg /data/local/tmp/
adb -s emulator-5554 push labels.txt /data/local/tmp/
adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_classify
adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
adb push mobile_classify /data/local/tmp/
adb push test.jpg /data/local/tmp/
adb push labels.txt /data/local/tmp/
adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
adb shell chmod +x /data/local/tmp/mobile_classify
adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
```
运行成功将在控制台输出预测结果的前5个类别的预测概率
- 如若想看前10个类别的预测概率,在运行命令输入topk的值即可
eg:
```shell
adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10"
```
- 如若想看其他模型的分类结果, 在运行命令输入model_dir 及其model的输入大小即可
eg:
```shell
adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/mobile_classify /data/local/tmp/mobilenetv2opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224"
```
9. 编译含CV预处理库模型单测demo
8. 编译含CV预处理库模型单测demo
```shell
cd ../test_cv
wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
tar zxvf mobilenet_v1.tar.gz
./model_optimize_tool optimize model
make
adb -s emulator-5554 push test_model_cv /data/local/tmp/
adb -s emulator-5554 push test.jpg /data/local/tmp/
adb -s emulator-5554 push labels.txt /data/local/tmp/
adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
adb -s emulator-5554 shell chmod +x /data/local/tmp/test_model_cv
adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
adb push test_model_cv /data/local/tmp/
adb push test.jpg /data/local/tmp/
adb push labels.txt /data/local/tmp/
adb push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
adb shell chmod +x /data/local/tmp/test_model_cv
adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/test_model_cv /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
```
运行成功将在控制台输出预测结果的前10个类别的预测概率
ARM_ABI = arm7
export ARM_ABI
include ../Makefile.def
LITE_ROOT=../../../
THIRD_PARTY_DIR=${LITE_ROOT}/third_party
OPENCV_VERSION=opencv4.1.0
OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
###############################################################
# How to use one of static libaray: #
# `libpaddle_api_full_bundled.a` #
# `libpaddle_api_light_bundled.a` #
###############################################################
# Note: default use lite's shared library. #
###############################################################
# 1. Comment above line using `libpaddle_light_api_shared.so`
# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
mask_detection: fetch_opencv mask_detection.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mask_detection.o -o mask_detection $(CXX_LIBS) $(LDFLAGS)
mask_detection.o: mask_detection.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mask_detection.o -c mask_detection.cc
fetch_opencv:
@ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR}
@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
(echo "fetch opencv libs" && \
wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
.PHONY: clean
clean:
rm -f mask_detection.o
rm -f mask_detection
ARM_ABI = arm8
export ARM_ABI
include ../Makefile.def
LITE_ROOT=../../../
THIRD_PARTY_DIR=${LITE_ROOT}/third_party
OPENCV_VERSION=opencv4.1.0
OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
###############################################################
# How to use one of static libaray: #
# `libpaddle_api_full_bundled.a` #
# `libpaddle_api_light_bundled.a` #
###############################################################
# Note: default use lite's shared library. #
###############################################################
# 1. Comment above line using `libpaddle_light_api_shared.so`
# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
mask_detection: fetch_opencv mask_detection.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mask_detection.o -o mask_detection $(CXX_LIBS) $(LDFLAGS)
mask_detection.o: mask_detection.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mask_detection.o -c mask_detection.cc
fetch_opencv:
@ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR}
@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
(echo "fetch opencv libs" && \
wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
.PHONY: clean
clean:
rm -f mask_detection.o
rm -f mask_detection
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <string>
#include <vector>
#include "opencv2/core.hpp"
#include "opencv2/imgcodecs.hpp"
#include "opencv2/imgproc.hpp"
#include "paddle_api.h" // NOLINT
using namespace paddle::lite_api; // NOLINT
struct Object {
int batch_id;
cv::Rect rec;
int class_id;
float prob;
};
int64_t ShapeProduction(const shape_t& shape) {
int64_t res = 1;
for (auto i : shape) res *= i;
return res;
}
// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
void neon_mean_scale(const float* din,
float* dout,
int size,
const std::vector<float> mean,
const std::vector<float> scale) {
if (mean.size() != 3 || scale.size() != 3) {
std::cerr << "[ERROR] mean or scale size must equal to 3\n";
exit(1);
}
float32x4_t vmean0 = vdupq_n_f32(mean[0]);
float32x4_t vmean1 = vdupq_n_f32(mean[1]);
float32x4_t vmean2 = vdupq_n_f32(mean[2]);
float32x4_t vscale0 = vdupq_n_f32(scale[0]);
float32x4_t vscale1 = vdupq_n_f32(scale[1]);
float32x4_t vscale2 = vdupq_n_f32(scale[2]);
float* dout_c0 = dout;
float* dout_c1 = dout + size;
float* dout_c2 = dout + size * 2;
int i = 0;
for (; i < size - 3; i += 4) {
float32x4x3_t vin3 = vld3q_f32(din);
float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
vst1q_f32(dout_c0, vs0);
vst1q_f32(dout_c1, vs1);
vst1q_f32(dout_c2, vs2);
din += 12;
dout_c0 += 4;
dout_c1 += 4;
dout_c2 += 4;
}
for (; i < size; i++) {
*(dout_c0++) = (*(din++) - mean[0]) * scale[0];
*(dout_c1++) = (*(din++) - mean[1]) * scale[1];
*(dout_c2++) = (*(din++) - mean[2]) * scale[2];
}
}
void pre_process(const cv::Mat& img,
int width,
int height,
const std::vector<float>& mean,
const std::vector<float>& scale,
float* data,
bool is_scale = false) {
cv::Mat resized_img;
cv::resize(
img, resized_img, cv::Size(width, height), 0.f, 0.f, cv::INTER_CUBIC);
cv::Mat imgf;
float scale_factor = is_scale ? 1.f / 256 : 1.f;
resized_img.convertTo(imgf, CV_32FC3, scale_factor);
const float* dimg = reinterpret_cast<const float*>(imgf.data);
neon_mean_scale(dimg, data, width * height, mean, scale);
}
void RunModel(std::string det_model_dir,
std::string class_model_dir,
std::string img_path) {
// Prepare
cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
float shrink = 0.2;
int width = img.cols;
int height = img.rows;
int s_width = static_cast<int>(width * shrink);
int s_height = static_cast<int>(height * shrink);
// Detection
MobileConfig config;
config.set_model_dir(det_model_dir);
// Create Predictor For Detction Model
std::shared_ptr<PaddlePredictor> predictor =
CreatePaddlePredictor<MobileConfig>(config);
// Get Input Tensor
std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
input_tensor0->Resize({1, 3, s_height, s_width});
auto* data = input_tensor0->mutable_data<float>();
// Do PreProcess
std::vector<float> detect_mean = {104.f, 117.f, 123.f};
std::vector<float> detect_scale = {0.007843, 0.007843, 0.007843};
pre_process(img, s_width, s_height, detect_mean, detect_scale, data, false);
// Detection Model Run
predictor->Run();
// Get Output Tensor
std::unique_ptr<const Tensor> output_tensor0(
std::move(predictor->GetOutput(0)));
auto* outptr = output_tensor0->data<float>();
auto shape_out = output_tensor0->shape();
int64_t out_len = ShapeProduction(shape_out);
// Filter Out Detection Box
float detect_threshold = 0.3;
std::vector<Object> detect_result;
for (int i = 0; i < out_len / 6; ++i) {
if (outptr[1] >= detect_threshold) {
Object obj;
int xmin = static_cast<int>(width * outptr[2]);
int ymin = static_cast<int>(height * outptr[3]);
int xmax = static_cast<int>(width * outptr[4]);
int ymax = static_cast<int>(height * outptr[5]);
int w = xmax - xmin;
int h = ymax - ymin;
cv::Rect rec_clip =
cv::Rect(xmin, ymin, w, h) & cv::Rect(0, 0, width, height);
obj.rec = rec_clip;
detect_result.push_back(obj);
}
outptr += 6;
}
// Classification
config.set_model_dir(class_model_dir);
// Create Predictor For Classification Model
predictor = CreatePaddlePredictor<MobileConfig>(config);
// Get Input Tensor
std::unique_ptr<Tensor> input_tensor1(std::move(predictor->GetInput(0)));
int classify_w = 128;
int classify_h = 128;
input_tensor1->Resize({1, 3, classify_h, classify_w});
auto* input_data = input_tensor1->mutable_data<float>();
int detect_num = detect_result.size();
std::vector<float> classify_mean = {0.5f, 0.5f, 0.5f};
std::vector<float> classify_scale = {1.f, 1.f, 1.f};
float classify_threshold = 0.5;
for (int i = 0; i < detect_num; ++i) {
cv::Rect rec_clip = detect_result[i].rec;
cv::Mat roi = img(rec_clip);
// Do PreProcess
pre_process(roi,
classify_w,
classify_h,
classify_mean,
classify_scale,
input_data,
true);
// Classification Model Run
predictor->Run();
// Get Output Tensor
std::unique_ptr<const Tensor> output_tensor1(
std::move(predictor->GetOutput(1)));
auto* outptr = output_tensor1->data<float>();
// Draw Detection and Classification Results
cv::rectangle(img, rec_clip, cv::Scalar(0, 0, 255), 2, cv::LINE_AA);
std::string text = outptr[1] > classify_threshold ? "wear mask" : "no mask";
int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
double font_scale = 1.f;
int thickness = 1;
cv::Size text_size =
cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
float new_font_scale = rec_clip.width * 0.7 * font_scale / text_size.width;
text_size =
cv::getTextSize(text, font_face, new_font_scale, thickness, nullptr);
cv::Point origin;
origin.x = rec_clip.x + 5;
origin.y = rec_clip.y + text_size.height + 5;
cv::putText(img,
text,
origin,
font_face,
new_font_scale,
cv::Scalar(0, 255, 255),
thickness,
cv::LINE_AA);
std::cout << "detect face, location: x=" << rec_clip.x
<< ", y=" << rec_clip.y << ", width=" << rec_clip.width
<< ", height=" << rec_clip.height
<< ", wear mask: " << (outptr[1] > classify_threshold)
<< std::endl;
}
// Write Result to Image File
int start = img_path.find_last_of("/");
int end = img_path.find_last_of(".");
std::string img_name = img_path.substr(start + 1, end - start - 1);
std::string result_name = img_name + "_mask_detection_result.jpg";
cv::imwrite(result_name, img);
}
int main(int argc, char** argv) {
if (argc < 3) {
std::cerr << "[ERROR] usage: " << argv[0]
<< " detction_model_dir classification_model_dir image_path\n";
exit(1);
}
std::string detect_model_dir = argv[1];
std::string classify_model_dir = argv[2];
std::string img_path = argv[3];
RunModel(detect_model_dir, classify_model_dir, img_path);
return 0;
}
......@@ -82,8 +82,8 @@ void neon_mean_scale(const float* din,
}
for (; i < size; i++) {
*(dout_c0++) = (*(din++) - mean[0]) * scale[0];
*(dout_c0++) = (*(din++) - mean[1]) * scale[1];
*(dout_c0++) = (*(din++) - mean[2]) * scale[2];
*(dout_c1++) = (*(din++) - mean[1]) * scale[1];
*(dout_c2++) = (*(din++) - mean[2]) * scale[2];
}
}
......@@ -188,13 +188,12 @@ void RunModel(std::string model_dir, std::string img_path) {
std::move(predictor->GetOutput(0)));
auto* outptr = output_tensor->data<float>();
auto shape_out = output_tensor->shape();
int64_t cnt = 1;
for (auto& i : shape_out) {
cnt *= i;
}
int64_t cnt = ShapeProduction(shape_out);
auto rec_out = detect_object(outptr, static_cast<int>(cnt / 6), 0.6f, img);
std::string result_name =
img_path.substr(0, img_path.find(".")) + "_ssd_detection_result.jpg";
int start = img_path.find_last_of("/");
int end = img_path.find_last_of(".");
std::string img_name = img_path.substr(start + 1, end - start - 1);
std::string result_name = img_name + "_ssd_detection_result.jpg";
cv::imwrite(result_name, img);
}
......
......@@ -46,7 +46,6 @@ class Tensor {
*/
class PaddlePredictor {
public:
void Init();
std::unique_ptr<Tensor> GetTensor(const std::string &id) const;
std::unique_ptr<Tensor> GetMutableTensor(const std::string &id);
......
......@@ -62,6 +62,10 @@ void CastCompute::Run() {
int32_t* out_data = param.Out->mutable_data<int32_t>();
std::transform(
x_data_begin, x_data_end, out_data, TransOp<int64_t, int32_t>);
} else if (param.in_dtype == 3 && param.out_dtype == 5) {
const auto* x_data = param.X->data<float>();
auto* o_data = param.Out->mutable_data<float>();
memcpy(o_data, x_data, sizeof(float) * param.X->numel());
} else {
LOG(FATAL) << "other has not been implemented";
}
......
......@@ -95,7 +95,7 @@ class FcCompute : public KernelLite<TARGET(kARM), PType> {
CHECK_GE(x_dims.size(), 2UL);
CHECK_EQ(w_dims.size(), 2UL);
CHECK_EQ(param.output->dims().size(), 2UL);
CHECK_GE(param.output->dims().size(), 2UL);
m_ = x_dims.Slice(0, param.in_num_col_dims).production();
k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
......
......@@ -60,25 +60,10 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<ARMContext>();
if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
auto data = param.Out->template mutable_data<float>();
for (int i = 0; i < param.Out->numel(); i++) {
data[i] = param.value;
}
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT32)) {
auto data = param.Out->template mutable_data<int32_t>();
for (int i = 0; i < param.Out->numel(); i++) {
data[i] = param.value;
}
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT8)) {
auto data = param.Out->template mutable_data<int8_t>();
for (int i = 0; i < param.Out->numel(); i++) {
data[i] = param.value;
}
} else {
LOG(FATAL) << "not supported dtype " << param.dtype;
// auto data = param.Out->template mutable_data<T>();
auto data = param.Out->template mutable_data<float>();
for (int i = 0; i < param.Out->numel(); i++) {
data[i] = param.value;
}
}
......@@ -94,32 +79,38 @@ class FillConstantBatchLikeCompute
auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<ARMContext>();
if (param.input->lod().size() && param.input_dim_idx == 0) {
auto odims = param.out->dims();
odims[param.output_dim_idx] = param.input->lod().back().size() - 1;
param.out->Resize(odims);
// auto data = param.out->template mutable_data<T>();
auto data = param.out->template mutable_data<float>();
for (int i = 0; i < param.out->numel(); i++) {
data[i] = param.value;
}
if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
auto data = param.out->template mutable_data<float>();
for (int i = 0; i < param.out->numel(); i++) {
data[i] = param.value;
}
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT32)) {
auto data = param.out->template mutable_data<int32_t>();
for (int i = 0; i < param.out->numel(); i++) {
data[i] = param.value;
}
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT8)) {
auto data = param.out->template mutable_data<int8_t>();
for (int i = 0; i < param.out->numel(); i++) {
data[i] = param.value;
}
} else {
LOG(FATAL) << "not supported dtype " << param.dtype;
}
// if (param.input->lod().size() && param.input_dim_idx == 0) {
// auto odims = param.out->dims();
// odims[param.output_dim_idx] = param.input->lod().back().size() - 1;
// param.out->Resize(odims);
// }
// if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
// auto data = param.out->template mutable_data<float>();
// for (int i = 0; i < param.out->numel(); i++) {
// data[i] = param.value;
// }
// } else if (param.dtype ==
// static_cast<int32_t>(lite::core::FluidType::INT32)) {
// auto data = param.out->template mutable_data<int32_t>();
// for (int i = 0; i < param.out->numel(); i++) {
// data[i] = param.value;
// }
// } else if (param.dtype ==
// static_cast<int32_t>(lite::core::FluidType::INT8)) {
// auto data = param.out->template mutable_data<int8_t>();
// for (int i = 0; i < param.out->numel(); i++) {
// data[i] = param.value;
// }
// } else {
// LOG(FATAL) << "not supported dtype " << param.dtype;
// }
}
virtual ~FillConstantBatchLikeCompute() = default;
......@@ -142,8 +133,9 @@ REGISTER_LITE_KERNEL(fill_constant,
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("ShapeTensorList",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(fill_constant_batch_size_like,
kARM,
kAny,
......
......@@ -36,7 +36,7 @@ void LookupTableCompute::Run() {
auto table_dim = w->dims();
int64_t ids_numel = ids->numel();
auto ids_data = ids->data<int64_t>();
auto ids_data = ids->data<float>();
int64_t row_number = table_dim[0];
int64_t row_width = table_dim[1];
......@@ -75,7 +75,6 @@ REGISTER_LITE_KERNEL(lookup_table,
.BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(lookup_table_v2,
kARM,
kFloat,
......
......@@ -15,7 +15,12 @@ lite_cc_library(subgraph_bridge_softmax_op_bm SRCS softmax_op.cc DEPS ${subgraph
lite_cc_library(subgraph_bridge_mul_op_bm SRCS mul_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_batch_norm_op_bm SRCS batch_norm_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_scale_op_bm SRCS scale_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_concat_op_bm SRCS concat_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_dropout_op_bm SRCS dropout_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_transpose_op_bm SRCS transpose_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_reshape_op_bm SRCS reshape_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_norm_op_bm SRCS norm_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_prior_box_op_bm SRCS prior_box_op.cc DEPS ${bm_subgraph_bridge_deps})
set(bm_subgraph_bridges
subgraph_bridge_registry
subgraph_bridge_engine
......@@ -28,4 +33,10 @@ set(bm_subgraph_bridges
subgraph_bridge_mul_op_bm
subgraph_bridge_batch_norm_op_bm
subgraph_bridge_scale_op_bm
subgraph_bridge_concat_op_bm
subgraph_bridge_dropout_op_bm
subgraph_bridge_transpose_op_bm
subgraph_bridge_reshape_op_bm
subgraph_bridge_norm_op_bm
subgraph_bridge_prior_box_op_bm
CACHE INTERNAL "bm_subgraph_bridges")
......@@ -45,7 +45,14 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
}
CHECK_EQ(op_type, "relu");
float alpha = 0.f;
if (op_type == "relu") {
} else if (op_type == "leaky_relu") {
alpha = op_info->GetAttr<float>("alpha");
} else {
LOG(FATAL) << "[BM] unsupport act type";
return FAILED;
}
add_relu_layer(graph->GetCompilerHandle(),
const_cast<const int*>(&i_x_shape_data[0]),
x_dims.size(),
......@@ -53,7 +60,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
const_cast<const int*>(&i_output_shape_data[0]),
output_dims.size(),
static_cast<const char*>(output_var_name.c_str()),
0.f,
alpha,
-1.f);
graph->AddNode(output_var_name);
return SUCCESS;
......@@ -65,3 +72,6 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(relu, kBM, paddle::lite::subgraph::bm::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
kBM,
paddle::lite::subgraph::bm::ActConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <bmcompiler_if.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/bm/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace bm {
int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto scope = op->scope();
auto op_info = op->op_info();
auto op_type = op_info->Type();
// input
auto x_names = op_info->Input("X");
auto x_type = kernel->GetInputDeclType("X");
CHECK(x_type->layout() == DATALAYOUT(kNCHW));
// output
auto output_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
auto output_dims = output->dims();
const int64_t* output_shape_data =
const_cast<const int64_t*>(&output_dims.data()[0]);
std::vector<int32_t> i_output_shape_data(output_dims.size());
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
}
const int32_t input_num = x_names.size();
int32_t** shape = new int32_t*[input_num];
int32_t* dim = new int32_t[input_num];
const char** name = new const char*[input_num];
for (size_t i = 0; i < x_names.size(); i++) {
auto x = scope->FindMutableTensor(x_names[i]);
name[i] = x_names[i].c_str();
auto x_dims = x->dims();
dim[i] = x_dims.size();
const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
shape[i] = new int32_t[x_dims.size()];
for (size_t j = 0; j < x_dims.size(); j++) {
shape[i][j] = static_cast<int32_t>(x_shape_data[j]);
}
}
auto axis = op_info->GetAttr<int>("axis");
add_concat_layer(graph->GetCompilerHandle(),
input_num,
shape,
dim,
name,
const_cast<const int*>(&i_output_shape_data[0]),
output_dims.size(),
static_cast<const char*>(output_var_name.c_str()),
axis);
for (size_t i = 0; i < x_names.size(); i++) {
delete[] shape[i];
}
delete[] shape;
delete[] name;
delete[] dim;
graph->AddNode(output_var_name);
return SUCCESS;
}
} // namespace bm
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(concat,
kBM,
paddle::lite::subgraph::bm::ConcatConverter);
......@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/conv_op.h"
#include <bmcompiler_if.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/bm/bridges/utility.h"
......@@ -58,10 +57,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
std::vector<int32_t> i_output_shape_data(output_dims.size());
for (size_t i = 0; i < input_dims.size(); i++) {
i_input_shape_data[i] = static_cast<int>(input_shape_data[i]);
i_input_shape_data[i] = static_cast<int32_t>(input_shape_data[i]);
}
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
i_output_shape_data[i] = static_cast<int32_t>(output_shape_data[i]);
}
const float* filter_data =
const_cast<const float*>(filter->mutable_data<float>());
......@@ -69,7 +68,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
auto strides = op_info->GetAttr<std::vector<int>>("strides");
auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
add_conv_layer(graph->GetCompilerHandle(),
const_cast<const int*>(&i_input_shape_data[0]),
input_dims.size(),
......@@ -104,3 +102,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
REGISTER_SUBGRAPH_BRIDGE(conv2d,
kBM,
paddle::lite::subgraph::bm::ConvConverter);
REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
kBM,
paddle::lite::subgraph::bm::ConvConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <bmcompiler_if.h>
#include <bmcompiler_op_code.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/bm/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace bm {
int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto scope = op->scope();
auto op_info = op->op_info();
auto op_type = op_info->Type();
// input
auto x_var_name = op_info->Input("X").front();
auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
auto x_dims = x->dims();
const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
std::vector<int32_t> i_x_shape_data(x_dims.size());
for (size_t i = 0; i < x_dims.size(); i++) {
i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
}
// output
auto output_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
auto output_dims = output->dims();
const int64_t* output_shape_data =
const_cast<const int64_t*>(&output_dims.data()[0]);
std::vector<int32_t> i_output_shape_data(output_dims.size());
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
}
auto dropout_prob = op_info->GetAttr<float>("dropout_prob");
auto dropout_implementation =
op_info->GetAttr<std::string>("dropout_implementation");
CHECK_EQ(dropout_implementation, "downgrade_in_infer");
add_const_binary_layer(graph->GetCompilerHandle(),
static_cast<const char*>(x_var_name.c_str()),
const_cast<const int*>(&i_x_shape_data[0]),
x_dims.size(),
1.f - dropout_prob,
static_cast<const char*>(output_var_name.c_str()),
BINARY_MUL,
0);
graph->AddNode(output_var_name);
return SUCCESS;
}
} // namespace bm
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(dropout,
kBM,
paddle::lite::subgraph::bm::DropoutConverter);
......@@ -14,6 +14,7 @@
#include <bmcompiler_defs.h>
#include <bmcompiler_if.h>
#include <bmcompiler_if_lite.h>
#include <bmcompiler_op_code.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/bm/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
......@@ -68,42 +69,52 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
}
if (y_is_const) {
CHECK_EQ(op_type, "elementwise_add");
}
auto axis = op_info->GetAttr<int>("axis");
int op_code{-1};
int eltwise_if_code{-1};
float coeff[2] = {1.f, 1.f};
if (op_type == "elementwise_mul") {
op_code = 0;
op_code = BINARY_MUL;
eltwise_if_code = 0;
} else if (op_type == "elementwise_add") {
op_code = 1;
op_code = BINARY_ADD;
eltwise_if_code = 1;
} else if (op_type == "elementwise_sub") {
op_code = 1;
op_code = BINARY_SUB;
eltwise_if_code = 1;
coeff[1] = -1.f;
} else {
LOG(FATAL) << "UNSUPPORTED ELTWISE OPERATION: " << op_type;
}
if (!y_is_const) {
add_eltwise_layer(graph->GetCompilerHandle(),
input_num,
shape,
dim,
name,
const_cast<const int*>(&i_output_shape_data[0]),
output_dims.size(),
static_cast<const char*>(output_var_name.c_str()),
op_code,
coeff);
} else {
const float* y_data = const_cast<const float*>(y->mutable_data<float>());
const float* x_data = const_cast<const float*>(x->mutable_data<float>());
bm_add_const_tensor(graph->GetCompilerHandle(),
name[1],
shape[0],
dim[0],
static_cast<bm_data_type_t>(DTYPE_FP32),
static_cast<const void*>(y_data));
const float* y_data = const_cast<const float*>(y->mutable_data<float>());
const float* x_data = const_cast<const float*>(x->mutable_data<float>());
auto unique_op_name = lite::subgraph::bm::UniqueName("expand_ndims");
std::vector<int32_t> i_expand_shape_data(3);
if (y_is_const) {
if (dim[0] == dim[1] || 2 == dim[0]) {
bm_add_const_tensor(graph->GetCompilerHandle(),
name[1],
shape[1],
dim[1],
static_cast<bm_data_type_t>(DTYPE_FP32),
static_cast<const void*>(y_data));
} else if (1 == dim[1] && 1 == axis) {
add_expand_ndims_layer(graph->GetCompilerHandle(),
name[1],
shape[1],
dim[1],
static_cast<const float*>(y_data),
-1,
2,
static_cast<const char*>(unique_op_name.c_str()));
name[1] = static_cast<const char*>(unique_op_name.c_str());
dim[1] = 3;
i_expand_shape_data[0] = i_y_shape_data[0];
i_expand_shape_data[1] = 1;
i_expand_shape_data[2] = 1;
shape[1] = &i_expand_shape_data[0];
y_data = nullptr;
}
add_binary_layer_v2(graph->GetCompilerHandle(),
name[0],
shape[0],
......@@ -111,12 +122,23 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
0,
static_cast<const float*>(x_data),
name[1],
shape[0],
dim[0],
shape[1],
dim[1],
0,
static_cast<const float*>(y_data),
static_cast<const char*>(output_var_name.c_str()),
0);
op_code);
} else {
add_eltwise_layer(graph->GetCompilerHandle(),
input_num,
shape,
dim,
name,
const_cast<const int*>(&i_output_shape_data[0]),
output_dims.size(),
static_cast<const char*>(output_var_name.c_str()),
eltwise_if_code,
coeff);
}
delete[] shape;
delete[] name;
......@@ -133,3 +155,9 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
kBM,
paddle::lite::subgraph::bm::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
kBM,
paddle::lite::subgraph::bm::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
kBM,
paddle::lite::subgraph::bm::ElementwiseConverter);
......@@ -41,8 +41,10 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
}
// add reshape layer
int i_x_reshape_shape_data[2];
for (size_t i = 0; i < 2; i++) {
i_x_reshape_shape_data[i] = static_cast<int>(x_shape_data[i]);
i_x_reshape_shape_data[0] = static_cast<int>(x_shape_data[0]);
i_x_reshape_shape_data[1] = 1;
for (size_t i = 1; i < x_dims.size(); i++) {
i_x_reshape_shape_data[1] *= static_cast<int>(x_shape_data[i]);
}
int reshape_param[] = {0, -1};
auto unique_op_reshape_name =
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <bmcompiler_if.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/bm/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace bm {
int NormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto scope = op->scope();
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
auto x_var_name = op_info->Input("X").front();
auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
auto x_dims = x->dims();
auto output_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
auto output_dims = output->dims();
const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
const int64_t* output_shape_data =
const_cast<const int64_t*>(&output_dims.data()[0]);
std::vector<int32_t> i_x_shape_data(x_dims.size());
std::vector<int32_t> i_output_shape_data(output_dims.size());
for (size_t i = 0; i < x_dims.size(); i++) {
i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
}
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
}
float one = 1.f;
auto epsilon = op_info->GetAttr<float>("epsilon");
add_normalize_layer(graph->GetCompilerHandle(),
const_cast<const int*>(&i_x_shape_data[0]),
x_dims.size(),
static_cast<const char*>(x_var_name.c_str()),
const_cast<const int*>(&i_output_shape_data[0]),
output_dims.size(),
static_cast<const char*>(output_var_name.c_str()),
static_cast<const char*>(unique_op_name.c_str()),
0,
1,
&one,
epsilon);
graph->AddNode(output_var_name);
return SUCCESS;
}
} // namespace bm
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(norm, kBM, paddle::lite::subgraph::bm::NormConverter);
......@@ -15,10 +15,24 @@
#pragma once
USE_SUBGRAPH_BRIDGE(relu, kBM);
USE_SUBGRAPH_BRIDGE(leaky_relu, kBM);
USE_SUBGRAPH_BRIDGE(conv2d, kBM);
USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kBM);
USE_SUBGRAPH_BRIDGE(elementwise_add, kBM);
USE_SUBGRAPH_BRIDGE(elementwise_mul, kBM);
USE_SUBGRAPH_BRIDGE(elementwise_sub, kBM);
USE_SUBGRAPH_BRIDGE(pool2d, kBM);
USE_SUBGRAPH_BRIDGE(softmax, kBM);
USE_SUBGRAPH_BRIDGE(mul, kBM);
USE_SUBGRAPH_BRIDGE(batch_norm, kBM);
USE_SUBGRAPH_BRIDGE(scale, kBM);
USE_SUBGRAPH_BRIDGE(concat, kBM);
USE_SUBGRAPH_BRIDGE(dropout, kBM);
USE_SUBGRAPH_BRIDGE(transpose, kBM);
USE_SUBGRAPH_BRIDGE(transpose2, kBM);
USE_SUBGRAPH_BRIDGE(reshape, kBM);
USE_SUBGRAPH_BRIDGE(reshape2, kBM);
USE_SUBGRAPH_BRIDGE(flatten, kBM);
USE_SUBGRAPH_BRIDGE(flatten2, kBM);
USE_SUBGRAPH_BRIDGE(norm, kBM);
USE_SUBGRAPH_BRIDGE(prior_box, kBM);
......@@ -65,6 +65,12 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
if (pooling_type == "avg") {
average_exclusive = op_info->GetAttr<bool>("exclusive");
}
if (global_pooling) {
paddings[0] = 0;
paddings[1] = 0;
ksize[0] = i_x_shape_data[2];
ksize[1] = i_x_shape_data[3];
}
add_pooling_layer(
graph->GetCompilerHandle(),
const_cast<const int*>(&i_x_shape_data[0]),
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <bmcompiler_if.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/bm/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace bm {
typedef struct __tag_st_priorbox_param {
std::vector<float> min_sizes;
std::vector<float> max_sizes;
std::vector<float> aspect_ratios;
std::vector<float> variances;
float step_w;
float step_h;
float offset;
int32_t img_w;
int32_t img_h;
int32_t prior_num;
bool min_max_aspect_ratios_order;
bool clip;
bool flip;
} st_priorbox_param;
inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
bool flip,
std::vector<float>* output_aspect_ratior) {
constexpr float epsilon = 1e-6;
output_aspect_ratior->clear();
output_aspect_ratior->push_back(1.0f);
for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
float ar = input_aspect_ratior[i];
bool already_exist = false;
for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
already_exist = true;
break;
}
}
if (!already_exist) {
output_aspect_ratior->push_back(ar);
if (flip) {
output_aspect_ratior->push_back(1.0f / ar);
}
}
}
}
float* compute_priorbox_kernel(OpLite* op, st_priorbox_param* param) {
auto op_info = op->op_info();
auto scope = op->scope();
// inputs
auto in_var_name = op_info->Input("Input").front();
auto in = scope->FindVar(in_var_name)->GetMutable<lite::Tensor>();
auto in_dims = in->dims();
auto img_var_name = op_info->Input("Image").front();
auto img = scope->FindVar(img_var_name)->GetMutable<lite::Tensor>();
auto img_dims = img->dims();
// outputs
auto boxes_var_name = op_info->Output("Boxes").front();
auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
auto var_var_name = op_info->Output("Variances").front();
auto var = scope->FindVar(var_var_name)->GetMutable<lite::Tensor>();
std::vector<float> expand_aspect_ratios;
ExpandAspectRatios(param->aspect_ratios, param->flip, &expand_aspect_ratios);
param->aspect_ratios.clear();
for (size_t i = 0; i < expand_aspect_ratios.size(); i++) {
param->aspect_ratios.push_back(expand_aspect_ratios[i]);
}
param->prior_num = param->aspect_ratios.size() * param->min_sizes.size();
if (param->max_sizes.size() > 0) {
param->prior_num += param->max_sizes.size();
}
int32_t win1 = in_dims[3];
int32_t hin1 = in_dims[2];
DDim shape_out({hin1, win1, param->prior_num, 4});
boxes->Resize(shape_out);
var->Resize(shape_out);
// boxes->mutable_data<float>();
// var->mutable_data<float>();
float* cpu_data =
static_cast<float*>(malloc(sizeof(float) * boxes->data_size() * 2));
CHECK(cpu_data != nullptr);
const int32_t width = in_dims[3];
const int32_t height = in_dims[2];
int32_t img_width = param->img_w;
int32_t img_height = param->img_h;
if (img_width == 0 || img_height == 0) {
img_width = img_dims[3];
img_height = img_dims[2];
}
float step_w = param->step_w;
float step_h = param->step_h;
if (step_w == 0.f || step_h == 0.f) {
step_w = static_cast<float>(img_width) / width;
step_h = static_cast<float>(img_height) / height;
}
float offset = param->offset;
int32_t channel_size = height * width * param->prior_num * 4;
int32_t idx = 0;
///////////////////////////////////////////////////////////////////////
for (int32_t h = 0; h < height; ++h) {
for (int32_t w = 0; w < width; ++w) {
float center_x = (w + offset) * step_w;
float center_y = (h + offset) * step_h;
float box_width = 0.f;
float box_height = 0.f;
float* min_buf = reinterpret_cast<float*>(malloc(sizeof(float) * 4));
float* max_buf = reinterpret_cast<float*>(malloc(sizeof(float) * 4));
float* com_buf = reinterpret_cast<float*>(
malloc(sizeof(float) * expand_aspect_ratios.size() * 4));
CHECK(min_buf != nullptr);
CHECK(max_buf != nullptr);
CHECK(com_buf != nullptr);
// LOG(INFO) << "the number of min_size is " << min_sizes_.size();
for (size_t s = 0; s < param->min_sizes.size(); ++s) {
int32_t min_idx = 0;
int32_t max_idx = 0;
int32_t com_idx = 0;
int32_t min_size = param->min_sizes[s];
//! first prior: aspect_ratio = 1, size = min_size
box_width = box_height = min_size;
//! xmin
min_buf[min_idx++] = (center_x - box_width / 2.f) / img_width;
//! ymin
min_buf[min_idx++] = (center_y - box_height / 2.f) / img_height;
//! xmax
min_buf[min_idx++] = (center_x + box_width / 2.f) / img_width;
//! ymax
min_buf[min_idx++] = (center_y + box_height / 2.f) / img_height;
if (param->max_sizes.size() > 0) {
int max_size = param->max_sizes[s];
//! second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
box_width = box_height = sqrtf(min_size * max_size);
//! xmin
max_buf[max_idx++] = (center_x - box_width / 2.f) / img_width;
//! ymin
max_buf[max_idx++] = (center_y - box_height / 2.f) / img_height;
//! xmax
max_buf[max_idx++] = (center_x + box_width / 2.f) / img_width;
//! ymax
max_buf[max_idx++] = (center_y + box_height / 2.f) / img_height;
}
//! rest of priors
for (size_t r = 0; r < expand_aspect_ratios.size(); ++r) {
float ar = expand_aspect_ratios[r];
if (fabs(ar - 1.) < 1e-6) {
continue;
}
box_width = min_size * sqrt(ar);
box_height = min_size / sqrt(ar);
//! xmin
com_buf[com_idx++] = (center_x - box_width / 2.f) / img_width;
//! ymin
com_buf[com_idx++] = (center_y - box_height / 2.f) / img_height;
//! xmax
com_buf[com_idx++] = (center_x + box_width / 2.f) / img_width;
//! ymax
com_buf[com_idx++] = (center_y + box_height / 2.f) / img_height;
}
if (param->min_max_aspect_ratios_order) {
memcpy(cpu_data + idx, min_buf, sizeof(float) * min_idx);
idx += min_idx;
memcpy(cpu_data + idx, max_buf, sizeof(float) * max_idx);
idx += max_idx;
memcpy(cpu_data + idx, com_buf, sizeof(float) * com_idx);
idx += com_idx;
} else {
memcpy(cpu_data + idx, com_buf, sizeof(float) * com_idx);
idx += com_idx;
memcpy(cpu_data + idx, max_buf, sizeof(float) * max_idx);
idx += max_idx;
}
}
free(min_buf);
free(max_buf);
free(com_buf);
}
}
//! clip the prior's coordidate such that it is within [0, 1]
if (param->clip) {
for (int32_t d = 0; d < channel_size; ++d) {
cpu_data[d] = std::min(std::max(cpu_data[d], 0.f), 1.f);
}
}
//! set the variance.
float* ptr = cpu_data + channel_size;
int count = 0;
for (int32_t h = 0; h < height; ++h) {
for (int32_t w = 0; w < width; ++w) {
for (int32_t i = 0; i < param->prior_num; ++i) {
for (int j = 0; j < 4; ++j) {
ptr[count] = param->variances[j];
++count;
}
}
}
}
return cpu_data;
}
int PriorBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto scope = op->scope();
auto op_info = op->op_info();
auto op_type = op_info->Type();
// inputs
auto in_var_name = op_info->Input("Input").front();
auto in = scope->FindVar(in_var_name)->GetMutable<lite::Tensor>();
auto in_dims = in->dims();
auto img_var_name = op_info->Input("Image").front();
auto img = scope->FindVar(img_var_name)->GetMutable<lite::Tensor>();
auto img_dims = img->dims();
std::vector<int32_t> i_input_shape_data(in_dims.size());
for (size_t i = 0; i < in_dims.size(); i++) {
i_input_shape_data[i] = static_cast<int32_t>(in_dims[i]);
}
// outputs
auto boxes_var_name = op_info->Output("Boxes").front();
auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
auto var_var_name = op_info->Output("Variances").front();
auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
// param
st_priorbox_param param;
param.clip = op_info->GetAttr<bool>("clip");
param.min_sizes = op_info->GetAttr<std::vector<float>>("min_sizes");
param.max_sizes = op_info->GetAttr<std::vector<float>>("max_sizes");
param.aspect_ratios = op_info->GetAttr<std::vector<float>>("aspect_ratios");
param.variances = op_info->GetAttr<std::vector<float>>("variances");
param.offset = op_info->GetAttr<float>("offset");
if (op_info->HasAttr("flip")) {
param.flip = op_info->GetAttr<bool>("flip");
}
if (op_info->HasAttr("img_w")) {
param.img_w = op_info->GetAttr<int32_t>("img_w");
}
if (op_info->HasAttr("img_h")) {
param.img_h = op_info->GetAttr<int32_t>("img_h");
}
if (op_info->HasAttr("step_w")) {
param.step_w = op_info->GetAttr<float>("step_w");
}
if (op_info->HasAttr("step_h")) {
param.step_h = op_info->GetAttr<float>("step_h");
}
if (op_info->HasAttr("prior_num")) {
param.prior_num = op_info->GetAttr<int32_t>("prior_num");
}
if (op_info->HasAttr("min_max_aspect_ratios_order")) {
param.min_max_aspect_ratios_order =
op_info->GetAttr<bool>("min_max_aspect_ratios_order");
}
float* cpu_data = compute_priorbox_kernel(op, &param);
compute_priorbox_kernel(op, param);
auto boxes_dims = boxes->dims();
std::vector<int32_t> i_pri_out_shape_data(boxes_dims.size());
for (size_t i = 0; i < boxes_dims.size(); i++) {
i_pri_out_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
}
i_pri_out_shape_data[0] *= 2;
add_priorbox_layer(graph->GetCompilerHandle(),
const_cast<const int*>(&i_input_shape_data[0]),
in_dims.size(),
static_cast<const char*>(in_var_name.c_str()),
const_cast<const int*>(&i_pri_out_shape_data[0]),
boxes_dims.size(),
static_cast<const char*>(unique_op_name.c_str()),
static_cast<const float*>(cpu_data),
param.min_sizes.size(),
const_cast<const float*>(&param.min_sizes[0]),
param.max_sizes.size(),
const_cast<const float*>(&param.max_sizes[0]),
param.aspect_ratios.size(),
const_cast<const float*>(&param.aspect_ratios[0]),
static_cast<int>(param.flip),
static_cast<int>(param.clip),
param.variances.size(),
const_cast<const float*>(&param.variances[0]),
param.img_h,
param.img_w,
param.step_h,
param.step_w,
param.offset);
std::vector<int32_t> i_output_shape_data(boxes_dims.size());
for (size_t i = 0; i < boxes_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
}
int32_t* shape[2];
int dim[2];
const char* name[2];
dim[0] = boxes_dims.size();
dim[1] = boxes_dims.size();
name[0] = static_cast<const char*>(boxes_var_name.c_str());
name[1] = static_cast<const char*>(var_var_name.c_str());
shape[0] = &i_output_shape_data[0];
shape[1] = &i_output_shape_data[0];
int split_size = 2;
add_tf_split_layer(graph->GetCompilerHandle(),
const_cast<const int*>(&i_pri_out_shape_data[0]),
boxes_dims.size(),
static_cast<const char*>(unique_op_name.c_str()),
2,
shape,
dim,
name,
boxes_dims.size(),
0,
&split_size,
0);
graph->AddNode(boxes_var_name);
graph->AddNode(var_var_name);
return SUCCESS;
}
} // namespace bm
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(prior_box,
kBM,
paddle::lite::subgraph::bm::PriorBoxConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <bmcompiler_if.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace bm {
int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto scope = op->scope();
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto x_var_name = op_info->Input("X").front();
auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
auto x_dims = x->dims();
std::vector<int32_t> i_x_shape_data(x_dims.size());
for (size_t i = 0; i < x_dims.size(); i++) {
i_x_shape_data[i] = static_cast<int32_t>(x_dims[i]);
}
auto output_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
auto output_dims = output->dims();
std::vector<int32_t> i_output_shape_data(output_dims.size());
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int32_t>(output_dims[i]);
}
// auto axis = op_info->GetAttr<int>("axis");
add_reshape_layer_v2(graph->GetCompilerHandle(),
static_cast<const char*>(x_var_name.c_str()),
const_cast<const int*>(&i_x_shape_data[0]),
x_dims.size(),
static_cast<const char*>(output_var_name.c_str()),
const_cast<const int*>(&i_output_shape_data[0]),
output_dims.size());
graph->AddNode(output_var_name);
return SUCCESS;
}
} // namespace bm
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(reshape,
kBM,
paddle::lite::subgraph::bm::ReshapeConverter);
REGISTER_SUBGRAPH_BRIDGE(reshape2,
kBM,
paddle::lite::subgraph::bm::ReshapeConverter);
REGISTER_SUBGRAPH_BRIDGE(flatten,
kBM,
paddle::lite::subgraph::bm::ReshapeConverter);
REGISTER_SUBGRAPH_BRIDGE(flatten2,
kBM,
paddle::lite::subgraph::bm::ReshapeConverter);
......@@ -48,7 +48,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
for (size_t i = 0; i < length; i++) {
i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
}
auto axis = op_info->GetAttr<int>("axis");
int32_t axis = -1;
if (op_info->HasAttr("axis")) {
axis = op_info->GetAttr<int>("axis");
}
if (axis < 0) {
axis += x_dims.size();
}
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <bmcompiler_defs.h>
#include <bmcompiler_if.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace bm {
int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto scope = op->scope();
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto x_var_name = op_info->Input("X").front();
auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
auto x_dims = x->dims();
auto output_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
auto output_dims = output->dims();
const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
const int64_t* output_shape_data =
const_cast<const int64_t*>(&output_dims.data()[0]);
std::vector<int32_t> i_x_shape_data(x_dims.size());
std::vector<int32_t> i_output_shape_data(output_dims.size());
for (size_t i = 0; i < x_dims.size(); i++) {
i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
}
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
}
auto axis = op_info->GetAttr<std::vector<int>>("axis");
CHECK_EQ(axis.size(), x_dims.size());
add_transpose_layer_v2(graph->GetCompilerHandle(),
static_cast<const char*>(x_var_name.c_str()),
const_cast<const int*>(&i_x_shape_data[0]),
x_dims.size(),
DTYPE_FP32,
static_cast<const char*>(output_var_name.c_str()),
NULL,
const_cast<const int*>(&axis[0]));
graph->AddNode(output_var_name);
return SUCCESS;
}
} // namespace bm
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(transpose,
kBM,
paddle::lite::subgraph::bm::TransposeConverter);
REGISTER_SUBGRAPH_BRIDGE(transpose2,
kBM,
paddle::lite::subgraph::bm::TransposeConverter);
......@@ -54,7 +54,7 @@ int SubgraphEngine::BuildDeviceProgram() {
}
std::string net_name = "paddle_bitmain";
__bmcompile_opt(
graph.GetCompilerHandle(), const_cast<char*>(net_name.c_str()), 2);
graph.GetCompilerHandle(), const_cast<char*>(net_name.c_str()), 1);
void* bmodel_data = nullptr;
unsigned int data_size = 0;
bm_hd_ = static_cast<bm_handle_t>(ctx.GetHandle());
......@@ -109,7 +109,6 @@ int SubgraphEngine::BuildDeviceProgram() {
net_info_->output_dtypes[i],
stage.output_shapes[i]);
}
return status;
}
......
......@@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_
add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
add_kernel(one_hot_compute_host Host extra SRCS one_hot_compute.cc DEPS ${lite_kernel_deps})
#lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
#lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
......@@ -426,8 +426,14 @@ REGISTER_LITE_KERNEL(multiclass_nms,
kNCHW,
paddle::lite::kernels::host::MulticlassNmsCompute,
def)
.BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("BBoxes",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindInput("Scores",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Index",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <map>
#include <utility>
#include <vector>
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/kernels/host/one_hot_compute.h"
#include "lite/utils/paddle_enforce.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace host {
void OneHotCompute::Run() {
auto& param = Param<operators::OneHotParam>();
param.Out->mutable_data<float>();
int depth = param.depth;
if (param.depth_tensor) {
auto* depth_tensor = param.depth_tensor;
auto* depth_data = depth_tensor->data<int32_t>();
depth = depth_data[0];
auto in_dims = param.X->dims();
DDim out_dims(in_dims);
out_dims[out_dims.size() - 1] = depth;
param.Out->Resize(out_dims);
}
auto* p_in_data = param.X->data<float>();
auto numel = param.X->numel();
auto* p_out_data = param.Out->mutable_data<float>();
for (int i = 0; i < param.Out->numel(); ++i) {
p_out_data[i] = 0;
}
if (param.allow_out_of_range) {
for (int i = 0; i < numel; ++i) {
if (p_in_data[i] >= 0 && p_in_data[i] < param.depth) {
*(p_out_data + i * param.depth + (int)(p_in_data[i])) = 1.0; // NOLINT
}
}
} else {
for (int i = 0; i < numel; ++i) {
PADDLE_ENFORCE_GE(
p_in_data[i], 0, "Illegal index value, should be at least 0.");
PADDLE_ENFORCE_LT(p_in_data[i],
param.depth,
"Illegal index value, should be less than depth (%d).",
param.depth);
*(p_out_data + i * param.depth + (int)(p_in_data[i])) = 1.0; // NOLINT
}
}
}
} // namespace host
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(one_hot,
kHost,
kFloat,
kNCHW,
paddle::lite::kernels::host::OneHotCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace host {
class OneHotCompute
: public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
void Run() override;
virtual ~OneHotCompute() = default;
};
} // namespace host
} // namespace kernels
} // namespace lite
} // namespace paddle
......@@ -46,17 +46,21 @@ REGISTER_LITE_KERNEL(reshape,
paddle::lite::kernels::host::ReshapeCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindInput("ShapeTensor",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindInput("Shape",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(reshape2,
......
......@@ -34,27 +34,29 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto input_type = kernel->GetInputDeclType("Input");
CHECK(input_type->precision() == PRECISION(kFloat));
CHECK(input_type->layout() == DATALAYOUT(kNCHW));
auto input = scope->FindMutableTensor(input_name);
auto input = scope->FindTensor(input_name);
auto input_dims = input->dims();
CHECK_GE(input_dims.size(), 2UL);
auto w_name = op_info->Input("W").front();
auto w_type = kernel->GetInputDeclType("W");
CHECK(w_type->precision() == PRECISION(kFloat));
CHECK(w_type->layout() == DATALAYOUT(kNCHW));
auto w = scope->FindMutableTensor(w_name);
auto w = scope->FindTensor(w_name);
auto w_dims = w->dims();
CHECK_EQ(w_dims.size(), 2UL);
auto out_name = op_info->Output("Out").front();
auto out_type = kernel->GetOutputDeclType("Out");
CHECK(out_type->precision() == PRECISION(kFloat));
CHECK(out_type->layout() == DATALAYOUT(kNCHW));
auto out = scope->FindTensor(out_name);
auto out_dims = out->dims();
int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
int m = input_dims.Slice(0, in_num_col_dims).production();
int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production();
int n = w_dims[1];
CHECK_EQ(k * n, w_dims.production());
VLOG(3) << "[NPU] input dims: " << input_dims << " w dims: " << w_dims
<< " m: " << m << " k: " << k << " n: " << n;
// Create input node and reshape it to (m, k, 1, 1)
std::shared_ptr<Node> input_node = nullptr;
......@@ -76,7 +78,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
transpose_w.Resize({n, k, 1, 1});
transpose_w.set_persistable(true);
auto transpose_w_data = transpose_w.mutable_data<float>();
auto w_data = w->mutable_data<float>();
auto w_data = w->data<float>();
for (int i = 0; i < k; i++) {
for (int j = 0; j < n; j++) {
transpose_w_data[j * k + i] = w_data[i * n + j];
......@@ -85,10 +87,11 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto trans_w_node = graph->Add(w_name, transpose_w);
// FC node
auto fc_node = graph->Add<ge::op::FullConnection>(out_name + "/fc");
auto fc_node = graph->Add<ge::op::FullConnection>(out_name);
auto fc_op = fc_node->data<ge::op::FullConnection>();
fc_op->set_input_x(*reshaped_input_node->data());
fc_op->set_input_w(*trans_w_node->data());
// Add bias node if bias tensor exists
if (HasInputArg(op_info, scope, "Bias")) {
std::shared_ptr<Node> bias_node = nullptr;
......@@ -99,19 +102,23 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto bias_type = kernel->GetInputDeclType("Bias");
CHECK(bias_type->precision() == PRECISION(kFloat));
CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
auto bias = scope->FindMutableTensor(bias_name);
auto bias = scope->FindTensor(bias_name);
auto bias_dims = bias->dims();
CHECK_EQ(bias_dims.production(), n);
bias_node = graph->Add(bias_name, *bias, {1, n, 1, 1});
}
fc_op->set_input_b(*bias_node->data());
}
// Reshape output of FC node from (m, n, 1, 1) to (m, n)
// Reshape output of FC node from (m, n, 1, 1) to out_shape
auto reshaped_fc_node = graph->Add<ge::op::Reshape>(out_name);
auto reshaped_fc_op = reshaped_fc_node->data<ge::op::Reshape>();
reshaped_fc_op->set_input_tensor(*fc_node->data());
reshaped_fc_op->set_attr_shape({m, n});
auto out_shape = out_dims.Vectorize();
reshaped_fc_op->set_attr_shape(
ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
reshaped_fc_op->set_attr_axis(0);
return REBUILD_WHEN_SHAPE_CHANGED;
}
......
......@@ -42,7 +42,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto out_type = kernel->GetOutputDeclType("Out");
CHECK(out_type->precision() == PRECISION(kFloat));
CHECK(out_type->layout() == DATALAYOUT(kNCHW));
auto axis = op_info->GetAttr<int>("axis");
int axis = op_info->HasAttr("axis") ? op_info->GetAttr<int>("axis") : -1;
if (axis < 0) {
axis += x_rank;
}
......
......@@ -20,7 +20,9 @@ add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc
add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
add_kernel(concat_opencl OPENCL basic SRCS concat_compute.cc DEPS ${cl_kernel_deps})
add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_compute.cc DEPS ${cl_kernel_deps})
add_kernel(scale_opencl OPENCL basic SRCS scale_compute.cc DEPS ${cl_kernel_deps})
lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
......@@ -83,6 +85,15 @@ lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc
lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
DEPS layout_opencl op_registry program context cl_image_converter
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_concat_opencl SRCS concat_compute_test.cc
DEPS concat_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_nearest_interp_opencl SRCS nearest_interp_compute_test.cc
DEPS nearest_interp_opencl layout_opencl op_registry program context cl_image_converter
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_scale_opencl SRCS scale_compute_test.cc
DEPS scale_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/opencl/concat_compute.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
template <>
void ConcatCompute<PRECISION(kFloat),
DATALAYOUT(kImageDefault)>::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
context.cl_context()->AddKernel(
kernel_func_name_, "image/concat_kernel.cl", build_options_);
// UpdateParams<kFloat, kImageDefault>();
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
auto out_dims = concat_param_->output->dims();
auto* axis_tensor = concat_param_->axis_tensor;
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
for (int i = 0; i < axis; i++) {
pre_size_ *= in_dims[i];
}
for (int i = axis + 1; i < in_dims.size(); i++) {
post_size_ *= in_dims[i];
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
// auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
if (in_dims.size() != dims.size()) {
printf("input shape must be same \n");
return;
}
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
}
}
}
template <>
void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::Run() {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.output->mutable_data<float, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.output->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto inputs = param.x;
int arg_idx = 0;
int width = inputs[0]->dims()[-1];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
VLOG(4) << TargetToStr(param.output->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int flag = 1; // cxw
switch (axis_) {
case 0:
width = x_dims[2]; // n
flag = 0;
break;
case 1:
width = x_dims[3]; // c
break;
case 2:
width = x_dims[0]; // h
flag = 0;
break;
case 3:
case -1:
width = x_dims[1]; // w
break;
default:
printf("this axis: %d does not support \n", axis_);
}
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<float, cl::Image2D>();
auto* x_buf1 = inputs[1]->data<float, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status =
kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
auto* x_buf = inputs[i]->data<float, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
CL_CHECK_FATAL(status);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
start += inputs[i]->dims()[axis_];
}
}
}
template <>
std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::doc() {
return "Concat using cl::Image, kFloat";
}
template <>
void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
// UpdateParams<kFloat, kImageDefault>();
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
auto out_dims = concat_param_->output->dims();
auto* axis_tensor = concat_param_->axis_tensor;
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
for (int i = 0; i < axis; i++) {
pre_size_ *= in_dims[i];
}
for (int i = axis + 1; i < in_dims.size(); i++) {
post_size_ *= in_dims[i];
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
if (in_dims.size() != dims.size()) {
printf("input shape must be same \n");
return;
}
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
}
}
}
template <>
void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::Run() {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
const auto& y_dims = param.output->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto inputs = param.x;
int arg_idx = 0;
auto global_work_size = cl::NDRange{axis_size_};
int total = axis_size_ * post_size_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
auto* x_buf1 = inputs[1]->data<float, cl::Buffer>();
auto axis0 = inputs[0]->dims()[axis_];
int total0 = axis0 * post_size_;
int total1 = (axis_size_ - axis0) * post_size_;
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<int>(axis0));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, pre_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, post_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total1);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
int size = inputs[i]->dims()[axis_];
auto* x_buf = inputs[i]->data<float, cl::Buffer>();
global_work_size = cl::NDRange{static_cast<size_t>(size)};
int total0 = size * post_size_;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<int>(size));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, pre_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, post_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total0);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
start += size;
}
}
}
template <>
std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::doc() {
return "Concat using cl::Buffer, kFloat";
}
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
DATALAYOUT(kNCHW)>
Concat_buffer;
typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
DATALAYOUT(kImageDefault)>
Concat_image;
REGISTER_LITE_KERNEL(
concat, kOpenCL, kFloat, kImageDefault, Concat_image, ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kInt32),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kInt32),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "lite/core/kernel.h"
#include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
template <PrecisionType Ptype, DataLayoutType layout>
class ConcatCompute : public KernelLite<TARGET(kOpenCL), Ptype, layout> {
public:
using param_t = operators::ConcatParam;
void PrepareForRun() override;
void Run() override;
std::string doc(); // override;
// protected:
// void UpdateParams();
int axis_size_ = 1;
int post_size_ = 1;
int pre_size_ = 1;
int axis_ = 1;
param_t* concat_param_{nullptr};
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
此差异已折叠。
......@@ -362,6 +362,20 @@ void ConvImageCompute::PrepareForRun() {
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d1x1;
} else if (kernel_h == 3 && kernel_h == 3) {
// conv2d_3x3
kernel_func_names_.push_back("conv2d_3x3");
kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
std::vector<float> filter_image_v(filter_image_dims[0] *
filter_image_dims[1] * 4); // 4 : RGBA
converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
filter_gpu_image_.mutable_data<float, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d3x3;
} else if (kernel_h == 5 && kernel_w == 5) {
// conv2d_5x5
kernel_func_names_.push_back("conv2d_5x5");
......@@ -582,6 +596,184 @@ void ConvImageCompute::Conv2d1x1() {
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Conv2d3x3() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<float, cl::Image2D>();
auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int input_channel = input_dims[1];
int output_width = output_dims[3];
int output_height = output_dims[2];
int output_channel = output_dims[1];
int filter_width = filter_dims[3];
int filter_height = filter_dims[2];
int filter_channel = filter_dims[1];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<float, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
// re-calc group
int new_groups{param.groups};
if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) {
new_groups = 1;
} else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
new_groups = input_channel / filter_channel;
}
/* TODO(ysh329): mobile has no case below
else {
LOG(FATAL) << "Not support conv3x3 case with"
<< " input_dims:" << input_dims << " output_dims:" <<
output_dims
<< " filter_dims:" << filter_dims;
}
*/
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
VLOG(4) << "input_image: " << input_image;
VLOG(4) << "input_dims: " << input_dims;
VLOG(4) << "filter_dims: " << filter_dims;
VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "param.groups(groups):" << param.groups;
VLOG(4) << "new_groups:" << new_groups;
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_.data<float, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
VLOG(4) << "set bias_image: ";
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_channel);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_channel);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, new_groups);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Conv2d5x5() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
......@@ -726,6 +918,7 @@ void ConvImageCompute::Conv2d5x5() {
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Conv2d7x7() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
......
......@@ -71,6 +71,7 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
private:
void Conv2d1x1();
void Conv2d3x3();
void Conv2d5x5();
void Conv2d7x7();
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ScaleParam;
std::string doc() const override { return "Scale using cl::Image2D, kFloat"; }
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/scale_kernel.cl", build_options_);
}
void Run() override {
const auto& param = *param_.get_mutable<param_t>();
const auto& in_dims = param.x->dims();
auto* x_img = param.x->data<float, cl::Image2D>();
const float scale = param.scale;
const float bias = param.bias;
LOG(INFO) << "x_image" << x_img;
auto out_image_shape = InitImageDimInfoWith(in_dims);
LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
<< out_image_shape["height"];
auto* out_img = param.output->mutable_data<float, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
LOG(INFO) << "out_image" << out_img;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(out_image_shape["width"]),
static_cast<cl::size_type>(out_image_shape["height"])};
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, scale);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, bias);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
}
private:
std::string kernel_func_name_{"scale"};
std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(scale,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::ScaleComputeImage2D,
image2d)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
此差异已折叠。
......@@ -41,7 +41,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto out_type = kernel->GetOutputDeclType("Out");
CHECK(out_type->precision() == PRECISION(kFloat));
CHECK(out_type->layout() == DATALAYOUT(kNCHW));
auto axis = op_info->GetAttr<int>("axis");
int axis = op_info->HasAttr("axis") ? op_info->GetAttr<int>("axis") : -1;
// X node
std::shared_ptr<Node> x_node = nullptr;
......
此差异已折叠。
此差异已折叠。
......@@ -135,6 +135,8 @@ add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS})
add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS})
add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS})
add_operator(one_hot basic SRCS one_hot_op.cc DEPS ${op_DEPS})
if (NOT LITE_WITH_X86)
lite_cc_test(test_fc_op SRCS fc_op_test.cc
DEPS fc_op memory
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册