提交 ef2616b6 编写于 作者: L Liu Yiqun

Merge branch 'develop' into step_rnn/opt_ddim_lite

...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
build/ build/
build_fpga/ build_fpga/
docs/_build/
.idea/ .idea/
......
请参考[PaddleLite文档开发规范](http://agroup.baidu.com/paddle-infer/md/article/2561104)
# 新增OP的方法
以下以添加argmax为例,详细说明新增op的方法。
## 1. 添加OpParam 结构体以传导 Op 的输入和输出
- 这里命名为 `ArgmaxParam`
-`paddlelite/lite/operators/op_params.h` 中添加 `ArgmaxParam` 结构体,代码如下:
```c++
struct ArgmaxParam {
lite::Tensor* X{};
lite::Tensor* Out{};
int Axis{0};
};
```
## 2. 添加 Argmax Op 并注册
- 在paddlelite/lite/operators/目录下新建argmax_op.h文件,主要代码如下:
```c++
class ArgmaxOpLite : public OpLite {
public:
ArgmaxOpLite() {}
explicit ArgmaxOpLite(const std::string &op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShape() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "argmax"; }
private:
mutable ArgmaxParam param_;
};
```
`ArgmaxOpLite` 继承 `OpLite` ,成员变量包括 `ArgmaxParam` 结构体,需要实现的接口包括 `CheckShape()``InferShape()``AttachImp()``AttachKernel()``DebugString()` 函数。`AttachKernel()``DebugString() `函数较为简单,此处直接实现;
-`paddlelite/lite/operators/` 目录下新建argmax_op.cc文件,需要具体实现`CheckShape()``InferShape()``AttachImp()`函数。`CheckShape()`函数检查输入是否符合要求,`InferShape()`函数基于输入推断得到输出的维度,`AttachImp()`函数绑定Op的输入输出。然后在argmax_op.cc文件中注册argmax,核心代码如下:
```c++
bool ArgmaxOpLite::CheckShape() const {
CHECK_OR_FALSE(param_.X);
CHECK_OR_FALSE(param_.Out);
CHECK_OR_FALSE(param_.Axis < (param_.X)->dims().size());
return true;
}
bool ArgmaxOpLite::InferShape() const {
auto x_dims = param_.X->dims();
int x_rank = x_dims.size();
int axis = param_.Axis;
if (axis < 0) axis += x_rank;
std::vector<int64_t> out_dims;
for (int64_t i = 0; i < axis; i++) {
out_dims.push_back(x_dims[i]);
}
for (int64_t i = axis + 1; i < x_rank; i++) {
out_dims.push_back(x_dims[i]);
}
// Set output dims
param_.Out->Resize(lite::DDim(out_dims));
return true;
}
bool ArgmaxOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
auto x = op_desc.Input("X").front();
auto out = op_desc.Output("Out").front();
param_.X = scope->FindVar(x)->GetMutable<lite::Tensor>();
param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
param_.Axis = op_desc.GetAttr<int>("Axis");
return true;
}
REGISTER_LITE_OP(argmax, paddle::lite::operators::ArgmaxOpLite);
```
- 在paddlelite/lite/operators/CMakeLists.txt中添加```add_operator(argmax_op basic SRCS argmax_op.cc DEPS ${op_DEPS})```
## 3. 添加Argmax Kernel并绑定
以下以arm端argmax实现为例说明
- 在paddlelite/lite/kernels/arm/目录下新建argmax_compute.h文件,声明ArgmaxCompute类,并继承KernelLite,主要代码如下:
```c++
class ArgmaxCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
using param_t = operators::ArgmaxParam;
void Run() override;
virtual ~ArgmaxCompute() = default;
};
```
- 在paddlelite/lite/kernels/arm/目录下新建argmax_compute.cc文件,主要实现Run函数。`Run()`函数调用paddlelite/lite/bachends/arm/math/argmax.h中的`argmax_func()`函数,根据输入计算输出。最后在argmax_compute.cc文件中,我们绑定argmax的输入输出(为tensor的输入参数都需要绑定),代码如下:
```c++
void ArgmaxCompute::Run() {
auto& param = Param<operators::ArgmaxParam>();
lite::Tensor* input = param.X;
lite::Tensor* output = param.Out;
int axis = param.Axis;
lite::arm::math::argmax_func(input, axis, output);
return;
}
REGISTER_LITE_KERNEL(
argmax, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ArgmaxCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
```
- 在paddlelite/lite/kernels/arm/CMakeLists.txt中添加
```cmake
add_kernel(argmax_compute_arm ARM basic SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
```
## 4. 添加Argmax实现
- 在paddlelite/lite/backends/arm/math/目录下新建argmax.h文件,声明`argmax_func()`函数,代码如下:
```c++
void argmax_func(const lite::Tensor* input, const int axis, lite::Tensor* output);
```
- 在paddlelite/lite/backends/arm/math/目录下新建argmax.cc文件,具体实现`argmax_func()`函数,代码如下:
```c++
void argmax_func(const lite::Tensor *input,
const int axis,
lite::Tensor *output) {
auto input_ddim = input->dims();
auto output_ddim = output->dims();
const int size = input_ddim[axis];
const int in_channel = input_ddim.count(axis, input_ddim.size());
const int out_channel = output_ddim.count(axis, output_ddim.size());
const int in_stride = input_ddim.count(axis + 1, input_ddim.size());
const int out_stride = input_ddim.count(0, axis);
for (int n = 0; n < out_stride; n++) {
for (int k = 0; k < in_stride; k++) {
const float *in_ptr = input->data<float>() + n * in_channel + k;
std::vector<std::pair<float, int>> vec;
vec.resize(size);
for (int i = 0; i < size; i++) {
vec[i] = std::make_pair(in_ptr[i * in_stride], i);
}
// sort
std::partial_sort(vec.begin(),
vec.begin() + 1,
vec.end(),
std::greater<std::pair<float, int>>());
// out
float *out_ptr = output->mutable_data<float>() + n * out_channel + k;
*out_ptr = vec[0].second;
}
}
}
```
- 在paddlelite/lite/backends/arm/math/CMakeFile.txt中的```math_arm library```中添加argmax.cc,在paddlelite/lite/backends/arm/math/funcs.h中添加```#include "lite/arm/math/argmax.h"```
## 5. 添加Argmax单测
- 在paddlelite/lite/tests/kernels目录下新建argmax_compute_test.cc文件,声明并实现ArgmaxComputeTester类;
- ArgmaxComputeTester类中主要包括PrepareOpDesc、PrepareData和RunBaseline函数。PrepareOpDesc函数设定单测op的类型和输入输出参数,PrepareData函数对输入tensor进行初始化,RunBaseline是基于输入计算得到输出,用于和框架计算的输出进行对比;
- 使用gtest添加单测,代码如下:
```c++
TEST(Argmax, precision) {
#ifdef LITE_WITH_ARM
LOG(INFO) << "test argmax arm";
Place place(TARGET(kARM));
for (int axis : {0, 1, 2, 3}) {
for (int n : {1, 3}) {
for (int c : {3, 6}) {
for (int h : {9, 18}) {
for (int w : {9, 18}) {
std::unique_ptr<arena::TestCase> tester(
new ArgmaxComputeTester(place, "def", axis, n, c, h, w));
arena::Arena arena(std::move(tester), place, 2e-5);
arena.TestPrecision();
}
}
}
}
}
#endif
}
```
- 在paddlelite/lite/tests/kernels/CMakeLists.txt中添加
```cmake
lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
```
## 6. 编译运行
- 在paddlelite目录中,执行```./lite/tools/ci_build.sh build_test_arm```,该脚本会创建手机模拟器,并编译运行所有单测(花费时间较久)。如果运行无误,则表明添加argmax成功。
# 模型量化
本文主要介绍使用Paddle-Lite加载PaddlePaddle产出的量化模型,并进行推理执行。我们以MobileNetV1模型为示例,首先介绍准备量化模型,然后介绍部署执行。
## 准备量化模型
PaddlePaddle使用量化训练和训练后量化两种方法将FP32模型量化成Int8模型,下面分别介绍两种方法如何产出量化模型。
### 量化训练
目前,PaddlePaddle框架的量化训练主要针对卷积层(包括二维卷积和Depthwise卷积)、和全连接层,对应算子是conv2d、depthwise_conv2d和mul,更多量化训练的原理请参考[文档](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/tutorial.md#1-quantization-aware-training%E9%87%8F%E5%8C%96%E4%BB%8B%E7%BB%8D)。Paddle-Lite支持运行PaddlePaddle框架量化训练产出的模型,可以进一步加快模型在移动端的执行速度。
温馨提示:如果您是初次接触PaddlePaddle框架,建议首先学习[新人入门](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)[使用指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/index_cn.html)
您可以选择下载训练好的量化模型,或者使用PaddleSlim模型压缩工具训练得到量化模型。
#### 下载量化模型
官方发布了[MobileNetV1量化模型](https://paddle-inference-dist.bj.bcebos.com/int8%2Fpretrain%2Fmobilenet_v1_quant%2Ffloat.zip),直接下载到本地。
```bash
wget https://paddle-inference-dist.bj.bcebos.com/int8%2Fpretrain%2Fmobilenet_v1_quant%2Ffloat.zip
```
#### 使用PaddleSlim模型压缩工具训练量化模型
##### 安装PaddlePaddle
根据操作系统、安装方式、Python版本和CUDA版本,按照[官方说明](https://paddlepaddle.org.cn/start)安装PaddlePaddle。例如:
Ubuntu 16.04.4 LTS操作系统,CUDA9,cuDNN7,GPU版本安装:
```bash
pip install paddlepaddle-gpu==1.6.0.post97 -i https://mirrors.aliyun.com/pypi/simple/
```
Ubuntu 16.04.4 LTS操作系统,CPU版本安装:
```bash
pip install paddlepaddle==1.6.0 -i https://mirrors.aliyun.com/pypi/simple/
```
##### 克隆量化训练所需的代码库
克隆[PaddlePaddle/models](https://github.com/PaddlePaddle/models)到本地,并进入models/PaddleSlim路径。
```bash
git clone https://github.com/PaddlePaddle/models.git
cd models/PaddleSlim
```
##### 数据准备
###### 训练数据准备
参考[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification#data-preparation)中的数据准备教程,下载训练数据,并且保存到PaddleSlim/data路径下。
###### 预训练模型准备
参考/models/PaddleSlim/run.sh脚本, 从[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#supported-models-and-performances)下载MobileNetV1的预训练模型,并保存到PaddleSlim/pretrain路径下。
经过以上三步,PaddleSlim目录下的文件结构如下所示:
```bash
.
├── compress.py # 模型压缩任务主脚本,定义了压缩任务需要的模型相关信息
├── configs # 压缩任务的配置文件,包括:蒸馏、int8量化量化、filter剪切和组合策略的配置文件
├── data # 存放训练数据(需要用户自己创建)
│   └── ILSVRC2012
├── pretrain # 存放预训练模型参数,执行run.sh自动生成
│   ├── MobileNetV1_pretrained
│   ├── MobileNetV1_pretrained.tar
│   ├── ResNet50_pretrained
│   └── ResNet50_pretrained.tar
├── docs # 文档目录
├── light_nas
├── models # 模型网络结构的定义,如MobileNetV1
├── quant_low_level_api # 量化训练的底层API, 用于灵活定制量化训练的过程,适用于高阶用户
├── reader.py # 定义数据处理逻辑
├── README.md
├── run.sh # 模型压缩任务启动脚本
└── utility.py # 定义了常用的工具方法
```
##### 压缩脚本介绍
`compress.py`中定义了执行压缩任务需要的所有模型相关的信息,这里对几个关键的步骤进行简要介绍:
###### 目标网络的定义
compress.py的以下代码片段定义了train program, 这里train program只有前向计算操作。
```python
out = model.net(input=image, class_dim=args.class_dim)
cost = fluid.layers.cross_entropy(input=out, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
```
然后,通过clone方法得到eval_program, 用来在压缩过程中评估模型精度,如下:
```python
val_program = fluid.default_main_program().clone()
```
定义完目标网络结构,需要对其初始化,并根据需要加载预训练模型。
###### 定义feed_list和fetch_list
对于train program, 定义train_feed_list用于指定从train data reader中取的数据feed给哪些variable。定义train_fetch_list用于指定在训练时,需要在log中展示的结果。如果需要在训练过程中在log中打印accuracy信心,则将('acc_top1', acc_top1.name)添加到train_fetch_list中即可。
```python
train_feed_list = [('image', image.name), ('label', label.name)]
train_fetch_list = [('loss', avg_cost.name)]
```
> 注意: 在train_fetch_list里必须有loss这一项。
对于eval program. 同上定义eval_feed_list和train_fetch_list:
```python
val_feed_list = [('image', image.name), ('label', label.name)]
val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5', acc_top5.name)]
```
###### Compressor和量化配置文件
`compress.py`主要使用Compressor和yaml文件完成对模型的量化训练工作。Compressor类的定义如下:
```python
class Compressor(object):
def __init__(self,
place,
scope,
train_program,
train_reader=None,
train_feed_list=None,
train_fetch_list=None,
eval_program=None,
eval_reader=None,
eval_feed_list=None,
eval_fetch_list=None,
teacher_programs=[],
checkpoint_path='./checkpoints',
train_optimizer=None,
distiller_optimizer=None):
```
在定义Compressor对象时,需要注意以下问题:
* train program如果带反向operators和优化更新相关的operators, 参数train_optimizer需要设置为None.
* eval_program中parameter的名称需要与train_program中的parameter的名称完全一致。
* 最终保存的量化模型是在eval_program网络基础上进行剪枝保存的。所以,如果用户希望最终保存的模型可以用于inference, 则eval program需要包含推理阶段需要的各种operators.
* checkpoint保存的是float数据类型的模型。
`configs/quantization.yaml`量化配置文件示例如下:
```python
version: 1.0
strategies:
quantization_strategy:
class: 'QuantizationStrategy'
start_epoch: 0
end_epoch: 9
float_model_save_path: './output/float'
mobile_model_save_path: './output/mobile'
int8_model_save_path: './output/int8'
weight_bits: 8
activation_bits: 8
weight_quantize_type: 'abs_max'
activation_quantize_type: 'moving_average_abs_max'
save_in_nodes: ['image']
save_out_nodes: ['fc_0.tmp_2']
compressor:
epoch: 10
checkpoint_path: './checkpoints_quan/'
strategies:
- quantization_strategy
```
其中,可配置参数包括:
- **class:** 量化策略的类名称,目前仅支持`QuantizationStrategy`
- **start_epoch:** 在start_epoch开始之前,量化训练策略会往train_program和eval_program插入量化operators和反量化operators。 从start_epoch开始,进入量化训练阶段。
- **end_epoch:** 在end_epoch结束之后,会保存用户指定格式的模型。注意:end_epoch之后并不会停止量化训练,而是继续训练直到epoch数等于compressor.epoch值为止。举例来说,当start_epoch=0,end_epoch=0,compressor.epoch=2时,量化训练开始于epoch0,结束于epoch1,但保存的模型是epoch0结束时的参数状态。
- **float_model_save_path:** 保存float数据格式的模型路径,即该路径下的模型参数范围为int8范围但参数数据类型为float32。如果设置为None, 则不存储float格式的模型,默认为None。**注意:Paddle-Lite即使用该目录下的模型进行量化模型推理优化,详见本文[使用Paddle-Lite运行量化模型推理](#二使用Paddle-Lite运行量化模型推理)部分。**
- **int8_model_save_path:** 保存int8数据格式的模型路径,即该路径下的模型参数范围为int8范围且参数数据类型为int8。如果设置为None, 则不存储int8格式的模型,默认为None.
- **mobile_model_save_path:** 保存兼容paddle-mobile框架的模型路径。如果设置为None, 则不存储paddle-mobile格式的模型,默认为None。目前paddle-mobile已升级为Paddle-Lite。
- **weight_bits:** 量化weight的bit数,注意偏置(bias)参数不会被量化。
- **activation_bits:** 量化activation的bit数。
- **weight_quantize_type:** weight量化方式,目前量化训练支持`abs_max``channel_wise_abs_max`
- **activation_quantize_type:** activation量化方式,目前量化训练支持`range_abs_max``moving_average_abs_max`。PaddlePaddle中还支持 `abs_max` 方法对激活进行量化,但是该方法动态计算输入的量化scale,这会增加计算量、减慢模型推理速度,所以lite不支持 `abs_max`激活量化方式。
- **save_in_nodes:** variable名称列表。在保存量化后模型的时候,需要根据save_in_nodes对eval programg 网络进行前向遍历剪枝。默认为eval_feed_list内指定的variable的名称列表。
- **save_out_nodes:** varibale名称列表。在保存量化后模型的时候,需要根据save_out_nodes对eval programg 网络进行回溯剪枝。默认为eval_fetch_list内指定的variable的名称列表。
> **备注:**
>
> 1)`abs_max`意为在训练的每个step及inference阶段均动态计算量化scale值。`channel_wise_abs_max`与`abs_max`类似,不同点在于它会对卷积权重进行分channel求取量化scale。换言之,`abs_max`属于tensor-wise量化,而`channel_wise_abs_max`属于channel-wise量化,详细说明请猛戳[此处](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/quantization/training_quantization_model_format.md)。
>
> 2)`moving_average_abs_max`和`range_abs_max`意为在训练阶段计算出一个静态的量化scale值,并将其用于inference阶段。`moving_average_abs_max`使用窗口滑动平均的方法计算量化scale,而`range_abs_max`则使用窗口绝对值最大值的方式。
>
> 3)**目前,Paddle-Lite仅支持运行weight量化方式使用`abs_max`且activation量化方式使用`moving_average_abs_max`或`range_abs_max`产出的量化模型**。
##### 执行int8量化训练
修改run.sh,即注释掉`# enable GC strategy``# for sensitivity filter pruning`之间的内容并打开`#for quantization`相关的脚本命令(所需打开注释的命令如下所示)。
```bash
# for quantization
#---------------------------
export CUDA_VISIBLE_DEVICES=0
python compress.py \
--batch_size 64 \
--model "MobileNet" \
--pretrained_model ./pretrain/MobileNetV1_pretrained \
--compress_config ./configs/quantization.yaml \
--quant_only True
```
最后,运行`sh run.sh`命令开始int8量化训练。
上述量化训练过程完成后,若按照本文中所述`configs/quantization.yaml`文件内容配置的模型输出路径,则可在models/PaddleSlim/output目录下看到`float``int8``mobile`三个目录,其中:
* float目录: 参数范围为int8范围但参数数据类型为float32的量化模型。Paddle-Lite即使用该目录下的模型文件及参数进行量化模型的部署。
* int8目录: 参数范围为int8范围且参数数据类型为int8的量化模型。
* mobile目录:参数特点与int8目录相同且兼容paddle-mobile的量化模型(目前paddle-mobile已升级为Paddle-Lite)。
### 训练后量化
下面以MobileNetV1为例,介绍使用训练后量化方法产出量化模型。关于训练后量化的原理和详细使用方法,请参考[文档](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api)
> 该示例的代码放在[models/PaddleSlim/quant_low_level_api/](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api)目录下。如果需要执行该示例,首先clone下来[models](https://github.com/PaddlePaddle/models.git),安装具有训练后量化功能的PaddlePaddle。因为目前Lite支持支持对conv2d、depthwise_conv2d和mul量化,所以修改[run_post_training_quanzation.sh](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/quant_low_level_api/run_post_training_quanzation.sh) 脚本,设置is_full_quantize=False,然后执行该脚本;执行结束后,量化模型保存在`mobilenetv1_int8_model`目录下。下面介绍详细步骤。
1)**准备模型和校准数据**
安装PaddlePaddle的develop分支编译的whl包,准备已经训练好的FP32预测模型。
准备校准数据,文件结构如下。val文件夹中有100张图片,val_list.txt文件中包含图片的label。
```bash
samples_100
└──val
└──val_list.txt
```
2)**配置校准数据生成器**
MobileNetV1的输入是图片和标签,所以配置读取校准数据的sample_generator,每次返回一张图片和一个标签。详细代码在[models/PaddleSlim/reader.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/reader.py)
3)**调用训练后量化**
调用训练后量化的核心代码如下,详细代码在[post_training_quantization.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/quant_low_level_api/post_training_quantization.py)
``` python
place = fluid.CUDAPlace(0) if args.use_gpu == "True" else fluid.CPUPlace()
exe = fluid.Executor(place)
sample_generator = reader.val(data_dir=args.data_path)
ptq = PostTrainingQuantization(
executor=exe,
sample_generator=sample_generator,
model_dir=args.model_dir,
model_filename=args.model_filename,
params_filename=args.params_filename,
batch_size=args.batch_size,
batch_nums=args.batch_nums,
algo=args.algo,
is_full_quantize=args.is_full_quantize == "True")
quantized_program = ptq.quantize()
ptq.save_quantized_model(args.save_model_path)
```
## 使用Paddle-Lite运行量化模型推理
#### 使用模型优化工具对量化模型进行优化
接下来,使用原始的量化模型生成适合在移动端直接部署的模型。
参考[源码编译](../source_compile)配置编译环境,确保可以编译成功。参考[模型转化方法](../model_optimize_tool),首先编译model_optimize_tool工具,然后执行下面命令对量化训练的模型进行优化(注意,需要自行修改model_file、param_file和optimize_out)。
```bash
./model_optimize_tool \
--model_file=mobilenet_v1_quant/float/model \
--param_file=mobilenet_v1_quant/float/weights \
--optimize_out_type=naive_buffer \
--optimize_out=mobilenet_v1_quant_opt \
--valid_targets=arm \
--prefer_int8_kernel=true
```
如前所述,量化训练后,float目录下的模型参数范围为int8,但参数数据类型仍为float32类型,这样确实没有起到模型参数压缩的效果。但是,经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果,且模型结构也被优化(如进行了各种operator fuse操作)。
#### 在手机端准备量化模型文件
使用如下命令将mobilenet_v1_quant_opt目录下的量化模型文件导入到手机端:
```bash
adb push mobilenet_v1_quant_opt /data/local/tmp
```
#### 使用mobilenetv1\_light\_api运行优化后的量化模型
参考[源码编译](../source_compile)配置编译环境后,在Paddle-Lite执行如下命令获取轻量级API的demo:
```bash
cd /Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light
make clean && make -j
```
执行完上述命令后,可在`Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light/`路径下看到`mobilenetv1_light_api`可执行文件。将`mobilenetv1_light_api`导入到手机端并运行量化模型推理。执行命令如下:
```bash
adb push Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp
adb shell chmod +x /data/local/tmp/mobilenetv1_light_api
adb shell /data/local/tmp/mobilenetv1_light_api \
--model_dir=/data/local/tmp/mobilenet_v1_quant_opt
```
**程序运行结果如下:**
```bash
Output dim: 1000
Output[0]: 0.000228
Output[100]: 0.000260
Output[200]: 0.000250
Output[300]: 0.000560
Output[400]: 0.000950
Output[500]: 0.000275
Output[600]: 0.005143
Output[700]: 0.002509
Output[800]: 0.000538
Output[900]: 0.000969
```
在C++中使用Paddle-Lite API的方法请猛戳[此处](../cpp_demo),用户也可参考[mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)的代码示例。
### FAQ
**问题**:Compiled with WITH_GPU, but no GPU found in runtime
**解答**:检查本机是否支持GPU训练,如果不支持请使用CPU训练。如果在docker进行GPU训练,请使用nvidia_docker启动容器。
**问题**:Inufficient GPU memory to allocation. at [/paddle/paddle/fluid/platform/gpu_info.cc:262]
**解答**:正确设置run.sh脚本中`CUDA_VISIBLE_DEVICES`,确保显卡剩余内存大于需要内存。
# 支持OP列表
## Ops
- affine_channel
- anchor_generator
- arg_max
- assign
- assign_value
- attention_padding_mask
- axpy
- batch_norm
- beam_search
- beam_search_decode
- bilinear_interp
- box_clip
- box_coder
- calib
- calib_once
- cast
- collect_fpn_proposals
- concat
- conditional_block
- conv2d
- conv2d_transpose
- crop
- decode_bboxes
- density_prior_box
- depthwise_conv2d
- distribute_fpn_proposals
- dropout
- elementwise_add
- elementwise_div
- elementwise_max
- elementwise_mul
- elementwise_sub
- equal
- exp
- expand
- fake_channel_wise_dequantize_max_abs
- fake_dequantize_max_abs
- fake_quantize_dequantize_moving_average_abs_max
- fake_quantize_moving_average_abs_max
- fake_quantize_range_abs_max
- fc
- feed
- fetch
- fill_constant
- fill_constant_batch_size_like
- flatten
- flatten2
- floor
- fusion_elementwise_add_activation
- fusion_elementwise_div_activation
- fusion_elementwise_max_activation
- fusion_elementwise_mul_activation
- fusion_elementwise_sub_activation
- gather
- generate_proposals
- graph_op
- greater_equal
- greater_than
- gru
- gru_unit
- hard_sigmoid
- im2sequence
- increment
- instance_norm
- io_copy
- io_copy_once
- is_empty
- layer_norm
- layout
- layout_once
- leaky_relu
- less_equal
- less_than
- lod_reset
- log
- logical_and
- logical_not
- logical_or
- logical_xor
- lookup_table
- lookup_table_v2
- lrn
- match_matrix_tensor
- matmul
- mean
- merge_lod_tensor
- mul
- multiclass_nms
- nearest_interp
- negative
- norm
- notequal
- pad2d
- pool2d
- power
- prelu
- prior_box
- range
- read_from_array
- reduce_max
- reduce_mean
- reduce_prod
- reduce_sum
- relu
- relu6
- relu_clipped
- reshape
- reshape2
- roi_align
- rsqrt
- scale
- search_aligned_mat_mul
- search_attention_padding_mask
- search_fc
- search_grnn
- search_group_padding
- search_seq_arithmetic
- search_seq_depadding
- search_seq_fc
- search_seq_softmax
- sequence_arithmetic
- sequence_concat
- sequence_expand
- sequence_expand_as
- sequence_pool
- sequence_reshape
- sequence_reverse
- sequence_softmax
- sequence_topk_avg_pooling
- shape
- shuffle_channel
- sigmoid
- slice
- softmax
- softsign
- split
- split_lod_tensor
- sqrt
- square
- squeeze
- squeeze2
- stack
- swish
- tanh
- top_k
- transpose
- transpose2
- uniform_random
- unsqueeze
- unsqueeze2
- var_conv_2d
- while
- write_to_array
- yolo_box
## Kernels
### Host kernels
- feed
- fetch
- flatten
- flatten2
- multiclass_nms
- reshape
- reshape2
### ARM kernels
- affine_channel
- anchor_generator
- arg_max
- assign
- assign_value
- axpy
- batch_norm
- beam_search
- beam_search_decode
- bilinear_interp
- box_clip
- box_coder
- cast
- collect_fpn_proposals
- concat
- conditional_block
- conv2d
- conv2d_transpose
- crop
- decode_bboxes
- density_prior_box
- depthwise_conv2d
- distribute_fpn_proposals
- dropout
- elementwise_add
- elementwise_div
- elementwise_max
- elementwise_mul
- elementwise_sub
- equal
- exp
- expand
- fc
- fill_constant
- fill_constant_batch_size_like
- floor
- fusion_elementwise_add_activation
- fusion_elementwise_div_activation
- fusion_elementwise_max_activation
- fusion_elementwise_mul_activation
- fusion_elementwise_sub_activation
- gather
- generate_proposals
- greater_equal
- greater_than
- gru
- gru_unit
- hard_sigmoid
- im2sequence
- increment
- instance_norm
- is_empty
- layer_norm
- layout
- layout_once
- leaky_relu
- less_equal
- less_than
- lod_reset
- log
- logical_and
- logical_not
- logical_or
- logical_xor
- lookup_table
- lookup_table_v2
- lrn
- matmul
- merge_lod_tensor
- mul
- nearest_interp
- negative
- norm
- not_equal
- pad2d
- pool2d
- power
- prelu
- prior_box
- range
- read_from_array
- reduce_max
- reduce_mean
- reduce_prod
- relu
- relu6
- relu_clipped
- roi_align
- rsqrt
- scale
- sequence_expand
- sequence_pool
- sequence_softmax
- shape
- shuffle_channel
- sigmoid
- slice
- softmax
- split
- split_lod_tensor
- squeeze
- squeeze2
- stack
- swish
- tanh
- top_k
- transpose
- transpose2
- unsqueeze
- unsqueeze2
- while
- write_to_array
- yolo_box
### X86 kernels
- batch_norm
- cast
- concat
- conv2d
- depthwise_conv2d
- dropout
- elementwise_add
- elementwise_sub
- fc
- fill_constant_batch_size_like
- gather
- gelu
- gru
- layer_norm
- match_matrix_tensor
- matmul
- mul
- pool2d
- reduce_sum
- relu
- reshape
- reshape2
- scale
- search_aligned_mat_mul
- search_attention_padding_mask
- search_fc
- search_grnn
- search_group_padding
- search_seq_arithmetic
- search_seq_depadding
- search_seq_fc
- search_seq_softmax
- sequence_arithmetic
- sequence_concat
- sequence_expand_as
- sequence_pool
- sequence_reverse
- sequence_topk_avg_pooling
- shape
- slice
- softmax
- softsign
- square
- squeeze
- squeeze2
- stack
- tanh
- transpose
- transpose2
- var_conv_2d
### CUDA kernels
- attention_padding_mask
- bilinear_interp
- calib
- concat
- conv
- dropout
- elementwise_add
- fusion_elementwise_add_activation
- fusion_elementwise_mul_activation
- elementwise_mul
- feed
- io_copy
- layout
- layout_once
- leaky_relu
- lookup_table
- match_matrix_tensor
- mul
- nearest_interp
- pool2d
- relu
- scale
- search_aligned_mat_mul
- search_fc
- search_grnn
- search_group_padding
- search_seq_depadding
- search_seq_fc
- sequence_arithmetic
- sequence_concat
- sequence_pool
- sequence_reverse
- sequence_topk_avg_pooling
- softmax
- transpose
- var_conv_2d
- yolo_box
### OpenCL kernels
- conv2d
- depthwise_conv2d
- elementwise_add
- fc
- fusion_elementwise_add_activation
- layout
- layout_once
- io_copy
- io_copy_once
- mul
- pool2d
- relu
# Benchmark 数据
可以参考[benchmark_tools](benchmark_tools),推荐**一键benchmark**
## 测试环境
* 测试模型
* fp32模型
* mobilenet_v1
* mobilenet_v2
* squeezenet_v1.1
* mnasnet
* shufflenet_v2
* int8模型
* mobilenet_v1
* mobilenet_v2
* resnet50
* 测试机器(android ndk ndk-r17c)
* 骁龙855
* xiaomi mi9, snapdragon 855
* 4xA76(1@2.84GHz + 3@2.4GHz) + 4xA55@1.78GHz
* 骁龙845
* xiaomi mi8, 845
* 2.8GHz(大四核),1.7GHz(小四核)
* 骁龙835
* xiaomi mix2, snapdragon 835
* 2.45GHz(大四核),1.9GHz(小四核)
* 骁龙625
* oppo R9s, snapdragon625
* A53 x 8, big core@2.0GHz
* 骁龙653
* 360 N5, snapdragon 653
* 4 x A73@2.0GHz + 4 x A53@1.4GHz
* 麒麟970
* HUAWEI Mate10
* 测试说明
* branch: release/2.0.0
* warmup=10, repeats=30,统计平均时间,单位是ms
* 当线程数为1时,```DeviceInfo::Global().SetRunMode```设置LITE_POWER_HIGH,否者设置LITE_POWER_NO_BIND
* 模型的输入图像的维度是{1, 3, 224, 224},输入图像的每一位数值是1
## 测试数据
### fp32模型测试数据
#### paddlepaddle model
骁龙855|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4
mobilenet_v1 |32.19 |18.81 |10.90 |30.92 |18.31 |10.15
mobilenet_v2 |22.91 |13.75 |8.64 |21.15 |12.79 |7.84
shufflenet_v2 |4.67 |3.37 |2.65 |4.43 |3.15 |2.66
squeezenet_v1.1 |25.10 |15.93 |9.68 |23.28 |14.61 |8.71
mnasnet |21.84 |13.14 |7.96 |19.61 |11.88 |7.55
骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4
mobilenet_v1 |94.13 |52.17 |30.68 |88.28 |47.58 |26.64
mobilenet_v2 |61.24 |34.64 |22.36 |56.66 |32.19 |19.63
shufflenet_v2 |10.87 |6.92 |5.12 |10.41 |6.76 |4.97
squeezenet_v1.1 |73.61 |42.25 |24.44 |64.87 |38.43 |23.06
mnasnet |58.22 |33.43 |20.44 |53.43 |30.20 |18.09
麒麟980|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4
mobilenet_v1 |55.11 |28.24 |13.27 |34.24 |17.74 |12.41
mobilenet_v2 |37.03 |19.80 |51.94 |23.64 |12.98 |9.38
shufflenet_v2 |7.26 |4.94 |15.06 |5.32 |3.33 |2.82
squeezenet_v1.1 |42.73 |23.66 |57.39 |26.03 |14.53 |13.66
mnasnet |36.87 |20.15 |46.04 |21.85 |12.06 |8.68
麒麟970|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4
mobilenet_v1 |97.80 |52.64 |34.46 |94.51 |49.36 |28.43
mobilenet_v2 |66.55 |38.52 |23.19 |62.89 |34.93 |21.53
shufflenet_v2 |13.78 |8.11 |5.93 |11.95 |7.90 |5.91
squeezenet_v1.1 |77.64 |43.67 |25.72 |69.91 |40.66 |24.62
mnasnet |61.86 |34.62 |22.68 |59.61 |32.79 |19.56
#### caffe model
骁龙855|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4 |
mobilenet_v1 |32.42 |18.68 |10.86 |30.92 |18.35 |10.07 |
mobilenet_v2 |29.53 |17.76 |10.89 |27.19 |16.53 |9.75 |
shufflenet_v2 |4.61 |3.29 |2.61 |4.36 |3.11 |2.51 |
骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4 |
mobilenet_v1 |92.52 |52.34 |30.37 |88.31 |49.75 |27.29 |
mobilenet_v2 |79.50 |45.67 |28.79 |76.13 |44.01 |26.13 |
shufflenet_v2 |10.94 |7.08 |5.16 |10.64 |6.83 |5.01 |
麒麟980|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4 |
mobilenet_v1 |55.36 |28.18 |13.31 |34.42 |17.93 |12.52 |
mobilenet_v2 |49.17 |26.10 |65.49 |30.50 |16.66 |11.72 |
shufflenet_v2 |8.45 |5.00 |15.65 |4.58 |3.14 |2.83 |
麒麟970|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4 |
mobilenet_v1 |97.85 |53.38 |33.85 |94.29 |49.42 |28.29 |
mobilenet_v2 |87.40 |50.25 |31.85 |85.55 |48.11 |28.24 |
shufflenet_v2 |12.16 |8.39 |6.21 |12.21 |8.33 |6.32 |
#### int8量化模型测试数据
骁龙855|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4 |
mobilenet_v1 |36.80 |21.58 |11.12 | 14.01 |8.13 |4.32 |
mobilenet_v2 |28.72 |19.08 |12.49 | 17.24 |11.55 |7.82 |
骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4 |
mobilenet_v1 |60.76 |32.25 |16.66 |56.57 |29.84 |15.24 |
mobilenet_v2 |49.38 |31.10 |22.07 |47.52 |28.18 |19.24 |
麒麟970|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4 |
mobilenet_v1 |65.95 |34.39 |18.68 |60.86 |30.98 |16.31 |
mobilenet_v2 |68.87 |39.39 |24.43 |65.57 |37.31 |20.87 |
# Benchmark 测试方法
本文将会介绍,在**Ubuntu:16.04交叉编译环境**下,用安卓手机在终端测试Paddle-Lite的性能,并介绍两种Benchmark方法:
1. **一键Benchmark**:适用于想快速获得常见模型性能的用户,下载预编译好的benchmark可执行文件;
2. **逐步Benchmark**:将**一键Benchmark**流程拆解讲解。
## 环境准备
1. 准备[adb](https://developer.android.com/studio/command-line/adb)等必备软件:
```shell
sudo apt update
sudo apt install -y wget adb
```
2. 检查手机与电脑连接。安卓手机USB连上电脑,打开设置 -> 开启开发者模式 -> 开启USB调试 -> 允许(授权)当前电脑调试手机;
3. 在电脑终端输入`adb devices`命令,查看当前连接到的设备:
```shell
adb devices
```
命令成功执行,显示结果类似下面(序列码略有不同):
```shell
List of devices attached
712QSDSEMMS7C device
```
## 一. 一键Benchmark
执行以下命令,完成Benchmark:
```shell
wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/run_benchmark.sh
sh run_benchmark.sh
```
`run_benchmark.sh`脚本会:
1. 下载模型,并上传手机:包含mobilenetv1/v2、shufflenetv2、squeezenetv1.1、mnasnet;
2. 下载pre-built android-armv7和android-armv8的可执行文件,并上传手机:`benchmark_bin_v7``benchmark_bin_v8`
3. 自动执行另一个脚本`benchmark.sh`(多台手机连接USB,请在`benchmark.sh`脚本中对`adb`命令后加上测试手机的`serial number`);
4. 从手机下载benchmark结果`result_armv7.txt``result_armv8.txt`,到当前目录,并显示Benchmark结果。
## 二. 逐步Benchmark
### 1. 获取benchmark可执行文件
benchmark_bin文件可以测试PaddleLite的性能,有下面两种方式获得。
#### 方式一:下载benchmark_bin可执行文件
```shell
# Download benchmark_bin for android-armv7
wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_bin_v7
# Download benchmark_bin for android-armv8
wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_bin_v8
```
#### 方式二:由源码编译benchmark_bin文件
根据[源码编译](../source_compile)准备编译环境,拉取PaddleLite最新release发布版代码,并在仓库根目录下,执行:
```shell
###########################################
# Build benchmark_bin for android-armv7 #
###########################################
./lite/tools/ci_build.sh \
--arm_os="android" \
--arm_abi="armv7" \
--arm_lang="gcc " \
build_arm
# `benchmark_bin` 在: <paddle-lite-repo>/build.lite.android.armv7.gcc/lite/api/benchmark_bin
###########################################
# Build benchmark_bin for android-armv8 #
###########################################
./lite/tools/ci_build.sh \
--arm_os="android" \
--arm_abi="armv8" \
--arm_lang="gcc " \
build_arm
# `benchmark_bin` 在: <paddle-lite-repo>/build.lite.android.armv8.gcc/lite/api/benchmark_bin
```
> **注意**:为了避免在docker内部访问不到手机的问题,建议编译得到benchmark_bin后退出到docker外面,并且将benchmark_bin文件拷贝到一个临时目录。然后在该临时目录下,按照下面步骤下载模型、拷贝脚本、测试。
### 2. 准备模型
PaddleLite为Benchmark准备好了[常见Benchmark模型](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_models.tgz)
执行以下命令,下载常见Benchmark模型并解压:
```shell
wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_models.tgz
tar zxvf benchmark_models.tgz
```
如果测试其他模型,请将模型文件放到 `benchmark_models` 文件夹中。
### 3. benchmark.sh脚本
benchmark测试的执行脚本`benchmark.sh` 位于源码中的`/PaddleLite/lite/tools/benchmark.sh`位置,测试时需要将`benchmark.sh``benchmark_bin``benchmark_models` 文件复制到同一目录下。
### 4. 测试
从终端进入benchmark.sh、可执行文件(benchmark_bin_v7、benchmark_bin_v8)和模型文件(benchmark_models)所在文件夹。
如果 `benchmark_models` 中所有模型文件都已经使用 `model_optimize_tool` 进行转换,则使用 benchmark.sh 脚本执行如下命令进行测试:
```shell
# Benchmark for android-armv7
sh benchmark.sh ./benchmark_bin_v7 ./benchmark_models result_armv7.txt
# Benchmark for android-armv8
sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt
```
如果 `benchmark_models` 中所有模型文件都没有使用 `model_optimize_tool` 进行转换,则执行下面的命令。`benchmark_bin` 会首先转换模型,然后加载模型进行测试。
```shell
# Benchmark for android-armv7
sh benchmark.sh ./benchmark_bin_v7 ./benchmark_models result_armv7.txt true
# Benchmark for android-armv8
sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt true
```
测试结束后,armv7和armv8的结果,分别保存在当前目录下的`result_armv7.txt``result_armv8.txt`文件中。
**查看测试结果**
在当前目录的`result_armv7.txt``result_armv8.txt`文件,查看测试结果。
> 不同手机,不同版本,测试模型的性能数据不同。
```shell
run benchmark armv7
--------------------------------------
PaddleLite Benchmark
Threads=1 Warmup=10 Repeats=30
-- mnasnet avg = 159.8427 ms
-- mobilenet_v1 avg = 235.0072 ms
-- mobilenet_v2 avg = 173.0387 ms
-- shufflenet_v2 avg = 76.0040 ms
-- squeezenet_v11 avg = 164.2957 ms
Threads=2 Warmup=10 Repeats=30
-- mnasnet avg = 83.1287 ms
-- mobilenet_v1 avg = 121.6029 ms
-- mobilenet_v2 avg = 86.6175 ms
-- shufflenet_v2 avg = 41.5761 ms
-- squeezenet_v11 avg = 87.8678 ms
Threads=4 Warmup=10 Repeats=30
-- mnasnet avg = 73.3880 ms
-- mobilenet_v1 avg = 119.0739 ms
-- mobilenet_v2 avg = 85.3050 ms
-- shufflenet_v2 avg = 38.0762 ms
-- squeezenet_v11 avg = 64.2201 ms
--------------------------------------
run benchmark armv8
--------------------------------------
PaddleLite Benchmark
Threads=1 Warmup=10 Repeats=30
-- mnasnet avg = 165.3073 ms
-- mobilenet_v1 avg = 306.0188 ms
-- mobilenet_v2 avg = 195.1884 ms
-- shufflenet_v2 avg = 99.3692 ms
-- squeezenet_v11 avg = 156.6971 ms
Threads=2 Warmup=10 Repeats=30
-- mnasnet avg = 90.2290 ms
-- mobilenet_v1 avg = 157.0007 ms
-- mobilenet_v2 avg = 118.1607 ms
-- shufflenet_v2 avg = 68.6804 ms
-- squeezenet_v11 avg = 91.3090 ms
Threads=4 Warmup=10 Repeats=30
-- mnasnet avg = 179.9730 ms
-- mobilenet_v1 avg = 204.0684 ms
-- mobilenet_v2 avg = 181.6486 ms
-- shufflenet_v2 avg = 123.2728 ms
-- squeezenet_v11 avg = 412.9046 ms
--------------------------------------
```
# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/master/config
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
#sys.path.insert(0, os.path.abspath('.'))
import sphinx_rtd_theme
from recommonmark.parser import CommonMarkParser
from recommonmark.transform import AutoStructify
# -- Project information -----------------------------------------------------
project = u'Paddle-Lite'
copyright = u'2020, Paddle-Lite Developer'
author = u'Paddle-Lite Developer'
# The short X.Y version
version = u'latest'
# The full version, including alpha/beta/rc tags
release = u''
# -- General configuration ---------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ['recommonmark', 'sphinx_markdown_tables']
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
source_suffix = ['.rst', '.md']
# The master toctree document.
master_doc = 'index'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = None
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself. Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
# html_sidebars = {}
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = 'Paddle-Litedoc'
# -- Options for LaTeX output ------------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'Paddle-Lite.tex', u'Paddle-Lite Documentation',
u'Paddle-Lite Developer', 'manual'),
]
# -- Options for manual page output ------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'paddle-lite', u'Paddle-Lite Documentation',
[author], 1)
]
# -- Options for Texinfo output ----------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'Paddle-Lite', u'Paddle-Lite Documentation',
author, 'Paddle-Lite', 'One line description of project.',
'Miscellaneous'),
]
# -- Options for Epub output -------------------------------------------------
# Bibliographic Dublin Core info.
epub_title = project
# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
#
# epub_identifier = ''
# A unique identification for the text.
#
# epub_uid = ''
# A list of files that should not be packed into the epub file.
epub_exclude_files = ['search.html']
.. Paddle-Lite documentation master file, created by
sphinx-quickstart on Thu Feb 6 14:11:30 2020.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to Paddle-Lite's documentation!
=======================================
.. toctree::
:maxdepth: 1
:caption: 简介
:name: sec-introduction
introduction/tech_highlights
introduction/architecture
.. toctree::
:maxdepth: 1
:caption: Benchmark数据和方法
:name: sec-benchmark
benchmark/benchmark
benchmark/benchmark_tools
.. toctree::
:maxdepth: 1
:caption: 安装
:name: sec-install
installation/source_compile
.. toctree::
:maxdepth: 1
:caption: 使用指南
:name: sec-user-guides
.. toctree::
:maxdepth: 1
:caption: 进阶使用指南
advanced_user_guides/support_operation_list
advanced_user_guides/add_operation
advanced_user_guides/model_quantization
.. toctree::
:maxdepth: 1
:caption: 开发者文档
.. toctree::
:maxdepth: 1
:caption: API文档
.. toctree::
:maxdepth: 1
:caption: FAQ
.. toctree::
:maxdepth: 1
:caption: paddle-mobile
# 源码编译
目前支持三种编译的环境:
1. Docker 容器环境,
2. Linux(推荐 Ubuntu 16.04)环境,
3. Mac OS 环境。
# 架构设计
Mobile 在这次升级为 Lite 架构, 侧重多硬件、高性能的支持,其主要设计思想如下
- 引入 Type system,强化多硬件、量化方法、data layout 的混合调度能力
- 硬件细节隔离,通过不同编译开关,对支持的任何硬件可以自由插拔
- 引入 MIR(Machine IR) 的概念,强化带执行环境下的优化支持
- 优化期和执行期严格隔离,保证预测时轻量和高效率
架构图如下
![Paddle Inference Refactor1.0](https://user-images.githubusercontent.com/52520497/64949619-26e49580-d8ac-11e9-855a-514feb9b75af.png)
## 编译期和执行期严格隔离设计
- compile time 优化完毕可以将优化信息存储到模型中;execution time 载入并执行
- 两套 API 及对应的预测lib,满足不同场景
- `CxxPredictor` 打包了 `Compile Time``Execution Time`,可以 runtime 在具体硬件上做分析和优化,得到最优效果
- `MobilePredictor` 只打包 `Execution Time`,保持部署和执行的轻量
## `Execution Time` 轻量级设计和实现
- 每个 batch 实际执行只包含两个步骤执行
- `Op.InferShape`
- `Kernel.Run`,Kernel 相关参数均使用指针提前确定,后续无查找或传参消耗
- 设计目标,执行时,只有 kernel 计算本身消耗
- 轻量级 `Op``Kernel` 设计,避免框架额外消耗
- `Op` 只有 `CreateKernels``InferShape` 两个重要职能
- `Kernel` 只有 `Run` 职能
## 多硬件后端支持
- 硬件通用行为,使用 `TargetWrapper` 模块做适配器适配,对上层框架提供一致界面
- 框架上层策略保持硬件无关,如存储优化 (Memory optimize),计算剪枝 (Computation prune) 等,任何硬件接入均可直接复用
- 框架支持了硬件通用行为,特定硬件细节不做过多约束,各硬件可以自行实现并接入框架
- 计算模式上目前支持两种主流模型,一种是类似 X86, ARM CPU 等非异构设备;一种是 GPU,或 FPGA 等异构设备(支持 stream, event异步执行模式以及跨设备拷贝)
---
## 多硬件及算法混合调度支持
`TensorTy` 用来表示 Tensor 类型
```c++
struct TensorTy {
TargetType target;
PrecisionType precision;
DataLayout layout;
int deviceid;
};
```
```c++
enum class TargetType { kARM, kX86, kCUDA, kOpenCL };
enum class PrecisionType { kFP32, kFP16, kInt8, kInt16 };
enum class DataLayout { kNCHW, kNHWC };
```
---
注册 Kernel,确定特定 Kernel 的输入输出特征
```c++
REGISTER_LITE_KERNEL(
mul, kARM, kFloat, kNCHW, arm::MulCompute, def)
.BindInput("X", {LiteType::GetTensorTy(kARM, kFloat, kNCHW)})
.BindInput("Y", {LiteType::GetTensorTy(kARM, kFloat, kNCHW))})
.BindOutput("Out", {LiteType::GetTensorTy(kARM, kFloat, kNCHW)})
.Finalize();
```
---
同一个 Op 的不同 Kernel 类似函数重载
用于支持任意的混合调度:
1. 标记模型中所有 tensor 的 Type
2. 标记 Kernel 的 硬件、执行精度、data layout 等信息
全局做类型推断,当发现 tensor 传递中有类型冲突,采用 type cast 操作,通过插入特定功能 Op 来实现正确的传导
![lite-7](https://user-images.githubusercontent.com/52520497/64949642-395ecf00-d8ac-11e9-8b69-ced1996abc3b.png)
---
## MIR 用于图分析优化
基于 Type System 的 SSA,通过 IR Pass 对计算图进行分析和优化:
- 支持对整个 graph 进行类型推断,发现类型冲突并加入 type cast op,来支持通用混合调度
- 计算剪枝 (Compute prune),比如去掉 scale(1), assign op 等
- 存储优化 (Memory optimize)
- 操作熔合 (Operator fuse)(已经支持 fc, conv_bn, ele_add+act 等6种 fuse 策略)
- 支持量化处理(已支持 Int8预测)
# 技术特点
不同于普通的移动端预测基于类 Caffe 的架构,Lite 架构最早的设计目标来源于 Paddle Server 和 Mobile 两种场景的要求,其中 Server 端需要有完善的图分析和优化能力,而 Mobile 端要求有轻量级部署的能力,两种场景共同的要求是高性能,多硬件支持等。
基于上述要求,Lite 架构完整实现了相应的能力,重点描述如下。
## 多硬件支持
Lite 架构已经验证和完整支持从 Mobile 到 Server 多种硬件的支持需求,包括 ARM CPU, ARM GPU, Huawei NPU, Intel X86 CPU, NV GPU 等。 得益于对不同硬件适度的抽象,在Lite 框架本身清晰的同时支持不同硬件的特殊调度需求,使得Lite架构在框架的清晰程度和硬件的特定调度优化上达到很好的平衡,比如 Nvidia GPU 上复杂的 stream, event 分配,在 Lite 中可以清晰表示。
多种硬件的 Kernel 在代码层和执行层均互不干扰,用户可以自由插拔任何硬件的支持。
## 高性能
高性能来源于两方面,一是 Kernel 优化;二是框架执行。
Kernel 方面,我们对相应硬件上的 Kernel 通过指令集、操作熔合、算法改写等方式进行了深入优化。
框架执行方面,通过简化 Op 和 Kernel 的功能,使得执行期的框架开销极低;此外,框架极大的灵活性可以支持各种硬件的特定调度优化以提升整体效率。
## 量化支持
Lite 支持Paddle Slim 强大的量化训练完毕的模型,因此完整保留了量化计算的高性能以及量化训练的高精度。
## 强大的图分析和优化能力
在图分析优化上,不同于常规的移动端预测引擎基于 Python 脚本工具转化模型, Lite 架构上有完整基于 C++ 开发的 IR 及相应 Pass 集合,以支持操作熔合 (Operator fusion),计算剪枝 (Computation pruning),存储优化 (Memory optimization),量化计算 (Quantitative computation) 等多类计算图优化。
更多的优化策略可以简单通过添加 Pass 的方式模块化支持。
## 轻量级部署
尽管图优化上有复杂的策略,但并不影响移动端的轻量级部署,图分析模块和最终的执行引擎可以拆开使用,最终部署只有一层薄薄的 Kernel 。
## 可支持任意硬件的混合调度
Lite 支持系统可见任意硬件的混合调度,目前已经支持 ARM CPU 和 ARM GPU 的 Kernel 自动混合调度,并验证了 X86 CPU 和 Nvidia GPU 间的混合调度。
支持混合调度的考量有两点:
1. 当系统内同时存在多种硬件可用时,混合调度可以充分利用各类硬件资源
2. 随着支持模型的增多,各硬件对kernel的支持丰富度不一,难免需要混合调度才能跑通
Lite架构通过从底层支持 `Type system` 的方式通用建模各类混合执行的行为,从而能够相对完备地支持混调。
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
:end
popd
sphinx
recommonmark
sphinx_markdown_tables
sphinx_rtd_theme
...@@ -41,6 +41,8 @@ void LightPredictor::Build(const std::string& model_dir, ...@@ -41,6 +41,8 @@ void LightPredictor::Build(const std::string& model_dir,
default: default:
LOG(FATAL) << "Unknown model type"; LOG(FATAL) << "Unknown model type";
} }
DequantizeWeight();
BuildRuntimeProgram(cpp_program_desc_); BuildRuntimeProgram(cpp_program_desc_);
PrepareFeedFetch(); PrepareFeedFetch();
} }
...@@ -144,5 +146,69 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { ...@@ -144,5 +146,69 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
program_->set_exec_scope(program.exec_scope()); program_->set_exec_scope(program.exec_scope());
} }
void LightPredictor::DequantizeWeight() {
#define PROCESS_CONV2D_DATA() \
for (int64_t i = 0; i < h; ++i) { \
for (int64_t j = 0; j < w; ++j) { \
fp_data[i * w + j] = scale_list[i] * int_data[i * w + j]; \
} \
}
#define PROCESS_FC_DATA() \
for (int i = 0; i < input_tensor->numel(); i++) { \
*fp_data = scale_list[0] * (*int_data); \
++fp_data; \
++int_data; \
}
Tensor tmp_tensor;
CHECK(cpp_program_desc_.BlocksSize());
auto* main_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
for (size_t k = 0; k < main_block->OpsSize(); ++k) {
auto* op_desc = main_block->GetOp<cpp::OpDesc>(k);
if (op_desc->HasAttr("quantize_weight_bits")) { // weight quantized op
auto input_names = op_desc->input_vars();
for (auto& input_name : input_names) {
std::string input_scale_name = input_name + "_quant_scale";
if (op_desc->HasAttr(input_scale_name)) { // the input is quantized
auto input_tensor =
scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
tmp_tensor.CopyDataFrom(*input_tensor);
auto scale_list =
op_desc->GetAttr<std::vector<float>>(input_scale_name);
int quantize_weight_bits =
op_desc->GetAttr<int>("quantize_weight_bits");
float* fp_data = input_tensor->mutable_data<float>();
std::string op_type = op_desc->Type();
if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
int64_t h = input_tensor->dims()[0];
int64_t w = input_tensor->numel() / h;
CHECK_EQ(scale_list.size(), h);
if (quantize_weight_bits == 8) {
const int8_t* int_data = tmp_tensor.data<int8_t>();
PROCESS_CONV2D_DATA()
} else {
const int16_t* int_data = tmp_tensor.data<int16_t>();
PROCESS_CONV2D_DATA()
}
} else if (op_type == "fc" || op_type == "mul") {
if (quantize_weight_bits == 8) {
const int8_t* int_data = tmp_tensor.data<int8_t>();
PROCESS_FC_DATA()
} else {
const int16_t* int_data = tmp_tensor.data<int16_t>();
PROCESS_FC_DATA()
}
}
}
}
}
}
#undef PROCESS_CONV2D_DATA
#undef PROCESS_FC_DATA
}
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -78,6 +78,8 @@ class LITE_API LightPredictor { ...@@ -78,6 +78,8 @@ class LITE_API LightPredictor {
void BuildRuntimeProgram(const cpp::ProgramDesc& prog); void BuildRuntimeProgram(const cpp::ProgramDesc& prog);
void DequantizeWeight();
private: private:
std::shared_ptr<Scope> scope_; std::shared_ptr<Scope> scope_;
std::unique_ptr<RuntimeProgram> program_; std::unique_ptr<RuntimeProgram> program_;
......
...@@ -116,6 +116,34 @@ static size_t PrecisionTypeLength(PrecisionType type) { ...@@ -116,6 +116,34 @@ static size_t PrecisionTypeLength(PrecisionType type) {
} }
} }
template <typename T>
struct PrecisionTypeTrait {
constexpr static PrecisionType Type() { return PrecisionType::kUnk; }
};
#define _ForEachPrecisionTypeHelper(callback, cpp_type, precision_type) \
callback(cpp_type, ::paddle::lite_api::PrecisionType::precision_type);
#define _ForEachPrecisionType(callback) \
_ForEachPrecisionTypeHelper(callback, bool, kBool); \
_ForEachPrecisionTypeHelper(callback, float, kFloat); \
_ForEachPrecisionTypeHelper(callback, int8_t, kInt8); \
_ForEachPrecisionTypeHelper(callback, int16_t, kInt16); \
_ForEachPrecisionTypeHelper(callback, int, kInt32); \
_ForEachPrecisionTypeHelper(callback, int64_t, kInt64);
#define DefinePrecisionTypeTrait(cpp_type, precision_type) \
template <> \
struct PrecisionTypeTrait<cpp_type> { \
constexpr static PrecisionType Type() { return precision_type; } \
}
_ForEachPrecisionType(DefinePrecisionTypeTrait);
#undef _ForEachPrecisionTypeHelper
#undef _ForEachPrecisionType
#undef DefinePrecisionTypeTrait
#define TARGET(item__) paddle::lite_api::TargetType::item__ #define TARGET(item__) paddle::lite_api::TargetType::item__
#define PRECISION(item__) paddle::lite_api::PrecisionType::item__ #define PRECISION(item__) paddle::lite_api::PrecisionType::item__
#define DATALAYOUT(item__) paddle::lite_api::DataLayoutType::item__ #define DATALAYOUT(item__) paddle::lite_api::DataLayoutType::item__
......
...@@ -44,3 +44,4 @@ USE_MIR_PASS(memory_optimize_pass); ...@@ -44,3 +44,4 @@ USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
USE_MIR_PASS(npu_subgraph_pass); USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass); USE_MIR_PASS(xpu_subgraph_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass);
...@@ -617,7 +617,7 @@ void conv_depthwise_3x3s1_fp32(const float *din, ...@@ -617,7 +617,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"fcmge v18.4s, v12.4s, %[vzero].4s \n" /* vcgeq_f32 */ \ "fcmge v18.4s, v12.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fcmge v19.4s, v13.4s, %[vzero].4s \n" /* vcgeq_f32 */ \ "fcmge v19.4s, v13.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fmul v20.4s, v12.4s, %[vscale].4s \n" /* mul */ \ "fmul v20.4s, v12.4s, %[vscale].4s \n" /* mul */ \
"fmul v21.4s, v12.4s, %[vscale].4s \n" /* mul */ \ "fmul v21.4s, v13.4s, %[vscale].4s \n" /* mul */ \
"ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \
\ \
"fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
...@@ -1627,7 +1627,7 @@ void conv_depthwise_3x3s1_fp32(const float *din, ...@@ -1627,7 +1627,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
\ \
"vbif q4, q6, q15 @ choose \n" \ "vbif q4, q6, q15 @ choose \n" \
"vcge.f32 q7, q5, %q[vzero] @ q0 > 0 \n" \ "vcge.f32 q7, q5, %q[vzero] @ q0 > 0 \n" \
"vmul.f32 q6, q4, q14 \n" \ "vmul.f32 q6, q5, q14 \n" \
"vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \
"vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \
\ \
...@@ -1815,8 +1815,8 @@ void conv_depthwise_3x3s1_fp32(const float *din, ...@@ -1815,8 +1815,8 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"vmul.f32 q12, q14, q9 \n" \ "vmul.f32 q12, q14, q9 \n" \
"vmul.f32 q13, q15, q9 \n" \ "vmul.f32 q13, q15, q9 \n" \
\ \
"vbif q14, q10, q12 \n" \ "vbif q14, q12, q10 \n" \
"vbif q15, q11, q13 \n" \ "vbif q15, q13, q11 \n" \
\ \
"vst1.32 {d28-d29}, [%[out1]]\n" \ "vst1.32 {d28-d29}, [%[out1]]\n" \
"vst1.32 {d30-d31}, [%[out2]]\n" "vst1.32 {d30-d31}, [%[out2]]\n"
......
...@@ -209,9 +209,9 @@ namespace math { ...@@ -209,9 +209,9 @@ namespace math {
"fcmge v7.4s, v22.4s, v0.4s \n" /* vcgeq_f32 */ \ "fcmge v7.4s, v22.4s, v0.4s \n" /* vcgeq_f32 */ \
"fmul v8.4s, v22.4s, %[vscale].4s \n" /* mul */ \ "fmul v8.4s, v22.4s, %[vscale].4s \n" /* mul */ \
"bif v19.16b, v2.16b, v1.16b \n" /* choose*/ \ "bif v19.16b, v2.16b, v1.16b \n" /* choose*/ \
"bif v19.16b, v4.16b, v3.16b \n" /* choose*/ \ "bif v20.16b, v4.16b, v3.16b \n" /* choose*/ \
"bif v19.16b, v6.16b, v5.16b \n" /* choose*/ \ "bif v21.16b, v6.16b, v5.16b \n" /* choose*/ \
"bif v19.16b, v8.16b, v7.16b \n" /* choose*/ "bif v22.16b, v8.16b, v7.16b \n" /* choose*/
#define STORE /* save result */ \ #define STORE /* save result */ \
"str q19, [%[outc0]], #16\n" \ "str q19, [%[outc0]], #16\n" \
"str q20, [%[outc1]], #16\n" \ "str q20, [%[outc1]], #16\n" \
......
...@@ -360,12 +360,12 @@ __read_only image2d_t new_scale, ...@@ -360,12 +360,12 @@ __read_only image2d_t new_scale,
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU
output0 = activation_type4(output0); output0 = activation_type4(output0);
output1 = activation_type4(output1); output1 = activation_type4(output1);
output2 = activation_type4(output2); output2 = activation_type4(output2);
output3 = activation_type4(output3); output3 = activation_type4(output3);
#endif
if (out_w0 < old_w) { if (out_w0 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
......
#include <cl_common.h>
__kernel void conv2d_5x5(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int batch_index = out_nh / output_height;
const int out_nh_in_one_batch = out_nh % output_height;
const int filter_n0 = 4 * out_c + 0;
const int filter_n1 = 4 * out_c + 1;
const int filter_n2 = 4 * out_c + 2;
const int filter_n3 = 4 * out_c + 3;
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh_in_one_batch;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE_CH
CL_DTYPE4 output =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
#else
CL_DTYPE4 output = 0.0f;
#endif
CL_DTYPE4 input;
CL_DTYPE4 filter[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
in_pos_in_one_block.y + batch_index * input_height);
for (int j = 0; j < 5; j++) {
for (int k = 0; k < 5; k++) {
input = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x + (j - 2) * dilation,
pos_in.y + (k - 2) * dilation)),
(CL_DTYPE4)(0.0f),
(ushort4)(
(in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
<< 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
filter_pos0.x = filter_c * 5 + filter_w;
filter_pos0.y = filter_n0 * 5 + filter_h;
filter_pos1.x = filter_c * 5 + filter_w;
filter_pos1.y = filter_n1 * 5 + filter_h;
filter_pos2.x = filter_c * 5 + filter_w;
filter_pos2.y = filter_n2 * 5 + filter_h;
filter_pos3.x = filter_c * 5 + filter_w;
filter_pos3.y = filter_n3 * 5 + filter_h;
filter[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos0);
filter[1] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos1);
filter[2] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos2);
filter[3] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
//
// if (output_pos.x == 0 && output_pos.y == 5) {
// printf("i,j,k ={ %d, %d , %d }\n", i,j,k);
// printf("in={ %f , %f , %f , %f } \n",
// convert_float(input.x),
// convert_float(input.y),
// convert_float(input.z),
// convert_float(input.w));
// printf("filter0={ %f , %f , %f , %f } \n",
// convert_float(filter[0].x),
// convert_float(filter[0].y),
// convert_float(filter[0].z),
// convert_float(filter[0].w));
// printf("filter1={ %f , %f , %f , %f } \n",
// convert_float(filter[1].x),
// convert_float(filter[1].y),
// convert_float(filter[1].z),
// convert_float(filter[1].w));
// printf("filter2={ %f , %f , %f , %f } \n",
// convert_float(filter[2].x),
// convert_float(filter[2].y),
// convert_float(filter[2].z),
// convert_float(filter[2].w));
// printf("filter3={ %f , %f , %f , %f } \n",
// convert_float(filter[3].x),
// convert_float(filter[3].y),
// convert_float(filter[3].z),
// convert_float(filter[3].w));
// printf("output={ %f , %f , %f , %f } \n",
// convert_float(output.x),
// convert_float(output.y),
// convert_float(output.z),
// convert_float(output.w));
// }
}
}
}
#ifdef BATCH_NORM
output =
output * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
#endif
output = activation_type4(output);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}
#include <cl_common.h>
__kernel void conv2d_7x7(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int batch_index = out_nh / output_height;
const int out_nh_in_one_batch = out_nh % output_height;
const filter_n0 = 4 * out_c + 0;
const filter_n1 = 4 * out_c + 1;
const filter_n2 = 4 * out_c + 2;
const filter_n3 = 4 * out_c + 3;
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh_in_one_batch;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE_CH
CL_DTYPE4 output =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
#else
CL_DTYPE4 output = 0.0f;
#endif
CL_DTYPE4 input;
CL_DTYPE4 filter[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
in_pos_in_one_block.y + batch_index * input_height);
for (int j = 0; j < 7; j++) {
for (int k = 0; k < 7; k++) {
input = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x + (j - 3) * dilation,
pos_in.y + (k - 3) * dilation)),
(CL_DTYPE4)(0.0f),
(ushort4)(
(in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
<< 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
filter_pos0.x = filter_c * 7 + filter_w;
filter_pos0.y = filter_n0 * 7 + filter_h;
filter_pos1.x = filter_c * 7 + filter_w;
filter_pos1.y = filter_n1 * 7 + filter_h;
filter_pos2.x = filter_c * 7 + filter_w;
filter_pos2.y = filter_n2 * 7 + filter_h;
filter_pos3.x = filter_c * 7 + filter_w;
filter_pos3.y = filter_n3 * 7 + filter_h;
filter[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos0);
filter[1] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos1);
filter[2] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos2);
filter[3] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
}
}
}
#ifdef BATCH_NORM
output = output * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
#endif
output = activation_type4(output);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}
...@@ -14,15 +14,72 @@ limitations under the License. */ ...@@ -14,15 +14,72 @@ limitations under the License. */
#include <cl_common.h> #include <cl_common.h>
__kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) { __kernel void elementwise_add(__read_only image2d_t input,
__read_only image2d_t bias,
__write_only image2d_t outputImage) {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords);
CL_DTYPE4 output = activation_type4(in + biase);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage,coords,output);
}
__kernel void channel_add(__read_only image2d_t input,
__read_only image2d_t bias,
__write_only image2d_t outputImage,
int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords; int2 coords;
coords.x = x; coords.x = x;
coords.y = y; coords.y = y;
float4 in = read_imagef(input, sampler, coords);
float4 biase = read_imagef(bias, sampler, coords); int2 coords_bias;
float4 output = in + biase; coords_bias.x = x % w;
write_imagef(outputImage,coords,output); coords_bias.y = 0;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
CL_DTYPE4 output = in + (CL_DTYPE4)(biase.x);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
} }
__kernel void width_add(__read_only image2d_t input,
__read_only image2d_t bias,
__write_only image2d_t outputImage,
int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias;
coords_bias.x = x % w;
coords_bias.y = 0;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
CL_DTYPE4 output;
output.x = in.x + biase.x;
output.y = in.y + biase.x;
output.z = in.z + biase.x;
output.w = in.w + biase.x;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,
__write_only image2d_t outputImage) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords);
CL_DTYPE4 output = in * biase;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
}
__kernel void channel_mul_d1(__read_only image2d_t input, __read_only image2d_t bias,
__write_only image2d_t outputImage, int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias;
coords_bias.x = x % w;
coords_bias.y = 0;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
CL_DTYPE4 output = in * (CL_DTYPE4)(biase.x);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
}
__kernel void channel_mul_d2(__read_only image2d_t input, __read_only image2d_t bias,
__write_only image2d_t outputImage, int w, int h) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias;
coords_bias.x = x % w;
coords_bias.y = y % h;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
CL_DTYPE4 output = in * (CL_DTYPE4)(biase.x);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
}
__kernel void channel_mul_d4(__read_only image2d_t input, __read_only image2d_t bias,
__write_only image2d_t outputImage, int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias;
coords_bias.x = x / w;
coords_bias.y = 0;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
CL_DTYPE4 output = in * biase;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output,
__private const float scale_h, __private const float scale_w,
__private const int in_dims_h, __private const int out_dims_h,
__private const int in_dims_w, __private const int out_dims_w) {
const int c = get_global_id(0);
const int w = get_global_id(1);
const int nh = get_global_id(2);
int2 output_pos;
output_pos.x = c * out_dims_w + w;
output_pos.y = nh;
int out_n = nh / out_dims_h;
int out_h = nh % out_dims_h;
int2 input_pos;
input_pos.x = c * in_dims_w + w / scale_w;
input_pos.y = out_n * in_dims_h + out_h / scale_h;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
half4 input_data = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y));
write_imageh(output, (int2)(output_pos.x , output_pos.y), input_data);
}
...@@ -35,6 +35,7 @@ lite_cc_library(mir_passes ...@@ -35,6 +35,7 @@ lite_cc_library(mir_passes
demo_pass.cc demo_pass.cc
runtime_context_assign_pass.cc runtime_context_assign_pass.cc
memory_optimize_pass.cc memory_optimize_pass.cc
weight_quantization_preprocess_pass.cc
DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs}) DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
# lite_cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS # lite_cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
......
...@@ -100,14 +100,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -100,14 +100,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
auto eps = matched.at("bn")->stmt()->op_info()->GetAttr<float>("epsilon"); auto eps = matched.at("bn")->stmt()->op_info()->GetAttr<float>("epsilon");
// conv // conv
auto conv_weight_t = scope->FindVar(matched.at("conv_weight")->arg()->name) std::string conv_weight_name = matched.at("conv_weight")->arg()->name;
->GetMutable<lite::Tensor>(); auto conv_weight_t =
scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()), CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
static_cast<size_t>(conv_weight_t->dims()[0])) static_cast<size_t>(conv_weight_t->dims()[0]))
<< "The BN bias's size should be equal to the size of the first " << "The BN bias's size should be equal to the size of the first "
<< "dim size of the conv weights"; << "dim size of the conv weights";
size_t weight_num = conv_weight_t->data_size(); size_t weight_num = conv_weight_t->data_size();
bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false; bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
bool is_weight_quantization =
conv_op_desc->HasAttr("quantize_weight_bits") ? true : false;
// comupte BN alpha and beta // comupte BN alpha and beta
Tensor alpha_tensor, beta_tensor; Tensor alpha_tensor, beta_tensor;
...@@ -160,6 +163,16 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -160,6 +163,16 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
} }
} }
conv_op_desc->SetAttr("weight_scale", weight_scale); conv_op_desc->SetAttr("weight_scale", weight_scale);
} else if (is_weight_quantization) {
std::string scale_name = conv_weight_name + "_quant_scale";
if (conv_op_desc->HasAttr(scale_name)) {
auto scale = conv_op_desc->GetAttr<std::vector<float>>(scale_name);
CHECK_EQ(scale.size(), alpha_tensor.numel());
for (size_t i = 0; i < scale.size(); i++) {
scale[i] *= alpha_data[i];
}
conv_op_desc->SetAttr(scale_name, scale);
}
} else { } else {
// compute new conv_weight // compute new conv_weight
auto conv_weight_d = conv_weight_t->mutable_data<float>(); auto conv_weight_d = conv_weight_t->mutable_data<float>();
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/weight_quantization_preprocess_pass.h"
#include <memory>
#include <string>
#include <vector>
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
void WeightQuantizationPreprocessPass::Apply(
const std::unique_ptr<SSAGraph>& graph) {
std::vector<std::string> weight_quantized_op = {"conv2d", "depthwise_conv2d"};
for (auto& node : graph->StmtTopologicalOrder()) {
if (node->IsStmt() &&
std::find(weight_quantized_op.begin(),
weight_quantized_op.end(),
node->AsStmt().op_type()) != weight_quantized_op.end()) {
auto* scope = node->stmt()->op()->scope();
auto* op_desc = node->stmt()->mutable_op_info();
if (op_desc->HasAttr("quantize_weight_bits")) {
for (auto& input_name : op_desc->input_vars()) {
std::string scale_name = input_name + "_quant_scale";
if (op_desc->HasAttr(scale_name)) {
VLOG(5) << "op:" << op_desc->Type() << " input_name:" << input_name;
auto input_tensor =
scope->FindVar(input_name)->GetMutable<lite::Tensor>();
int weight_out_channel = static_cast<int>(input_tensor->dims()[0]);
auto input_scale = op_desc->GetAttr<std::vector<float>>(scale_name);
// scale length is equal to weight out channel
std::vector<float> scale_list(weight_out_channel, input_scale[0]);
op_desc->SetAttr(scale_name, scale_list);
}
}
}
}
}
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(weight_quantization_preprocess_pass,
paddle::lite::mir::WeightQuantizationPreprocessPass)
.BindTargets({TARGET(kAny)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "lite/core/mir/pass.h"
#include "lite/core/op_registry.h"
#include "lite/core/target_wrapper.h"
namespace paddle {
namespace lite {
namespace mir {
/*
* If the model is quantized by WeightQuantization in PostTrainingQuantization,
* the data type of the weight in quantized ops (conv2d, depthwise_conv2d) is
* int, and the scale is save in the quantized ops.
* WeightQuantizationPreprocessPass obtains the scale value, expands the
* scale value to a list, and save the list in the quantized ops.
*/
class WeightQuantizationPreprocessPass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
} // namespace mir
} // namespace lite
} // namespace paddle
...@@ -55,10 +55,11 @@ class Optimizer { ...@@ -55,10 +55,11 @@ class Optimizer {
if (passes.empty()) { if (passes.empty()) {
std::vector<std::string> passes_local{ std::vector<std::string> passes_local{
{"lite_quant_dequant_fuse_pass", // {"lite_quant_dequant_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-elemwise-bn "weight_quantization_preprocess_pass", //
"lite_conv_bn_fuse_pass", // "lite_conv_elementwise_fuse_pass", // conv-elemwise-bn
"lite_conv_elementwise_fuse_pass", // conv-bn-elemwise "lite_conv_bn_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-bn-elemwise
// TODO(Superjomn) Refine the fusion related design to select fusion // TODO(Superjomn) Refine the fusion related design to select fusion
// kernels for devices automatically. // kernels for devices automatically.
"lite_conv_activation_fuse_pass", // "lite_conv_activation_fuse_pass", //
......
...@@ -37,22 +37,21 @@ value_type DDimLite::count(int start, int end) const { ...@@ -37,22 +37,21 @@ value_type DDimLite::count(int start, int end) const {
if (end < start) { if (end < start) {
return 0; return 0;
} }
value_type res = 1; value_type sum = 1;
for (auto i = start; i < end; ++i) { for (auto i = start; i < end; ++i) {
res *= data_[i]; sum *= data_[i];
} }
return res; return sum;
} }
DDimLite DDimLite::Slice(int start, int end) const { DDimLite DDimLite::Slice(int start, int end) const {
start = std::max(start, 0); start = std::max(start, 0);
end = std::min(end, static_cast<int>(data_.size())); end = std::min(end, static_cast<int>(data_.size()));
DDimLite new_dim; std::vector<value_type> new_dim(end - start);
new_dim.resize(end - start); for (int i = start; i < end; i++) {
for (int i = start; i < end; ++i) {
new_dim[i - start] = data_[i]; new_dim[i - start] = data_[i];
} }
return new_dim; return DDim(new_dim);
} }
std::string DDimLite::repr() const { std::string DDimLite::repr() const {
......
...@@ -204,7 +204,7 @@ class TensorLite { ...@@ -204,7 +204,7 @@ class TensorLite {
// LOG(INFO) << "Set dims: " << dims_ << " for tensor " << this; // LOG(INFO) << "Set dims: " << dims_ << " for tensor " << this;
} }
void Resize(const std::vector<int64_t> &x) { void Resize(const std::vector<int64_t> &x) {
dims_ = DDimLite(x); dims_.ConstructFrom(x);
// LOG(INFO) << "Set dims: " << dims_ << " for tensor " << this; // LOG(INFO) << "Set dims: " << dims_ << " for tensor " << this;
} }
...@@ -230,22 +230,7 @@ class TensorLite { ...@@ -230,22 +230,7 @@ class TensorLite {
// For other devices, T and R may be the same type. // For other devices, T and R may be the same type.
template <typename T, typename R = T> template <typename T, typename R = T>
R *mutable_data() { R *mutable_data() {
auto type_id = typeid(T).hash_code(); precision_ = lite_api::PrecisionTypeTrait<T>::Type();
if (type_id == typeid(bool).hash_code()) { // NOLINT
precision_ = PrecisionType::kBool;
} else if (type_id == typeid(float).hash_code()) { // NOLINT
precision_ = PrecisionType::kFloat;
} else if (type_id == typeid(int8_t).hash_code()) {
precision_ = PrecisionType::kInt8;
} else if (type_id == typeid(int16_t).hash_code()) {
precision_ = PrecisionType::kInt16;
} else if (type_id == typeid(int32_t).hash_code()) {
precision_ = PrecisionType::kInt32;
} else if (type_id == typeid(int64_t).hash_code()) {
precision_ = PrecisionType::kInt64;
} else {
precision_ = PrecisionType::kUnk;
}
memory_size_ = dims_.production() * sizeof(T); memory_size_ = dims_.production() * sizeof(T);
buffer_->ResetLazy(target_, memory_size_); buffer_->ResetLazy(target_, memory_size_);
// char *ptr = static_cast<char *>(buffer_->data()) + offset_; // char *ptr = static_cast<char *>(buffer_->data()) + offset_;
......
# get the name of current operation system: Linux or Darwin
SYSTEM=$(shell "uname")
CXX_DEFINES = -DARM_WITH_OMP -DHPPL_STUB_FUNC -DLITE_WITH_ARM -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK \ CXX_DEFINES = -DARM_WITH_OMP -DHPPL_STUB_FUNC -DLITE_WITH_ARM -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK \
-DLITE_WITH_LINUX -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_WITH_TESTING -DLITE_WITH_LINUX -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_WITH_TESTING
LDFLAGS = -latomic -pthread -ldl -llog -lz LDFLAGS = -latomic -pthread -ldl -llog -lz
SYSROOT_COMPLILE = --sysroot=/opt/android-ndk-r17c/sysroot SYSROOT_COMPLILE = --sysroot=$(NDK_ROOT)/sysroot
SYSTEM_INCLUDES = -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/include \ SYSTEM_INCLUDES = -I$(NDK_ROOT)/sources/cxx-stl/llvm-libc++/include \
-I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++abi/include \ -I$(NDK_ROOT)/sources/cxx-stl/llvm-libc++abi/include \
-I/opt/android-ndk-r17c/sources/android/support/include \ -I$(NDK_ROOT)/sources/android/support/include \
-I/opt/android-ndk-r17c/sysroot/usr/include \ -I$(NDK_ROOT)/sysroot/usr/include \
ifeq ($(ARM_ABI), arm8) ifeq ($(ARM_ABI), arm8)
CC = /opt/android-ndk-r17c/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-g++ ifeq ($(SYSTEM), Linux)
CC = $(NDK_ROOT)/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-g++
else
CC = $(NDK_ROOT)/toolchains/aarch64-linux-android-4.9/prebuilt/darwin-x86_64/bin/aarch64-linux-android-g++
endif
CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=23 -fexceptions -frtti -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=23 -fexceptions -frtti -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE
CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--gc-sections CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--gc-sections
SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-24/arch-arm64 SYSROOT_LINK = --sysroot=$(NDK_ROOT)/platforms/android-24/arch-arm64
SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_static.a \ SYSTEM_LIBS = $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_static.a \
/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++abi.a $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++abi.a
INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/aarch64-linux-android INCLUDES = $(SYSTEM_INCLUDES) -I$(NDK_ROOT)/sysroot/usr/include/aarch64-linux-android
else else
CC = /opt/android-ndk-r17c/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-g++ ifeq ($(SYSTEM), Linux)
CC = $(NDK_ROOT)/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-g++
else
CC = $(NDK_ROOT)/toolchains/arm-linux-androideabi-4.9/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-g++
endif
CXX_FLAGS = -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp -funwind-tables -no-canonical-prefixes \ CXX_FLAGS = -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp -funwind-tables -no-canonical-prefixes \
-D__ANDROID_API__=23 -fexceptions -frtti -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE -D__ANDROID_API__=23 -fexceptions -frtti -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE
CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--fix-cortex-a8 -Wl,--gc-sections -Wl,-z,nocopyreloc CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--fix-cortex-a8 -Wl,--gc-sections -Wl,-z,nocopyreloc
SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-23/arch-arm SYSROOT_LINK = --sysroot=$(NDK_ROOT)/platforms/android-23/arch-arm
SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++_static.a \ SYSTEM_LIBS = $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++_static.a \
/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++abi.a \ $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++abi.a \
/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libandroid_support.a \ $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libandroid_support.a \
/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libunwind.a $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libunwind.a
INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/arm-linux-androideabi INCLUDES = $(SYSTEM_INCLUDES) -I$(NDK_ROOT)/sysroot/usr/include/arm-linux-androideabi
endif endif
...@@ -2,34 +2,40 @@ if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL)) ...@@ -2,34 +2,40 @@ if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL))
return () return ()
endif() endif()
set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper) set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper cl_image_converter)
add_kernel(fc_opencl OPENCL basic SRCS fc_compute.cc DEPS ${cl_kernel_deps}) add_kernel(fc_opencl OPENCL basic SRCS fc_compute.cc DEPS ${cl_kernel_deps})
add_kernel(mul_opencl OPENCL basic SRCS mul_compute.cc DEPS ${cl_kernel_deps}) add_kernel(mul_opencl OPENCL basic SRCS mul_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps}) add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fusion_elementwise_add_activation_opencl add_kernel(fusion_elementwise_add_activation_opencl
OPENCL basic SRCS fusion_elementwise_add_activation_compute.cc OPENCL basic SRCS fusion_elementwise_add_activation_compute.cc
DEPS elementwise_add_opencl ${cl_kernel_deps}) DEPS elementwise_add_opencl ${cl_kernel_deps})
add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps}) add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps})
add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps}) add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps}) add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps}) add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps}) #add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps})
add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps}) add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps}) add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps}) add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_compute.cc DEPS ${cl_kernel_deps})
lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_elementwise_mul_opencl SRCS elementwise_mul_compute_test.cc
DEPS elementwise_mul_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_pool_opencl SRCS pool_compute_test.cc lite_cc_test(test_pool_opencl SRCS pool_compute_test.cc
DEPS pool_opencl op_registry program context cl_image_converter DEPS pool_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc
DEPS fc_opencl op_registry program context DEPS fc_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
# TODO(ysh329): comment for buffer-impl mul # TODO(ysh329): comment for buffer-impl mul
#lite_cc_test(test_mul_opencl SRCS mul_compute_test.cc #lite_cc_test(test_mul_opencl SRCS mul_compute_test.cc
...@@ -37,34 +43,41 @@ lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc ...@@ -37,34 +43,41 @@ lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) # ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc
DEPS io_copy_compute_opencl op_registry program context DEPS io_copy_compute_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#TODO(ysh329): comment buffer-impl relu #TODO(ysh329): comment buffer-impl relu
lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc
DEPS relu_opencl layout_opencl op_registry program context DEPS relu_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
DEPS depthwise_conv2d_opencl op_registry program context cl_image_converter DEPS depthwise_conv2d_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_depthwise_conv2d_basic_opencl SRCS depthwise_conv2d_basic_compute_test.cc lite_cc_test(test_depthwise_conv2d_basic_opencl SRCS depthwise_conv2d_basic_compute_test.cc
DEPS depthwise_conv2d_opencl op_registry program context cl_image_converter DEPS depthwise_conv2d_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc #lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc
# DEPS conv2d_1x1_opencl cl_image_converter op_registry program context # DEPS conv2d_1x1_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) # ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc
DEPS reshape_opencl cl_image_converter op_registry program context DEPS reshape_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
DEPS conv_opencl op_registry program context DEPS conv_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc
DEPS conv_opencl op_registry program context cl_image_converter
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
DEPS layout_opencl op_registry program context cl_image_converter DEPS layout_opencl op_registry program context cl_image_converter
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_nearest_interp_opencl SRCS nearest_interp_compute_test.cc
DEPS nearest_interp_opencl layout_opencl op_registry program context cl_image_converter
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
...@@ -13,9 +13,13 @@ ...@@ -13,9 +13,13 @@
// limitations under the License. // limitations under the License.
#include "lite/kernels/opencl/conv_compute.h" #include "lite/kernels/opencl/conv_compute.h"
#include <sstream> #include <sstream>
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
namespace paddle { namespace paddle {
...@@ -242,7 +246,6 @@ void ConvCompute::Conv2d1x1() { ...@@ -242,7 +246,6 @@ void ConvCompute::Conv2d1x1() {
GemmBatched(kernel, x_d, filter_d, bias_d, output_d, batch_size, m, n, k); GemmBatched(kernel, x_d, filter_d, bias_d, output_d, batch_size, m, n, k);
} }
// a: filter_d ==> <m, k> <=> <oc, ic> // a: filter_d ==> <m, k> <=> <oc, ic>
// b: x_d ==> <k, n> <=> <ic, ih*iw> // b: x_d ==> <k, n> <=> <ic, ih*iw>
// c: output_d ==> <m, n> <=> <oc, ih*iw> // c: output_d ==> <m, n> <=> <oc, ih*iw>
...@@ -294,6 +297,582 @@ void ConvCompute::GemmBatched(cl::Kernel& kernel, ...@@ -294,6 +297,582 @@ void ConvCompute::GemmBatched(cl::Kernel& kernel,
void ConvCompute::Run() { (this->*impl_)(); } void ConvCompute::Run() { (this->*impl_)(); }
/* image kernel*/
void ConvImageCompute::PrepareForRun() {
const auto& param = this->Param<param_t>();
auto x_dims = param.x->dims();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
float* filter_cpu = param.filter->mutable_data<float>();
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
int bs = x_dims[0];
int c_in = x_dims[1];
int h_out = output_dims[2];
int w_out = output_dims[3];
int kernel_h = filter_dims[2]; // oihw
int kernel_w = filter_dims[3];
auto paddings = *param.paddings;
auto dilations = *param.dilations;
int stride_h = param.strides[0];
int stride_w = param.strides[1];
int pad_h = paddings[0];
int pad_w = paddings[2];
int groups = param.groups;
bool relu_fused = param.fuse_relu;
bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
bool zero_pad = (pad_h == 0) && (pad_w == 0);
bool pad_equal =
((paddings[0] == paddings[1]) && (paddings[1] == paddings[2]) &&
(paddings[2] == paddings[3]));
bool stride_equal = stride_h == stride_w;
bool dilation_equal = dilations[0] == dilations[1];
CHECK(pad_equal && stride_equal && dilation_equal);
VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
<< " stride_w:" << stride_w << " pad_h:" << pad_h
<< " pad_w:" << pad_w << " kernel_h:" << kernel_h
<< " kernel_h:" << kernel_h;
VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
<< " " << x_dims[3];
VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " "
<< output_dims[2] << " " << output_dims[3];
VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " "
<< filter_dims[2] << " " << filter_dims[3];
if (kernel_h == 1 && kernel_w == 1) {
// conv2d_1x1
if (param.x->dims()[1] % 4 == 0) {
kernel_func_names_.push_back("conv2d_1x1_simple");
} else {
kernel_func_names_.push_back("conv2d_1x1");
}
kernel_func_paths_.push_back("image/conv2d_1x1_kernel.cl");
CLImageConverterNWBlock converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
std::vector<float> filter_image_v(filter_image_dims[0] *
filter_image_dims[1] * 4); // 4 : RGBA
converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
filter_gpu_image_.mutable_data<float, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d1x1;
} else if (kernel_h == 5 && kernel_w == 5) {
// conv2d_5x5
kernel_func_names_.push_back("conv2d_5x5");
kernel_func_paths_.push_back("image/conv2d_5x5_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
std::vector<float> filter_image_v(filter_image_dims[0] *
filter_image_dims[1] * 4); // 4 : RGBA
converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
filter_gpu_image_.mutable_data<float, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d5x5;
} else if (kernel_h == 7 && kernel_w == 7) {
// conv2d_7x7
kernel_func_names_.push_back("conv2d_7x7");
kernel_func_paths_.push_back("image/conv2d_7x7_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
std::vector<float> filter_image_v(filter_image_dims[0] *
filter_image_dims[1] * 4); // 4 : RGBA
converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
this->filter_gpu_image_.mutable_data<float, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d7x7;
} else {
LOG(FATAL) << "conv image compute not support this condition yet! ";
}
std::string build_options_single(" -DCL_DTYPE_float");
// relu options
if (relu_fused) {
build_options_single += " -DRELU";
} else if (param.activation_param.active_type ==
lite_api::ActivationType::kRelu6) {
build_options_single += " -DRELU6";
} else {
// do nothing
}
// bias options
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
if (has_bias) {
build_options_single +=
is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
// convert cpu buffer bias --> gpu image
CLImageConverterFolder bias_converter;
const DDim& bias_image_dims =
bias_converter.InitImageDimInfoWith(param.bias->dims());
std::vector<float> bias_image_v(bias_image_dims[0] * bias_image_dims[1] *
4);
float* bias_cpu_data = param.bias->mutable_data<float>();
bias_converter.NCHWToImage(
bias_cpu_data, bias_image_v.data(), param.bias->dims());
this->bias_gpu_image_.mutable_data<float, cl::Image2D>(
bias_image_dims[0], bias_image_dims[1], bias_image_v.data());
// convert cpu buffer bias --> gpu image --- end ----
}
build_options_.push_back(build_options_single);
for (size_t i = 0; i < kernel_func_names_.size(); i++) {
context.cl_context()->AddKernel(
kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
}
}
void ConvImageCompute::Conv2d1x1() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<float, cl::Image2D>();
auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int output_width = output_dims[3];
int output_height = output_dims[2];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<float, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
VLOG(4) << "============ conv2d_1x1 params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
VLOG(4) << "input_image: " << input_image;
VLOG(4) << "filter_dims: " << filter_dims;
VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
// handle bias use buffer for channel wise , use image for element wise
const cl::Buffer* bias_buf = nullptr;
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_.data<float, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
std::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int maped_w = maptofactor(w, 4);
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "maped_w: " << maped_w;
VLOG(4) << "hasbias: " << has_bias;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, maped_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(maped_w),
static_cast<size_t>(default_work_size.data()[2])};
VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Conv2d5x5() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<float, cl::Image2D>();
auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int output_width = output_dims[3];
int output_height = output_dims[2];
int filter_width = filter_dims[3];
int filter_height = filter_dims[2];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<float, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
VLOG(4) << "input_image: " << input_image;
VLOG(4) << "input_dims: " << input_dims;
VLOG(4) << "filter_dims: " << filter_dims;
VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_.data<float, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
VLOG(4) << "set bias_image: ";
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Conv2d7x7() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<float, cl::Image2D>();
auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int output_width = output_dims[3];
int output_height = output_dims[2];
int filter_width = filter_dims[3];
int filter_height = filter_dims[2];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<float, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
VLOG(4) << "input_image: " << input_image;
VLOG(4) << "input_dims: " << input_dims;
VLOG(4) << "filter_dims: " << filter_dims;
VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_.data<float, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
VLOG(4) << "set bias_image: ";
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Run() { (this->*impl_)(); }
} // namespace opencl } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
...@@ -310,3 +889,21 @@ REGISTER_LITE_KERNEL(conv2d, ...@@ -310,3 +889,21 @@ REGISTER_LITE_KERNEL(conv2d,
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(conv2d,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::ConvImageCompute,
image2d)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
...@@ -57,6 +58,30 @@ class ConvCompute ...@@ -57,6 +58,30 @@ class ConvCompute
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ConvParam;
using kernel_t = void (ConvImageCompute::*)();
void PrepareForRun() override;
void Run() override;
private:
void Conv2d1x1();
void Conv2d5x5();
void Conv2d7x7();
kernel_t impl_;
std::vector<std::string> kernel_func_names_{};
std::vector<std::string> kernel_func_paths_{};
std::vector<std::string> build_options_{};
std::shared_ptr<cl::Event> event_{new cl::Event};
Tensor filter_gpu_image_;
Tensor bias_gpu_image_;
};
} // namespace opencl } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
// #define SHADOW_LOG LOG(INFO)
#define SHADOW_LOG VLOG(4)
template <typename Dtype1, typename Dtype2>
static void conv_basic(const Dtype1* din,
Dtype2* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
const Dtype1* weights,
const Dtype2* bias,
int group,
int kernel_w,
int kernel_h,
int stride_w,
int stride_h,
int dila_w,
int dila_h,
int pad_w,
int pad_h,
bool flag_bias,
std::string flag_relu) {
Dtype2 beta = 0;
auto src_data = din;
auto dst_data_ref = dout;
auto weights_data = weights;
auto with_bias = flag_bias;
auto bias_data = bias;
int in_num = num;
int out_channels = chout;
int out_h = hout;
int out_w = wout;
int in_channel = chin;
int in_h = hin;
int in_w = win;
int out_c_group = out_channels / group;
int in_c_group = in_channel / group;
for (int n = 0; n < in_num; ++n) {
for (int g = 0; g < group; ++g) {
for (int oc = 0; oc < out_c_group; ++oc) {
for (int oh = 0; oh < out_h; ++oh) {
for (int ow = 0; ow < out_w; ++ow) {
int out_idx = n * group * out_c_group * out_h * out_w +
g * out_c_group * out_h * out_w + oc * out_h * out_w +
oh * out_w + ow;
Dtype2 bias_d =
with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0;
dst_data_ref[out_idx] = bias_d; // + dst_data_ref[out_idx] * beta;
for (int ic = 0; ic < in_c_group; ++ic) {
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
int iw = ow * stride_w - pad_w + kw * (dila_w);
int ih = oh * stride_h - pad_h + kh * (dila_h);
if (iw < 0 || iw >= in_w) continue;
if (ih < 0 || ih >= in_h) continue;
int iidx = n * in_channel * in_h * in_w +
g * in_c_group * in_h * in_w + ic * in_h * in_w +
ih * in_w + iw;
int widx =
g * out_c_group * in_c_group * kernel_h * kernel_w +
oc * in_c_group * kernel_h * kernel_w +
ic * kernel_h * kernel_w + kh * kernel_w + kw;
dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx];
}
}
}
if (flag_relu == "relu") {
dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
? dst_data_ref[out_idx]
: (Dtype2)0;
} else if (flag_relu == "relu6") {
auto dst_tmp = (dst_data_ref[out_idx] > (Dtype2)0)
? dst_data_ref[out_idx]
: (Dtype2)0;
dst_data_ref[out_idx] = (dst_tmp < 6.f) ? dst_tmp : 6.f;
}
}
}
}
}
}
}
int ConvOutputSize(int input_size,
int filter_size,
int dilation,
int pad_left,
int pad_right,
int stride) {
const int dkernel = dilation * (filter_size - 1) + 1;
int output_size =
(input_size + (pad_left + pad_right) - dkernel) / stride + 1;
return output_size;
}
// #define PRINT_RESULT
// #define LOOP_TEST
TEST(conv2d, compute_image2d_1x1) {
// conv infos
const int ksize = 1;
const int stride = 1;
const int pad = 0;
const int group = 1;
const int dilation = 0;
// int loop_cnt = 0;
#ifdef LOOP_TEST
for (int batch_size = 1; batch_size < 4; ++batch_size) {
for (int oc = 4; oc < 10; oc += 1) { // oc
for (int ih = 4; ih < 9; ih += 1) { // ih
int iw = ih;
for (int iw = 4; iw < 10; iw += 1) { // iw
for (int ic = 4; ic < 10; ic += 1) { // ic
for (bool bias_flag : {true, false}) {
for (std::string relu_flag : {"relu"}) {
#else
const int batch_size = 1;
const int oc = 4;
const int ih = 8;
const int iw = 8;
const int ic = 4;
const bool bias_flag = true;
const std::string relu_flag = "relu";
#endif
const int oh = ih;
const int ow = iw;
SHADOW_LOG << "to get kernel ...";
auto kernels =
KernelRegistry::Global().Create("conv2d",
TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
SHADOW_LOG << "created conv2d_1x1 kernel";
SHADOW_LOG << "prepare kernel ------";
lite::Tensor input, filter, bias, output;
operators::ConvParam param;
param.x = &input;
param.filter = &filter;
param.output = &output;
if (bias_flag) {
param.bias = &bias;
}
if (relu_flag == "relu") {
param.fuse_relu = true;
} else if (relu_flag == "None") {
param.fuse_relu = false;
} else if (relu_flag == "relu6") {
param.activation_param.Relu_clipped_coef = 6.f;
param.activation_param.has_active = true;
param.activation_param.active_type =
lite_api::ActivationType::kRelu6;
}
std::vector<int> paddings = {pad, pad, pad, pad};
std::vector<int> dilations = {dilation, dilation};
param.paddings = std::make_shared<std::vector<int>>(paddings);
param.dilations = std::make_shared<std::vector<int>>(dilations);
param.strides = std::vector<int>{stride, stride};
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
std::unique_ptr<KernelContext> conv_1x1_context(
new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(conv_1x1_context->As<OpenCLContext>()));
kernel->SetContext(std::move(conv_1x1_context));
const DDim& input_dim =
lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
const DDim& filter_dim =
lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
const DDim& out_dim =
lite::DDim{std::vector<int64_t>({batch_size, oc, ih, iw})};
// element wise bias
const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
param.x->Resize(input_dim);
param.filter->Resize(filter_dim);
param.output->Resize(out_dim);
if (bias_flag) {
param.bias->Resize(bias_dim);
}
kernel->SetParam(param);
size_t input_image_width = iw * ((ic + 3) / 4);
size_t input_image_height = ih * batch_size;
size_t out_image_width = ow * ((oc + 3) / 4);
size_t out_image_height = oh * batch_size;
size_t bias_image_width = ow * ((oc + 3) / 4);
size_t bias_image_height = oh * batch_size;
size_t filter_image_width = ksize * ((oc + 3) / 4);
size_t filter_image_height = ic * ksize;
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
std::default_random_engine engine;
std::uniform_real_distribution<float> gen(-5, 5);
std::vector<float> input_v(batch_size * ic * ih * iw);
std::vector<float> filter_v(oc * ic * ksize * ksize);
std::vector<float> output_v(batch_size * oc * ih * iw);
std::vector<float> bias_v(oc);
SHADOW_LOG << "gen input and filter ...";
for (auto& i : input_v) {
i = gen(engine);
}
for (auto& f : filter_v) {
f = gen(engine);
}
SHADOW_LOG << "after gen input and filter ...";
SHADOW_LOG << "input_v.size(): " << input_v.size();
SHADOW_LOG << "filter_v.size(): " << filter_v.size();
SHADOW_LOG << "output_v.size(): " << output_v.size();
SHADOW_LOG << "bias_v.size(): " << bias_v.size();
SHADOW_LOG << "input_dim.production(): "
<< input_dim.production();
SHADOW_LOG << "filter_dim.production(): "
<< filter_dim.production();
SHADOW_LOG << "out_dim.production(): " << out_dim.production();
SHADOW_LOG << "bias_dim.production(): "
<< bias_dim.production();
SHADOW_LOG << "4 * input_image_height * input_image_width: "
<< 4 * input_image_height * input_image_width;
SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
<< 4 * filter_image_width * filter_image_height;
CHECK(input_dim.production() == input_v.size());
CHECK_LE(input_dim.production(),
4 * input_image_height * input_image_width);
CHECK(filter_dim.production() == filter_v.size());
CHECK_LE(filter_dim.production(),
4 * filter_image_width * filter_image_height);
paddle::lite::CLImageConverterDefault default_convertor;
SHADOW_LOG << "set mapped input ...";
std::vector<float> x_image_v(
input_image_width * input_image_height * 4); // 4 : RGBA
std::vector<float> filter_image_v(
filter_image_width * filter_image_height * 4); // 4 :RGBA
std::vector<float> bias_image_v(
bias_image_width * bias_image_height * 4); // 4 : RGBA
std::vector<float> out_image_v(
out_image_width * out_image_height * 4); // 4 : RGBA
default_convertor.NCHWToImage(
input_v.data(), x_image_v.data(), input_dim);
SHADOW_LOG << "set mapped filter ...";
paddle::lite::CLImageConverterNWBlock nw_convertor;
nw_convertor.NCHWToImage(
filter_v.data(), filter_image_v.data(), filter_dim);
auto* input_image2d = input.mutable_data<float, cl::Image2D>(
input_image_width, input_image_height, x_image_v.data());
// assign filter as target arm
filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
filter_dim);
// auto* filter_image2d =
// filter.mutable_data<float, cl::Image2D>(
// filter_image_width,
// filter_image_height,
// filter_image_v.data());
SHADOW_LOG << "卷积核: ---- ";
for (int i = 0; i < filter_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << filter_v[i];
}
SHADOW_LOG << "卷积核1: ---- ";
const float* filter_p = filter.data<float>();
for (int i = 0; i < filter_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << *filter_p;
filter_p++;
}
SHADOW_LOG << "卷积核2: ---- ";
const float* filter_p2 = filter.mutable_data<float>();
for (int i = 0; i < filter_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << *filter_p2;
filter_p2++;
}
if (bias_flag) {
for (int i = 0; i < bias_dim.production(); ++i) {
bias_v[i] = static_cast<int>(gen(engine));
}
bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
bias_dim);
// CLImageConverterFolder folder_convertor;
// folder_convertor.NCHWToImage(
// bias_v.data(), bias_image_v.data(),
// bias_dim);
//
// auto* bias_data = bias.mutable_data<float,
// cl::Image2D>(
// bias_image_width, bias_image_height,
// bias_image_v.data());
}
SHADOW_LOG << "resize output ...";
output.Resize(out_dim);
// cpu conv basic calc
lite::Tensor out_ref;
out_ref.Resize(out_dim);
SHADOW_LOG << "prepare kernel ready";
SHADOW_LOG << "kernel launch ...";
kernel->Launch();
SHADOW_LOG << "mutable output ...";
auto* output_image2d = output.mutable_data<float, cl::Image2D>(
out_image_width, out_image_height);
auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
SHADOW_LOG << "--- Find the sync event for the target cl "
"tensor. ---";
auto& event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target"
"cl tensor.";
}
TargetWrapperCL::ImgcpySync(out_image_v.data(),
output.data<float, cl::Image2D>(),
out_image_width,
out_image_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
DDim out_image_shape =
default_convertor.InitImageDimInfoWith(output.dims());
default_convertor.ImageToNCHW(out_image_v.data(),
output_v.data(),
out_image_shape,
output.dims());
SHADOW_LOG << "mutable_data out_ref_data: ";
// run cpu ref
auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
SHADOW_LOG << " conv_basic beigin ..... ";
conv_basic<float, float>(input_v.data(),
out_ref_data,
batch_size,
oc,
oh,
ow,
ic,
ih,
iw,
filter_v.data(),
bias_v.data(), // mapped_bias,
group,
ksize,
ksize,
stride,
stride,
dilation,
dilation,
pad,
pad,
bias_flag,
relu_flag);
SHADOW_LOG << " conv_basic end ..... ";
SHADOW_LOG << " out_dim: " << out_dim;
const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
{static_cast<int64_t>(out_image_width),
static_cast<int64_t>(out_image_height)})};
for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
LOG(FATAL) << "error idx:" << i;
}
}
#ifdef LOOP_TEST
}
}
}
}
}
}
}
#else
// nothing to do.
#endif
}
#undef LOOP_TEST
#undef PRINT_RESULT
// #define PRINT_RESULT
// #define LOOP_TEST
TEST(conv2d, compute_image2d_5x5) {
// conv infos
const int ksize = 5;
const int stride = 1;
const int pad = 2;
const int group = 1;
const int dilation = 1;
// int loop_cnt = 0;
#ifdef LOOP_TEST
for (int batch_size = 2; batch_size < 4; ++batch_size) {
for (int oc = 1; oc < 10; oc += 1) { // oc
for (int ih = 5; ih < 9; ih += 1) { // ih
int iw = ih;
for (int ic = 1; ic < 10; ic += 1) { // ic
for (bool bias_flag : {true, false}) {
for (std::string relu_flag : {/*true,*/ "relu"}) {
#else
const int batch_size = 2;
const int oc = 1;
const int ih = 5;
const int iw = 5;
const int ic = 1;
const bool bias_flag = true;
const std::string relu_flag = "relu";
#endif
const int oh =
ConvOutputSize(ih, ksize, dilation, pad, pad, stride);
const int ow =
ConvOutputSize(iw, ksize, dilation, pad, pad, stride);
SHADOW_LOG << "to get kernel ...";
auto kernels =
KernelRegistry::Global().Create("conv2d",
TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
SHADOW_LOG << "created conv2d kernel";
SHADOW_LOG << "prepare kernel ------";
lite::Tensor input, filter, bias, output;
operators::ConvParam param;
param.x = &input;
param.filter = &filter;
param.output = &output;
if (bias_flag) {
param.bias = &bias;
}
if (relu_flag == "relu") {
param.fuse_relu = true;
} else if (relu_flag == "None") {
param.fuse_relu = false;
} else if (relu_flag == "relu6") {
param.activation_param.Relu_clipped_coef = 6.f;
param.activation_param.has_active = true;
param.activation_param.active_type =
lite_api::ActivationType::kRelu6;
}
std::vector<int> paddings = {pad, pad, pad, pad};
std::vector<int> dilations = {dilation, dilation};
param.paddings = std::make_shared<std::vector<int>>(paddings);
param.dilations = std::make_shared<std::vector<int>>(dilations);
param.strides = std::vector<int>{stride, stride};
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
std::unique_ptr<KernelContext> conv_1x1_context(
new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(conv_1x1_context->As<OpenCLContext>()));
kernel->SetContext(std::move(conv_1x1_context));
const DDim& input_dim =
lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
const DDim& filter_dim =
lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
const DDim& out_dim =
lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
// element wise bias
const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
param.x->Resize(input_dim);
param.filter->Resize(filter_dim);
param.output->Resize(out_dim);
if (bias_flag) {
param.bias->Resize(bias_dim);
}
kernel->SetParam(param);
size_t input_image_width = iw * ((ic + 3) / 4);
size_t input_image_height = ih * batch_size;
size_t out_image_width = ow * ((oc + 3) / 4);
size_t out_image_height = oh * batch_size;
size_t bias_image_width = ow * ((oc + 3) / 4);
size_t bias_image_height = oh * batch_size;
size_t filter_image_width = ksize * ((ic + 3) / 4);
size_t filter_image_height = oc * ksize;
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
std::default_random_engine engine;
std::uniform_real_distribution<float> gen(-5, 5);
std::vector<float> input_v(batch_size * ic * ih * iw);
std::vector<float> filter_v(oc * ic * ksize * ksize);
std::vector<float> output_v(batch_size * oc * oh * ow);
std::vector<float> bias_v(oc);
SHADOW_LOG << "gen input and filter ...";
for (auto& i : input_v) {
i = gen(engine);
}
for (auto& f : filter_v) {
f = gen(engine);
}
SHADOW_LOG << "after gen input and filter ...";
SHADOW_LOG << "input_v.size(): " << input_v.size();
SHADOW_LOG << "filter_v.size(): " << filter_v.size();
SHADOW_LOG << "output_v.size(): " << output_v.size();
SHADOW_LOG << "bias_v.size(): " << bias_v.size();
SHADOW_LOG << "input_dim.production(): "
<< input_dim.production();
SHADOW_LOG << "filter_dim.production(): "
<< filter_dim.production();
SHADOW_LOG << "out_dim.production(): " << out_dim.production();
SHADOW_LOG << "bias_dim.production(): " << bias_dim.production();
SHADOW_LOG << "4 * input_image_height *input_image_width: "
<< 4 * input_image_height * input_image_width;
SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
<< 4 * filter_image_width * filter_image_height;
CHECK(input_dim.production() == input_v.size());
CHECK_LE(input_dim.production(),
4 * input_image_height * input_image_width);
CHECK(filter_dim.production() == filter_v.size());
CHECK_LE(filter_dim.production(),
4 * filter_image_width * filter_image_height);
paddle::lite::CLImageConverterDefault default_convertor;
SHADOW_LOG << "set mapped input ...";
std::vector<float> x_image_v(input_image_width *
input_image_height * 4); // 4 :RGBA
std::vector<float> filter_image_v(
filter_image_width * filter_image_height * 4); // 4 : RGBA
std::vector<float> bias_image_v(
bias_image_width * bias_image_height * 4); // 4 : RGBA
std::vector<float> out_image_v(out_image_width *
out_image_height * 4); // 4 :RGBA
default_convertor.NCHWToImage(
input_v.data(), x_image_v.data(), input_dim);
SHADOW_LOG << "输入: ---- ";
for (int i = 0; i < input_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << input_v[i];
}
SHADOW_LOG << "输入image : ---- ";
for (int i = 0; i < x_image_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << x_image_v[i];
}
SHADOW_LOG << "set mapped filter ...";
CLImageConverterFolder folder_convertor;
folder_convertor.NCHWToImage(
filter_v.data(), filter_image_v.data(), filter_dim);
SHADOW_LOG << "卷积核: ---- ";
for (int i = 0; i < filter_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << filter_v[i];
}
SHADOW_LOG << "卷积核image: ---- ";
for (int i = 0; i < filter_image_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
}
auto* input_image2d = input.mutable_data<float, cl::Image2D>(
input_image_width, input_image_height, x_image_v.data());
// assign filter as target arm
filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
filter_dim);
// filter kernel
// auto* filter_image2d = filter.mutable_data<float,
// cl::Image2D>(
// filter_image_width,
// filter_image_height,
// filter_image_v.data());
if (bias_flag) {
for (int i = 0; i < bias_dim.production(); ++i) {
bias_v[i] = static_cast<int>(gen(engine));
}
bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
bias_dim);
// CLImageConverterFolder folder_convertor;
// folder_convertor.NCHWToImage(
// bias_v.data(), bias_image_v.data(),
// bias_dim);
//
// auto* bias_data = bias.mutable_data<float,
// cl::Image2D>(
// bias_image_width, bias_image_height,
// bias_image_v.data());
}
SHADOW_LOG << "resize output ...";
output.Resize(out_dim);
// cpu conv basic calc
lite::Tensor out_ref;
out_ref.Resize(out_dim);
SHADOW_LOG << "prepare kernel ready";
SHADOW_LOG << "kernel launch ...";
kernel->Launch();
SHADOW_LOG << "mutable output ...";
auto* output_image2d = output.mutable_data<float, cl::Image2D>(
out_image_width, out_image_height);
auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
SHADOW_LOG << "--- Find the sync event for the target cl "
"tensor. ---";
auto& event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
}
TargetWrapperCL::ImgcpySync(out_image_v.data(),
output.data<float, cl::Image2D>(),
out_image_width,
out_image_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
DDim out_image_shape =
default_convertor.InitImageDimInfoWith(output.dims());
default_convertor.ImageToNCHW(out_image_v.data(),
output_v.data(),
out_image_shape,
output.dims());
SHADOW_LOG << "输出: ---- ";
for (int i = 0; i < output_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << output_v[i];
}
SHADOW_LOG << "输出image: ---- ";
for (int i = 0; i < out_image_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << out_image_v[i];
}
SHADOW_LOG << "mutable_data out_ref_data: ";
// run cpu ref
auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
SHADOW_LOG << " conv_basic beigin ..... ";
conv_basic<float, float>(input_v.data(),
out_ref_data,
batch_size,
oc,
oh,
ow,
ic,
ih,
iw,
filter_v.data(),
bias_v.data(), // mapped_bias,
group,
ksize,
ksize,
stride,
stride,
dilation,
dilation,
pad,
pad,
bias_flag,
relu_flag);
SHADOW_LOG << " conv_basic end ..... ";
SHADOW_LOG << " out_dim: " << out_dim;
const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
{static_cast<int64_t>(out_image_width),
static_cast<int64_t>(out_image_height)})};
for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
LOG(FATAL) << "error idx:" << i;
}
}
#ifdef LOOP_TEST
}
}
}
}
}
}
#else
// nothing to do.
#endif
}
#undef LOOP_TEST
#undef PRINT_RESULT
// #define LOOP_TEST
TEST(conv2d, compute_image2d_7x7) {
// conv infos
const int ksize = 7;
const int stride = 1;
const int pad = 2;
const int group = 1;
const int dilation = 1;
// int loop_cnt = 0;
#ifdef LOOP_TEST
for (int batch_size = 2; batch_size < 4; ++batch_size) {
for (int oc = 1; oc < 10; oc += 1) { // oc
for (int ih = 7; ih < 15; ih += 1) { // ih
int iw = ih;
for (int ic = 1; ic < 10; ic += 1) { // ic
for (bool bias_flag : {true, false}) {
for (std::string relu_flag : {"relu"}) {
#else
const int batch_size = 2;
const int oc = 1;
const int ih = 7;
const int iw = 7;
const int ic = 1;
const bool bias_flag = false;
const std::string relu_flag = "";
#endif
const int oh =
ConvOutputSize(ih, ksize, dilation, pad, pad, stride);
const int ow =
ConvOutputSize(iw, ksize, dilation, pad, pad, stride);
SHADOW_LOG << "to get kernel ...";
auto kernels =
KernelRegistry::Global().Create("conv2d",
TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
SHADOW_LOG << "created conv2d kernel";
SHADOW_LOG << "prepare kernel ------";
lite::Tensor input, filter, bias, output;
operators::ConvParam param;
param.x = &input;
param.filter = &filter;
param.output = &output;
if (bias_flag) {
param.bias = &bias;
}
if (relu_flag == "relu") {
param.fuse_relu = true;
} else if (relu_flag == "None") {
param.fuse_relu = false;
} else if (relu_flag == "relu6") {
param.activation_param.Relu_clipped_coef = 6.f;
param.activation_param.has_active = true;
param.activation_param.active_type =
lite_api::ActivationType::kRelu6;
}
std::vector<int> paddings = {pad, pad, pad, pad};
std::vector<int> dilations = {dilation, dilation};
param.paddings = std::make_shared<std::vector<int>>(paddings);
param.dilations = std::make_shared<std::vector<int>>(dilations);
param.strides = std::vector<int>{stride, stride};
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
std::unique_ptr<KernelContext> conv_1x1_context(
new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(conv_1x1_context->As<OpenCLContext>()));
kernel->SetContext(std::move(conv_1x1_context));
const DDim& input_dim =
lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
const DDim& filter_dim =
lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
const DDim& out_dim =
lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
// element wise bias
const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
param.x->Resize(input_dim);
param.filter->Resize(filter_dim);
param.output->Resize(out_dim);
if (bias_flag) {
param.bias->Resize(bias_dim);
}
kernel->SetParam(param);
size_t input_image_width = iw * ((ic + 3) / 4);
size_t input_image_height = ih * batch_size;
size_t out_image_width = ow * ((oc + 3) / 4);
size_t out_image_height = oh * batch_size;
size_t bias_image_width = ow * ((oc + 3) / 4);
size_t bias_image_height = oh * batch_size;
size_t filter_image_width = ksize * ((ic + 3) / 4);
size_t filter_image_height = oc * ksize;
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
std::default_random_engine engine;
std::uniform_real_distribution<float> gen(-5, 5);
std::vector<float> input_v(batch_size * ic * ih * iw);
std::vector<float> filter_v(oc * ic * ksize * ksize);
std::vector<float> output_v(batch_size * oc * oh * ow);
std::vector<float> bias_v(oc);
SHADOW_LOG << "gen input and filter ...";
for (auto& i : input_v) {
i = gen(engine);
// i = 1;
}
for (auto& f : filter_v) {
f = gen(engine);
// f = 1;
}
LOG(INFO) << "bias: " << bias_flag;
LOG(INFO) << "relu: " << relu_flag;
LOG(INFO) << "inputdims : " << input_dim;
LOG(INFO) << "filterdims: " << filter.dims();
LOG(INFO) << "outputdims : " << output.dims();
SHADOW_LOG << "after gen input and filter ...";
SHADOW_LOG << "input_v.size(): " << input_v.size();
SHADOW_LOG << "filter_v.size(): " << filter_v.size();
SHADOW_LOG << "output_v.size(): " << output_v.size();
SHADOW_LOG << "bias_v.size(): " << bias_v.size();
SHADOW_LOG << "input_dim.production(): "
<< input_dim.production();
SHADOW_LOG << "filter_dim.production(): "
<< filter_dim.production();
SHADOW_LOG << "out_dim.production(): " << out_dim.production();
SHADOW_LOG << "bias_dim.production(): " << bias_dim.production();
SHADOW_LOG << "4 * input_image_height * input_image_width: "
<< 4 * input_image_height * input_image_width;
SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
<< 4 * filter_image_width * filter_image_height;
CHECK(input_dim.production() == input_v.size());
CHECK_LE(input_dim.production(),
4 * input_image_height * input_image_width);
CHECK(filter_dim.production() == filter_v.size());
CHECK_LE(filter_dim.production(),
4 * filter_image_width * filter_image_height);
paddle::lite::CLImageConverterDefault default_convertor;
SHADOW_LOG << "set mapped input ...";
std::vector<float> x_image_v(input_image_width *
input_image_height * 4); // 4 : RGBA
std::vector<float> filter_image_v(
filter_image_width * filter_image_height * 4); // 4 : RGBA
std::vector<float> bias_image_v(
bias_image_width * bias_image_height * 4); // 4 : RGBA
std::vector<float> out_image_v(out_image_width *
out_image_height * 4); // 4 : RGBA
default_convertor.NCHWToImage(
input_v.data(), x_image_v.data(), input_dim);
SHADOW_LOG << "输入: ---- ";
for (int i = 0; i < input_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << input_v[i];
}
SHADOW_LOG << "输入image : ---- ";
for (int i = 0; i < x_image_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << x_image_v[i];
}
SHADOW_LOG << "set mapped filter ...";
CLImageConverterFolder folder_convertor;
folder_convertor.NCHWToImage(
filter_v.data(), filter_image_v.data(), filter_dim);
SHADOW_LOG << "卷积核: ---- ";
for (int i = 0; i < filter_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << filter_v[i];
}
SHADOW_LOG << "卷积核image: ---- ";
for (int i = 0; i < filter_image_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
}
auto* input_image2d = input.mutable_data<float, cl::Image2D>(
input_image_width, input_image_height, x_image_v.data());
// assign filter as target arm
filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
filter_dim);
// auto* filter_image2d = filter.mutable_data<float,
// cl::Image2D>(
// filter_image_width,
// filter_image_height,
// filter_image_v.data());
if (bias_flag) {
for (int i = 0; i < bias_dim.production(); ++i) {
bias_v[i] = static_cast<int>(gen(engine));
}
bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
bias_dim);
// CLImageConverterFolder folder_convertor;
// folder_convertor.NCHWToImage(
// bias_v.data(), bias_image_v.data(),
// bias_dim);
//
// auto* bias_data = bias.mutable_data<float,
// cl::Image2D>(
// bias_image_width, bias_image_height,
// bias_image_v.data());
}
SHADOW_LOG << "resize output ...";
output.Resize(out_dim);
// cpu conv basic calc
lite::Tensor out_ref;
out_ref.Resize(out_dim);
SHADOW_LOG << "prepare kernel ready";
SHADOW_LOG << "kernel launch ...";
kernel->Launch();
SHADOW_LOG << "mutable output ...";
auto* output_image2d = output.mutable_data<float, cl::Image2D>(
out_image_width, out_image_height);
auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
SHADOW_LOG << "--- Find the sync event for the target cl "
"tensor. ---";
auto& event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
}
TargetWrapperCL::ImgcpySync(out_image_v.data(),
output.data<float, cl::Image2D>(),
out_image_width,
out_image_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
DDim out_image_shape =
default_convertor.InitImageDimInfoWith(output.dims());
default_convertor.ImageToNCHW(out_image_v.data(),
output_v.data(),
out_image_shape,
output.dims());
SHADOW_LOG << "输出: ---- ";
for (int i = 0; i < output_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << output_v[i];
}
SHADOW_LOG << "输出image: ---- ";
for (int i = 0; i < out_image_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << out_image_v[i];
}
SHADOW_LOG << "mutable_data out_ref_data: ";
// run cpu ref
auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
SHADOW_LOG << " conv_basic beigin ..... ";
conv_basic<float, float>(input_v.data(),
out_ref_data,
batch_size,
oc,
oh,
ow,
ic,
ih,
iw,
filter_v.data(),
bias_v.data(), // mapped_bias,
group,
ksize,
ksize,
stride,
stride,
dilation,
dilation,
pad,
pad,
bias_flag,
relu_flag);
SHADOW_LOG << " conv_basic end ..... ";
SHADOW_LOG << " out_dim: " << out_dim;
const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
{static_cast<int64_t>(out_image_width),
static_cast<int64_t>(out_image_height)})};
for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
LOG(FATAL) << "error idx:" << i;
}
}
#ifdef LOOP_TEST
}
}
}
}
}
}
#else
// nothing to do.
#endif
}
#undef LOOP_TEST
#undef PRINT_RESULT
#undef SHADOW_LOG
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kImageDefault, image2d);
...@@ -23,6 +23,8 @@ namespace lite { ...@@ -23,6 +23,8 @@ namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
/* Buffer */
#if 0
void ElementwiseAddCompute::PrepareForRun() { void ElementwiseAddCompute::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
...@@ -92,6 +94,124 @@ void ElementwiseAddCompute::UpdateParams() { ...@@ -92,6 +94,124 @@ void ElementwiseAddCompute::UpdateParams() {
VLOG(4) << "channels: " << channels_; VLOG(4) << "channels: " << channels_;
VLOG(4) << "num: " << num_; VLOG(4) << "num: " << num_;
} }
#endif
/* Image2D */
void ElementwiseAddImageCompute::PrepareForRun() {
ele_param_ = param_.get_mutable<param_t>();
auto* x = ele_param_->X;
auto* y = ele_param_->Y;
auto axis = ele_param_->axis;
if (y->dims().size() == 4) {
kernel_func_name_ = "elementwise_add"; // y: ImageDefault
} else if (y->dims().size() == 1) {
if (axis == x->dims().size() - 1) {
kernel_func_name_ = "width_add"; // y: ImageDefault
} else if (axis == x->dims().size() - 3) {
kernel_func_name_ = "channel_add"; // y: ImageFolder
} else {
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
<< ", x->dims().size():" << x->dims().size()
<< ", y->dims.size():" << y->dims().size();
}
} else {
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
<< ", x->dims().size():" << x->dims().size()
<< ", y->dims.size():" << y->dims().size();
}
VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
}
void ElementwiseAddImageCompute::Run() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x = ele_param_->X;
auto* y = ele_param_->Y;
auto* out = ele_param_->Out;
auto axis = ele_param_->axis;
VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "y->target():" << TargetToStr(y->target());
VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << x->dims();
VLOG(4) << "y->dims():" << y->dims();
VLOG(4) << "out->dims():" << out->dims();
VLOG(4) << "axis:" << axis;
paddle::lite::CLImageConverterDefault default_convertor;
auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims()); // w, h
auto x_img_width = x_img_shape[0];
auto x_img_height = x_img_shape[1];
auto out_img_shape =
default_convertor.InitImageDimInfoWith(out->dims()); // w, h
auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
auto* x_img = x->data<float, cl::Image2D>();
auto* y_img = y->data<float, cl::Image2D>();
auto* out_img =
out->mutable_data<float, cl::Image2D>(out_img_shape[0], out_img_shape[1]);
VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
<< out_img_shape[1];
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
auto y_dims = y->dims();
if (y_dims.size() == 4) {
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
} else if (y_dims.size() == 1) {
if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
int tensor_w = x->dims()[x->dims().size() - 1];
VLOG(4) << "tensor_w:" << tensor_w;
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
CL_CHECK_FATAL(status);
} else {
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
<< ", x->dims().size():" << x->dims().size()
<< ", y->dims.size():" << y->dims().size();
}
} else {
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
<< ", x->dims().size():" << x->dims().size()
<< ", y->dims.size():" << y->dims().size();
}
auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
static_cast<cl::size_type>(x_img_height)};
VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
}
} // namespace opencl } // namespace opencl
} // namespace kernels } // namespace kernels
...@@ -99,9 +219,36 @@ void ElementwiseAddCompute::UpdateParams() { ...@@ -99,9 +219,36 @@ void ElementwiseAddCompute::UpdateParams() {
} // namespace paddle } // namespace paddle
namespace ocl = paddle::lite::kernels::opencl; namespace ocl = paddle::lite::kernels::opencl;
REGISTER_LITE_KERNEL(
elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def) // REGISTER_LITE_KERNEL(
.BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))}) // elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))}) // .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))}) // .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize();
// TODO(ysh329): Not fix.
// "Y" may from constant value like conv bias (kARM, need do cl_image_converter
// on CPU);
// may from anther branch like "X" (kOpenCL, nothing to do).
// Consider 2 situations have different actions when pass running(pick kernel),
// set target of "Y" as kOpenCL temporarily.
REGISTER_LITE_KERNEL(elementwise_add,
kOpenCL,
kFloat,
kImageDefault,
ocl::ElementwiseAddImageCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindInput("Y",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
...@@ -33,6 +33,10 @@ class ElementwiseAddCompute ...@@ -33,6 +33,10 @@ class ElementwiseAddCompute
void Run() override; void Run() override;
std::string doc() const override {
return "ElementwiseAdd using cl::Buffer, kFloat";
}
protected: protected:
void UpdateParams(); void UpdateParams();
...@@ -45,6 +49,28 @@ class ElementwiseAddCompute ...@@ -45,6 +49,28 @@ class ElementwiseAddCompute
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
class ElementwiseAddImageCompute
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ElementwiseParam;
void PrepareForRun() override;
void Run() override;
std::string doc() const override {
return "ElementwiseAdd using cl::Image2D, kFloat";
}
protected:
param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_add"};
std::string build_options_{" -DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
...@@ -22,6 +22,19 @@ ...@@ -22,6 +22,19 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
template <typename dtype>
void fill_data(dtype *x, const int length, int set_value = -1) {
if (set_value == -1) {
for (size_t idx = 0; idx < length; ++idx) {
x[idx] = idx;
}
} else if (set_value != -1) {
for (size_t idx = 0; idx < length; ++idx) {
x[idx] = set_value;
}
}
}
template <typename dtype> template <typename dtype>
void elementwise_compute_ref(const dtype *x_data, void elementwise_compute_ref(const dtype *x_data,
const dtype *y_data, const dtype *y_data,
...@@ -46,25 +59,17 @@ void elementwise_compute_ref(const dtype *x_data, ...@@ -46,25 +59,17 @@ void elementwise_compute_ref(const dtype *x_data,
for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
num *= x_dims[i]; num *= x_dims[i];
} }
VLOG(4) << "axis:" << axis;
VLOG(4) << "batch:" << batch;
VLOG(4) << "cahnnels:" << channels;
VLOG(4) << "num:" << num;
// do elementwise add/sub/max/... // do elementwise add/sub/max/...
if (elt_type == "add") { if (elt_type == "add" && axis == 1 && y_dims.size() == 1) {
for (int i = 0; i < batch; ++i) { for (int i = 0; i < x_dims.production(); ++i) {
for (int j = 0; j < channels; ++j) { auto w = i % y_dims.production();
int offset = (i * channels + j) * num; out_data[i] = x_data[i] + y_data[w];
const dtype *din_ptr = x_data + offset;
const dtype diny_data = y_data[j];
dtype *dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = *din_ptr + diny_data;
if (use_relu) {
*dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
}
dout_ptr++;
din_ptr++;
}
}
} }
} else if (elt_type == "sub") { } else if (elt_type == "add") {
for (int i = 0; i < batch; ++i) { for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) { for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num; int offset = (i * channels + j) * num;
...@@ -72,7 +77,7 @@ void elementwise_compute_ref(const dtype *x_data, ...@@ -72,7 +77,7 @@ void elementwise_compute_ref(const dtype *x_data,
const dtype diny_data = y_data[j]; const dtype diny_data = y_data[j];
dtype *dout_ptr = out_data + offset; dtype *dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) { for (int k = 0; k < num; ++k) {
*dout_ptr = *din_ptr - diny_data; *dout_ptr = *din_ptr + diny_data;
if (use_relu) { if (use_relu) {
*dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0)); *dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
} }
...@@ -86,7 +91,9 @@ void elementwise_compute_ref(const dtype *x_data, ...@@ -86,7 +91,9 @@ void elementwise_compute_ref(const dtype *x_data,
} }
} }
TEST(elementwise_add, compute) { // buffer
#if 0
TEST(elementwise_add_buffer, compute) {
LOG(INFO) << "to get kernel ..."; LOG(INFO) << "to get kernel ...";
auto kernels = KernelRegistry::Global().Create( auto kernels = KernelRegistry::Global().Create(
"elementwise_add", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)); "elementwise_add", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
...@@ -163,7 +170,7 @@ TEST(elementwise_add, compute) { ...@@ -163,7 +170,7 @@ TEST(elementwise_add, compute) {
TargetWrapperCL::Unmap(out_data, mapped_out); TargetWrapperCL::Unmap(out_data, mapped_out);
} }
TEST(fusion_elementwise_add_activation, compute) { TEST(fusion_elementwise_add_activation_buffer, compute) {
LOG(INFO) << "to get kernel ..."; LOG(INFO) << "to get kernel ...";
auto kernels = auto kernels =
KernelRegistry::Global().Create("fusion_elementwise_add_activation", KernelRegistry::Global().Create("fusion_elementwise_add_activation",
...@@ -243,9 +250,204 @@ TEST(fusion_elementwise_add_activation, compute) { ...@@ -243,9 +250,204 @@ TEST(fusion_elementwise_add_activation, compute) {
} }
TargetWrapperCL::Unmap(out_data, mapped_out); TargetWrapperCL::Unmap(out_data, mapped_out);
} }
#endif
// image
TEST(elementwise_add_image2d_fp32, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img on cpu) -> "
"elementwise_add(img) -> "
"layout(img2buf on cpu) "
"-> host";
// elementwise_add's 3 kernels selection routing strategy:
// --------------------------------------------------------
// 1. elementwise_add: Need y_dim.size() == 4
// 2. elementwise_add (used by fuse_elementwise_activation op):
// Need y_dim.size() == 4 && act_type == "relu"
// 3. width_add: Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
// 3
// 4. channel_add: Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
// 1
// dims
const int n = 1;
const int c = 3;
const int h = 2;
const int w = 2;
const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
auto out_dim = x_dim;
// y_dim / axis / relu_flag
std::vector<DDim> y_dim_v{DDim(std::vector<DDim::value_type>{n, c, h, w}),
DDim(std::vector<DDim::value_type>{n, c, h, w}),
DDim(std::vector<DDim::value_type>{w}),
DDim(std::vector<DDim::value_type>{w})};
std::vector<int> axis_v{-1, -1, 3, 1};
std::vector<bool> relu_flag_v{false, true, false, false};
CHECK(y_dim_v.size() == axis_v.size() && axis_v.size() == relu_flag_v.size())
<< "y_dim_v.size() == axis_v.size() == relu_flag_v.size() should be "
"same, and be corresponding "
"one by one";
// start loop
for (size_t case_idx = 0; case_idx < y_dim_v.size(); ++case_idx) {
auto y_dim = y_dim_v[case_idx];
auto axis = axis_v[case_idx];
auto relu_flag = relu_flag_v[case_idx];
LOG(INFO) << "================== elementwise_add, case_idx:" << case_idx + 1
<< "/" << y_dim_v.size() << " ===================";
LOG(INFO) << "x_dim:" << x_dim;
LOG(INFO) << "y_dim:" << y_dim;
LOG(INFO) << "out_dim:" << out_dim;
LOG(INFO) << "axis:" << axis;
LOG(INFO) << "relu_flag:" << relu_flag;
// tensor
VLOG(4) << "set tensors about op param";
lite::Tensor eleadd_x, eleadd_y, eleadd_out;
eleadd_x.Resize(x_dim);
eleadd_y.Resize(y_dim);
eleadd_out.Resize(out_dim);
// initialize tensors
VLOG(4) << "initialize tensors";
paddle::lite::CLImageConverterDefault default_convertor;
// x
std::vector<float> x_v(x_dim.production());
fill_data<float>(x_v.data(), x_v.size()); // fill with index value
auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim); // w, h
auto x_img_w = x_img_shape[0];
auto x_img_h = x_img_shape[1];
std::vector<float> x_img_v(x_img_w * x_img_h * 4); // 4: RGBA
default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
eleadd_x.mutable_data<float, cl::Image2D>(x_img_w, x_img_h, x_img_v.data());
// y
std::vector<float> y_v(y_dim.production());
fill_data<float>(y_v.data(), y_v.size()); // fill with index value
auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim); // w, h
auto y_img_w = y_img_shape[0];
auto y_img_h = y_img_shape[1];
std::vector<float> y_img_v(y_img_shape[0] * y_img_shape[1] * 4); // 4: RGBA
default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
eleadd_y.mutable_data<float, cl::Image2D>(y_img_w, y_img_h, y_img_v.data());
// out
auto out_img_shape =
default_convertor.InitImageDimInfoWith(out_dim); // w, h
auto out_img_w = out_img_shape[0];
auto out_img_h = out_img_shape[1];
eleadd_out.mutable_data<float, cl::Image2D>(out_img_w, out_img_h);
std::vector<float> out_img_v(out_img_w * out_img_h * 4);
fill_data<float>(
out_img_v.data(), out_img_v.size(), 0); // fill with zero value
std::vector<float> out_v(out_dim.production());
// operator param
operators::FusionElementwiseActivationParam
fuseEleaddParam; // enabled if relu_flag is true
fuseEleaddParam.X = &eleadd_x;
fuseEleaddParam.Y = &eleadd_y;
fuseEleaddParam.Out = &eleadd_out;
fuseEleaddParam.axis = axis;
fuseEleaddParam.act_type = relu_flag ? "relu" : "";
operators::ElementwiseParam eleaddParam;
eleaddParam.X = &eleadd_x;
eleaddParam.Y = &eleadd_y;
eleaddParam.Out = &eleadd_out;
eleaddParam.axis = axis;
auto op_param = relu_flag ? fuseEleaddParam : eleaddParam;
// set kernel
auto eleadd_img_kernels =
KernelRegistry::Global().Create("elementwise_add",
TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(eleadd_img_kernels.empty());
auto eleadd_img_kernel = std::move(eleadd_img_kernels.front());
VLOG(4) << "get eleadd kernel: " << eleadd_img_kernel->doc();
// set context and kernel args
VLOG(4) << "set context and kernel args";
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
eleadd_img_kernel->SetParam(op_param);
std::unique_ptr<KernelContext> eleadd_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(eleadd_img_context->As<OpenCLContext>()));
eleadd_img_kernel->SetContext(std::move(eleadd_img_context));
// run kernel
VLOG(4) << "run kernel";
eleadd_img_kernel->Launch();
// download gpu result to cpu
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
TargetWrapperCL::ImgcpySync(out_img_v.data(),
eleadd_out.data<float, cl::Image2D>(),
out_img_w,
out_img_h,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
default_convertor.ImageToNCHW(
out_img_v.data(), out_v.data(), out_img_shape, out_dim);
// compute cpu reference
std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
elementwise_compute_ref<float>(x_v.data(),
y_v.data(),
out_ref.get(),
x_dim,
y_dim,
op_param.axis,
"add",
relu_flag);
#if 0 // enable to check value of x and y
for (int eidx = 0; eidx < out_dim.production(); eidx++) {
auto value = out_v[eidx];
auto ref_value = out_ref.get()[eidx];
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
<< out_dim.production() << ", x_v[" << eidx << "]:"
<< x_v[eidx] << ", value[" << eidx << "]:" << value
<< ", ref_value[" << eidx << "]:" << ref_value;
}
for (int i = 0; i < y_v.size(); i++) {
LOG(INFO) << "y_v[" << i << "]:" << y_v[i];
}
#endif
for (int eidx = 0; eidx < out_dim.production(); eidx++) {
auto value = out_v[eidx];
auto ref_value = out_ref.get()[eidx];
EXPECT_NEAR(value, ref_value, 1e-6);
if (abs(value - ref_value) > 1e-6) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
<< out_dim.production() << ", value[" << eidx << "]:" << value
<< ", ref_value[" << eidx << "]:" << ref_value;
break;
}
}
}
}
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, def); // USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_add_activation, kOpenCL, kFloat, kNCHW, def); // USE_LITE_KERNEL(fusion_elementwise_add_activation, kOpenCL, kFloat, kNCHW,
// def);
USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kImageDefault, def);
USE_LITE_KERNEL(
fusion_elementwise_add_activation, kOpenCL, kFloat, kImageDefault, def);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/opencl/elementwise_mul_compute.h"
#include <memory>
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
void ElementwiseMulFloatImageCompute::PrepareForRun() {
ele_param_ = param_.get_mutable<param_t>();
auto* y = ele_param_->Y;
auto y_dims = y->dims();
if (y_dims == ele_param_->X->dims()) {
kernel_func_name_ = "elementwise_mul";
} else if (y_dims.size() == 1) {
kernel_func_name_ = "channel_mul_d1";
} else if (y_dims.size() == 2) {
kernel_func_name_ = "channel_mul_d2";
} else if (y_dims.size() == 4) {
kernel_func_name_ = "channel_mul_d4";
} else {
LOG(FATAL) << "ElementwiseMul not supported y_dims.size():" << y_dims.size()
<< ", x_dims.size():" << ele_param_->X->dims().size();
}
VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
VLOG(4) << "y_dims:" << y_dims;
VLOG(4) << "y_dims.size():" << y_dims.size();
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/elementwise_mul_kernel.cl", build_options_);
}
void ElementwiseMulFloatImageCompute::Run() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x = ele_param_->X;
auto* y = ele_param_->Y;
auto* out = ele_param_->Out;
VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "y->target():" << TargetToStr(y->target());
VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << x->dims();
VLOG(4) << "y->dims():" << y->dims();
VLOG(4) << "out->dims():" << out->dims();
paddle::lite::CLImageConverterDefault default_convertor;
auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims()); // w, h
auto x_img_width = x_img_shape[0];
auto x_img_height = x_img_shape[1];
auto out_img_shape =
default_convertor.InitImageDimInfoWith(out->dims()); // w, h
auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
auto* x_img = x->data<float, cl::Image2D>();
auto* y_img = y->data<float, cl::Image2D>();
auto* out_img =
out->mutable_data<float, cl::Image2D>(out_img_shape[0], out_img_shape[1]);
VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
<< out_img_shape[1];
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
auto y_dims = y->dims();
if (y_dims == ele_param_->X->dims()) {
// kernel: elementwise_mul(channel_mul_d4)
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
} else if (y_dims.size() == 1 || y_dims.size() == 4) {
auto tensor_w = x->dims()[x->dims().size() - 1];
VLOG(4) << "tensor_w:" << tensor_w;
// kernel: channel_mul_d1 / channel_mul_d4
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
CL_CHECK_FATAL(status);
} else if (y_dims.size() == 2) {
auto y_tensor_h = y->dims()[0];
auto y_tensor_w = y->dims()[1];
VLOG(4) << "y_tensor_w:" << y_tensor_w << " y_tensor_h:" << y_tensor_h;
// kernel: channel_mul_d2
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_w));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_h));
CL_CHECK_FATAL(status);
} else {
LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
<< y_dims.size();
}
auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
static_cast<cl::size_type>(x_img_height)};
auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
}
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
namespace ocl = paddle::lite::kernels::opencl;
REGISTER_LITE_KERNEL(elementwise_mul,
kOpenCL,
kFloat,
kImageDefault,
ocl::ElementwiseMulFloatImageCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindInput("Y",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/core/kernel.h"
#include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ElementwiseMulFloatImageCompute
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ElementwiseParam;
std::string doc() const override {
return "ElementwiseMul using cl::Image2D(ImageDefault/RGBA), kFP32";
}
void PrepareForRun() override;
void Run() override;
protected:
param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_mul"};
std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <algorithm>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
template <typename dtype>
void fill_data(dtype *x, const int length, int set_value = -1) {
if (set_value == -1) {
for (size_t idx = 0; idx < length; ++idx) {
x[idx] = idx;
}
} else if (set_value != -1) {
for (size_t idx = 0; idx < length; ++idx) {
x[idx] = set_value;
}
}
}
template <typename dtype>
void elementwise_compute_ref(const dtype *x_data,
const dtype *y_data,
dtype *out_data,
const DDim &x_dims,
const DDim &y_dims,
int axis,
const std::string elt_type,
bool use_relu = false) {
if (axis < 0) {
axis = x_dims.size() - y_dims.size();
}
int batch = 1;
int channels = 1;
int num = 1;
for (int i = 0; i < axis; ++i) {
batch *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
channels *= y_dims[i];
}
for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
num *= x_dims[i];
}
if (x_dims == y_dims || y_dims.size() == 2 || y_dims.size() == 1) {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const dtype *din_ptr = x_data + offset;
const dtype diny_data = y_data[j];
dtype *dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = *din_ptr * diny_data;
if (use_relu) {
*dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
}
dout_ptr++;
din_ptr++;
}
}
}
} else if (y_dims.size() == 4) {
// eg: x_dims: [1, 3, 2, 2]
// y_dims: [1, 3, 1, 1]
ASSERT_EQ(y_dims[2], y_dims[3]);
ASSERT_EQ(y_dims[2], 1);
ASSERT_EQ(y_dims[0], 1);
auto y_offset = y_dims.production();
auto x_offset = x_dims.production() / y_offset;
for (auto x = 0; x < x_dims.production(); ++x) {
auto y = x / x_offset;
out_data[x] = x_data[x] * y_data[y];
}
} else {
LOG(FATAL) << "unsupported Elementwise type: " << elt_type << std::endl;
}
}
// #define PRINT_RESULT
TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
LOG(INFO)
<< "main steps of test: host -> layout(buf2img on cpu) -> elemul(img) -> "
"layout(img2buf on cpu) "
"-> host";
// dims
const int n = 1;
const int c = 3;
const int h = 2;
const int w = 2;
const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
auto out_dim = x_dim;
std::vector<DDim> y_dim_v{DDim(std::vector<DDim::value_type>{n, c, 1, 1}),
DDim(std::vector<DDim::value_type>{n, c, h, w}),
DDim(std::vector<DDim::value_type>{h, w}),
DDim(std::vector<DDim::value_type>{w})};
for (auto y_dim : y_dim_v) {
LOG(INFO) << "================== elementwise_mul ===================";
LOG(INFO) << "x_dim:" << x_dim << "\ty_dim:" << y_dim
<< "\tout_dim:" << out_dim;
// tensor
LOG(INFO) << "set tensors about op param";
lite::Tensor elemul_x, elemul_y, elemul_out;
elemul_x.Resize(x_dim);
elemul_y.Resize(y_dim);
elemul_out.Resize(out_dim);
// initialize tensors
VLOG(4) << "initialize tensors";
paddle::lite::CLImageConverterDefault default_convertor;
// x
std::vector<float> x_v(x_dim.production());
fill_data<float>(x_v.data(), x_v.size()); // fill with index value
auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim); // w, h
auto x_img_w = x_img_shape[0];
auto x_img_h = x_img_shape[1];
std::vector<float> x_img_v(x_img_w * x_img_h * 4); // 4: RGBA
default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
elemul_x.mutable_data<float, cl::Image2D>(x_img_w, x_img_h, x_img_v.data());
// y
std::vector<float> y_v(y_dim.production());
fill_data<float>(y_v.data(), y_v.size()); // fill with index value
auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim); // w, h
auto y_img_w = y_img_shape[0];
auto y_img_h = y_img_shape[1];
std::vector<float> y_img_v(y_img_shape[0] * y_img_shape[1] * 4); // 4: RGBA
default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
elemul_y.mutable_data<float, cl::Image2D>(y_img_w, y_img_h, y_img_v.data());
// out
auto out_img_shape =
default_convertor.InitImageDimInfoWith(out_dim); // w, h
auto out_img_w = out_img_shape[0];
auto out_img_h = out_img_shape[1];
elemul_out.mutable_data<float, cl::Image2D>(out_img_w, out_img_h);
std::vector<float> out_img_v(out_img_w * out_img_h * 4);
fill_data<float>(
out_img_v.data(), out_img_v.size(), 0); // fill with zero value
std::vector<float> out_v(out_dim.production());
// operator param
operators::ElementwiseParam elemulParam;
elemulParam.X = &elemul_x;
elemulParam.Y = &elemul_y;
elemulParam.Out = &elemul_out;
elemulParam.axis = -1;
// set kernel
auto elemul_img_kernels =
KernelRegistry::Global().Create("elementwise_mul",
TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(elemul_img_kernels.empty());
auto elemul_img_kernel = std::move(elemul_img_kernels.front());
VLOG(4) << "get elemul kernel: " << elemul_img_kernel->doc();
// set context and kernel args
VLOG(4) << "set context and kernel args";
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
elemul_img_kernel->SetParam(elemulParam);
std::unique_ptr<KernelContext> elemul_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(elemul_img_context->As<OpenCLContext>()));
elemul_img_kernel->SetContext(std::move(elemul_img_context));
// run kernel
VLOG(4) << "run kernel";
elemul_img_kernel->Launch();
// download gpu result to cpu
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
TargetWrapperCL::ImgcpySync(out_img_v.data(),
elemul_out.data<float, cl::Image2D>(),
out_img_w,
out_img_h,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
default_convertor.ImageToNCHW(
out_img_v.data(), out_v.data(), out_img_shape, out_dim);
// compute cpu reference
std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
elementwise_compute_ref<float>(x_v.data(),
y_v.data(),
out_ref.get(),
x_dim,
y_dim,
elemulParam.axis,
"mul");
#if 0 // enable to check value of x and y
for (int eidx = 0; eidx < out_dim.production(); eidx++) {
auto value = out_v[eidx];
auto ref_value = out_ref.get()[eidx];
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
<< out_dim.production() << ", x_v[" << eidx << "]:"
<< x_v[eidx] << ", value[" << eidx << "]:" << value
<< ", ref_value[" << eidx << "]:" << ref_value;
}
for (int i = 0; i < y_v.size(); i++) {
LOG(INFO) << "y_v[" << i << "]:" << y_v[i];
}
#endif
for (int eidx = 0; eidx < out_dim.production(); eidx++) {
auto value = out_v[eidx];
auto ref_value = out_ref.get()[eidx];
EXPECT_NEAR(value, ref_value, 1e-6);
if (abs(value - ref_value) > 1e-6) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
<< out_dim.production() << ", value[" << eidx << "]:" << value
<< ", ref_value[" << eidx << "]:" << ref_value;
break;
}
}
}
}
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(elementwise_mul, kOpenCL, kFloat, kImageDefault, def);
...@@ -20,6 +20,9 @@ namespace paddle { ...@@ -20,6 +20,9 @@ namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
/* Buffer */
#if 0
class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute { class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
public: public:
using param_t = operators::FusionElementwiseActivationParam; using param_t = operators::FusionElementwiseActivationParam;
...@@ -38,19 +41,60 @@ class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute { ...@@ -38,19 +41,60 @@ class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
} }
} }
}; };
#endif
class FusionElementwiseAddActivationImageCompute
: public ElementwiseAddImageCompute {
public:
using param_t = operators::FusionElementwiseActivationParam;
void PrepareForRun() override {
build_options_ += " -DRELU";
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
ele_param_ = param_.get_mutable<param_t>();
auto act_t = static_cast<param_t*>(ele_param_)->act_type;
VLOG(4) << "act: " << act_t;
if (act_t != "relu") {
LOG(FATAL) << "Unsupported Activation type: " << act_t;
}
}
};
} // namespace opencl } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
namespace ocl = paddle::lite::kernels::opencl; namespace ocl = paddle::lite::kernels::opencl;
// REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
// kOpenCL,
// kFloat,
// kNCHW,
// ocl::FusionElementwiseAddActivationCompute,
// def)
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize();
REGISTER_LITE_KERNEL(fusion_elementwise_add_activation, REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
kOpenCL, kOpenCL,
kFloat, kFloat,
kNCHW, kImageDefault,
ocl::FusionElementwiseAddActivationCompute, ocl::FusionElementwiseAddActivationImageCompute,
def) def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindInput("X",
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))}) {LiteType::GetTensorTy(TARGET(kOpenCL),
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))}) PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindInput("Y",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class NearestInterpComputeFloatImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::InterpolateParam;
std::string doc() const override {
return "NearestInterp using cl::Image2D(ImageDefault/RGBA), kFloat";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_buf = param.X->data<float, cl::Image2D>();
auto* out_buf =
param.Out->mutable_data<float, cl::Image2D>(param.out_w, param.out_h);
const auto& y_dims = param.Out->dims(); // useless: check dim only
float scale_h = y_dims[2] / x_dims[2];
float scale_w = y_dims[3] / x_dims[3];
int in_dims_h = x_dims[2];
int out_dims_h = y_dims[2];
int in_dims_w = x_dims[3];
int out_dims_w = y_dims[3];
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const float>(scale_w));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_h));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_h));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_w));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
CL_CHECK_FATAL(status);
paddle::lite::CLImageConverterDefault default_convertor;
auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dims); // w, h
auto y_img_width = y_img_shape[0];
LOG(INFO) << "y_img_width:" << y_img_width;
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(y_img_width / y_dims[3]),
static_cast<cl::size_type>(y_dims[3]),
static_cast<cl::size_type>(y_dims[0] * y_dims[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
}
private:
std::string kernel_func_name_{"nearest_interp"};
std::string build_options_{"-DCL_DTYPE_float "};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class NearestInterpComputeFP16ImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::InterpolateParam;
std::string doc() const override {
return "NearestInterp using cl::Image2D(ImageDefault/RGBA), kFP16";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_buf =
param.X->data<int16_t,
cl::Image2D>(); // use int16_t represents half float
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf =
param.Out->mutable_data<int16_t, cl::Image2D>( // use int16_t
// represents half float
image_shape["width"],
image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
float scale_h = y_dims[2] / x_dims[2];
float scale_w = y_dims[3] / x_dims[3];
int in_dims_h = x_dims[2];
int out_dims_h = y_dims[2];
int in_dims_w = x_dims[3];
int out_dims_w = y_dims[3];
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const float>(scale_w));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_h));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_h));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_w));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
}
private:
std::string kernel_func_name_{"nearest_interp"};
std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
nearest_interp,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::NearestInterpComputeFloatImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
REGISTER_LITE_KERNEL(
nearest_interp,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::NearestInterpComputeFP16ImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
namespace paddle {
namespace lite {
template <typename dtype>
void nearest_interp_compute_ref(const dtype *src,
int w_in,
int h_in,
dtype *dst,
int w_out,
int h_out,
float scale_x,
float scale_y,
bool with_align = false) {
float scale_w_new = (with_align)
? (static_cast<float>(w_in - 1) / (w_out - 1))
: (static_cast<float>(w_in) / (w_out));
float scale_h_new = (with_align)
? (static_cast<float>(h_in - 1) / (h_out - 1))
: (static_cast<float>(h_in) / (h_out));
if (with_align) {
for (int h = 0; h < h_out; ++h) {
dtype *dst_p = dst + h * w_out;
int near_y = static_cast<int>(scale_h_new * h + 0.5);
for (int w = 0; w < w_out; ++w) {
int near_x = static_cast<int>(scale_w_new * w + 0.5);
*dst_p++ = src[near_y * w_in + near_x];
}
}
} else {
for (int h = 0; h < h_out; ++h) {
dtype *dst_p = dst + h * w_out;
int near_y = static_cast<int>(scale_h_new * h);
for (int w = 0; w < w_out; ++w) {
int near_x = static_cast<int>(scale_w_new * w);
*dst_p++ = src[near_y * w_in + near_x];
}
}
}
}
// #define LOOP_TEST
// #define PRINT_RESULT
TEST(nearest_interp_image2d_fp32, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> "
"nearest_interp(img) -> "
"layout(img2buf) "
"-> host";
#ifdef LOOP_TEST
for (int n : {1, 3}) {
for (auto c : {1, 3}) {
for (int h : {12, 20, 50, 112}) {
for (int w : {12, 20, 50, 112}) {
for (int out_h : {36, 60, 90, 224}) {
for (int out_w : {36, 60, 90, 224}) {
if (out_w < w || out_h < h) {
continue;
}
#else
const int n = 1;
const int c = 2;
const int h = 3;
const int w = 4;
const int out_h = 6;
const int out_w = 8;
#endif // LOOP_TEST
float scale_x = out_w / w;
float scale_y = out_h / h;
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c
<< " " << h << " " << w << " ========" << out_h << " "
<< out_w;
// set layout kernels
auto buf_to_img_kernels =
KernelRegistry::Global().Create("layout",
TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kImageDefault));
auto img_to_buf_kernels =
KernelRegistry::Global().Create("layout",
TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kNCHW));
auto nearest_interp_img_kernels =
KernelRegistry::Global().Create("nearest_interp",
TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(nearest_interp_img_kernels.empty());
auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
auto nearest_interp_img_kernel =
std::move(nearest_interp_img_kernels.front());
LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
LOG(INFO) << "get 3rd kernel: "
<< nearest_interp_img_kernel->doc();
// set tensors about op param
LOG(INFO) << "set tensors about op param";
// layout(buf->img): x -> nearest_interp_in
// nearest_interp(img): nearest_interp_in -> nearest_interp_out
// layout(img->buf): nearest_interp_out -> y
lite::Tensor x, y, nearest_interp_in, nearest_interp_out, y_ref;
operators::LayoutParam BufferToImageParam;
operators::LayoutParam ImageToBufferParam;
BufferToImageParam.x = &x;
BufferToImageParam.y = &nearest_interp_in;
ImageToBufferParam.x = &nearest_interp_out;
ImageToBufferParam.y = &y;
operators::InterpolateParam NearestInterpParam;
NearestInterpParam.X = &nearest_interp_in;
NearestInterpParam.Out = &nearest_interp_out;
NearestInterpParam.out_h = out_h;
NearestInterpParam.out_w = out_w;
const DDim x_dim =
DDim(std::vector<DDim::value_type>{n, c, h, w});
const DDim y_dim =
DDim(std::vector<DDim::value_type>{n, c, out_h, out_w});
x.Resize(x_dim);
y.Resize(y_dim);
nearest_interp_in.Resize(x_dim);
nearest_interp_out.Resize(y_dim);
y_ref.Resize(y_dim);
auto nearest_interp_image2d_shape =
paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
// initialize tensors
LOG(INFO) << "initialize tensors";
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
x_data, 0, sizeof(float) * x_dim.production()));
auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
y_data, 0, sizeof(float) * y_dim.production()));
for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
}
for (int i = 0; i < y_dim.production(); ++i) {
mapped_y[i] = static_cast<int>(0);
}
auto *nearest_interp_in_data =
nearest_interp_in.mutable_data<float, cl::Image2D>(
nearest_interp_image2d_shape["width"],
nearest_interp_image2d_shape["height"]);
auto *nearest_interp_out_data =
nearest_interp_out.mutable_data<float, cl::Image2D>(y_dim[3],
y_dim[2]);
// set context and kernel args
LOG(INFO) << "set context and kernel args";
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
buf_to_img_kernel->SetParam(BufferToImageParam);
std::unique_ptr<KernelContext> buf_to_img_context(
new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(buf_to_img_context->As<OpenCLContext>()));
buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
img_to_buf_kernel->SetParam(ImageToBufferParam);
std::unique_ptr<KernelContext> img_to_buf_context(
new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(img_to_buf_context->As<OpenCLContext>()));
img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
nearest_interp_img_kernel->SetParam(NearestInterpParam);
std::unique_ptr<KernelContext> nearest_interp_img_context(
new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(nearest_interp_img_context->As<OpenCLContext>()));
nearest_interp_img_kernel->SetContext(
std::move(nearest_interp_img_context));
// run kernels
LOG(INFO) << "run kernel: buf_to_img_kernel";
buf_to_img_kernel->Launch();
LOG(INFO) << "run kernel: nearest_interp_img_kernel";
nearest_interp_img_kernel->Launch();
LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch();
// compute ref cpu
for (int nid = 0; nid < x_dim[0]; ++nid) {
for (int cid = 0; cid < x_dim[1]; ++cid) {
float *x_nc =
mapped_x + (nid * x_dim[1] + cid) * x_dim[3] * x_dim[2];
float *y_nc =
y_data_ref + (nid * x_dim[1] + cid) * y_dim[3] * y_dim[2];
nearest_interp_compute_ref<float>(x_nc,
x_dim[3],
x_dim[2],
y_nc,
y_dim[3],
y_dim[2],
1 / scale_x,
1 / scale_y);
}
}
// result
#ifdef PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " ";
}
std::cout << std::endl;
for (int eidx = 0; eidx < y_dim.production(); ++eidx) {
std::cout << mapped_y[eidx] << " ";
}
std::cout << std::endl;
for (int eidx = 0; eidx < y_dim.production(); ++eidx) {
std::cout << y_data_ref[eidx] << " ";
}
std::cout << std::endl;
#endif // PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < y_dim.production(); eidx++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
LOG(FATAL) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", y_data_ref["
<< eidx << "]:" << y_data_ref[eidx]
<< ", mapped_y[" << eidx << "]:" << mapped_y[eidx];
break;
}
}
// free
LOG(INFO) << "free: unmap x, y";
TargetWrapperCL::Unmap(x_data, mapped_x);
TargetWrapperCL::Unmap(y_data, mapped_y);
#ifdef LOOP_TEST
}
}
} // w
} // h
} // c
} // n
#else
// nothing to do.
#endif
}
} // namespace lite
} // namespace paddle
// nearest_interp buffer
// USE_LITE_KERNEL(nearest_interp, kOpenCL, kFloat, kNCHW, def);
// nearest_interp image2d fp32
USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
USE_LITE_KERNEL(nearest_interp, kOpenCL, kFloat, kImageDefault, ImageDefault);
// nearest_interp image2d fp16
USE_LITE_KERNEL(nearest_interp, kOpenCL, kFP16, kImageDefault, ImageDefault);
...@@ -100,3 +100,4 @@ lite_cc_test(test_sequence_concat_compute_x86 SRCS sequence_concat_compute_test. ...@@ -100,3 +100,4 @@ lite_cc_test(test_sequence_concat_compute_x86 SRCS sequence_concat_compute_test.
lite_cc_test(test_var_conv_2d_compute_x86 SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_x86) lite_cc_test(test_var_conv_2d_compute_x86 SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_x86)
#lite_cc_test(test_attention_padding_mask_compute_x86 SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_x86) #lite_cc_test(test_attention_padding_mask_compute_x86 SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_x86)
lite_cc_test(test_sequence_arithmetic_compute_x86 SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_x86) lite_cc_test(test_sequence_arithmetic_compute_x86 SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_x86)
lite_cc_test(test_leaky_relu_compute_x86 SRCS leaky_relu_compute_test.cc DEPS activation_compute_x86)
...@@ -36,6 +36,17 @@ REGISTER_LITE_KERNEL(relu, ...@@ -36,6 +36,17 @@ REGISTER_LITE_KERNEL(relu,
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize(); .Finalize();
// float
REGISTER_LITE_KERNEL(leaky_relu,
kX86,
kFloat,
kNCHW,
paddle::lite::kernels::x86::LeakyReluCompute<float>,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
// float // float
REGISTER_LITE_KERNEL(tanh, REGISTER_LITE_KERNEL(tanh,
kX86, kX86,
......
...@@ -117,6 +117,40 @@ class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -117,6 +117,40 @@ class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
virtual ~ReluCompute() = default; virtual ~ReluCompute() = default;
}; };
template <typename T>
struct LeakyReluFunctor {
float alpha;
explicit LeakyReluFunctor(float alpha_) : alpha(alpha_) {}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
}
};
template <typename T>
class LeakyReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
void Run() override {
auto& param = *param_.get_mutable<operators::ActivationParam>();
param.Out->template mutable_data<T>();
auto X = param.X;
auto Out = param.Out;
auto place = lite::fluid::EigenDeviceType<TARGET(kX86)>();
CHECK(X);
CHECK(Out);
auto x = lite::fluid::EigenVector<T>::Flatten(*X);
auto out = lite::fluid::EigenVector<T>::Flatten(*Out);
LeakyReluFunctor<T> functor(param.Leaky_relu_alpha);
functor(place, x, out);
}
virtual ~LeakyReluCompute() = default;
};
// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T> template <typename T>
struct TanhFunctor : public BaseActivationFunctor<T> { struct TanhFunctor : public BaseActivationFunctor<T> {
......
...@@ -131,7 +131,6 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -131,7 +131,6 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto* w = param.w; auto* w = param.w;
auto* bias = param.bias; auto* bias = param.bias;
auto* output = param.output; auto* output = param.output;
int in_num_col_dims = param.in_num_col_dims;
bool with_relu = (param.activation_type == "relu") ? true : false; bool with_relu = (param.activation_type == "relu") ? true : false;
bool padding_weights = param.padding_weights; bool padding_weights = param.padding_weights;
...@@ -139,17 +138,7 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -139,17 +138,7 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0]; auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0];
auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1]; auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
DDim out_dims; int M = output->dims().production() / w_dims1;
out_dims.resize(static_cast<size_t>(in_num_col_dims + 1));
const auto& in_dims = input->dims();
for (int i = 0; i < in_num_col_dims; ++i) {
out_dims[i] = in_dims[i];
}
out_dims[in_num_col_dims] = w_dims1;
output->Resize(out_dims);
output->set_lod(input->lod());
int M = out_dims.production() / w_dims1;
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* w_data = w->data<T>(); const T* w_data = w->data<T>();
......
...@@ -155,7 +155,6 @@ TEST(layer_norm_x86, run_test) { ...@@ -155,7 +155,6 @@ TEST(layer_norm_x86, run_test) {
ref(&x, &Scale, &Bias, &out, &Mean, &Var, begin_norm_axis, epsilon); ref(&x, &Scale, &Bias, &out, &Mean, &Var, begin_norm_axis, epsilon);
for (int j = 0; j < out.dims().production(); ++j) { for (int j = 0; j < out.dims().production(); ++j) {
EXPECT_NEAR(out_data[j], ref_data[j], 1e-5); EXPECT_NEAR(out_data[j], ref_data[j], 1e-5);
// LOG(INFO) << out_data[j];
} }
LOG(INFO) << *mean_data; LOG(INFO) << *mean_data;
LOG(INFO) << *var_data; LOG(INFO) << *var_data;
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <iostream>
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/kernels/x86/activation_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
TEST(leaky_relu_x86, retrive_op) {
auto leaky_relu =
KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
"leaky_relu");
ASSERT_FALSE(leaky_relu.empty());
ASSERT_TRUE(leaky_relu.front());
}
TEST(leaky_relu_x86, init) {
LeakyReluCompute<float> leaky_relu;
ASSERT_EQ(leaky_relu.precision(), PRECISION(kFloat));
ASSERT_EQ(leaky_relu.target(), TARGET(kX86));
}
TEST(leaky_relu_x86, run_test) {
lite::Tensor x, out;
constexpr int batch_size = 1;
std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
x.Resize(lite::DDim(x_shape));
std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
out.Resize(lite::DDim(out_shape));
auto x_data = x.mutable_data<float>();
auto out_data = out.mutable_data<float>();
for (int64_t i = 0; i < x.dims().production(); i++) {
x_data[i] = static_cast<float>(i) / 12.0 - 0.5;
}
LeakyReluCompute<float> leaky_relu;
operators::ActivationParam param;
param.X = &x;
param.Out = &out;
param.Leaky_relu_alpha = 0.05;
leaky_relu.SetParam(param);
leaky_relu.Run();
std::vector<float> ref_data({-0.025,
-0.02083333,
-0.01666667,
-0.0125,
-0.00833333,
-0.00416667,
0.,
0.08333334,
0.16666667,
0.25,
0.33333334,
0.41666666});
for (int i = 0; i < out.dims().production(); i++) {
EXPECT_NEAR(out_data[i], ref_data[i], 1e-05);
}
}
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(leaky_relu, kX86, kFloat, kNCHW, def);
...@@ -28,6 +28,8 @@ namespace x86 { ...@@ -28,6 +28,8 @@ namespace x86 {
template <typename T> template <typename T>
void Compute(const lite::Tensor* in, lite::Tensor* out) { void Compute(const lite::Tensor* in, lite::Tensor* out) {
// In CopyDataFrom, the target tensor's dims will be set to the source
// tensor's dims.
auto out_dims = out->dims(); auto out_dims = out->dims();
out->CopyDataFrom(*in); out->CopyDataFrom(*in);
out->Resize(out_dims); out->Resize(out_dims);
......
...@@ -78,7 +78,11 @@ int CastConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -78,7 +78,11 @@ int CastConverter(void* ctx, OpLite* op, KernelBase* kernel) {
if (graph->Has(x_name)) { if (graph->Has(x_name)) {
x_node = graph->Get(x_name); x_node = graph->Get(x_name);
} else { } else {
x_node = graph->Add(x_name, *x, in_ptype); CHECK(x->precision() == in_ptype)
<< "The data type of input tensor X should be "
<< PrecisionToStr(in_ptype) << ", but received "
<< PrecisionToStr(x->precision());
x_node = graph->Add(x_name, *x);
} }
// Cast node // Cast node
......
...@@ -45,6 +45,7 @@ int SizeOfType(framework::proto::VarType::Type type) { ...@@ -45,6 +45,7 @@ int SizeOfType(framework::proto::VarType::Type type) {
DO(FP16, float); DO(FP16, float);
DO(FP32, float); DO(FP32, float);
DO(INT8, int8_t); DO(INT8, int8_t);
DO(INT16, int16_t);
DO(INT32, int); DO(INT32, int);
DO(INT64, int64_t); DO(INT64, int64_t);
#undef DO #undef DO
......
...@@ -152,10 +152,8 @@ void ParamDesc::SetDim(const std::vector<int64_t>& dim) { ...@@ -152,10 +152,8 @@ void ParamDesc::SetDim(const std::vector<int64_t>& dim) {
auto& data_builder = desc_->GetField<PrimaryListBuilder<char>>("data"); \ auto& data_builder = desc_->GetField<PrimaryListBuilder<char>>("data"); \
auto data = data_builder.data(); \ auto data = data_builder.data(); \
size_t size = data_builder.size() / sizeof(T); \ size_t size = data_builder.size() / sizeof(T); \
auto* data_ptr = reinterpret_cast<const T*>(data); \ res.resize(size); \
for (size_t i = 0; i < size; ++i) { \ memcpy(&res[0], data, data_builder.size()); \
res.push_back(data_ptr[i]); \
} \
return res; \ return res; \
} }
......
...@@ -27,7 +27,7 @@ bool ConcatOpLite::CheckShape() const { ...@@ -27,7 +27,7 @@ bool ConcatOpLite::CheckShape() const {
} }
bool ConcatOpLite::InferShape() const { bool ConcatOpLite::InferShape() const {
std::vector<lite::Tensor *> &inputs = param_.x; const std::vector<Tensor *> &inputs = param_.x;
const size_t n = inputs.size(); const size_t n = inputs.size();
CHECK_GT_OR_FALSE(n, 0); CHECK_GT_OR_FALSE(n, 0);
...@@ -45,7 +45,7 @@ bool ConcatOpLite::InferShape() const { ...@@ -45,7 +45,7 @@ bool ConcatOpLite::InferShape() const {
auto out_dims = inputs[0]->dims(); auto out_dims = inputs[0]->dims();
size_t in_zero_dims_size = out_dims.size(); size_t in_zero_dims_size = out_dims.size();
for (size_t i = 1; i < n; i++) { for (size_t i = 1; i < n; i++) {
auto &input_dims_i = inputs[i]->dims(); const auto &input_dims_i = inputs[i]->dims();
for (size_t j = 0; j < in_zero_dims_size; j++) { for (size_t j = 0; j < in_zero_dims_size; j++) {
if (j == static_cast<size_t>(axis)) { if (j == static_cast<size_t>(axis)) {
out_dims[axis] += input_dims_i[j]; out_dims[axis] += input_dims_i[j];
......
...@@ -49,17 +49,17 @@ bool FcOpLite::CheckShape() const { ...@@ -49,17 +49,17 @@ bool FcOpLite::CheckShape() const {
} }
bool FcOpLite::InferShape() const { bool FcOpLite::InferShape() const {
const auto &input_dims = param_.input->dims(); const auto& input_dims = param_.input->dims();
const auto &w_dims = param_.w->dims(); const auto& w_dims = param_.w->dims();
int in_num_col_dims = param_.in_num_col_dims; int in_num_col_dims = param_.in_num_col_dims;
int64_t w_dims_1 = param_.padding_weights ? w_dims[1] - 4 : w_dims[1];
// Set output dims // Set output dims
DDim output_dims; std::vector<DDim::value_type> output_dims(in_num_col_dims + 1);
output_dims.resize(in_num_col_dims + 1);
for (int i = 0; i < in_num_col_dims; ++i) { for (int i = 0; i < in_num_col_dims; ++i) {
output_dims[i] = input_dims[i]; output_dims[i] = input_dims[i];
} }
output_dims[in_num_col_dims] = w_dims[1]; output_dims[in_num_col_dims] = w_dims_1;
param_.output->Resize(output_dims); param_.output->Resize(output_dims);
// share LoD // share LoD
...@@ -67,7 +67,7 @@ bool FcOpLite::InferShape() const { ...@@ -67,7 +67,7 @@ bool FcOpLite::InferShape() const {
return true; return true;
} }
bool FcOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { bool FcOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
auto input = op_desc.Input("Input").front(); auto input = op_desc.Input("Input").front();
auto W = op_desc.Input("W").front(); auto W = op_desc.Input("W").front();
auto out = op_desc.Output("Out").front(); auto out = op_desc.Output("Out").front();
......
...@@ -28,8 +28,8 @@ bool GRUOpLite::CheckShape() const { ...@@ -28,8 +28,8 @@ bool GRUOpLite::CheckShape() const {
CHECK_OR_FALSE(param_.batch_hidden) CHECK_OR_FALSE(param_.batch_hidden)
CHECK_OR_FALSE(param_.hidden) CHECK_OR_FALSE(param_.hidden)
const auto &input_dims = param_.input->dims(); const auto& input_dims = param_.input->dims();
const auto &weight_dims = param_.weight->dims(); const auto& weight_dims = param_.weight->dims();
int input_size = input_dims[1]; int input_size = input_dims[1];
int frame_size = weight_dims[0]; int frame_size = weight_dims[0];
CHECK_EQ_OR_FALSE(input_size, frame_size * 3) CHECK_EQ_OR_FALSE(input_size, frame_size * 3)
...@@ -52,8 +52,8 @@ bool GRUOpLite::CheckShape() const { ...@@ -52,8 +52,8 @@ bool GRUOpLite::CheckShape() const {
} }
bool GRUOpLite::InferShape() const { bool GRUOpLite::InferShape() const {
const auto &input_dims = param_.input->dims(); const auto& input_dims = param_.input->dims();
const auto &weight_dims = param_.weight->dims(); const auto& weight_dims = param_.weight->dims();
int frame_size = weight_dims[0]; int frame_size = weight_dims[0];
auto batch_size = input_dims[0]; auto batch_size = input_dims[0];
...@@ -68,7 +68,7 @@ bool GRUOpLite::InferShape() const { ...@@ -68,7 +68,7 @@ bool GRUOpLite::InferShape() const {
return true; return true;
} }
bool GRUOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { bool GRUOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
auto input = op_desc.Input("Input").front(); auto input = op_desc.Input("Input").front();
auto weight = op_desc.Input("Weight").front(); auto weight = op_desc.Input("Weight").front();
auto batch_gate = op_desc.Output("BatchGate").front(); auto batch_gate = op_desc.Output("BatchGate").front();
......
...@@ -25,8 +25,8 @@ bool LookupTableOpLite::CheckShape() const { ...@@ -25,8 +25,8 @@ bool LookupTableOpLite::CheckShape() const {
CHECK_OR_FALSE(param_.Ids) CHECK_OR_FALSE(param_.Ids)
CHECK_OR_FALSE(param_.Out) CHECK_OR_FALSE(param_.Out)
const auto &table_dims = param_.W->dims(); const auto& table_dims = param_.W->dims();
const auto &ids_dims = param_.Ids->dims(); const auto& ids_dims = param_.Ids->dims();
int ids_rank = ids_dims.size(); int ids_rank = ids_dims.size();
...@@ -37,23 +37,20 @@ bool LookupTableOpLite::CheckShape() const { ...@@ -37,23 +37,20 @@ bool LookupTableOpLite::CheckShape() const {
} }
bool LookupTableOpLite::InferShape() const { bool LookupTableOpLite::InferShape() const {
const auto &table_dims = param_.W->dims(); const auto& table_dims = param_.W->dims();
// LOG(INFO) << "table_dims: " << table_dims; const auto& ids_dims = param_.Ids->dims();
const auto &ids_dims = param_.Ids->dims();
// LOG(INFO) << "ids_dims: " << ids_dims;
auto out_dims = ids_dims; auto out_dims = ids_dims;
int ids_rank = ids_dims.size(); int ids_rank = ids_dims.size();
out_dims[ids_rank - 1] = table_dims[1]; out_dims[ids_rank - 1] = table_dims[1];
// LOG(INFO) << "out_dims: " << out_dims;
param_.Out->Resize(out_dims); param_.Out->Resize(out_dims);
param_.Out->set_lod(param_.Ids->lod()); param_.Out->set_lod(param_.Ids->lod());
return true; return true;
} }
bool LookupTableOpLite::AttachImpl(const cpp::OpDesc &op_desc, bool LookupTableOpLite::AttachImpl(const cpp::OpDesc& op_desc,
lite::Scope *scope) { lite::Scope* scope) {
auto input = op_desc.Input("W").front(); auto input = op_desc.Input("W").front();
auto ids = op_desc.Input("Ids").front(); auto ids = op_desc.Input("Ids").front();
auto out = op_desc.Output("Out").front(); auto out = op_desc.Output("Out").front();
......
...@@ -49,8 +49,7 @@ bool ReduceOp::InferShape() const { ...@@ -49,8 +49,7 @@ bool ReduceOp::InferShape() const {
param_.output->Resize(std::vector<int64_t>{1}); param_.output->Resize(std::vector<int64_t>{1});
} else { } else {
size_t out_rank = keep_dim ? x_rank : x_rank - dims.size(); size_t out_rank = keep_dim ? x_rank : x_rank - dims.size();
DDim out_dims; std::vector<DDim::value_type> out_dims(out_rank);
out_dims.resize(out_rank);
if (keep_dim) { if (keep_dim) {
for (size_t i = 0; i < dims.size(); ++i) { for (size_t i = 0; i < dims.size(); ++i) {
out_dims[dims[i]] = 1; out_dims[dims[i]] = 1;
......
...@@ -34,7 +34,7 @@ bool ReshapeOp::InferShape() const { ...@@ -34,7 +34,7 @@ bool ReshapeOp::InferShape() const {
std::vector<int> final_shape; std::vector<int> final_shape;
if (shape_tensor_vct.size() > 0) { if (shape_tensor_vct.size() > 0) {
final_shape.resize(shape_tensor_vct.size()); final_shape.resize(shape_tensor_vct.size());
for (int i = 0; i < shape_tensor_vct.size(); i++) { for (size_t i = 0; i < shape_tensor_vct.size(); i++) {
final_shape[i] = shape_tensor_vct[i]->data<int>()[0]; final_shape[i] = shape_tensor_vct[i]->data<int>()[0];
} }
} else if (shape_tensor != nullptr) { } else if (shape_tensor != nullptr) {
...@@ -100,8 +100,7 @@ bool Reshape2Op::CheckShape() const { ...@@ -100,8 +100,7 @@ bool Reshape2Op::CheckShape() const {
bool Reshape2Op::InferShape() const { bool Reshape2Op::InferShape() const {
ReshapeOp::InferShape(); ReshapeOp::InferShape();
const auto &x_dims = param_.x->dims(); const auto &x_dims = param_.x->dims();
DDim xshape_dims; std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1);
xshape_dims.resize(x_dims.size() + 1);
xshape_dims[0] = 0; xshape_dims[0] = 0;
for (size_t i = 0; i < x_dims.size(); i++) { for (size_t i = 0; i < x_dims.size(); i++) {
xshape_dims[i + 1] = x_dims[i]; xshape_dims[i + 1] = x_dims[i];
...@@ -119,16 +118,24 @@ bool Reshape2Op::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { ...@@ -119,16 +118,24 @@ bool Reshape2Op::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
return true; return true;
} }
DDim ValidateShape(const std::vector<int> &shape, const DDim &input_dims) { static bool CheckPositive(const DDim &dims) {
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] <= 0) {
return false;
}
}
return true;
}
std::vector<DDim::value_type> ValidateShape(const std::vector<int> &shape,
const DDim &input_dims) {
const DDim::value_type input_size = input_dims.production(); const DDim::value_type input_size = input_dims.production();
// Only one dimension can be set to -1, whose size will be automatically // only one dimension can be set to -1, whose size will be automatically
// infered. // infered.
const int unk_dim_val = -1; const int unk_dim_val = -1;
const int copy_dim_val = 0; const int copy_dim_val = 0;
DDim output_dims;
output_dims.resize(shape.size());
DDim::value_type capacity = 1; DDim::value_type capacity = 1;
int unk_dim_idx = -1; int unk_dim_idx = -1;
for (size_t i = 0; i < shape.size(); ++i) { for (size_t i = 0; i < shape.size(); ++i) {
...@@ -152,7 +159,7 @@ DDim ValidateShape(const std::vector<int> &shape, const DDim &input_dims) { ...@@ -152,7 +159,7 @@ DDim ValidateShape(const std::vector<int> &shape, const DDim &input_dims) {
} }
if (unk_dim_idx != -1) { if (unk_dim_idx != -1) {
if (input_dims.CheckPositive()) { if (CheckPositive(input_dims)) {
// input_size < 0 and is un-determinate in compile time, skip the check, // input_size < 0 and is un-determinate in compile time, skip the check,
// for example, input_dims = [-1, 8, 1, 1], shape = [-1, 3, 8], // for example, input_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
// capacity = -24, input_size = -8, output_dims[0] = 0 // capacity = -24, input_size = -8, output_dims[0] = 0
......
...@@ -56,7 +56,8 @@ class Reshape2Op : public ReshapeOp { ...@@ -56,7 +56,8 @@ class Reshape2Op : public ReshapeOp {
std::string DebugString() const override { return "reshape2"; } std::string DebugString() const override { return "reshape2"; }
}; };
DDim ValidateShape(const std::vector<int> &shape, const DDim &input_dims); std::vector<DDim::value_type> ValidateShape(const std::vector<int> &shape,
const DDim &input_dims);
} // namespace operators } // namespace operators
} // namespace lite } // namespace lite
......
...@@ -306,8 +306,8 @@ void test_conv_fp32(const std::vector<DDim>& input_dims, ...@@ -306,8 +306,8 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
const float leakey_relu_scale) {} const float leakey_relu_scale) {}
#endif // LITE_WITH_ARM #endif // LITE_WITH_ARM
// TODO(chenjiaoAngel): fix me, diff: 3x3 depthwise conv // TODO(chenjiaoAngel): fix multi-threds, diff: 3x3 depthwise conv
#if 0 /// 3x3dw #if 1 /// 3x3dw
TEST(TestConv3x3DW, test_conv3x3_depthwise) { TEST(TestConv3x3DW, test_conv3x3_depthwise) {
if (FLAGS_basic_test) { if (FLAGS_basic_test) {
for (auto& stride : {1, 2}) { for (auto& stride : {1, 2}) {
...@@ -334,7 +334,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) { ...@@ -334,7 +334,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
{1, 1}, {1, 1},
flag_bias, flag_bias,
flag_act, flag_act,
{1, 2, 4}, {1},
{FLAGS_power_mode}, {FLAGS_power_mode},
leakey_relu_scale); leakey_relu_scale);
} }
...@@ -352,12 +352,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) { ...@@ -352,12 +352,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
#if 1 /// 5x5dw #if 1 /// 5x5dw
TEST(TestConv5x5DW, test_conv5x5_depthwise) { TEST(TestConv5x5DW, test_conv5x5_depthwise) {
if (FLAGS_basic_test) { if (FLAGS_basic_test) {
#ifdef __aarch64__
// TODO(chenjiaoAngel): fix me, diff: arm64 5x5s2 depthwise conv
for (auto& stride : {1}) {
#else
for (auto& stride : {1, 2}) { for (auto& stride : {1, 2}) {
#endif
for (auto& pad_left : {0, 1, 2}) { for (auto& pad_left : {0, 1, 2}) {
for (auto& pad_right : {0, 1, 2}) { for (auto& pad_right : {0, 1, 2}) {
for (auto& pad_top : {0, 1, 2}) { for (auto& pad_top : {0, 1, 2}) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册