提交 d784eb9e 编写于 作者: Z zhangwen31

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite into pp_yolo_support

要显示的变更太多。

To preserve performance only 1000 of 1000+ files are displayed.
......@@ -16,12 +16,6 @@ cmake_minimum_required(VERSION 3.0)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
include(lite_utils)
lite_option(WITH_PADDLE_MOBILE "Use the paddle-mobile legacy build" OFF)
if (WITH_PADDLE_MOBILE)
add_subdirectory(mobile)
return()
endif(WITH_PADDLE_MOBILE)
set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
set(CMAKE_CXX_STANDARD 11)
......
......@@ -43,7 +43,6 @@ Paddle Lite提供了C++、Java、Python三种API,并且提供了相应API的
- [iOS示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/ios_app_demo.html)
- [ARMLinux示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/linux_arm_demo.html)
- [X86示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/x86.html)
- [CUDA示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/cuda.html)
- [OpenCL示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/opencl.html)
- [FPGA示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/fpga.html)
- [华为NPU示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/huawei_kirin_npu.html)
......@@ -77,7 +76,6 @@ Paddle Lite提供了C++、Java、Python三种API,并且提供了相应API的
| CPU(32bit) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) |
| CPU(64bit) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) |
| OpenCL | - | - | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - |
| CUDA | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - | - |
| FPGA | - | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - | - |
| 华为NPU | - | - | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - |
| 百度 XPU | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - | - |
......
......@@ -199,13 +199,10 @@ if (LITE_WITH_EXCEPTION)
add_definitions("-DLITE_WITH_EXCEPTION")
endif()
if (LITE_ON_FLATBUFFERS_DESC_VIEW)
add_definitions("-DLITE_ON_FLATBUFFERS_DESC_VIEW")
message(STATUS "Flatbuffers will be used as cpp default program description.")
endif()
if (LITE_ON_TINY_PUBLISH)
add_definitions("-DLITE_ON_TINY_PUBLISH")
add_definitions("-DLITE_ON_FLATBUFFERS_DESC_VIEW")
message(STATUS "Flatbuffers will be used as cpp default program description.")
else()
add_definitions("-DLITE_WITH_FLATBUFFERS_DESC")
endif()
......
......@@ -16,6 +16,11 @@ if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
return()
endif()
# require -D_GLIBCXX_USE_CXX11_ABI=0 if GCC 7.3.0
if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
endif()
# 1. path to Huawei Ascend Install Path
if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT)
set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT})
......
......@@ -27,7 +27,7 @@ SET(FLATBUFFERS_SOURCES_DIR ${CMAKE_SOURCE_DIR}/third-party/flatbuffers)
SET(FLATBUFFERS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flatbuffers)
SET(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_SOURCES_DIR}/include" CACHE PATH "flatbuffers include directory." FORCE)
IF(WIN32)
set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/flatbuffers.lib" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
ELSE(WIN32)
set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.a" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
ENDIF(WIN32)
......@@ -64,13 +64,6 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
IF(WIN32)
IF(NOT EXISTS "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib")
add_custom_command(TARGET extern_flatbuffers POST_BUILD
COMMAND cmake -E copy ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/flatbuffers_static.lib ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib
)
ENDIF()
ENDIF(WIN32)
ADD_LIBRARY(flatbuffers STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET flatbuffers PROPERTY IMPORTED_LOCATION ${FLATBUFFERS_LIBRARIES})
ADD_DEPENDENCIES(flatbuffers extern_flatbuffers)
......
......@@ -217,6 +217,10 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
ENDIF()
IF(LITE_WITH_HUAWEI_ASCEND_NPU)
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}")
ENDIF()
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
ExternalProject_Add(
${TARGET_NAME}
......
......@@ -267,6 +267,10 @@ function(cc_library TARGET_NAME)
list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc)
endif()
if("${cc_library_DEPS};" MATCHES "fbs_headers;")
list(REMOVE_ITEM cc_library_DEPS fbs_headers)
add_dependencies(${TARGET_NAME} fbs_headers)
endif()
# Only deps libmklml.so, not link
if("${cc_library_DEPS};" MATCHES "mklml;")
list(REMOVE_ITEM cc_library_DEPS mklml)
......
......@@ -91,13 +91,23 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, ImageFormat dstFormat, T
// 方法二
void ImagePreprocess::imageCovert(const uint8_t* src,
uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat);
// 方法三
void ImagePreprocess::imageCovert(const uint8_t* src,
uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat,
int srcw, int srch);
```
+ 第一个 `imageCovert` 接口,缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时,必须要给以下成员变量赋值:
- param srcFormat:`ImagePreprocess` 类的成员变量`srcFormat_`
- param dstFormat:`ImagePreprocess` 类的成员变量`dstFormat_`
- param srcw: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`iw`变量
- param srch: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`ih`变量
- 第二个`imageCovert` 接口,缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时,必须要给以下成员变量赋值:
- param srcw: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`iw`变量
- param srch: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`ih`变量
- 第二个`imageCovert` 接口可以直接使用
- 第二个`imageCovert` 接口, 可以直接使用
### 缩放 Resize
......
......@@ -16,69 +16,12 @@ Paddle Lite已支持百度XPU在x86和arm服务器(例如飞腾 FT-2000+/64)
### 已支持的Paddle模型
- [ResNet50](https://paddlelite-demo.bj.bcebos.com/models/resnet50_fp32_224_fluid.tar.gz)
- [BERT](https://paddlelite-demo.bj.bcebos.com/models/bert_fp32_fluid.tar.gz)
- [ERNIE](https://paddlelite-demo.bj.bcebos.com/models/ernie_fp32_fluid.tar.gz)
- YOLOv3
- Mask R-CNN
- Faster R-CNN
- UNet
- SENet
- SSD
- [开源模型支持列表](../introduction/support_model_list)
- 百度内部业务模型(由于涉密,不方便透露具体细节)
### 已支持(或部分支持)的Paddle算子(Kernel接入方式)
- scale
- relu
- tanh
- sigmoid
- stack
- matmul
- pool2d
- slice
- lookup_table
- elementwise_add
- elementwise_sub
- cast
- batch_norm
- mul
- layer_norm
- softmax
- conv2d
- io_copy
- io_copy_once
- __xpu__fc
- __xpu__multi_encoder
- __xpu__resnet50
- __xpu__embedding_with_eltwise_add
### 已支持(或部分支持)的Paddle算子(子图/XTCL接入方式)
- relu
- tanh
- conv2d
- depthwise_conv2d
- elementwise_add
- pool2d
- softmax
- mul
- batch_norm
- stack
- gather
- scale
- lookup_table
- slice
- transpose
- transpose2
- reshape
- reshape2
- layer_norm
- gelu
- dropout
- matmul
- cast
- yolo_box
- [算子支持列表](../introduction/support_operation_list)
## 参考示例演示
......@@ -233,7 +176,7 @@ $ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build
```
- 将编译生成的build.lite.x86/inference_lite_lib/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/amd64/include目录;
- 将编译生成的build.lite.x86/inference_lite_lib/cxx/include/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/amd64/lib/libpaddle_full_api_shared.so文件;
- 将编译生成的build.lite.x86/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/amd64/lib/libpaddle_full_api_shared.so文件;
- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.xpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/arm64/include目录;
- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.xpu/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_full_api_shared.so文件。
......
# PaddleLite使用CUDA预测部署
**注意**: Lite CUDA仅作为Nvidia GPU加速库,支持模型有限,如有需要请使用[PaddleInference](https://paddle-inference.readthedocs.io/en/latest)
Lite支持在x86_64,arm64架构上(如:TX2)进行CUDA的编译运行。
## 编译
......
docs/images/architecture.png

149.8 KB | W: | H:

docs/images/architecture.png

227.6 KB | W: | H:

docs/images/architecture.png
docs/images/architecture.png
docs/images/architecture.png
docs/images/architecture.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -57,7 +57,6 @@ Welcome to Paddle-Lite's documentation!
demo_guides/ios_app_demo
demo_guides/linux_arm_demo
demo_guides/x86
demo_guides/cuda
demo_guides/opencl
demo_guides/fpga
demo_guides/huawei_kirin_npu
......
......@@ -5,23 +5,25 @@ Mobile 在这次升级为 Lite 架构, 侧重多硬件、高性能的支持,
- 引入 Type system,强化多硬件、量化方法、data layout 的混合调度能力
- 硬件细节隔离,通过不同编译开关,对支持的任何硬件可以自由插拔
- 引入 MIR(Machine IR) 的概念,强化带执行环境下的优化支持
- 优化期和执行期严格隔离,保证预测时轻量和高效率
- 图优化模块和执行引擎实现了良好的解耦拆分,保证预测执行阶段的轻量和高效率
架构图如下
![Paddle Inference Refactor1.0](https://user-images.githubusercontent.com/52520497/64949619-26e49580-d8ac-11e9-855a-514feb9b75af.png)
<p align="center"><img width="500" src="https://raw.githubusercontent.com/PaddlePaddle/Paddle-Lite/develop/docs/images/architecture.png"/></p>
## 编译期和执行期严格隔离设计
## 模型优化阶段和预测执行阶段的隔离设计
- compile time 优化完毕可以将优化信息存储到模型中;execution time 载入并执行
- 两套 API 及对应的预测lib,满足不同场景
- `CxxPredictor` 打包了 `Compile Time``Execution Time`,可以 runtime 在具体硬件上做分析和优化,得到最优效果
- `MobilePredictor` 只打包 `Execution Time`,保持部署和执行的轻量
- Analysis Phase为模型优化阶段,输入为Paddle的推理模型,通过Lite的模型加速和优化策略对计算图进行相关的优化分析,包含算子融合,计算裁剪,存储优化,量化精度转换、存储优化、Kernel优选等多类图优化手段。优化后的模型更轻量级,在相应的硬件上运行时耗费资源更少,并且执行速度也更快。
- Execution Phase为预测执行阶段,输入为优化后的Lite模型,仅做模型加载和预测执行两步操作,支持极致的轻量级部署,无任何第三方依赖。
## `Execution Time` 轻量级设计和实现
Lite设计了两套 API 及对应的预测库,满足不同场景需求:
- `CxxPredictor` 同时包含 `Analysis Phase``Execution Phase`,支持一站式的预测任务,同时支持模型进行分析优化与预测执行任务,适用于对预测库大小不敏感的硬件场景。
- `MobilePredictor` 只包含 `Execution Phase`,保持预测部署和执行的轻量级和高性能,支持从内存或者文件中加载优化后的模型,并进行预测执行。
- 每个 batch 实际执行只包含两个步骤执行
- `Op.InferShape`
## Execution Phase轻量级设计和实现
- 在预测执行阶段,每个 batch 实际执行只包含两个步骤执行
- `OpLite.InferShape` 基于输入推断得到输出的维度
- `Kernel.Run`,Kernel 相关参数均使用指针提前确定,后续无查找或传参消耗
- 设计目标,执行时,只有 kernel 计算本身消耗
- 轻量级 `Op``Kernel` 设计,避免框架额外消耗
......
......@@ -29,7 +29,8 @@ Paddle Lite支持[ARM Cortex-A系列处理器](https://en.wikipedia.org/wiki/ARM
Paddle Lite支持移动端GPU和Nvidia端上GPU设备,支持列表如下:
- ARM Mali G 系列
- Qualcomm Adreno 系列
- Nvida tegra系列: tx1, tx2, nano, xavier
Nvida tegra系列: tx1, tx2, nano, xavier
## NPU
Paddle Lite支持NPU,支持列表如下:
......
# 支持模型
目前已严格验证24个模型的精度和性能,对视觉类模型做到了较为充分的支持,覆盖分类、检测和定位,包含了特色的OCR模型的支持,并在不断丰富中。
目前已严格验证28个模型的精度和性能,对视觉类模型做到了较为充分的支持,覆盖分类、检测和定位,包含了特色的OCR模型的支持,并在不断丰富中。
| 类别 | 类别细分 | 模型 | 支持Int8 | 支持平台 |
|-|-|:-:|:-:|-:|
| CV | 分类 | mobilenetv1 | Y | ARM,X86,NPU,RKNPU,APU |
| CV | 分类 | mobilenetv2 | Y | ARM,X86,NPU |
| CV | 分类 | resnet18 | Y | ARM,NPU |
| CV | 分类 | resnet50 | Y | ARM,X86,NPU,XPU |
| CV | 分类 | mnasnet | | ARM,NPU |
| CV | 分类 | efficientnet | | ARM |
| CV | 分类 | squeezenetv1.1 | | ARM,NPU |
| CV | 分类 | ShufflenetV2 | Y | ARM |
| CV | 分类 | shufflenet | Y | ARM |
| CV | 分类 | inceptionv4 | Y | ARM,X86,NPU |
| CV | 分类 | vgg16 | Y | ARM |
| CV | 分类 | googlenet | Y | ARM,X86 |
| CV | 检测 | mobilenet_ssd | Y | ARM,NPU* |
| CV | 检测 | mobilenet_yolov3 | Y | ARM,NPU* |
| CV | 检测 | Faster RCNN | | ARM |
| CV | 检测 | Mask RCNN | | ARM |
| CV | 分割 | Deeplabv3 | Y | ARM |
| CV | 分割 | unet | | ARM |
| CV | 人脸 | facedetection | | ARM |
| CV | 人脸 | facebox | | ARM |
| CV | 人脸 | blazeface | Y | ARM |
| CV | 人脸 | mtcnn | | ARM |
| CV | OCR | ocr_attention | | ARM |
| NLP | 机器翻译 | transformer | | ARM,NPU* |
| 类别 | 类别细分 | 模型 | 支持平台 |
|-|-|:-|:-|
| CV | 分类 | [MobileNetV1](https://paddlelite-demo.bj.bcebos.com/models/mobilenet_v1_fp32_224_fluid.tar.gz) | ARM,X86,NPU,RKNPU,APU |
| CV | 分类 | [MobileNetV2](https://paddlelite-demo.bj.bcebos.com/models/mobilenet_v2_fp32_224_fluid.tar.gz) | ARM,X86,NPU |
| CV | 分类 | [ResNet18](https://paddlelite-demo.bj.bcebos.com/models/resnet18_fp32_224_fluid.tar.gz) | ARM,NPU |
| CV | 分类 | [ResNet50](https://paddlelite-demo.bj.bcebos.com/models/resnet50_fp32_224_fluid.tar.gz) | ARM,X86,NPU,XPU |
| CV | 分类 | [MnasNet](https://paddlelite-demo.bj.bcebos.com/models/mnasnet_fp32_224_fluid.tar.gz) | ARM,NPU |
| CV | 分类 | [EfficientNet*](https://github.com/PaddlePaddle/PaddleClas) | ARM |
| CV | 分类 | [SqueezeNet](https://paddlelite-demo.bj.bcebos.com/models/squeezenet_fp32_224_fluid.tar.gz) | ARM,NPU |
| CV | 分类 | [ShufflenetV2*](https://github.com/PaddlePaddle/PaddleClas) | ARM |
| CV | 分类 | [ShuffleNet](https://paddlepaddle-inference-banchmark.bj.bcebos.com/shufflenet_inference.tar.gz) | ARM |
| CV | 分类 | [InceptionV4](https://paddle-inference-dist.bj.bcebos.com/inception_v4_simple.tar.gz) | ARM,X86,NPU |
| CV | 分类 | [VGG16](https://paddlepaddle-inference-banchmark.bj.bcebos.com/VGG16_inference.tar) | ARM |
| CV | 分类 | [VGG19](https://paddlepaddle-inference-banchmark.bj.bcebos.com/VGG19_inference.tar) | XPU|
| CV | 分类 | [GoogleNet](https://paddlepaddle-inference-banchmark.bj.bcebos.com/GoogleNet_inference.tar) | ARM,X86,XPU |
| CV | 检测 | [MobileNet-SSD](https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz) | ARM,NPU* |
| CV | 检测 | [YOLOv3-MobileNetV3](https://paddlelite-demo.bj.bcebos.com/models/yolov3_mobilenet_v3_prune86_FPGM_320_fp32_fluid.tar.gz) | ARM,NPU* |
| CV | 检测 | [Faster RCNN](https://paddlepaddle-inference-banchmark.bj.bcebos.com/faster_rcnn.tar) | ARM |
| CV | 检测 | [Mask RCNN*](https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.4/docs/MODEL_ZOO_cn.md) | ARM |
| CV | 分割 | [Deeplabv3](https://paddlelite-demo.bj.bcebos.com/models/deeplab_mobilenet_fp32_fluid.tar.gz) | ARM |
| CV | 分割 | [UNet](https://paddlelite-demo.bj.bcebos.com/models/Unet.zip) | ARM |
| CV | 人脸 | [FaceDetection](https://paddlelite-demo.bj.bcebos.com/models/facedetection_fp32_240_430_fluid.tar.gz) | ARM |
| CV | 人脸 | [FaceBoxes*](https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.4/docs/featured_model/FACE_DETECTION.md#FaceBoxes) | ARM |
| CV | 人脸 | [BlazeFace*](https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.4/docs/featured_model/FACE_DETECTION.md#BlazeFace) | ARM |
| CV | 人脸 | [MTCNN](https://paddlelite-demo.bj.bcebos.com/models/mtcnn.zip) | ARM |
| CV | OCR | [OCR-Attention](https://paddle-inference-dist.bj.bcebos.com/ocr_attention.tar.gz) | ARM |
| CV | GAN | [CycleGAN*](https://github.com/PaddlePaddle/models/tree/release/1.7/PaddleCV/gan/cycle_gan) | NPU |
| NLP | 机器翻译 | [Transformer*](https://github.com/PaddlePaddle/models/tree/release/1.8/PaddleNLP/machine_translation/transformer) | ARM,NPU* |
| NLP | 机器翻译 | [BERT](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/models_and_data_for_unittests/bert.tar.gz) | XPU |
| NLP | 语义表示 | [ERNIE](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/models_and_data_for_unittests/ernie.tar.gz) | XPU |
> **注意:** NPU* 代表ARM+NPU异构计算
**注意:**
1. 模型列表中 * 代表该模型链接来自[PaddlePaddle/models](https://github.com/PaddlePaddle/models),否则为推理模型的下载链接
2. 支持平台列表中 NPU* 代表ARM+NPU异构计算,否则为NPU计算
......@@ -76,7 +76,6 @@ pip install paddlelite
- [ArmLinux源码编译](../source_compile/compile_linux)
- [x86源码编译](../demo_guides/x86)
- [opencl源码编译](../demo_guides/opencl)
- [CUDA源码编译](../demo_guides/cuda)
- [FPGA源码编译](../demo_guides/fpga)
- [华为NPU源码编译](../demo_guides/huawei_kirin_npu)
- [百度XPU源码编译](../demo_guides/baidu_xpu)
......
......@@ -2,51 +2,63 @@
Lite是一种轻量级、灵活性强、易于扩展的高性能的深度学习预测框架,它可以支持诸如ARM、OpenCL、NPU等等多种终端,同时拥有强大的图优化及预测加速能力。如果您希望将Lite框架集成到自己的项目中,那么只需要如下几步简单操作即可。
## 一. 准备模型
Lite框架目前支持的模型结构为[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)深度学习框架产出的模型格式。因此,在您开始使用 Lite 框架前您需要准备一个由PaddlePaddle框架保存的模型。
如果您手中的模型是由诸如Caffe2、Tensorflow等框架产出的,那么我们推荐您使用 [X2Paddle](https://github.com/PaddlePaddle/X2Paddle) 工具进行模型格式转换。
![workflow](https://raw.githubusercontent.com/PaddlePaddle/Paddle-Lite/develop/docs/images/workflow.png)
## 二. 模型优化
**一. 准备模型**
Lite框架拥有强大的加速、优化策略及实现,其中包含诸如量化、子图融合、Kernel优选等等优化手段,为了方便您使用这些优化策略,我们提供了[opt](../user_guides/model_optimize_tool)帮助您轻松进行模型优化。优化后的模型更轻量级,耗费资源更少,并且执行速度也更快。
Paddle Lite框架直接支持模型结构为[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)深度学习框架产出的模型格式。目前PaddlePaddle用于推理的模型是通过[save_inference_model](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/io_cn/save_inference_model_cn.html#save-inference-model)这个API保存下来的。
如果您手中的模型是由诸如Caffe、Tensorflow、PyTorch等框架产出的,那么您可以使用 [X2Paddle](https://github.com/PaddlePaddle/X2Paddle) 工具将模型转换为PadddlePaddle格式。
opt的详细介绍,请您参考 [模型优化方法](../user_guides/model_optimize_tool)
**二. 模型优化**
下载opt工具后执行以下代码:
Paddle Lite框架拥有优秀的加速、优化策略及实现,包含量化、子图融合、Kernel优选等优化手段。优化后的模型更轻量级,耗费资源更少,并且执行速度也更快。
这些优化通过Paddle Lite提供的opt工具实现。opt工具还可以统计并打印出模型中的算子信息,并判断不同硬件平台下Paddle Lite的支持情况。您获取PaddlePaddle格式的模型之后,一般需要通该opt工具做模型优化。opt工具的下载和使用,请参考 [模型优化方法](https://paddle-lite.readthedocs.io/zh/latest/user_guides/model_optimize_tool.html)
``` shell
$ ./opt \
--model_dir=<model_param_dir> \
--model_file=<model_path> \
--param_file=<param_path> \
--optimize_out_type=(protobuf|naive_buffer) \
--optimize_out=<output_optimize_model_dir> \
--valid_targets=(arm|opencl|x86)
```
**注意**: 为了减少第三方库的依赖、提高Lite预测框架的通用性,在移动端使用Lite API您需要准备Naive Buffer存储格式的模型。
其中,optimize_out为您希望的优化模型的输出路径。optimize_out_type则可以指定输出模型的序列化方式,其目前支持Protobuf与Naive Buffer两种方式,其中Naive Buffer是一种更轻量级的序列化/反序列化实现。如果你需要使用Lite在mobile端进行预测,那么您需要设置optimize_out_type=naive_buffer。
**三. 下载或编译**
## 三. 使用Lite框架执行预测
Paddle Lite提供了Android/iOS/X86平台的官方Release预测库下载,我们优先推荐您直接下载 [Paddle Lite预编译库](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html)
您也可以根据目标平台选择对应的[源码编译方法](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html#id2)。Paddle Lite 提供了源码编译脚本,位于 `lite/tools/`文件夹下,只需要 [准备环境](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html)[调用编译脚本](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html#id2) 两个步骤即可一键编译得到目标平台的Paddle Lite预测库。
在上一节中,我们已经通过`opt`获取到了优化后的模型,使用优化模型进行预测也十分的简单。为了方便您的使用,Lite进行了良好的API设计,隐藏了大量您不需要投入时间研究的细节。您只需要简单的五步即可使用Lite在移动端完成预测(以C++ API进行说明):
**四. 开发应用程序**
Paddle Lite提供了C++、Java、Python三种API,只需简单五步即可完成预测(以C++ API为例):
1. 声明MobileConfig。在config中可以设置**从文件加载模型**也可以设置**从memory加载模型**。从文件加载模型需要声明模型文件路径,如 `config.set_model_from_file(FLAGS_model_file)` ;从memory加载模型方法现只支持加载优化后模型的naive buffer,实现方法为:
`void set_model_from_buffer(model_buffer) `
1. 声明`MobileConfig`,设置第二步优化后的模型文件路径,或选择从内存中加载模型
2. 创建`Predictor`,调用`CreatePaddlePredictor`接口,一行代码即可完成引擎初始化
3. 准备输入,通过`predictor->GetInput(i)`获取输入变量,并为其指定输入大小和输入值
4. 执行预测,只需要运行`predictor->Run()`一行代码,即可使用Lite框架完成预测执行
5. 获得输出,使用`predictor->GetOutput(i)`获取输出变量,并通过`data<T>`取得输出值
2. 创建Predictor。Predictor即为Lite框架的预测引擎,为了方便您的使用我们提供了 `CreatePaddlePredictor` 接口,你只需要简单的执行一行代码即可完成预测引擎的初始化,`std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config)`
3. 准备输入。执行predictor->GetInput(0)您将会获得输入的第0个field,同样的,如果您的模型有多个输入,那您可以执行 `predictor->GetInput(i)` 来获取相应的输入变量。得到输入变量后您可以使用Resize方法指定其具体大小,并填入输入值。
4. 执行预测。您只需要执行 `predictor->Run()` 即可使用Lite框架完成预测。
5. 获取输出。与输入类似,您可以使用 `predictor->GetOutput(i)` 来获得输出的第i个变量。您可以通过其shape()方法获取输出变量的维度,通过 `data<T>()` 模板方法获取其输出值。
Paddle Lite提供了C++、Java、Python三种API的完整使用示例和开发说明文档,您可以参考示例中的说明快速了解使用方法,并集成到您自己的项目中去。
- [C++完整示例](cpp_demo.html)
- [Java完整示例](java_demo.html)
- [Python完整示例](python_demo.html)
针对不同的硬件平台,Paddle Lite提供了各个平台的完整示例:
- [Android示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/android_app_demo.html)
- [iOS示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/ios_app_demo.html)
- [ARMLinux示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/linux_arm_demo.html)
- [X86示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/x86.html)
- [OpenCL示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/opencl.html)
- [FPGA示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/fpga.html)
- [华为NPU示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/huawei_kirin_npu.html)
- [百度XPU示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/baidu_xpu.html)
- [瑞芯微NPU示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/rockchip_npu.html)
- [联发科APU示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/mediatek_apu.html)
## 四. Lite API
您也可以下载以下基于Paddle-Lite开发的预测APK程序,安装到Andriod平台上,先睹为快:
为了方便您的使用,我们提供了C++、Java、Python三种API,并且提供了相应的api的完整使用示例:[C++完整示例](cpp_demo)[Java完整示例](java_demo)[Python完整示例](python_demo),您可以参考示例中的说明快速了解C++/Java/Python的API使用方法,并集成到您自己的项目中去。需要说明的是,为了减少第三方库的依赖、提高Lite预测框架的通用性,在移动端使用Lite API您需要准备Naive Buffer存储格式的模型,具体方法可参考第2节`模型优化`
- [图像分类](https://paddlelite-demo.bj.bcebos.com/apps/android/mobilenet_classification_demo.apk)
- [目标检测](https://paddlelite-demo.bj.bcebos.com/apps/android/yolo_detection_demo.apk)
- [口罩检测](https://paddlelite-demo.bj.bcebos.com/apps/android/mask_detection_demo.apk)
- [人脸关键点](https://paddlelite-demo.bj.bcebos.com/apps/android/face_keypoints_detection_demo.apk)
- [人像分割](https://paddlelite-demo.bj.bcebos.com/apps/android/human_segmentation_demo.apk)
## 五. 测试工具
## 更多测试工具
为了使您更好的了解并使用Lite框架,我们向有进一步使用需求的用户开放了 [Debug工具](../user_guides/debug)[Profile工具](../user_guides/debug)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异,进一步快速定位问题Op,方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗,其会自动统计Op执行的次数,最长、最短、平均执行时间等等信息,为性能调优做一个基础参考。您可以通过 [相关专题](../user_guides/debug) 了解更多内容。
......@@ -19,7 +19,6 @@ Paddle Lite提供了Android/iOS/X86平台的官方Release预测库下载,如
- [ArmLinux源码编译](../source_compile/compile_linux)
- [X86源码编译](../demo_guides/x86)
- [OpenCL源码编译](../demo_guides/opencl)
- [CUDA源码编译](../demo_guides/cuda)
- [FPGA源码编译](../demo_guides/fpga)
- [华为NPU源码编译](../demo_guides/huawei_kirin_npu)
- [百度XPU源码编译](../demo_guides/baidu_xpu)
......
......@@ -41,6 +41,7 @@ if (WITH_TESTING)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1_int16.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
......@@ -51,11 +52,19 @@ if (WITH_TESTING)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "step_rnn.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "bert.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "ernie.tar.gz")
set(LITE_URL_FOR_UNITTESTS "http://paddle-inference-dist.bj.bcebos.com/PaddleLite/models_and_data_for_unittests")
# models
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "resnet50.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "bert.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "ernie.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "GoogLeNet.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "VGG19.tar.gz")
# data
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "ILSVRC2012_small.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "bert_data.tar.gz")
endif()
endif()
......
......@@ -15,7 +15,6 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
#full api dynamic library
lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
DEPS paddle_api paddle_api_light paddle_api_full)
target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files})
add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto op_registry fbs_headers)
target_link_libraries(paddle_full_api_shared framework_proto op_registry)
if(LITE_WITH_X86)
......@@ -70,6 +69,10 @@ else()
set(TARGET_COMIPILE_FLAGS "-fdata-sections")
if (NOT (ARM_TARGET_LANG STREQUAL "clang")) #gcc
set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto")
# TODO (hong19860320): Disable lto temporarily since it causes fail to catch the exceptions in android when toolchain is gcc.
if (ARM_TARGET_OS STREQUAL "android" AND LITE_WITH_EXCEPTION)
set(TARGET_COMIPILE_FLAGS "")
endif()
endif()
set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}")
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h fbs_headers)
......@@ -289,6 +292,14 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
set_target_properties(test_mobilenetv1 PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
endif()
lite_cc_test(test_mobilenetv1_int16 SRCS mobilenetv1_int16_test.cc
DEPS ${lite_model_test_DEPS} ${light_lib_DEPS}
CL_DEPS ${opencl_kernels}
NPU_DEPS ${npu_kernels} ${npu_bridges}
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
--model_dir=${LITE_MODEL_DIR}/mobilenet_v1_int16 SERIAL)
add_dependencies(test_mobilenetv1 extern_lite_download_mobilenet_v1_int16_tar_gz)
lite_cc_test(test_mobilenetv2 SRCS mobilenetv2_test.cc
DEPS ${lite_model_test_DEPS}
CL_DEPS ${opencl_kernels}
......
......@@ -17,7 +17,6 @@ if (NOT LITE_ON_TINY_PUBLISH)
# Unlike static library, module library has to link target to be able to work
# as a single .so lib.
target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels})
add_dependencies(paddle_lite_jni fbs_headers)
if (LITE_WITH_NPU)
# Strips the symbols of our protobuf functions to fix the conflicts during
# loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
......
......@@ -30,8 +30,6 @@
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/core/device_info.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
......
......@@ -58,6 +58,16 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
config.mlu_input_layout(),
config.mlu_firstconv_param());
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_BM
Env<TARGET(kBM)>::Init();
int device_id = 0;
if (const char *c_id = getenv("BM_VISIBLE_DEVICES")) {
device_id = static_cast<int>(*c_id) - 48;
}
TargetWrapper<TARGET(kBM)>::SetDevice(device_id);
#endif // LITE_WITH_BM
auto use_layout_preprocess_pass =
config.model_dir().find("OPENCL_PRE_PRECESS");
VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass;
......@@ -86,7 +96,7 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
config.subgraph_model_cache_dir());
#endif
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
int num_threads = config.x86_math_library_num_threads();
int real_num_threads = num_threads > 1 ? num_threads : 1;
paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);
......
......@@ -131,7 +131,8 @@ TEST(CXXApi, save_model) {
predictor.Build(FLAGS_model_dir, "", "", valid_places);
LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
predictor.SaveModel(FLAGS_optimized_model);
predictor.SaveModel(FLAGS_optimized_model,
lite_api::LiteModelType::kProtobuf);
predictor.SaveModel(FLAGS_optimized_model + ".naive",
lite_api::LiteModelType::kNaiveBuffer);
}
......
......@@ -46,7 +46,6 @@ void LightPredictor::Build(const std::string& model_dir,
case lite_api::LiteModelType::kProtobuf:
LoadModelPb(model_dir, "", "", scope_.get(), program_desc_.get());
break;
#endif
case lite_api::LiteModelType::kNaiveBuffer: {
if (model_from_memory) {
LoadModelNaiveFromMemory(
......@@ -56,6 +55,7 @@ void LightPredictor::Build(const std::string& model_dir,
}
break;
}
#endif
default:
LOG(FATAL) << "Unknown model type";
}
......
......@@ -17,6 +17,10 @@
#include "lite/api/paddle_api.h"
#include "lite/core/version.h"
#include "lite/model_parser/model_parser.h"
#ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#endif
namespace paddle {
namespace lite {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/cxx_api.h"
#include "lite/api/light_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/op_registry.h"
DEFINE_string(optimized_model,
"/data/local/tmp/int16_model",
"optimized_model");
DEFINE_int32(N, 1, "input_batch");
DEFINE_int32(C, 3, "input_channel");
DEFINE_int32(H, 224, "input_height");
DEFINE_int32(W, 224, "input_width");
namespace paddle {
namespace lite {
void TestModel(const std::vector<Place>& valid_places,
const std::string& model_dir) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_NO_BIND, FLAGS_threads);
LOG(INFO) << "Optimize model.";
lite::Predictor cxx_predictor;
cxx_predictor.Build(model_dir, "", "", valid_places);
cxx_predictor.SaveModel(FLAGS_optimized_model,
paddle::lite_api::LiteModelType::kNaiveBuffer);
LOG(INFO) << "Load optimized model.";
lite::LightPredictor predictor(FLAGS_optimized_model + ".nb", false);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(
std::vector<DDim::value_type>({FLAGS_N, FLAGS_C, FLAGS_H, FLAGS_W})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = FLAGS_N * FLAGS_C * FLAGS_H * FLAGS_W;
for (int i = 0; i < item_size; i++) {
data[i] = 1.;
}
LOG(INFO) << "Predictor run.";
predictor.Run();
auto* out = predictor.GetOutput(0);
const auto* pdata = out->data<float>();
std::vector<float> ref = {
0.000191383, 0.000592063, 0.000112282, 6.27426e-05, 0.000127522};
double eps = 1e-5;
for (int i = 0; i < ref.size(); ++i) {
EXPECT_NEAR(pdata[i], ref[i], eps);
}
}
TEST(MobileNetV1_Int16, test_arm) {
std::vector<Place> valid_places({
Place{TARGET(kARM), PRECISION(kFloat)},
});
std::string model_dir = FLAGS_model_dir;
TestModel(valid_places, model_dir);
}
} // namespace lite
} // namespace paddle
......@@ -25,8 +25,6 @@
#include "lite/core/profile/basic_profiler.h"
#endif // LITE_WITH_PROFILE
#include <gflags/gflags.h>
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
using paddle::lite::profile::Timer;
......
......@@ -356,5 +356,13 @@ void MobileConfig::set_model_buffer(const char *model_buffer,
model_from_memory_ = true;
}
// This is the method for allocating workspace_size according to L3Cache size
void MobileConfig::SetArmL3CacheSize(L3CacheSetMethod method,
int absolute_val) {
#ifdef LITE_WITH_ARM
lite::DeviceInfo::Global().SetArmL3CacheSize(method, absolute_val);
#endif
}
} // namespace lite_api
} // namespace paddle
......@@ -32,6 +32,14 @@ using shape_t = std::vector<int64_t>;
using lod_t = std::vector<std::vector<uint64_t>>;
enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK };
// Methods for allocating L3Cache on Arm platform
enum class L3CacheSetMethod {
kDeviceL3Cache = 0, // Use the system L3 Cache size, best performance.
kDeviceL2Cache = 1, // Use the system L2 Cache size, trade off performance
// with less memory consumption.
kAbsolute = 2, // Use the external setting.
// kAutoGrow = 3, // Not supported yet, least memory consumption.
};
// return true if current device supports OpenCL model
LITE_API bool IsOpenCLBackendValid();
......@@ -294,6 +302,11 @@ class LITE_API MobileConfig : public ConfigBase {
// NOTE: This is a deprecated API and will be removed in latter release.
const std::string& param_buffer() const { return param_buffer_; }
// This is the method for allocating workspace_size according to L3Cache size
void SetArmL3CacheSize(
L3CacheSetMethod method = L3CacheSetMethod::kDeviceL3Cache,
int absolute_val = -1);
};
template <typename ConfigT>
......
......@@ -15,8 +15,6 @@
#include "lite/api/paddle_api.h"
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/io.h"
......@@ -109,7 +107,8 @@ TEST(CxxApi, share_external_data) {
TEST(LightApi, run) {
lite_api::MobileConfig config;
config.set_model_from_file(FLAGS_model_dir + ".opt2.naive.nb");
// disable L3 cache on workspace_ allocating
config.SetArmL3CacheSize(L3CacheSetMethod::kDeviceL2Cache);
auto predictor = lite_api::CreatePaddlePredictor(config);
auto inputs = predictor->GetInputNames();
......@@ -150,6 +149,8 @@ TEST(MobileConfig, LoadfromMemory) {
// set model buffer and run model
lite_api::MobileConfig config;
config.set_model_from_buffer(model_buffer);
// allocate 1M initial space for workspace_
config.SetArmL3CacheSize(L3CacheSetMethod::kAbsolute, 1024 * 1024);
auto predictor = lite_api::CreatePaddlePredictor(config);
auto input_tensor = predictor->GetInput(0);
......
......@@ -62,6 +62,7 @@ USE_MIR_PASS(quantized_op_attributes_inference_pass);
USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass)
USE_MIR_PASS(lite_scale_activation_fuse_pass);
USE_MIR_PASS(__xpu__resnet_fuse_pass);
USE_MIR_PASS(__xpu__resnet_d_fuse_pass);
USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);
USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
......
......@@ -9,7 +9,7 @@ if(WIN32)
target_link_libraries(lite_pybind ${os_dependency_modules})
else()
lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
target_sources(lite_pybind PUBLIC ${__lite_cc_files})
target_sources(lite_pybind PUBLIC ${__lite_cc_files} fbs_headers)
endif(WIN32)
if (LITE_ON_TINY_PUBLISH)
......
......@@ -82,16 +82,20 @@ void NeuronAdapter::InitFunctions() {
PADDLE_DLSYM(NeuronModel_setOperandValue);
PADDLE_DLSYM(NeuronModel_setOperandSymmPerChannelQuantParams);
PADDLE_DLSYM(NeuronModel_addOperation);
PADDLE_DLSYM(NeuronModel_addOperationExtension);
PADDLE_DLSYM(NeuronModel_identifyInputsAndOutputs);
PADDLE_DLSYM(NeuronCompilation_create);
PADDLE_DLSYM(NeuronCompilation_free);
PADDLE_DLSYM(NeuronCompilation_finish);
PADDLE_DLSYM(NeuronCompilation_createForDevices);
PADDLE_DLSYM(NeuronExecution_create);
PADDLE_DLSYM(NeuronExecution_free);
PADDLE_DLSYM(NeuronExecution_setInput);
PADDLE_DLSYM(NeuronExecution_setOutput);
PADDLE_DLSYM(NeuronExecution_compute);
PADDLE_DLSYM(Neuron_getDeviceCount);
PADDLE_DLSYM(Neuron_getDevice);
PADDLE_DLSYM(NeuronDevice_getName);
#undef PADDLE_DLSYM
}
......@@ -146,6 +150,25 @@ int NeuronModel_addOperation(NeuronModel* model,
model, type, inputCount, inputs, outputCount, outputs);
}
int NeuronModel_addOperationExtension(NeuronModel* model,
const char* name,
const char* vendor,
const NeuronDevice* device,
uint32_t inputCount,
const uint32_t* inputs,
uint32_t outputCount,
const uint32_t* outputs) {
return paddle::lite::NeuronAdapter::Global()
->NeuronModel_addOperationExtension()(model,
name,
vendor,
device,
inputCount,
inputs,
outputCount,
outputs);
}
int NeuronModel_identifyInputsAndOutputs(NeuronModel* model,
uint32_t inputCount,
const uint32_t* inputs,
......@@ -172,6 +195,15 @@ int NeuronCompilation_finish(NeuronCompilation* compilation) {
compilation);
}
int NeuronCompilation_createForDevices(NeuronModel* model,
const NeuronDevice* const* devices,
uint32_t numDevices,
NeuronCompilation** compilation) {
return paddle::lite::NeuronAdapter::Global()
->NeuronCompilation_createForDevices()(
model, devices, numDevices, compilation);
}
int NeuronExecution_create(NeuronCompilation* compilation,
NeuronExecution** execution) {
return paddle::lite::NeuronAdapter::Global()->NeuronExecution_create()(
......@@ -205,3 +237,18 @@ int NeuronExecution_compute(NeuronExecution* execution) {
return paddle::lite::NeuronAdapter::Global()->NeuronExecution_compute()(
execution);
}
int Neuron_getDeviceCount(uint32_t* numDevices) {
return paddle::lite::NeuronAdapter::Global()->Neuron_getDeviceCount()(
numDevices);
}
int Neuron_getDevice(uint32_t devIndex, NeuronDevice** device) {
return paddle::lite::NeuronAdapter::Global()->Neuron_getDevice()(devIndex,
device);
}
int NeuronDevice_getName(const NeuronDevice* device, const char** name) {
return paddle::lite::NeuronAdapter::Global()->NeuronDevice_getName()(device,
name);
}
......@@ -42,12 +42,25 @@ class NeuronAdapter final {
const uint32_t *,
uint32_t,
const uint32_t *);
using NeuronModel_addOperationExtension_Type = int (*)(NeuronModel *,
const char *,
const char *,
const NeuronDevice *,
uint32_t,
const uint32_t *,
uint32_t,
const uint32_t *);
using NeuronModel_identifyInputsAndOutputs_Type = int (*)(
NeuronModel *, uint32_t, const uint32_t *, uint32_t, const uint32_t *);
using NeuronCompilation_create_Type = int (*)(NeuronModel *,
NeuronCompilation **);
using NeuronCompilation_free_Type = void (*)(NeuronCompilation *);
using NeuronCompilation_finish_Type = int (*)(NeuronCompilation *);
using NeuronCompilation_createForDevices_Type =
int (*)(NeuronModel *,
const NeuronDevice *const *,
uint32_t,
NeuronCompilation **);
using NeuronExecution_create_Type = int (*)(NeuronCompilation *,
NeuronExecution **);
using NeuronExecution_free_Type = void (*)(NeuronExecution *);
......@@ -59,6 +72,10 @@ class NeuronAdapter final {
using NeuronExecution_setOutput_Type = int (*)(
NeuronExecution *, int32_t, const NeuronOperandType *, void *, size_t);
using NeuronExecution_compute_Type = int (*)(NeuronExecution *);
using Neuron_getDeviceCount_Type = int (*)(uint32_t *);
using Neuron_getDevice_Type = int (*)(uint32_t, NeuronDevice **);
using NeuronDevice_getName_Type = int (*)(const NeuronDevice *,
const char **);
Neuron_getVersion_Type Neuron_getVersion() {
CHECK(Neuron_getVersion_ != nullptr) << "Cannot load Neuron_getVersion!";
......@@ -105,6 +122,12 @@ class NeuronAdapter final {
return NeuronModel_addOperation_;
}
NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension() {
CHECK(NeuronModel_addOperationExtension_ != nullptr)
<< "Cannot load NeuronModel_addOperationExtension!";
return NeuronModel_addOperationExtension_;
}
NeuronModel_identifyInputsAndOutputs_Type
NeuronModel_identifyInputsAndOutputs() {
CHECK(NeuronModel_identifyInputsAndOutputs_ != nullptr)
......@@ -130,6 +153,12 @@ class NeuronAdapter final {
return NeuronCompilation_finish_;
}
NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices() {
CHECK(NeuronCompilation_createForDevices_ != nullptr)
<< "Cannot load NeuronCompilation_createForDevices!";
return NeuronCompilation_createForDevices_;
}
NeuronExecution_create_Type NeuronExecution_create() {
CHECK(NeuronExecution_create_ != nullptr)
<< "Cannot load NeuronExecution_create!";
......@@ -160,6 +189,23 @@ class NeuronAdapter final {
return NeuronExecution_compute_;
}
Neuron_getDeviceCount_Type Neuron_getDeviceCount() {
CHECK(Neuron_getDeviceCount_ != nullptr)
<< "Cannot load Neuron_getDeviceCount!";
return Neuron_getDeviceCount_;
}
Neuron_getDevice_Type Neuron_getDevice() {
CHECK(Neuron_getDevice_ != nullptr) << "Cannot load Neuron_getDevice!";
return Neuron_getDevice_;
}
NeuronDevice_getName_Type NeuronDevice_getName() {
CHECK(NeuronDevice_getName_ != nullptr)
<< "Cannot load NeuronDevice_getName!";
return NeuronDevice_getName_;
}
private:
NeuronAdapter();
NeuronAdapter(const NeuronAdapter &) = delete;
......@@ -176,16 +222,23 @@ class NeuronAdapter final {
NeuronModel_setOperandSymmPerChannelQuantParams_Type
NeuronModel_setOperandSymmPerChannelQuantParams_{nullptr};
NeuronModel_addOperation_Type NeuronModel_addOperation_{nullptr};
NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension_{
nullptr};
NeuronModel_identifyInputsAndOutputs_Type
NeuronModel_identifyInputsAndOutputs_{nullptr};
NeuronCompilation_create_Type NeuronCompilation_create_{nullptr};
NeuronCompilation_free_Type NeuronCompilation_free_{nullptr};
NeuronCompilation_finish_Type NeuronCompilation_finish_{nullptr};
NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices_{
nullptr};
NeuronExecution_create_Type NeuronExecution_create_{nullptr};
NeuronExecution_free_Type NeuronExecution_free_{nullptr};
NeuronExecution_setInput_Type NeuronExecution_setInput_{nullptr};
NeuronExecution_setOutput_Type NeuronExecution_setOutput_{nullptr};
NeuronExecution_compute_Type NeuronExecution_compute_{nullptr};
Neuron_getDeviceCount_Type Neuron_getDeviceCount_{nullptr};
Neuron_getDevice_Type Neuron_getDevice_{nullptr};
NeuronDevice_getName_Type NeuronDevice_getName_{nullptr};
};
} // namespace lite
} // namespace paddle
......@@ -127,8 +127,10 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
anchor_generator.cc
split_merge_lod_tenosr.cc
reduce_prod.cc
reduce_sum.cc
lstm.cc
clip.cc
pixel_shuffle.cc
scatter.cc
DEPS ${lite_kernel_deps} context tensor)
endif()
......@@ -620,8 +620,10 @@ void conv_depthwise_3x3_fp32(const void* din,
int pad = pad_w;
bool flag_bias = param.bias != nullptr;
bool pads_less = ((paddings[1] < 2) && (paddings[3] < 2));
bool ch_four = ch_in <= 4 * w_in;
if (stride == 1) {
if (pads_less && (pad_h == pad_w) && (pad < 2)) { // support pad = [0, 1]
if (ch_four && pads_less && (pad_h == pad_w) &&
(pad < 2)) { // support pad = [0, 1]
conv_depthwise_3x3s1_fp32(reinterpret_cast<const float*>(din),
reinterpret_cast<float*>(dout),
num,
......@@ -638,7 +640,6 @@ void conv_depthwise_3x3_fp32(const void* din,
act_param,
ctx);
} else {
#ifdef __aarch64__
conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
reinterpret_cast<float*>(dout),
num,
......@@ -653,30 +654,10 @@ void conv_depthwise_3x3_fp32(const void* din,
param,
act_param,
ctx);
#else
#ifdef LITE_WITH_ARM_CLANG
LOG(FATAL) << "fp32 depthwise conv3x3s1px doesnot support in v7-clang, "
"this can run in basic";
#else
conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
reinterpret_cast<float*>(dout),
num,
ch_out,
h_out,
w_out,
ch_in,
h_in,
w_in,
reinterpret_cast<const float*>(weights),
bias,
param,
act_param,
ctx);
#endif
#endif
}
} else if (stride == 2) {
if (pads_less && pad_h == pad_w && (pad < 2)) { // support pad = [0, 1]
if (ch_four && pads_less && pad_h == pad_w &&
(pad < 2)) { // support pad = [0, 1]
conv_depthwise_3x3s2_fp32(reinterpret_cast<const float*>(din),
reinterpret_cast<float*>(dout),
num,
......
......@@ -53,7 +53,9 @@
#include "lite/backends/arm/math/reduce_max.h"
#include "lite/backends/arm/math/reduce_mean.h"
#include "lite/backends/arm/math/reduce_prod.h"
#include "lite/backends/arm/math/reduce_sum.h"
#include "lite/backends/arm/math/scale.h"
#include "lite/backends/arm/math/scatter.h"
#include "lite/backends/arm/math/sequence_expand.h"
#include "lite/backends/arm/math/sequence_pool.h"
#include "lite/backends/arm/math/sequence_pool_grad.h"
......@@ -357,6 +359,15 @@ inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
return exp_ps(vmulq_f32(b, log_ps(a)));
}
inline float32x4_t vpaddq_f32(float32x4_t a, float32x4_t b) {
float32x4_t vrst;
vrst[0] = a[0] + a[1];
vrst[1] = a[2] + a[3];
vrst[2] = b[0] + b[1];
vrst[3] = b[2] + b[3];
return vrst;
}
template <typename T>
void fill_bias_fc(
T* tensor, const T* bias, int num, int channel, bool flag_relu);
......
......@@ -70,7 +70,8 @@ void bilinear_interp(const float* src,
int h_out,
float scale_x,
float scale_y,
bool with_align) {
bool align_corners,
bool align_mode) {
int* buf = new int[w_out + h_out + w_out * 2 + h_out * 2];
int* xofs = buf;
......@@ -78,14 +79,13 @@ void bilinear_interp(const float* src,
float* alpha = reinterpret_cast<float*>(buf + w_out + h_out);
float* beta = reinterpret_cast<float*>(buf + w_out + h_out + w_out * 2);
bool with_align = (align_mode == 0 && !align_corners);
float fx = 0.0f;
float fy = 0.0f;
int sx = 0;
int sy = 0;
if (with_align) {
scale_x = static_cast<float>(w_in - 1) / (w_out - 1);
scale_y = static_cast<float>(h_in - 1) / (h_out - 1);
if (!with_align) {
// calculate x axis coordinate
for (int dx = 0; dx < w_out; dx++) {
fx = dx * scale_x;
......@@ -105,8 +105,6 @@ void bilinear_interp(const float* src,
beta[dy * 2 + 1] = fy;
}
} else {
scale_x = static_cast<float>(w_in) / w_out;
scale_y = static_cast<float>(h_in) / h_out;
// calculate x axis coordinate
for (int dx = 0; dx < w_out; dx++) {
fx = scale_x * (dx + 0.5f) - 0.5f;
......@@ -468,15 +466,9 @@ void nearest_interp(const float* src,
float* dst,
int w_out,
int h_out,
float scale_x,
float scale_y,
float scale_w_new,
float scale_h_new,
bool with_align) {
float scale_w_new = (with_align)
? (static_cast<float>(w_in - 1) / (w_out - 1))
: (static_cast<float>(w_in) / (w_out));
float scale_h_new = (with_align)
? (static_cast<float>(h_in - 1) / (h_out - 1))
: (static_cast<float>(h_in) / (h_out));
if (with_align) {
for (int h = 0; h < h_out; ++h) {
float* dst_p = dst + h * w_out;
......@@ -506,7 +498,8 @@ void interpolate(lite::Tensor* X,
int out_height,
int out_width,
float scale,
bool with_align,
bool align_corners,
bool align_mode,
std::string interpolate_type) {
int in_h = X->dims()[2];
int in_w = X->dims()[3];
......@@ -531,12 +524,12 @@ void interpolate(lite::Tensor* X,
out_width = out_size_data[1];
}
}
float height_scale = scale;
float width_scale = scale;
if (out_width > 0 && out_height > 0) {
height_scale = static_cast<float>(out_height / X->dims()[2]);
width_scale = static_cast<float>(out_width / X->dims()[3]);
}
// float height_scale = scale;
// float width_scale = scale;
// if (out_width > 0 && out_height > 0) {
// height_scale = static_cast<float>(out_height / X->dims()[2]);
// width_scale = static_cast<float>(out_width / X->dims()[3]);
// }
int num_cout = X->dims()[0];
int c_cout = X->dims()[1];
Out->Resize({num_cout, c_cout, out_height, out_width});
......@@ -551,6 +544,10 @@ void interpolate(lite::Tensor* X,
int spatial_in = in_h * in_w;
int spatial_out = out_h * out_w;
float scale_x = (align_corners) ? (static_cast<float>(in_w - 1) / (out_w - 1))
: (static_cast<float>(in_w) / (out_w));
float scale_y = (align_corners) ? (static_cast<float>(in_h - 1) / (out_h - 1))
: (static_cast<float>(in_h) / (out_h));
if ("Bilinear" == interpolate_type) {
#pragma omp parallel for
for (int i = 0; i < count; ++i) {
......@@ -560,9 +557,10 @@ void interpolate(lite::Tensor* X,
dout + spatial_out * i,
out_w,
out_h,
1.f / width_scale,
1.f / height_scale,
with_align);
scale_x,
scale_y,
align_corners,
align_mode);
}
} else if ("Nearest" == interpolate_type) {
#pragma omp parallel for
......@@ -573,9 +571,9 @@ void interpolate(lite::Tensor* X,
dout + spatial_out * i,
out_w,
out_h,
1.f / width_scale,
1.f / height_scale,
with_align);
scale_x,
scale_y,
align_corners);
}
}
}
......
......@@ -30,7 +30,8 @@ void bilinear_interp(const float* src,
int h_out,
float scale_x,
float scale_y,
bool with_align);
bool align_corners,
bool align_mode);
void nearest_interp(const float* src,
int w_in,
......@@ -40,7 +41,7 @@ void nearest_interp(const float* src,
int h_out,
float scale_x,
float scale_y,
bool with_align);
bool align_corners);
void interpolate(lite::Tensor* X,
lite::Tensor* OutSize,
......@@ -50,7 +51,8 @@ void interpolate(lite::Tensor* X,
int out_height,
int out_width,
float scale,
bool with_align,
bool align_corners,
bool align_mode,
std::string interpolate_type);
} /* namespace math */
......
文件模式从 100644 更改为 100755
......@@ -2224,7 +2224,13 @@ void pooling3x3s2p1_max(const float* din,
w_unroll_size -= 1;
w_unroll_remian = wout - w_unroll_size * 4;
}
float32x4_t vmin = vdupq_n_f32(std::numeric_limits<float>::lowest());
int w_needed = wout * 2 + 1;
int need_right = w_needed - win - pad_right;
int w_2 = need_right > 0 ? w_unroll_remian : w_unroll_remian + 1;
w_2 = w_unroll_size <= 0 ? w_2 - 1 : w_2;
need_right = wout > 1 ? need_right : 0;
float minval = std::numeric_limits<float>::lowest();
float32x4_t vmin = vdupq_n_f32(minval);
for (int n = 0; n < num; ++n) {
float* data_out_batch = data_out + n * chout * size_channel_out;
......@@ -2263,6 +2269,11 @@ void pooling3x3s2p1_max(const float* din,
break;
}
}
auto pr0 = dr0;
auto pr1 = dr1;
auto pr2 = dr2;
int cnt_num = w_unroll_size;
if (w_unroll_size > 0) {
#ifdef __aarch64__
......@@ -2316,27 +2327,60 @@ void pooling3x3s2p1_max(const float* din,
"q11",
"q15");
#endif
dr0 -= 8;
dr1 -= 8;
dr2 -= 8;
}
// deal with right pad
int wstart = w_unroll_size * 4 * S - P;
for (int j = 0; j < w_unroll_remian; ++j) {
int wend = std::min(wstart + K, win);
int st = wstart > 0 ? wstart : 0;
float tmp = dr0[0];
for (int i = 0; i < wend - st; i++) {
} else {
float tmp = minval;
int left_ = std::min(2, win);
for (int i = 0; i < left_; i++) {
tmp = std::max(tmp, dr0[i]);
tmp = std::max(tmp, dr1[i]);
tmp = std::max(tmp, dr2[i]);
}
*(dr_out++) = tmp;
dr0 += S - (st - wstart);
dr1 += S - (st - wstart);
dr2 += S - (st - wstart);
wstart += S;
dr_out[0] = tmp;
dr0++;
dr1++;
dr2++;
dr_out++;
}
for (int w = 0; w < w_2 - 1; w += 1) {
float32x4_t vr0 = vld1q_f32(dr0);
float32x4_t vr1 = vld1q_f32(dr1);
float32x4_t vr2 = vld1q_f32(dr2);
vr0 = vsetq_lane_f32(minval, vr0, 3);
vr1 = vsetq_lane_f32(minval, vr1, 3);
vr2 = vsetq_lane_f32(minval, vr2, 3);
float32x4_t vmax1 = vmaxq_f32(vr0, vr1);
vmax1 = vmaxq_f32(vmax1, vr2);
float32x2_t vmax2 =
vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1));
float32x2_t vmax = vpmax_f32(vmax2, vmax2);
dr_out[0] = vget_lane_f32(vmax, 0);
dr_out++;
dr0 += 2;
dr1 += 2;
dr2 += 2;
}
if (need_right) {
float tmp = minval;
int idx = win - 1;
tmp = std::max(tmp, std::max(pr0[idx], pr1[idx]));
tmp = std::max(tmp, pr2[idx]);
dr_out[0] = tmp;
if (win % 2) {
idx = win - 2;
tmp = std::max(tmp, std::max(pr0[idx], pr1[idx]));
tmp = std::max(tmp, pr2[idx]);
dr_out[0] = tmp;
}
}
data_out_channel += wout;
}
}
......@@ -2573,6 +2617,7 @@ void pooling3x3s2p0_max(const float* din,
int wend = std::min(tmp_val + K, win) - tmp_val;
float minval = std::numeric_limits<float>::lowest();
remain = right > 0 ? remain : remain + 1;
for (int n = 0; n < num; ++n) {
float* data_out_batch = data_out + n * chout * size_channel_out;
const float* data_in_batch = data_in + n * chin * size_channel_in;
......@@ -2663,13 +2708,14 @@ void pooling3x3s2p0_max(const float* din,
vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1));
float32x2_t vmax = vpmax_f32(vmax2, vmax2);
dr_out[0] = vget_lane_f32(vmax, 0);
dr_out++;
dr0 += 2;
dr1 += 2;
dr2 += 2;
}
if (right) {
float tmp = dr0[0]; // std::numeric_limits<float>::min();
if (right > 0) {
float tmp = dr0[0];
for (int i = 0; i < wend; i++) {
tmp = std::max(tmp, std::max(dr0[i], dr1[i]));
tmp = std::max(tmp, dr2[i]);
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/arm/math/reduce_sum.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
template <>
void reduce_sum_n<float>(const float* src,
float* dst,
int num_in,
int channel_in,
int height_in,
int width_in) {
int chw_size = channel_in * height_in * width_in;
if (num_in == 1) {
memcpy(dst, src, sizeof(float) * chw_size);
} else {
int cnt_n = num_in >> 2;
int remain_n = num_in & 3;
int cnt_chw = chw_size >> 3;
int cnt_rem = chw_size & 7;
int stride = chw_size << 2;
int stride_c = 0;
for (int c = 0; c < cnt_chw; c++) {
float32x4_t vsum0 = vdupq_n_f32(0.f);
float32x4_t vsum1 = vdupq_n_f32(0.f);
const float* din_ptr0 = src + stride_c;
const float* din_ptr1 = din_ptr0 + chw_size;
const float* din_ptr2 = din_ptr1 + chw_size;
const float* din_ptr3 = din_ptr2 + chw_size;
for (int n = 0; n < cnt_n; n++) {
float32x4_t va0 = vld1q_f32(din_ptr0);
float32x4_t vb0 = vld1q_f32(din_ptr1);
float32x4_t va1 = vld1q_f32(din_ptr0 + 4);
float32x4_t vb1 = vld1q_f32(din_ptr1 + 4);
float32x4_t vc0 = vld1q_f32(din_ptr2);
float32x4_t vd0 = vld1q_f32(din_ptr3);
float32x4_t vs00 = vaddq_f32(va0, vb0);
float32x4_t vc1 = vld1q_f32(din_ptr2 + 4);
float32x4_t vs10 = vaddq_f32(va1, vb1);
float32x4_t vd1 = vld1q_f32(din_ptr3 + 4);
float32x4_t vs01 = vaddq_f32(vc0, vd0);
vsum0 = vaddq_f32(vsum0, vs00);
float32x4_t vs11 = vaddq_f32(vc1, vd1);
vsum1 = vaddq_f32(vsum1, vs10);
din_ptr0 += stride;
din_ptr1 += stride;
vsum0 = vaddq_f32(vsum0, vs01);
din_ptr2 += stride;
din_ptr3 += stride;
vsum1 = vaddq_f32(vsum1, vs11);
}
for (int n = 0; n < remain_n; n++) {
float32x4_t va0 = vld1q_f32(din_ptr0);
float32x4_t va1 = vld1q_f32(din_ptr0 + 4);
vsum0 = vaddq_f32(vsum0, va0);
din_ptr0 += chw_size;
vsum1 = vaddq_f32(vsum1, va1);
}
vst1q_f32(dst, vsum0);
dst += 4;
stride_c += 8;
vst1q_f32(dst, vsum1);
dst += 4;
}
if (cnt_rem > 3) {
float32x4_t vsum0 = vdupq_n_f32(0.f);
const float* din_ptr0 = src + stride_c;
const float* din_ptr1 = din_ptr0 + chw_size;
const float* din_ptr2 = din_ptr1 + chw_size;
const float* din_ptr3 = din_ptr2 + chw_size;
for (int n = 0; n < cnt_n; n++) {
float32x4_t va0 = vld1q_f32(din_ptr0);
float32x4_t vb0 = vld1q_f32(din_ptr1);
float32x4_t vc0 = vld1q_f32(din_ptr2);
float32x4_t vd0 = vld1q_f32(din_ptr3);
float32x4_t vs00 = vaddq_f32(va0, vb0);
float32x4_t vs01 = vaddq_f32(vc0, vd0);
vsum0 = vaddq_f32(vsum0, vs00);
din_ptr0 += stride;
din_ptr1 += stride;
vsum0 = vaddq_f32(vsum0, vs01);
din_ptr2 += stride;
din_ptr3 += stride;
}
for (int n = 0; n < remain_n; n++) {
float32x4_t va0 = vld1q_f32(din_ptr0);
din_ptr0 += chw_size;
vsum0 = vaddq_f32(vsum0, va0);
}
stride_c += 4;
vst1q_f32(dst, vsum0);
dst += 4;
cnt_rem -= 4;
}
for (int c = 0; c < cnt_rem; c++) {
const float* din_ptr0 = src + stride_c;
const float* din_ptr1 = din_ptr0 + chw_size;
const float* din_ptr2 = din_ptr1 + chw_size;
const float* din_ptr3 = din_ptr2 + chw_size;
float sum = 0.0;
for (int n = 0; n < cnt_n; n++) {
float tmp0 = din_ptr0[0] + din_ptr1[0];
float tmp1 = din_ptr2[0] + din_ptr3[0];
din_ptr0 += stride;
din_ptr1 += stride;
sum += tmp0;
din_ptr2 += stride;
din_ptr3 += stride;
sum += tmp1;
}
for (int n = 0; n < remain_n; n++) {
sum += din_ptr0[0];
din_ptr0 += chw_size;
}
stride_c++;
dst[0] = sum;
dst++;
}
}
}
template <>
void reduce_sum_c<float>(const float* src,
float* dst,
int num_in,
int channel_in,
int height_in,
int width_in) {
int hw_size = height_in * width_in;
int chw_size = hw_size * channel_in;
for (int n = 0; n < num_in; ++n) {
reduce_sum_n<float>(src, dst, channel_in, 1, height_in, width_in);
src += chw_size;
dst += hw_size;
}
}
template <>
void reduce_sum_h<float>(const float* src,
float* dst,
int num_in,
int channel_in,
int height_in,
int width_in) {
int nc_size = num_in * channel_in;
int hw_size = height_in * width_in;
for (int n = 0; n < nc_size; ++n) {
reduce_sum_n<float>(src, dst, height_in, 1, 1, width_in);
src += hw_size;
dst += width_in;
}
}
template <>
void reduce_sum_w<float>(const float* src,
float* dst,
int num_in,
int channel_in,
int height_in,
int width_in) {
int nch_size = num_in * channel_in * height_in;
int cnt_w = width_in >> 3;
int cnt_n = nch_size >> 2;
int rem_w = width_in & 7;
int rem_n = nch_size & 3;
int stride = 0;
int stride_n = width_in << 2;
for (int n = 0; n < cnt_n; n++) {
const float* din_ptr0 = src + stride;
const float* din_ptr1 = din_ptr0 + width_in;
const float* din_ptr2 = din_ptr1 + width_in;
const float* din_ptr3 = din_ptr2 + width_in;
float32x4_t vsum = vdupq_n_f32(0.f);
int tmp = rem_w;
for (int w = 0; w < cnt_w; w++) {
float32x4_t va0 = vld1q_f32(din_ptr0);
float32x4_t va1 = vld1q_f32(din_ptr0 + 4);
float32x4_t vb0 = vld1q_f32(din_ptr1);
float32x4_t vb1 = vld1q_f32(din_ptr1 + 4);
float32x4_t vc0 = vld1q_f32(din_ptr2);
float32x4_t vc1 = vld1q_f32(din_ptr2 + 4);
float32x4_t vs0 = vaddq_f32(va0, va1);
float32x4_t vd0 = vld1q_f32(din_ptr3);
float32x4_t vs1 = vaddq_f32(vb0, vb1);
float32x4_t vd1 = vld1q_f32(din_ptr3 + 4);
float32x4_t vs2 = vaddq_f32(vc0, vc1);
din_ptr0 += 8;
float32x4_t vs3 = vaddq_f32(vd0, vd1);
din_ptr1 += 8;
float32x4_t vs00 = vpaddq_f32(vs0, vs1);
din_ptr2 += 8;
float32x4_t vs01 = vpaddq_f32(vs2, vs3);
din_ptr3 += 8;
float32x4_t vs = vpaddq_f32(vs00, vs01);
vsum = vaddq_f32(vs, vsum);
}
if (tmp > 3) {
float32x4_t va0 = vld1q_f32(din_ptr0);
float32x4_t vb0 = vld1q_f32(din_ptr1);
float32x4_t vc0 = vld1q_f32(din_ptr2);
float32x4_t vd0 = vld1q_f32(din_ptr3);
din_ptr0 += 4;
din_ptr1 += 4;
float32x4_t vs00 = vpaddq_f32(va0, vb0);
float32x4_t vs01 = vpaddq_f32(vc0, vd0);
din_ptr2 += 4;
din_ptr3 += 4;
float32x4_t vs = vpaddq_f32(vs00, vs01);
vsum = vaddq_f32(vs, vsum);
tmp -= 4;
}
for (int w = 0; w < tmp; w++) {
vsum[0] += *din_ptr0++;
vsum[1] += *din_ptr1++;
vsum[2] += *din_ptr2++;
vsum[3] += *din_ptr3++;
}
stride += stride_n;
vst1q_f32(dst, vsum);
dst += 4;
}
if (rem_n > 1) {
const float* din_ptr0 = src + stride;
const float* din_ptr1 = din_ptr0 + width_in;
float32x4_t vsum = vdupq_n_f32(0.f);
for (int w = 0; w < cnt_w; w++) {
float32x4_t va0 = vld1q_f32(din_ptr0);
din_ptr0 += 4;
float32x4_t vb0 = vld1q_f32(din_ptr1);
din_ptr1 += 4;
float32x4_t va1 = vld1q_f32(din_ptr0);
float32x4_t vb1 = vld1q_f32(din_ptr1);
float32x4_t vs0 = vpaddq_f32(va0, vb0);
din_ptr0 += 4;
float32x4_t vs1 = vpaddq_f32(va1, vb1);
din_ptr1 += 4;
float32x4_t vs00 = vpaddq_f32(vs0, vs1);
vsum = vaddq_f32(vs00, vsum);
}
int tmp = rem_w;
if (tmp > 3) {
float32x4_t va0 = vld1q_f32(din_ptr0);
float32x4_t vb0 = vld1q_f32(din_ptr1);
din_ptr0 += 4;
din_ptr1 += 4;
float32x4_t vs00 = vpaddq_f32(va0, vb0);
tmp -= 4;
vsum[0] += vs00[0];
vsum[2] += vs00[1];
vsum[1] += vs00[2];
vsum[3] += vs00[3];
}
vsum[0] += vsum[2];
vsum[1] += vsum[3];
for (int w = 0; w < tmp; w++) {
vsum[0] += *din_ptr0++;
vsum[1] += *din_ptr1++;
}
stride += width_in;
*dst++ = vsum[0];
stride += width_in;
*dst++ = vsum[1];
rem_n -= 2;
}
for (int n = 0; n < rem_n; n++) {
const float* din_ptr0 = src + stride;
float32x4_t vsum = vdupq_n_f32(0.f);
for (int w = 0; w < cnt_w; w++) {
float32x4_t va0 = vld1q_f32(din_ptr0);
float32x4_t va1 = vld1q_f32(din_ptr0 + 4);
float32x4_t vs0 = vaddq_f32(va0, va1);
din_ptr0 += 8;
vsum = vaddq_f32(vs0, vsum);
}
if (rem_w > 3) {
float32x4_t va0 = vld1q_f32(din_ptr0);
din_ptr0 += 4;
vsum = vaddq_f32(vsum, va0);
rem_w -= 4;
}
vsum[1] += vsum[2];
for (int w = 0; w < rem_w; w++) {
vsum[0] += *din_ptr0++;
}
vsum[1] += vsum[3];
vsum[0] += vsum[1];
*dst++ = vsum[0];
}
}
template <>
void reduce_sum_all<float>(const float* src, float* dst, int all_size) {
int cnt_n = all_size >> 4;
int rem_n = all_size & 15;
int cnt_rem = rem_n >> 2;
int rem_rem = rem_n & 3;
float32x4_t vsum = vdupq_n_f32(0.f);
for (int n = 0; n < cnt_n; n++) {
float32x4_t va0 = vld1q_f32(src);
float32x4_t va1 = vld1q_f32(src + 4);
float32x4_t va2 = vld1q_f32(src + 8);
float32x4_t va3 = vld1q_f32(src + 12);
src += 16;
float32x4_t vs0 = vaddq_f32(va0, va1);
float32x4_t vs1 = vaddq_f32(va2, va3);
float32x4_t vs = vpaddq_f32(vs0, vs1);
vsum = vaddq_f32(vsum, vs);
}
for (int n = 0; n < cnt_rem; n++) {
float32x4_t va0 = vld1q_f32(src);
src += 4;
vsum = vaddq_f32(vsum, va0);
}
vsum[1] += vsum[2];
for (int n = 0; n < rem_rem; n++) {
vsum[0] += *src++;
}
vsum[1] += vsum[3];
vsum[0] += vsum[1];
dst[0] = vsum[0];
}
template <>
void reduce_sum_nc<float>(const float* src,
float* dst,
int num_in,
int channel_in,
int height_in,
int width_in) {
// reduce nc.
int num = num_in * channel_in;
int size = height_in * width_in;
reduce_sum_n(src, dst, num, size, 1, 1);
}
template <>
void reduce_sum_ch<float>(const float* src,
float* dst,
int num_in,
int channel_in,
int height_in,
int width_in) {
int ch_size = channel_in * height_in;
int chw_size = ch_size * width_in;
for (int n = 0; n < num_in; n++) {
reduce_sum_n<float>(src, dst, ch_size, 1, 1, width_in);
src += chw_size;
dst += width_in;
}
}
template <>
void reduce_sum_hw<float>(const float* src,
float* dst,
int num_in,
int channel_in,
int height_in,
int width_in) {
int hw_size = height_in * width_in;
int nc_size = num_in * channel_in;
reduce_sum_w(src, dst, nc_size, 1, 1, hw_size);
}
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle {
namespace lite {
namespace arm {
namespace math {
template <typename T>
void reduce_sum_n(const T* src,
T* dst,
int num_in,
int channel_in,
int height_in,
int width_in);
template <typename T>
void reduce_sum_c(const T* src,
T* dst,
int num_in,
int channel_in,
int height_in,
int width_in);
template <typename T>
void reduce_sum_h(const T* src,
T* dst,
int num_in,
int channel_in,
int height_in,
int width_in);
template <typename T>
void reduce_sum_w(const T* src,
T* dst,
int num_in,
int channel_in,
int height_in,
int width_in);
template <typename T>
void reduce_sum_nc(const T* src,
T* dst,
int num_in,
int channel_in,
int height_in,
int width_in);
template <typename T>
void reduce_sum_ch(const T* src,
T* dst,
int num_in,
int channel_in,
int height_in,
int width_in);
template <typename T>
void reduce_sum_hw(const T* src,
T* dst,
int num_in,
int channel_in,
int height_in,
int width_in);
template <typename T>
void reduce_sum_all(const T* src, T* dst, int all_size);
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/arm/math/scatter.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
template <>
void scatter<float>(const int64_t* indexs,
const float* src,
float* dst,
int index_size,
int num,
int size,
bool overwrite) {
for (int i = 0; i < num; i++) {
const float* din = src + indexs[i] * size;
memcpy(dst, din, sizeof(float) * size);
dst += size;
}
if (overwrite) {
for (int i = num; i < index_size; i++) {
const float* din = src + indexs[i] * size;
float* dout = dst + indexs[i] * size;
memcpy(dout, din, sizeof(float) * size);
}
} else {
int cnt = size >> 3;
int rem = size & 7;
for (int i = num; i < index_size; i++) {
const float* din = src + indexs[i] * size;
float* dout = dst + indexs[i] * size;
for (int j = 0; j < cnt; j++) {
float32x4_t va0 = vld1q_f32(din);
float32x4_t vb0 = vld1q_f32(dout);
float32x4_t va1 = vld1q_f32(din + 4);
float32x4_t vb1 = vld1q_f32(dout + 4);
vb0 = vaddq_f32(va0, vb0);
vb1 = vaddq_f32(va1, vb1);
din += 8;
vst1q_f32(dout, vb0);
vst1q_f32(dout + 4, vb0);
dout += 8;
}
for (int j = 0; j < rem; j++) {
dout[0] += *din++;
dout++;
}
}
}
}
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
......@@ -13,21 +13,22 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdio.h>
namespace paddle_mobile {
namespace zynqmp {
class DLEngine {
public:
static DLEngine& get_instance() {
static DLEngine s_instance;
return s_instance;
}
private:
DLEngine();
};
} // namespace zynqmp
} // namespace paddle_mobile
#include <stdint.h>
namespace paddle {
namespace lite {
namespace arm {
namespace math {
template <typename T>
void scatter(const int64_t* indexs,
const T* updates,
T* dst,
int index_size,
int num,
int size,
bool overwrite);
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
......@@ -23,7 +23,7 @@ int TargetWrapperBM::device_id_ = 0;
std::map<int, void*> TargetWrapperBM::bm_hds_;
size_t TargetWrapperBM::num_devices() {
int count = 0;
int count = 1;
bm_status_t ret = bm_dev_getcount(&count);
CHECK_EQ(ret, BM_SUCCESS) << "Failed with error code: "
<< static_cast<int>(ret);
......
......@@ -48,7 +48,7 @@ __kernel void depth_conv2d_3x3(
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
ouput_pos_in_one_block * stride_xy + (int2)(offset + dilation - 1, offset + dilation - 1);
#ifdef BIASE_CH
CL_DTYPE4 output =
......@@ -77,13 +77,13 @@ __kernel void depth_conv2d_3x3(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation,
pos_in_input_block.y + in_pos_in_one_block.y - dilation)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 ||
in_pos_in_one_block.y - 1 < 0 ||
in_pos_in_one_block.x - 1 >= input_width ||
in_pos_in_one_block.y - 1 >= input_height)
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
inputs[1] = select(
......@@ -91,45 +91,37 @@ __kernel void depth_conv2d_3x3(
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x,
pos_in_input_block.y + in_pos_in_one_block.y - 1)),
pos_in_input_block.y + in_pos_in_one_block.y - dilation)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - 1 >= input_height)
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
inputs[2] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation,
pos_in_input_block.y + in_pos_in_one_block.y - dilation)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 ||
in_pos_in_one_block.y - 1 < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
in_pos_in_one_block.y - 1 >= input_height)
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
inputs[3] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation,
pos_in_input_block.y + in_pos_in_one_block.y)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - 1 >= input_width ||
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
/*
if (output_pos.x == 112 && output_pos.y == 0) {
CL_DTYPE4 input1 = inputs[3];
float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
printf(" input4 3 - %v4hlf \n", in);
printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
}
*/
inputs[4] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
......@@ -147,11 +139,11 @@ __kernel void depth_conv2d_3x3(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation,
pos_in_input_block.y + in_pos_in_one_block.y)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
......@@ -159,13 +151,13 @@ __kernel void depth_conv2d_3x3(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation,
pos_in_input_block.y + in_pos_in_one_block.y + dilation)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 ||
in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x - 1 >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
inputs[7] = select(
......@@ -173,24 +165,24 @@ __kernel void depth_conv2d_3x3(
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
pos_in_input_block.y + in_pos_in_one_block.y + dilation)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
inputs[8] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation,
pos_in_input_block.y + in_pos_in_one_block.y + dilation)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 ||
in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
CL_DTYPE4 filters[9];
......@@ -221,14 +213,18 @@ __kernel void depth_conv2d_3x3(
/*
if (output_pos.x == 112 && output_pos.y == 0) {
if (output_pos.x == 0 && output_pos.y == 0) {
for (int i = 0; i < 9; ++i) {
CL_DTYPE4 input1 = inputs[i];
float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
printf(" input4 %d - %v4hlf \n", i, in);
printf(" input4[%d]: %v4hlf \n", i, in);
}
for (int i = 0; i < 9; ++i) {
CL_DTYPE4 filters1 = filters[i];
float4 f = (float4)(filters1.x, filters1.y, filters1.z, filters1.w);
printf(" weights4[%d]: %v4hlf \n", i, f);
}
float4 out = (float4)(output.x, output.y, output.z, output.w);
printf(" depth wise output output4 = %v4hlf \n", out);
printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
......
......@@ -24,6 +24,7 @@
#include <sys/types.h>
#elif defined(_WIN32)
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#define GLOG_NO_ABBREVIATED_SEVERITIES
#include <windows.h>
#else
#include <unistd.h>
......
......@@ -61,3 +61,5 @@ math_library(search_fc DEPS blas dynload_mklml)
# cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search)
# cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
# cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
math_library(box_coder DEPS math_function)
math_library(prior_box DEPS math_function)
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/x86/math/box_coder.h"
#include <string>
namespace paddle {
namespace lite {
namespace x86 {
namespace math {
void encode_center_size(const int64_t row, // N
const int64_t col, // M
const int64_t len, // 4
const float* target_box_data,
const float* prior_box_data,
const float* prior_box_var_data,
const bool normalized,
const std::vector<float> variance,
float* output) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
size_t offset = i * col * len + j * len;
float prior_box_width = prior_box_data[j * len + 2] -
prior_box_data[j * len] + (normalized == false);
float prior_box_height = prior_box_data[j * len + 3] -
prior_box_data[j * len + 1] +
(normalized == false);
float prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2;
float prior_box_center_y =
prior_box_data[j * len + 1] + prior_box_height / 2;
float target_box_center_x =
(target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
float target_box_center_y =
(target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
float target_box_width = target_box_data[i * len + 2] -
target_box_data[i * len] + (normalized == false);
float target_box_height = target_box_data[i * len + 3] -
target_box_data[i * len + 1] +
(normalized == false);
output[offset] =
(target_box_center_x - prior_box_center_x) / prior_box_width;
output[offset + 1] =
(target_box_center_y - prior_box_center_y) / prior_box_height;
output[offset + 2] =
std::log(std::fabs(target_box_width / prior_box_width));
output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height));
}
}
if (prior_box_var_data) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(3)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
for (int64_t k = 0; k < len; ++k) {
size_t offset = i * col * len + j * len;
int prior_var_offset = j * len;
output[offset + k] /= prior_box_var_data[prior_var_offset + k];
}
}
}
} else if (!(variance.empty())) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(3)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
for (int64_t k = 0; k < len; ++k) {
size_t offset = i * col * len + j * len;
output[offset + k] /= variance[k];
}
}
}
}
}
void decode_center_size(const int axis,
const int var_size,
const int64_t row,
const int64_t col,
const int64_t len,
const float* target_box_data,
const float* prior_box_data,
const float* prior_box_var_data,
const bool normalized,
const std::vector<float> variance,
float* output) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
float var_data[4] = {1., 1., 1., 1.};
float* var_ptr = var_data;
size_t offset = i * col * len + j * len;
int prior_box_offset = axis == 0 ? j * len : i * len;
float prior_box_width = prior_box_data[prior_box_offset + 2] -
prior_box_data[prior_box_offset] +
(normalized == false);
float prior_box_height = prior_box_data[prior_box_offset + 3] -
prior_box_data[prior_box_offset + 1] +
(normalized == false);
float prior_box_center_x =
prior_box_data[prior_box_offset] + prior_box_width / 2;
float prior_box_center_y =
prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
float target_box_center_x = 0, target_box_center_y = 0;
float target_box_width = 0, target_box_height = 0;
int prior_var_offset = axis == 0 ? j * len : i * len;
if (var_size == 2) {
std::memcpy(
var_ptr, prior_box_var_data + prior_var_offset, 4 * sizeof(float));
} else if (var_size == 1) {
var_ptr = const_cast<float*>(variance.data());
}
float box_var_x = *var_ptr;
float box_var_y = *(var_ptr + 1);
float box_var_w = *(var_ptr + 2);
float box_var_h = *(var_ptr + 3);
target_box_center_x =
box_var_x * target_box_data[offset] * prior_box_width +
prior_box_center_x;
target_box_center_y =
box_var_y * target_box_data[offset + 1] * prior_box_height +
prior_box_center_y;
target_box_width =
std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width;
target_box_height =
std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height;
output[offset] = target_box_center_x - target_box_width / 2;
output[offset + 1] = target_box_center_y - target_box_height / 2;
output[offset + 2] =
target_box_center_x + target_box_width / 2 - (normalized == false);
output[offset + 3] =
target_box_center_y + target_box_height / 2 - (normalized == false);
}
}
}
} // namespace math
} // namespace x86
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "lite/backends/x86/math/math_function.h"
namespace paddle {
namespace lite {
namespace x86 {
namespace math {
void encode_center_size(const int64_t row,
const int64_t col,
const int64_t len,
const float* target_box_data,
const float* prior_box_data,
const float* prior_box_var_data,
const bool normalized,
const std::vector<float> variance,
float* output);
void decode_center_size(const int axis,
const int var_size,
const int64_t row,
const int64_t col,
const int64_t len,
const float* target_box_data,
const float* prior_box_data,
const float* prior_box_var_data,
const bool normalized,
const std::vector<float> variance,
float* output);
} // namespace math
} // namespace x86
} // namespace lite
} // namespace paddle
......@@ -161,7 +161,7 @@ class ContextProjectFunctor {
sequence_width});
if (up_pad > 0) { // add up pad
int padding_rows = std::min(
int padding_rows = (std::min)(
up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
for (int k = 0; k < padding_rows; ++k) {
......@@ -180,10 +180,10 @@ class ContextProjectFunctor {
}
if (down_pad > 0) { // add down pad
int down_pad_begin_row =
std::max(0,
(sequence_height - context_start - context_length) + 1) +
(std::max)(
0, (sequence_height - context_start - context_length) + 1) +
1;
int padding_begin = std::max(0, context_start - sequence_height);
int padding_begin = (std::max)(0, context_start - sequence_height);
int padding_size =
sequence_height - context_start >= context_length
? 1
......
......@@ -67,8 +67,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
hend = AdaptEndIndex(ph, input_height, output_height);
} else {
hstart = ph * stride_height - padding_height;
hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
hend = (std::min)(hstart + ksize_height, input_height);
hstart = (std::max)(hstart, 0);
}
for (int pw = 0; pw < output_width; ++pw) {
if (adaptive) {
......@@ -76,8 +76,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
wend = AdaptEndIndex(pw, input_width, output_width);
} else {
wstart = pw * stride_width - padding_width;
wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
wend = (std::min)(wstart + ksize_width, input_width);
wstart = (std::max)(wstart, 0);
}
T ele = pool_process.initial();
......@@ -150,8 +150,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
hend = AdaptEndIndex(ph, input_height, output_height);
} else {
hstart = ph * stride_height - padding_height;
hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
hend = (std::min)(hstart + ksize_height, input_height);
hstart = (std::max)(hstart, 0);
}
for (int pw = 0; pw < output_width; ++pw) {
if (adaptive) {
......@@ -159,8 +159,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
wend = AdaptEndIndex(pw, input_width, output_width);
} else {
wstart = pw * stride_width - padding_width;
wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
wend = (std::min)(wstart + ksize_width, input_width);
wstart = (std::max)(wstart, 0);
}
int pool_size = (exclusive || adaptive)
? (hend - hstart) * (wend - wstart)
......@@ -228,12 +228,12 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
int hend = (std::min)(hstart + ksize_height, input_height);
hstart = (std::max)(hstart, 0);
for (int pw = 0; pw < output_width; ++pw) {
int wstart = pw * stride_width - padding_width;
int wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
int wend = (std::min)(wstart + ksize_width, input_width);
wstart = (std::max)(wstart, 0);
bool stop = false;
for (int h = hstart; h < hend && !stop; ++h) {
......@@ -337,8 +337,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
dend = AdaptEndIndex(pd, input_depth, output_depth);
} else {
dstart = pd * stride_depth - padding_depth;
dend = std::min(dstart + ksize_depth, input_depth);
dstart = std::max(dstart, 0);
dend = (std::min)(dstart + ksize_depth, input_depth);
dstart = (std::max)(dstart, 0);
}
for (int ph = 0; ph < output_height; ++ph) {
if (adaptive) {
......@@ -346,8 +346,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
hend = AdaptEndIndex(ph, input_height, output_height);
} else {
hstart = ph * stride_height - padding_height;
hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
hend = (std::min)(hstart + ksize_height, input_height);
hstart = (std::max)(hstart, 0);
}
for (int pw = 0; pw < output_width; ++pw) {
if (adaptive) {
......@@ -355,8 +355,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
wend = AdaptEndIndex(pw, input_width, output_width);
} else {
wstart = pw * stride_width - padding_width;
wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
wend = (std::min)(wstart + ksize_width, input_width);
wstart = (std::max)(wstart, 0);
}
int output_idx = (pd * output_height + ph) * output_width + pw;
T ele = pool_process.initial();
......@@ -441,8 +441,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
dend = AdaptEndIndex(pd, input_depth, output_depth);
} else {
dstart = pd * stride_depth - padding_depth;
dend = std::min(dstart + ksize_depth, input_depth);
dstart = std::max(dstart, 0);
dend = (std::min)(dstart + ksize_depth, input_depth);
dstart = (std::max)(dstart, 0);
}
for (int ph = 0; ph < output_height; ++ph) {
if (adaptive) {
......@@ -450,8 +450,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
hend = AdaptEndIndex(ph, input_height, output_height);
} else {
hstart = ph * stride_height - padding_height;
hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
hend = (std::min)(hstart + ksize_height, input_height);
hstart = (std::max)(hstart, 0);
}
for (int pw = 0; pw < output_width; ++pw) {
if (adaptive) {
......@@ -459,8 +459,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
wend = AdaptEndIndex(pw, input_width, output_width);
} else {
wstart = pw * stride_width - padding_width;
wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
wend = (std::min)(wstart + ksize_width, input_width);
wstart = (std::max)(wstart, 0);
}
int pool_size =
......@@ -540,16 +540,16 @@ class MaxPool3dGradFunctor<lite::TargetType::kX86, T> {
for (int c = 0; c < output_channels; ++c) {
for (int pd = 0; pd < output_depth; ++pd) {
int dstart = pd * stride_depth - padding_depth;
int dend = std::min(dstart + ksize_depth, input_depth);
dstart = std::max(dstart, 0);
int dend = (std::min)(dstart + ksize_depth, input_depth);
dstart = (std::max)(dstart, 0);
for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
int hend = (std::min)(hstart + ksize_height, input_height);
hstart = (std::max)(hstart, 0);
for (int pw = 0; pw < output_width; ++pw) {
int wstart = pw * stride_width - padding_width;
int wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
int wend = (std::min)(wstart + ksize_width, input_width);
wstart = (std::max)(wstart, 0);
bool stop = false;
for (int d = dstart; d < dend && !stop; ++d) {
for (int h = hstart; h < hend && !stop; ++h) {
......@@ -651,8 +651,8 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
hend = AdaptEndIndex(ph, input_height, output_height);
} else {
hstart = ph * stride_height - padding_height;
hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
hend = (std::min)(hstart + ksize_height, input_height);
hstart = (std::max)(hstart, 0);
}
for (int pw = 0; pw < output_width; ++pw) {
if (adaptive) {
......@@ -660,8 +660,8 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
wend = AdaptEndIndex(pw, input_width, output_width);
} else {
wstart = pw * stride_width - padding_width;
wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
wend = (std::min)(wstart + ksize_width, input_width);
wstart = (std::max)(wstart, 0);
}
T1 ele = static_cast<T1>(-FLT_MAX);
......@@ -794,8 +794,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
dend = AdaptEndIndex(pd, input_depth, output_depth);
} else {
dstart = pd * stride_depth - padding_depth;
dend = std::min(dstart + ksize_depth, input_depth);
dstart = std::max(dstart, 0);
dend = (std::min)(dstart + ksize_depth, input_depth);
dstart = (std::max)(dstart, 0);
}
for (int ph = 0; ph < output_height; ++ph) {
if (adaptive) {
......@@ -803,8 +803,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
hend = AdaptEndIndex(ph, input_height, output_height);
} else {
hstart = ph * stride_height - padding_height;
hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
hend = (std::min)(hstart + ksize_height, input_height);
hstart = (std::max)(hstart, 0);
}
for (int pw = 0; pw < output_width; ++pw) {
if (adaptive) {
......@@ -812,8 +812,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
wend = AdaptEndIndex(pw, input_width, output_width);
} else {
wstart = pw * stride_width - padding_width;
wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
wend = (std::min)(wstart + ksize_width, input_width);
wstart = (std::max)(wstart, 0);
}
int output_idx = (pd * output_height + ph) * output_width + pw;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -12,91 +12,51 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef DENSITY_PRIORBOX_OP
#pragma once
#include <operators/kernel/prior_box_kernel.h>
#include "lite/backends/x86/math/prior_box.h"
#include <algorithm>
#include <cmath>
#include <vector>
namespace paddle_mobile {
namespace operators {
template <typename T>
struct ClipFunctor {
inline T operator()(T in) const {
return std::min<T>(std::max<T>(in, 0.), 1.);
}
};
template <typename P>
void DensityPriorBoxCompute(const DensityPriorBoxParam<CPU> &param) {
const auto *input_ = param.Input();
const auto &input_dims = input_->dims();
const auto *input_image = param.InputImage();
const auto &input_image_dims = input_image->dims();
auto densities = param.Densities();
auto fixed_ratios = param.FixedRatios();
auto fixed_sizes = param.FixedSizes();
const auto &variances = param.Variances();
const bool &clip = param.Clip();
const float &step_w = param.StepW();
const float &step_h = param.StepH();
const float &offset = param.Offset();
Tensor *output_boxes = param.OutputBoxes();
auto output_boxes_dataptr = output_boxes->mutable_data<float>();
Tensor *output_variances = param.OutputVariances();
auto output_variances_dataptr = output_variances->mutable_data<float>();
auto img_width = input_image_dims[3];
auto img_height = input_image_dims[2];
auto feature_width = input_dims[3];
auto feature_height = input_dims[2];
auto stride0 = output_boxes->dims()[1] * output_boxes->dims()[2] *
output_boxes->dims()[3];
auto stride1 = output_boxes->dims()[2] * output_boxes->dims()[3];
auto stride2 = output_boxes->dims()[3];
float step_width, step_height;
/// 300 / 19
if (step_w == 0 || step_h == 0) {
step_width = static_cast<float>(img_width) / feature_width;
step_height = static_cast<float>(img_height) / feature_height;
} else {
step_width = step_w;
step_height = step_h;
}
int num_priors = 0;
for (size_t i = 0; i < densities.size(); ++i) {
num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
}
auto box_dim = output_variances->dims();
output_boxes->Resize({feature_height, feature_width, num_priors, 4});
#include <string>
namespace paddle {
namespace lite {
namespace x86 {
namespace math {
void density_prior_box(const int64_t img_width,
const int64_t img_height,
const int64_t feature_width,
const int64_t feature_height,
const float* input_data,
const float* image_data,
const bool clip,
const std::vector<float> variances,
const std::vector<float> fixed_sizes,
const std::vector<float> fixed_ratios,
const std::vector<int> densities,
const float step_width,
const float step_height,
const float offset,
const int num_priors,
float* boxes_data,
float* vars_data) {
int step_average = static_cast<int>((step_width + step_height) * 0.5);
std::vector<float> sqrt_fixed_ratios;
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (size_t i = 0; i < fixed_ratios.size(); i++) {
sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
}
for (int h = 0; h < feature_height; ++h) {
for (int w = 0; w < feature_width; ++w) {
/// map origin image
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for (int64_t h = 0; h < feature_height; ++h) {
for (int64_t w = 0; w < feature_width; ++w) {
float center_x = (w + offset) * step_width;
float center_y = (h + offset) * step_height;
int idx = 0;
int64_t offset = (h * feature_width + w) * num_priors * 4;
// Generate density prior boxes with fixed sizes.
for (size_t s = 0; s < fixed_sizes.size(); ++s) {
auto fixed_size = fixed_sizes[s];
int density = densities[s];
......@@ -111,51 +71,48 @@ void DensityPriorBoxCompute(const DensityPriorBoxParam<CPU> &param) {
for (int dj = 0; dj < density; ++dj) {
float center_x_temp = density_center_x + dj * shift;
float center_y_temp = density_center_y + di * shift;
output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
0] =
std::max((center_x_temp - box_width_ratio / 2.) / img_width,
0.);
output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
1] =
std::max((center_y_temp - box_height_ratio / 2.) / img_height,
0.);
output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
2] =
std::min((center_x_temp + box_width_ratio / 2.) / img_width,
1.);
output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
3] =
std::min((center_y_temp + box_height_ratio / 2.) / img_height,
1.);
idx++;
boxes_data[offset++] = std::max(
(center_x_temp - box_width_ratio / 2.) / img_width, 0.);
boxes_data[offset++] = std::max(
(center_y_temp - box_height_ratio / 2.) / img_height, 0.);
boxes_data[offset++] = std::min(
(center_x_temp + box_width_ratio / 2.) / img_width, 1.);
boxes_data[offset++] = std::min(
(center_y_temp + box_height_ratio / 2.) / img_height, 1.);
}
}
}
}
}
}
//! clip the prior's coordinate such that it is within [0, 1]
if (clip) {
math::Transform trans;
ClipFunctor<float> clip_func;
trans(output_boxes_dataptr, output_boxes_dataptr + output_boxes->numel(),
output_boxes_dataptr, clip_func);
int channel_size = feature_height * feature_width * num_priors * 4;
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (int d = 0; d < channel_size; ++d) {
boxes_data[d] = std::min(std::max(boxes_data[d], 0.f), 1.f);
}
}
//! set the variance.
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(3)
#endif
for (int h = 0; h < feature_height; ++h) {
for (int w = 0; w < feature_width; ++w) {
for (int i = 0; i < num_priors; ++i) {
int idx = ((h * feature_width + w) * num_priors + i) * 4;
vars_data[idx++] = variances[0];
vars_data[idx++] = variances[1];
vars_data[idx++] = variances[2];
vars_data[idx++] = variances[3];
}
if ((variances.size() != 4)) {
LOG(kLOG_ERROR) << " variances.size() must be 4.";
}
int64_t box_num = feature_height * feature_width * num_priors;
for (int i = 0; i < box_num; i++) {
output_variances_dataptr[4 * i] = variances[0];
output_variances_dataptr[4 * i + 1] = variances[1];
output_variances_dataptr[4 * i + 2] = variances[2];
output_variances_dataptr[4 * i + 3] = variances[3];
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
} // namespace math
} // namespace x86
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "lite/backends/x86/math/math_function.h"
namespace paddle {
namespace lite {
namespace x86 {
namespace math {
void density_prior_box(const int64_t img_width,
const int64_t img_height,
const int64_t feature_width,
const int64_t feature_height,
const float* input_data,
const float* image_data,
const bool clip,
const std::vector<float> variances,
const std::vector<float> fixed_sizes,
const std::vector<float> fixed_ratios,
const std::vector<int> densities,
const float step_width,
const float step_height,
const float offset,
const int num_priors,
float* boxes_data,
float* vars_data);
} // namespace math
} // namespace x86
} // namespace lite
} // namespace paddle
......@@ -35,7 +35,7 @@ inline static uint64_t MaximumSequenceLength(
uint64_t seq_num = seq_offset.size() - 1;
uint64_t max_seq_len = 0;
for (size_t i = 0; i < seq_num; ++i) {
max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
max_seq_len = (std::max)(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
}
return max_seq_len;
}
......
......@@ -26,7 +26,7 @@ namespace x86 {
static void SetNumThreads(int num_threads) {
#ifdef PADDLE_WITH_MKLML
int real_num_threads = std::max(num_threads, 1);
int real_num_threads = (std::max)(num_threads, 1);
x86::MKL_Set_Num_Threads(real_num_threads);
omp_set_num_threads(real_num_threads);
#endif
......@@ -52,14 +52,14 @@ static inline void RunParallelFor(const int64_t begin,
}
#ifdef PADDLE_WITH_MKLML
int64_t num_threads = std::min(GetMaxThreads(), end - begin);
int64_t num_threads = (std::min)(GetMaxThreads(), end - begin);
if (num_threads > 1) {
#pragma omp parallel num_threads(num_threads)
{
int64_t tid = omp_get_thread_num();
int64_t chunk_size = (end - begin + num_threads - 1) / num_threads;
int64_t begin_tid = begin + tid * chunk_size;
f(begin_tid, std::min(end, chunk_size + begin_tid));
f(begin_tid, (std::min)(end, chunk_size + begin_tid));
}
return;
}
......
......@@ -18,6 +18,27 @@
namespace paddle {
namespace lite {
void XPUScratchPad::Reserve(size_t new_size) {
if (new_size <= size_) {
return;
}
if (!is_l3_) {
TargetWrapperXPU::Free(addr_);
addr_ = TargetWrapperXPU::Malloc(new_size);
size_ = new_size;
} else {
CHECK(false) << "Not supported if is_l3_ == true";
}
}
void XPUScratchPadDeleter::operator()(XPUScratchPad* sp) const {
if (!sp->is_l3_) {
TargetWrapperXPU::Free(sp->addr_);
}
delete sp;
}
void* TargetWrapperXPU::Malloc(size_t size) {
void* ptr{nullptr};
XPU_CALL(xpu_malloc(&ptr, size));
......@@ -51,7 +72,7 @@ XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size,
ptr = TargetWrapperXPU::Malloc(size);
}
CHECK(ptr != nullptr) << "size = " << size << ", use_l3 = " << use_l3;
return XPUScratchPadGuard(new XPUScratchPad(ptr, use_l3));
return XPUScratchPadGuard(new XPUScratchPad(ptr, size, use_l3));
}
std::string TargetWrapperXPU::multi_encoder_precision; // NOLINT
......
......@@ -37,19 +37,19 @@ const int XPU_MAX_LOD_SEQ_LEN = 512;
using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
struct XPUScratchPad {
XPUScratchPad(void* addr, bool is_l3) : addr_(addr), is_l3_(is_l3) {}
XPUScratchPad(void* addr, size_t size, bool is_l3)
: addr_(addr), size_(size), is_l3_(is_l3) {}
// XXX(miaotianxiang): |size_| increases monotonically
void Reserve(size_t new_size);
void* addr_{nullptr};
size_t size_{0};
bool is_l3_{false};
};
struct XPUScratchPadDeleter {
void operator()(XPUScratchPad* sp) const {
if (!sp->is_l3_) {
XPU_CALL(xpu_free(sp->addr_));
}
delete sp;
}
void operator()(XPUScratchPad* sp) const;
};
using XPUScratchPadGuard = std::unique_ptr<XPUScratchPad, XPUScratchPadDeleter>;
......
......@@ -2,7 +2,7 @@ if (WITH_TESTING)
lite_cc_library(lite_gtest_main SRCS lite_gtest_main.cc DEPS gtest gflags)
endif()
lite_cc_library(target_wrapper SRCS target_wrapper.cc
DEPS target_wrapper_host place
DEPS target_wrapper_host place fbs_headers
X86_DEPS target_wrapper_x86
CUDA_DEPS target_wrapper_cuda
XPU_DEPS target_wrapper_xpu
......
......@@ -176,6 +176,9 @@ void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
case 0xd0a:
arch_type = kA75;
break;
case 0xd0d:
arch_type = kA77;
break;
case 0xd40:
arch_type = kA76;
break;
......@@ -637,6 +640,20 @@ void DeviceInfo::SetArchInfo(int argc, ...) {
bool DeviceInfo::SetCPUInfoByName() {
/* Snapdragon */
if (dev_name_.find("KONA") != std::string::npos) { // 865
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA77, kA55);
SetCacheInfo(0, 2, 192 * 1024, 256 * 1024);
SetCacheInfo(1, 2, 768 * 1024, 512 * 1024);
SetCacheInfo(2, 1, 4 * 1024 * 1024);
SetFP16Info(1, 1);
SetDotInfo(2, 1, 1);
return true;
}
if (dev_name_.find("SM8150") != std::string::npos) { // 855
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
......
......@@ -17,6 +17,7 @@
#include <cstdarg>
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/core/tensor.h"
#include "lite/utils/cp_logging.h"
#ifdef LITE_WITH_MLU
......@@ -27,6 +28,7 @@
namespace paddle {
namespace lite {
using L3CacheSetMethod = lite_api::L3CacheSetMethod;
#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
typedef enum {
......@@ -38,6 +40,8 @@ typedef enum {
kA73 = 73,
kA75 = 75,
kA76 = 76,
kA77 = 77,
kA78 = 78,
kARMArch_UNKOWN = -1
} ARMArch;
......@@ -65,11 +69,41 @@ class DeviceInfo {
int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
// Methods for allocating L3Cache on Arm platform
// Enum class L3CacheSetMethod is declared in `lite/api/paddle_api.h`
void SetArmL3CacheSize(
L3CacheSetMethod method = L3CacheSetMethod::kDeviceL3Cache,
int absolute_val = -1) {
l3_cache_method_ = method;
absolute_l3cache_size_ = absolute_val;
// Realloc memory for sgemm in this context.
workspace_.clear();
workspace_.Resize({llc_size()});
workspace_.mutable_data<int8_t>();
}
int llc_size() const {
auto size = L3_cache_[active_ids_[0]] > 0 ? L3_cache_[active_ids_[0]]
auto size = absolute_l3cache_size_;
switch (l3_cache_method_) {
// kDeviceL3Cache = 0, use the system L3 Cache size, best performance.
case L3CacheSetMethod::kDeviceL3Cache:
size = L3_cache_[active_ids_[0]] > 0 ? L3_cache_[active_ids_[0]]
: L2_cache_[active_ids_[0]];
break;
// kDeviceL2Cache = 1, use the system L2 Cache size, trade off performance
// with less memory consumption.
case L3CacheSetMethod::kDeviceL2Cache:
size = L2_cache_[active_ids_[0]];
break;
// kAbsolute = 2, use the external setting.
case L3CacheSetMethod::kAbsolute:
break;
default:
LOG(FATAL) << "Error: unknown l3_cache_method_ !";
}
return size > 0 ? size : 512 * 1024;
}
bool has_dot() const { return dot_[active_ids_[0]]; }
bool has_fp16() const { return fp16_[active_ids_[0]]; }
......@@ -121,6 +155,10 @@ class DeviceInfo {
void RequestPowerRandHighMode(int shift_num, int thread_num);
void RequestPowerRandLowMode(int shift_num, int thread_num);
// Methods for allocating L3Cache on Arm platform
// Enum class L3CacheSetMethod is declared in `lite/api/paddle_api.h`
L3CacheSetMethod l3_cache_method_{L3CacheSetMethod::kDeviceL3Cache};
int absolute_l3cache_size_{-1};
DeviceInfo() = default;
};
#endif // LITE_WITH_ARM
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include <algorithm>
#include <string>
#include "lite/api/paddle_place.h"
#include "lite/core/target_wrapper.h"
......@@ -140,20 +141,21 @@ class Buffer {
#ifdef LITE_WITH_OPENCL
template <typename T>
void ResetLazyImage2D(TargetType target,
const size_t img_w,
const size_t img_h,
const size_t img_w_req,
const size_t img_h_req,
void* host_ptr = nullptr) {
if (target != target_ || cl_image2d_width_ < img_w ||
cl_image2d_height_ < img_h || host_ptr != nullptr) {
if (target != target_ || cl_image2d_width_ < img_w_req ||
cl_image2d_height_ < img_h_req || host_ptr != nullptr) {
CHECK_EQ(own_data_, true) << "Can not reset unowned buffer.";
cl_image2d_width_ = std::max(cl_image2d_width_, img_w_req);
cl_image2d_height_ = std::max(cl_image2d_height_, img_h_req);
Free();
data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h, host_ptr);
data_ = TargetWrapperCL::MallocImage<T>(
cl_image2d_width_, cl_image2d_height_, host_ptr);
target_ = target;
space_ = sizeof(T) * img_w * img_h *
space_ = sizeof(T) * cl_image2d_width_ * cl_image2d_height_ *
4; // un-used for opencl Image2D, 4 for RGBA,
cl_use_image2d_ = true;
cl_image2d_width_ = img_w;
cl_image2d_height_ = img_h;
}
}
#endif
......
......@@ -28,6 +28,12 @@ TEST(memory, test) {
ASSERT_TRUE(buf_cuda);
TargetFree(TARGET(kCUDA), buf_cuda);
#endif
#ifdef LITE_WITH_OPENCL
auto* buf_cl = TargetMalloc(TARGET(kOpenCL), 10);
ASSERT_TRUE(buf_cl);
TargetFree(TARGET(kOpenCL), buf_cl);
#endif
}
} // namespace lite
......
......@@ -244,6 +244,7 @@ class XPUConv2dBlock0Fuser : public FuseBase {
std::string output_name = "";
if (_with_relu) {
op_desc.SetAttr("act_type", std::string{"relu"});
output_name = matched.at("relu_out")->arg()->name;
} else {
output_name = matched.at("bn_out")->arg()->name;
......@@ -433,6 +434,7 @@ class XPUConv2dBlock1Fuser : public FuseBase {
TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW));
scope->NewTensor(max_output_name);
op_desc.SetOutput("OutputMax", {max_output_name});
op_desc.SetAttr("act_type", std::string{"relu"});
auto conv_op = LiteOpRegistry::Global().Create("__xpu__conv2d");
auto& valid_places = conv_old->valid_places();
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "lite/core/mir/fusion/conv_conv_fuse_pass.h"
#include <list>
#include <memory>
#include <vector>
#include "lite/core/mir/fusion/conv_conv_fuser.h"
......@@ -27,13 +28,10 @@ void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// initialze fuser params
std::vector<bool> conv_has_bias_cases{true, false};
std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};
bool has_fp32 = false;
bool has_int8 = false;
bool has_weight_quant = false;
for (auto& place : graph->valid_places()) {
if (place.target == TARGET(kARM) || place.target == TARGET(kHost)) {
if (place.precision == PRECISION(kFloat)) {
has_fp32 = true;
}
if (place.precision == PRECISION(kInt8)) {
has_int8 = true;
}
......@@ -42,8 +40,18 @@ void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
return;
}
}
const std::list<mir::Node>& nodes = graph->nodes();
for (auto& node : nodes) {
if (node.IsStmt()) {
auto* op_info = (node.stmt())->op_info();
if (op_info->HasAttr("quantization_type")) {
has_weight_quant = true;
break;
}
}
}
// only support arm-fp32
if (has_int8 || (has_fp32 && has_int8)) {
if (has_int8 || has_weight_quant) {
return;
}
// only support fp32 fusion
......
......@@ -61,5 +61,4 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
REGISTER_MIR_PASS(lite_quant_dequant_fuse_pass,
paddle::lite::mir::QuantDequantFusePass)
.BindTargets({TARGET(kAny)})
.BindKernel("calib");
.BindTargets({TARGET(kAny)});
......@@ -148,7 +148,7 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(
int cur_life =
(*lifecycles)[TargetToStr(target_type)][var_name].second;
(*lifecycles)[TargetToStr(target_type)][var_name].second =
std::max(max_lifecycle_, cur_life);
(std::max)(max_lifecycle_, cur_life);
}
}
++max_lifecycle_;
......
......@@ -61,7 +61,7 @@ class StaticKernelPickPass : public mir::StmtPass {
float final_score{-1.};
Place winner_place{places[0]};
const int kMax =
std::numeric_limits<core::KernelPickFactor::value_type>::max();
(std::numeric_limits<core::KernelPickFactor::value_type>::max)();
size_t place_size = places.size();
// NOTE: We compare kernel's place with place in valid_places to select the
......
......@@ -17,8 +17,6 @@
#include <cmath>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/test_helper.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
......
......@@ -82,8 +82,11 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,
// not a good judge, but don't find the source of this issue from
// static_pick_kernel_pass
// to this pass.
auto is_host = [](TargetType x) -> bool {
return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM);
};
auto* in_arg_type = const_cast<Type*>(in->AsArg().type);
if (in_arg_type->target() == TARGET(kARM) &&
if (is_host(in_arg_type->target()) &&
in_arg_type->layout() == DATALAYOUT(kImageDefault)) {
return;
}
......
......@@ -233,67 +233,98 @@ bool OpInfo::GetOutputIndex(const std::string &output_name, int *out) const {
return false;
}
bool OpInfo::HasInputScale(const std::string &input_name) const {
bool OpInfo::HasInputScale(const std::string &name, bool is_scale_name) const {
bool res = false;
if (is_scale_name) {
res = HasAttr(name);
} else {
std::string argname;
int index;
if (GetInputArgname(input_name, &argname) &&
GetInputIndex(input_name, &index)) {
return HasAttr(argname + to_string(index) + "_scale");
} else {
return false;
if (GetInputArgname(name, &argname) && GetInputIndex(name, &index)) {
res = HasAttr(argname + to_string(index) + "_scale");
}
}
return res;
}
bool OpInfo::HasOutputScale(const std::string &output_name) const {
bool OpInfo::HasOutputScale(const std::string &name, bool is_scale_name) const {
bool res = false;
if (is_scale_name) {
res = HasAttr(name);
} else {
std::string argname;
int index;
if (GetOutputArgname(output_name, &argname) &&
GetOutputIndex(output_name, &index)) {
return HasAttr(argname + to_string(index) + "_scale");
} else {
return false;
if (GetOutputArgname(name, &argname) && GetOutputIndex(name, &index)) {
res = HasAttr(argname + to_string(index) + "_scale");
}
}
return res;
}
void OpInfo::SetInputScale(const std::string &input_name,
const std::vector<float> &scale_value) {
void OpInfo::SetInputScale(const std::string &name,
const std::vector<float> &scale_value,
bool is_scale_name) {
std::string scale_name;
if (is_scale_name) {
scale_name = name;
} else {
std::string argname;
int index;
CHECK(GetInputArgname(input_name, &argname));
CHECK(GetInputIndex(input_name, &index));
CHECK(GetInputArgname(name, &argname));
CHECK(GetInputIndex(name, &index));
CHECK(scale_value.size() > 0)
<< "Error in SetInputScale: the scales should not be empty";
SetAttr<std::vector<float>>(argname + to_string(index) + "_scale",
scale_value);
scale_name = argname + to_string(index) + "_scale";
}
SetAttr<std::vector<float>>(scale_name, scale_value);
}
void OpInfo::SetOutputScale(const std::string &output_name,
const std::vector<float> &scale_value) {
void OpInfo::SetOutputScale(const std::string &name,
const std::vector<float> &scale_value,
bool is_scale_name) {
std::string scale_name;
if (is_scale_name) {
scale_name = name;
} else {
std::string argname;
int index;
CHECK(GetOutputArgname(output_name, &argname));
CHECK(GetOutputIndex(output_name, &index));
CHECK(GetOutputArgname(name, &argname));
CHECK(GetOutputIndex(name, &index));
CHECK(scale_value.size() > 0)
<< "Error in SetOutputScale: the scales should not be empty";
SetAttr<std::vector<float>>(argname + to_string(index) + "_scale",
scale_value);
scale_name = argname + to_string(index) + "_scale";
}
SetAttr<std::vector<float>>(scale_name, scale_value);
}
std::vector<float> OpInfo::GetInputScale(const std::string &input_name) const {
std::vector<float> OpInfo::GetInputScale(const std::string &name,
bool is_scale_name) const {
std::string scale_name;
if (is_scale_name) {
scale_name = name;
} else {
std::string argname;
int index;
CHECK(GetInputArgname(input_name, &argname));
CHECK(GetInputIndex(input_name, &index));
return GetAttr<std::vector<float>>(argname + to_string(index) + "_scale");
CHECK(GetInputArgname(name, &argname));
CHECK(GetInputIndex(name, &index));
scale_name = argname + to_string(index) + "_scale";
}
return GetAttr<std::vector<float>>(scale_name);
}
std::vector<float> OpInfo::GetOutputScale(
const std::string &output_name) const {
std::vector<float> OpInfo::GetOutputScale(const std::string &name,
bool is_scale_name) const {
std::string scale_name;
if (is_scale_name) {
scale_name = name;
} else {
std::string argname;
int index;
CHECK(GetOutputArgname(output_name, &argname));
CHECK(GetOutputIndex(output_name, &index));
return GetAttr<std::vector<float>>(argname + to_string(index) + "_scale");
CHECK(GetOutputArgname(name, &argname));
CHECK(GetOutputIndex(name, &index));
scale_name = argname + to_string(index) + "_scale";
}
return GetAttr<std::vector<float>>(scale_name);
}
} // namespace lite
......
......@@ -251,19 +251,31 @@ class OpInfo : public cpp::OpDesc {
bool GetInputIndex(const std::string &input_name, int *out) const;
bool GetOutputIndex(const std::string &output_name, int *out) const;
bool HasInputScale(const std::string &input_name) const;
bool HasOutputScale(const std::string &output_name) const;
// If a quantized op has two input argname (X, Y) and one output
// argname (Out). The scales of input argname X are saved in op desc as
// (X0_scale, scale_value_0), (X1_scale, scale_value_1)...
// The following APIs get or set the quantized scale in op_desc.
// If use the input or output name, the is_scale_name should be false.
// If use the scale_name such as (X0_scale, scale_value_0),
// the is_scale_name should be true.
bool HasInputScale(const std::string &name, bool is_scale_name = false) const;
bool HasOutputScale(const std::string &name,
bool is_scale_name = false) const;
void SetInputScale(const std::string &input_name,
const std::vector<float> &scale_value);
const std::vector<float> &scale_value,
bool is_scale_name = false);
void SetOutputScale(const std::string &output_name,
const std::vector<float> &scale_value);
const std::vector<float> &scale_value,
bool is_scale_name = false);
// For conv2d, depthwise_conv2d and mul, the scale of weight are a vector.
// Otherwise, all input and output scales are scalar, but we save these
// as vecotr.
std::vector<float> GetInputScale(const std::string &input_name) const;
std::vector<float> GetOutputScale(const std::string &output_name) const;
std::vector<float> GetInputScale(const std::string &name,
bool is_scale_name = false) const;
std::vector<float> GetOutputScale(const std::string &name,
bool is_scale_name = false) const;
};
} // namespace lite
......
......@@ -80,8 +80,8 @@ class Optimizer {
InitControlFlowOpUnusedInputsAndOutputsEliminatePass();
if (passes.empty() || passes.size() == 1) {
std::vector<std::string> passes_local{
{"lite_quant_dequant_fuse_pass", //
std::vector<std::string> passes_local{{
"lite_quant_dequant_fuse_pass", //
"weight_quantization_preprocess_pass", //
"lite_conv_elementwise_fuse_pass", // conv-elemwise-bn
"lite_conv_bn_fuse_pass", //
......@@ -108,6 +108,7 @@ class Optimizer {
#endif
"identity_dropout_eliminate_pass",
"__xpu__resnet_fuse_pass",
"__xpu__resnet_d_fuse_pass",
"__xpu__resnet_cbam_fuse_pass",
"__xpu__conv2d_fuse_pass",
"__xpu__conv2d_link_previous_out_max_pass",
......@@ -169,8 +170,9 @@ class Optimizer {
"runtime_context_assign_pass",
"argument_type_display_pass",
"lite_reshape_fuse_pass",
"memory_optimize_pass"}};
"memory_optimize_pass" // you can comment this line when enable
// PRECISION_PROFILE
}};
if (passes.size() == 1) {
// multi_stream_analysis_pass must be in the front of
......
......@@ -18,10 +18,18 @@
* of each kernel.
*/
#pragma once
#include <sys/time.h>
#include <time.h>
#include <cmath>
#include <cstdlib>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "lite/core/program.h"
#include "lite/utils/io.h"
#ifdef LITE_WITH_X86
#include "lite/fluid/float16.h"
#endif
......@@ -40,14 +48,50 @@ namespace paddle {
namespace lite {
namespace profile {
static const std::string get_date_str() {
struct tm tm_time;
time_t timestamp = time(NULL);
localtime_r(&timestamp, &tm_time);
struct timeval tv;
gettimeofday(&tv, NULL);
// print date / time
std::string date_str =
std::to_string(1900 + tm_time.tm_year) +
std::to_string(1 + tm_time.tm_mon) + std::to_string(tm_time.tm_mday) +
'_' + std::to_string(tm_time.tm_hour) + std::to_string(tm_time.tm_min) +
std::to_string(tm_time.tm_sec) + '_' + std::to_string(tv.tv_usec / 1000);
return date_str;
}
inline std::string generate_valid_tensor_name(const std::string& name) {
std::string new_name("");
for (size_t i = 0; i < name.length(); ++i) {
if (name[i] != '/') {
new_name += name[i];
} else {
new_name += "_";
}
}
return new_name;
}
template <typename dtype>
static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
if (locate.find('/') != std::string::npos) {
return false;
static bool write_tensorfile(
const Tensor* tensor,
const std::string& tensor_name,
const std::string prefix_path = "/storage/emulated/0/") {
std::string new_tensor_name = generate_valid_tensor_name(tensor_name);
if (tensor_name.find('/') != std::string::npos) {
LOG(ERROR) << "--> tensor name is abnormal with '\\':" << tensor_name
<< " !!!, replace with '_'," << new_tensor_name
<< new_tensor_name;
}
FILE* fp = fopen(locate.c_str(), "w");
std::string tensor_save_path = prefix_path + new_tensor_name + ".txt";
FILE* fp = fopen(tensor_save_path.c_str(), "w");
if (fp == nullptr) {
LOG(ERROR) << "file open field " << locate;
LOG(ERROR) << "failed open file " << tensor_save_path;
return false;
} else {
const dtype* data = tensor->data<dtype>();
......@@ -56,19 +100,23 @@ static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
}
}
fclose(fp);
LOG(INFO) << "write tensor " << tensor_name
<< " to file:" << tensor_save_path;
return true;
}
static bool write_precision_summary_tofile(const std::string& string,
const std::string& log_dir = "") {
if (log_dir == "") {
LOG(INFO) << "The `log_dir` of precision summary file is not set. log_dir:"
<< log_dir;
static bool write_precision_summary_tofile(
const std::string& string, const std::string& summary_log_dir = "") {
if (summary_log_dir == "") {
LOG(INFO) << "The `summary_log_dir` of precision summary file is not set. "
"summary_log_dir:"
<< summary_log_dir;
return false;
}
FILE* fp = fopen(log_dir.c_str(), "a");
FILE* fp = fopen(summary_log_dir.c_str(), "a");
if (fp == nullptr) {
LOG(INFO) << "Open precision summary file:" << log_dir << "failed.";
LOG(INFO) << "Open precision summary file:" << summary_log_dir << "failed.";
return false;
} else {
fprintf(fp, "%s\n", string.c_str());
......@@ -85,7 +133,14 @@ class PrecisionProfiler {
std::string inst_precison_str = GetInstPrecision(inst);
}
PrecisionProfiler() {}
PrecisionProfiler() {
MkDirRecur(log_dir_);
const char* write_to_file_raw =
std::getenv("PADDLELITE_PRECISION_WRITE_TO_FILE");
write_result_to_file_ = (write_to_file_raw && atoi(write_to_file_raw) > 0)
? atoi(write_to_file_raw) > 0
: false;
}
std::string GetSummaryHeader() {
using std::setw;
......@@ -102,9 +157,9 @@ class PrecisionProfiler {
<< " " << setw(15) << left << "std_deviation"
<< " " << setw(15) << left << "ave_grow_rate*" << std::endl;
// write to file with path: `log_dir`
if (log_dir_ != "") {
FILE* fp = fopen(log_dir_.c_str(), "a");
// write to file with path: `summary_log_dir`
if (summary_log_dir_ != "") {
FILE* fp = fopen(summary_log_dir_.c_str(), "a");
std::string header_str{ss.str()};
fprintf(fp, "%s\n", header_str.c_str());
fclose(fp);
......@@ -112,6 +167,18 @@ class PrecisionProfiler {
return ss.str();
}
std::string GetSummaryTail() {
STL::stringstream ss;
ss << "[note]" << std::endl;
ss << "1. `ave_grow_rate`: show the sequence value of tensor when std_dev "
"& mean are same."
<< std::endl;
ss << "2. Enable write each output tensor to file: `export "
"PADDLELITE_PRECISION_WRITE_TO_FILE=1` on ADB command line."
<< std::endl;
return ss.str();
}
template <typename T>
double compute_mean(const T* in, const size_t length) {
double sum = 0.;
......@@ -157,6 +224,17 @@ class PrecisionProfiler {
return false;
}
std::string rename_out_for_mem_reuse_pass(const std::string& old_name) {
if (out_tensor_names_map.find(old_name) == out_tensor_names_map.end()) {
out_tensor_names_map[old_name] = 1;
} else {
++out_tensor_names_map[old_name];
}
std::string new_name =
old_name + "_" + std::to_string(out_tensor_names_map[old_name]);
return new_name;
}
void compute_tensor_precision_info(const Tensor* in,
TargetType target_type,
PrecisionType precision_type,
......@@ -180,7 +258,7 @@ class PrecisionProfiler {
*std_dev =
compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
write_result_to_file&& write_tensorfile<float>(in, name);
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
return;
}
case PRECISION(kAny): {
......@@ -189,7 +267,7 @@ class PrecisionProfiler {
*std_dev =
compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
write_result_to_file&& write_tensorfile<float>(in, name);
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
return;
}
case PRECISION(kInt8): {
......@@ -198,7 +276,7 @@ class PrecisionProfiler {
*std_dev =
compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<int8_t>(ptr, in->numel());
write_result_to_file&& write_tensorfile<int8_t>(in, name);
write_result_to_file&& write_tensorfile<int8_t>(in, name, log_dir_);
return;
}
case PRECISION(kInt32): {
......@@ -207,7 +285,7 @@ class PrecisionProfiler {
*std_dev = compute_standard_deviation<int32_t>(
ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<int32_t>(ptr, in->numel());
write_result_to_file&& write_tensorfile<int32_t>(in, name);
write_result_to_file&& write_tensorfile<int32_t>(in, name, log_dir_);
return;
}
case PRECISION(kInt64): {
......@@ -254,7 +332,14 @@ class PrecisionProfiler {
real_out_v.data(), in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<float>(real_out_v.data(),
real_out_v.size());
write_result_to_file&& write_tensorfile<float>(in, name);
std::shared_ptr<lite::Tensor> real_out_t(new lite::Tensor);
real_out_t->Resize(in->dims());
float* real_out_data = real_out_t->mutable_data<float>();
memcpy(real_out_data,
real_out_v.data(),
real_out_v.size() * sizeof(float));
write_result_to_file&& write_tensorfile<float>(
real_out_t.get(), name, log_dir_);
return;
}
case DATALAYOUT(kNCHW): {
......@@ -269,7 +354,14 @@ class PrecisionProfiler {
in_data_v.data(), in->numel(), true, *mean);
*ave_grow_rate =
compute_average_grow_rate<float>(in_data_v.data(), in->numel());
write_result_to_file&& write_tensorfile<float>(in, name);
std::shared_ptr<lite::Tensor> real_out_t(new lite::Tensor);
real_out_t->Resize(in->dims());
float* real_out_data = real_out_t->mutable_data<float>();
memcpy(real_out_data,
in_data_v.data(),
in_data_v.size() * sizeof(float));
write_result_to_file&& write_tensorfile<float>(
real_out_t.get(), name, log_dir_);
return;
}
default:
......@@ -296,7 +388,7 @@ class PrecisionProfiler {
in_data_v.data(), in->numel(), true, *mean);
*ave_grow_rate =
compute_average_grow_rate<float>(in_data_v.data(), in->numel());
write_result_to_file&& write_tensorfile<float>(in, name);
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
return;
}
case PRECISION(kInt32): {
......@@ -311,7 +403,7 @@ class PrecisionProfiler {
in_data_v.data(), in->numel(), true, *mean);
*ave_grow_rate =
compute_average_grow_rate<int>(in_data_v.data(), in->numel());
write_result_to_file&& write_tensorfile<float>(in, name);
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
return;
}
case PRECISION(kInt64): {
......@@ -326,7 +418,7 @@ class PrecisionProfiler {
in_data_v.data(), in->numel(), true, *mean);
*ave_grow_rate =
compute_average_grow_rate<int64_t>(in_data_v.data(), in->numel());
write_result_to_file&& write_tensorfile<float>(in, name);
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
return;
}
case PRECISION(kFP16): {
......@@ -347,7 +439,7 @@ class PrecisionProfiler {
in_data_v.data(), in->numel(), true, *mean);
*ave_grow_rate =
compute_average_grow_rate<float>(in_data_v.data(), in->numel());
write_result_to_file&& write_tensorfile<float>(in, name);
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
return;
}
default:
......@@ -372,12 +464,12 @@ class PrecisionProfiler {
using std::left;
using std::fixed;
STL::stringstream ss;
bool write_result_to_file = false;
VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr()
<< " registered on " << TargetToStr(inst->kernel()->target()) << "/"
<< PrecisionToStr(inst->kernel()->precision()) << "/"
<< DataLayoutToStr(inst->kernel()->layout());
<< DataLayoutToStr(inst->kernel()->layout())
<< ", write_result_to_file_:" << write_result_to_file_;
std::string kernel_repr = inst->op()->op_info()->Repr();
std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" +
......@@ -404,6 +496,7 @@ class PrecisionProfiler {
std::string mean_str{"unused"};
std::string std_dev_str{"unused"};
std::string ave_grow_rate_str{"unused"};
std::string new_out_name = rename_out_for_mem_reuse_pass(out_name);
if (!is_unused(tout)) {
compute_tensor_precision_info(tout,
......@@ -413,14 +506,14 @@ class PrecisionProfiler {
&mean,
&std_dev,
&ave_grow_rate,
out_name,
write_result_to_file);
new_out_name,
write_result_to_file_);
mean_str = std::to_string(mean);
std_dev_str = std::to_string(std_dev);
ave_grow_rate_str = std::to_string(ave_grow_rate);
}
std::string kernel_info = op_name + ":" + kernel_place;
std::string output_arg_info = out_name + ":" +
std::string output_arg_info = new_out_name + ":" +
TargetToStr(type->target()) + "/" +
PrecisionToStr(type->precision()) +
"/" + DataLayoutToStr(type->layout());
......@@ -441,6 +534,7 @@ class PrecisionProfiler {
std::string mean_str{"unused"};
std::string std_dev_str{"unused"};
std::string ave_grow_rate_str{"unused"};
std::string new_out_name = rename_out_for_mem_reuse_pass(out_name);
if (!is_unused(tout)) {
compute_tensor_precision_info(tout,
......@@ -450,14 +544,14 @@ class PrecisionProfiler {
&mean,
&std_dev,
&ave_grow_rate,
out_name,
write_result_to_file);
new_out_name,
write_result_to_file_);
mean_str = std::to_string(mean);
std_dev_str = std::to_string(std_dev);
ave_grow_rate_str = std::to_string(ave_grow_rate);
}
std::string kernel_info = op_name + ":" + kernel_place;
std::string output_arg_info = out_name + ":" +
std::string output_arg_info = new_out_name + ":" +
TargetToStr(type->target()) + "/" +
PrecisionToStr(type->precision()) +
"/" + DataLayoutToStr(type->layout());
......@@ -471,12 +565,16 @@ class PrecisionProfiler {
}
}
}
write_precision_summary_tofile(ss.str(), log_dir_);
write_precision_summary_tofile(ss.str(), summary_log_dir_);
return ss.str();
}
private:
std::string log_dir_{"/storage/emulated/0/precision.log"};
std::string log_dir_{"/storage/emulated/0/PaddleLite_" + get_date_str() +
"/"};
std::string summary_log_dir_{log_dir_ + "precision_summary.log"};
std::map<std::string, size_t> out_tensor_names_map;
bool write_result_to_file_{false};
};
} // namespace profile
......
......@@ -302,7 +302,9 @@ void RuntimeProgram::Run() {
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 1);
#endif
#ifdef LITE_WITH_PRECISION_PROFILE
LOG(INFO) << "\n" << precision_profiler_summary;
LOG(INFO) << "\n"
<< precision_profiler_summary
<< inst_precision_profiler.GetSummaryTail();
#endif
}
......
......@@ -29,6 +29,21 @@ int64_t ShapeProduction(const shape_t& shape) {
return res;
}
std::string ShapePrint(const std::vector<shape_t>& shapes) {
std::string shapes_str{""};
for (size_t shape_idx = 0; shape_idx < shapes.size(); ++shape_idx) {
auto shape = shapes[shape_idx];
std::string shape_str;
for (auto i : shape) {
shape_str += std::to_string(i) + ",";
}
shapes_str += shape_str;
shapes_str +=
(shape_idx != 0 && shape_idx == shapes.size() - 1) ? "" : " : ";
}
return shapes_str;
}
std::string ShapePrint(const shape_t& shape) {
std::string shape_str{""};
for (auto i : shape) {
......@@ -37,6 +52,37 @@ std::string ShapePrint(const shape_t& shape) {
return shape_str;
}
std::vector<std::string> split_string(const std::string& str_in) {
std::vector<std::string> str_out;
std::string tmp_str = str_in;
while (!tmp_str.empty()) {
size_t next_offset = tmp_str.find(":");
str_out.push_back(tmp_str.substr(0, next_offset));
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return str_out;
}
std::vector<int64_t> get_shape(const std::string& str_shape) {
std::vector<int64_t> shape;
std::string tmp_str = str_shape;
while (!tmp_str.empty()) {
int dim = atoi(tmp_str.data());
shape.push_back(dim);
size_t next_offset = tmp_str.find(",");
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return shape;
}
template <typename T>
double compute_mean(const T* in, const size_t length) {
double sum = 0.;
......@@ -70,7 +116,7 @@ inline double GetCurrentUS() {
}
void RunModel(std::string model_dir,
const shape_t& input_shape,
const std::vector<shape_t>& input_shapes,
size_t repeats,
size_t warmup,
size_t print_output_elem,
......@@ -111,12 +157,19 @@ void RunModel(std::string model_dir,
CreatePaddlePredictor<MobileConfig>(config);
// 3. Prepare input data
std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
input_tensor->Resize(
{input_shape[0], input_shape[1], input_shape[2], input_shape[3]});
auto* data = input_tensor->mutable_data<float>();
for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
data[i] = 1;
std::cout << "input_shapes.size():" << input_shapes.size() << std::endl;
for (int j = 0; j < input_shapes.size(); ++j) {
auto input_tensor = predictor->GetInput(j);
input_tensor->Resize(input_shapes[j]);
auto input_data = input_tensor->mutable_data<float>();
int input_num = 1;
for (int i = 0; i < input_shapes[j].size(); ++i) {
input_num *= input_shapes[j][i];
}
for (int i = 0; i < input_num; ++i) {
input_data[i] = 1.f;
}
}
// 4. Run predictor
......@@ -142,7 +195,7 @@ void RunModel(std::string model_dir,
}
avg_duration = sum_duration / static_cast<float>(repeats);
std::cout << "\n======= benchmark summary =======\n"
<< "input_shape(NCHW):" << ShapePrint(input_shape) << "\n"
<< "input_shape(s) (NCHW):" << ShapePrint(input_shapes) << "\n"
<< "model_dir:" << model_dir << "\n"
<< "warmup:" << warmup << "\n"
<< "repeats:" << repeats << "\n"
......@@ -184,18 +237,19 @@ void RunModel(std::string model_dir,
}
int main(int argc, char** argv) {
shape_t input_shape{1, 3, 224, 224}; // shape_t ==> std::vector<int64_t>
std::vector<std::string> str_input_shapes;
std::vector<shape_t> input_shapes{
{1, 3, 224, 224}}; // shape_t ==> std::vector<int64_t>
int repeats = 10;
int warmup = 10;
int print_output_elem = 0;
if (argc > 2 && argc < 9) {
if (argc > 2 && argc < 6) {
std::cerr << "usage: ./" << argv[0] << "\n"
<< " <naive_buffer_model_dir>\n"
<< " <input_n>\n"
<< " <input_c>\n"
<< " <input_h>\n"
<< " <input_w>\n"
<< " <raw_input_shapes>, eg: 1,3,224,224 for 1 input; "
"1,3,224,224:1,5 for 2 inputs\n"
<< " <repeats>\n"
<< " <warmup>\n"
<< " <print_output>" << std::endl;
......@@ -203,14 +257,19 @@ int main(int argc, char** argv) {
}
std::string model_dir = argv[1];
if (argc >= 9) {
input_shape[0] = atoi(argv[2]);
input_shape[1] = atoi(argv[3]);
input_shape[2] = atoi(argv[4]);
input_shape[3] = atoi(argv[5]);
repeats = atoi(argv[6]);
warmup = atoi(argv[7]);
print_output_elem = atoi(argv[8]);
if (argc >= 6) {
input_shapes.clear();
std::string raw_input_shapes = argv[2];
std::cout << "raw_input_shapes: " << raw_input_shapes << std::endl;
str_input_shapes = split_string(raw_input_shapes);
for (size_t i = 0; i < str_input_shapes.size(); ++i) {
std::cout << "input shape: " << str_input_shapes[i] << std::endl;
input_shapes.push_back(get_shape(str_input_shapes[i]));
}
repeats = atoi(argv[3]);
warmup = atoi(argv[4]);
print_output_elem = atoi(argv[5]);
}
// set arm power mode:
// 0 for big cluster, high performance
......@@ -220,7 +279,7 @@ int main(int argc, char** argv) {
size_t power_mode = 0;
RunModel(
model_dir, input_shape, repeats, warmup, print_output_elem, power_mode);
model_dir, input_shapes, repeats, warmup, print_output_elem, power_mode);
return 0;
}
......@@ -128,7 +128,7 @@ bool test_convert(bool cv_run,
for (int i = 0; i < test_iter; i++) {
clock_t begin = clock();
// resize default linear
image_preprocess.imageConvert(src, resize_lite);
image_preprocess.image_convert(src, resize_lite);
clock_t end = clock();
to_lite += (end - begin);
}
......@@ -226,7 +226,7 @@ bool test_flip(bool cv_run,
for (int i = 0; i < test_iter; i++) {
clock_t begin = clock();
// resize default linear
image_preprocess.imageFlip(src, resize_lite);
image_preprocess.image_flip(src, resize_lite);
clock_t end = clock();
to_lite += (end - begin);
}
......@@ -330,7 +330,7 @@ bool test_rotate(bool cv_run,
for (int i = 0; i < test_iter; i++) {
clock_t begin = clock();
// resize default linear
image_preprocess.imageRotate(src, resize_lite);
image_preprocess.image_rotate(src, resize_lite);
clock_t end = clock();
to_lite += (end - begin);
}
......@@ -426,7 +426,7 @@ bool test_resize(bool cv_run,
for (int i = 0; i < test_iter; i++) {
clock_t begin = clock();
// resize default linear
image_preprocess.imageResize(src, resize_lite);
image_preprocess.image_resize(src, resize_lite);
clock_t end = clock();
to_lite += (end - begin);
}
......@@ -526,7 +526,7 @@ bool test_crop(bool cv_run,
std::cout << "lite compute:" << std::endl;
for (int i = 0; i < test_iter; i++) {
clock_t begin = clock();
image_preprocess.imageCrop(
image_preprocess.image_crop(
src, resize_lite, dstFormat, srcw, srch, left_x, left_y, dstw, dsth);
clock_t end = clock();
to_lite += (end - begin);
......
......@@ -88,13 +88,13 @@ void pre_process(const cv::Mat& img, int width, int height, Tensor dstTensor) {
uint8_t* rgb_ptr = new uint8_t[img.cols * img.rows * 3];
uint8_t* resize_ptr = new uint8_t[width * height * 3];
// do convert bgr--rgb
img_process.imageConvert(img_ptr, rgb_ptr);
img_process.image_convert(img_ptr, rgb_ptr);
// do resize
img_process.imageResize(rgb_ptr, resize_ptr);
img_process.image_resize(rgb_ptr, resize_ptr);
// data--tensor and normalize
float means[3] = {103.94f, 116.78f, 123.68f};
float scales[3] = {0.017f, 0.017f, 0.017f};
img_process.image2Tensor(
img_process.image_to_tensor(
resize_ptr, &dstTensor, LayoutType::kNCHW, means, scales);
float* data = dstTensor.mutable_data<float>();
#else
......
......@@ -14,6 +14,8 @@ lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_br
lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_concat_op_apu SRCS concat_op.cc DEPS ${apu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_conv_transpose_op_apu SRCS conv_transpose_op.cc DEPS ${apu_subgraph_bridge_deps})
set(apu_subgraph_bridges
......@@ -25,6 +27,8 @@ set(apu_subgraph_bridges
subgraph_bridge_softmax_op_apu
subgraph_bridge_fc_op_apu
subgraph_bridge_pool_op_apu
subgraph_bridge_conv_transpose_op_apu
subgraph_bridge_concat_op_apu
CACHE INTERNAL "apu_subgraph_bridges")
message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}")
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -28,7 +28,7 @@ int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
LOG(FATAL) << "[APU] Node" << name << " is redefined.";
return -1;
} else {
VLOG(3) << " Add: " << name << " : " << node->index();
VLOG(5) << " Add: " << name << " : " << node->index();
auto ret = nodes_.insert(
std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
CHECK(ret.second);
......
......@@ -22,3 +22,6 @@ USE_SUBGRAPH_BRIDGE(elementwise_mul, kAPU);
USE_SUBGRAPH_BRIDGE(fc, kAPU);
USE_SUBGRAPH_BRIDGE(pool2d, kAPU);
USE_SUBGRAPH_BRIDGE(softmax, kAPU);
USE_SUBGRAPH_BRIDGE(concat, kAPU);
USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kAPU);
USE_SUBGRAPH_BRIDGE(conv2d_transpose, kAPU);
此差异已折叠。
......@@ -64,12 +64,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
xType.dimensions = &dims_x[0];
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
// input operand already exist
x_node = graph->Get(x_name);
VLOG(3) << "Graph has " << x_name << ",index: " << x_node->index();
} else {
// add input operand
NeuronModel_addOperand(model, &xType); // 0: input
NeuronModel_addOperand(model, &xType); // Operand 0: input
x_node = graph->Add(x_name, dims_x);
}
VLOG(3) << "input_scale size: " << input_scale
......@@ -80,7 +78,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
NeuronOperandType betaType;
betaType.type = NEURON_FLOAT32;
betaType.dimensionCount = 0;
NeuronModel_addOperand(model, &betaType); // 1: beta
NeuronModel_addOperand(model, &betaType); // Operand 1: beta
std::shared_ptr<Node> beta_node = nullptr;
beta_node = graph->Add(x_name + "_beta", dims_int32);
......@@ -88,7 +86,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
NeuronOperandType axisType;
axisType.type = NEURON_INT32;
axisType.dimensionCount = 0;
NeuronModel_addOperand(model, &axisType); // 2: axis
NeuronModel_addOperand(model, &axisType); // Operand 2: axis
std::shared_ptr<Node> axis_node = nullptr;
axis_node = graph->Add(x_name + "_axis", dims_int32);
......@@ -99,7 +97,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
outType.zeroPoint = 128;
outType.dimensionCount = x_dims.size();
outType.dimensions = &dims_x[0];
NeuronModel_addOperand(model, &outType); // 3: output
NeuronModel_addOperand(model, &outType); // Operand 3: output
std::shared_ptr<Node> out_node = nullptr;
out_node = graph->Add(out_name, dims_x);
VLOG(3) << "out_scale: " << out_scale;
......@@ -112,8 +110,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
axis_val[0] = axis;
NeuronModel_setOperandValue(
model, axis_node->index(), axis_val, sizeof(int32_t) * 1);
std::vector<uint32_t> addInIndex = {
x_node->index(), beta_node->index(), axis_node->index()};
std::vector<uint32_t> addInIndex = {x_node->index(), // 0: input
beta_node->index(), // 1: beta
axis_node->index()}; // 2: axis
std::vector<uint32_t> addOutIndex = {out_node->index()};
int neuron_errCode = NeuronModel_addOperation(model,
NEURON_SOFTMAX,
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册