diff --git a/.gitignore b/.gitignore index dc0a38edcb563589ce3845803174598ca68ec396..be97cf2f3ff9878774913ecf8dab0130179bbf16 100644 --- a/.gitignore +++ b/.gitignore @@ -116,4 +116,10 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models metal/MobileNetDemo/MobileNetDemo/Resources +#flatbuffers +lite/model_parser/flatbuffers/framework_generated.h + build* + +# hiai libs +ai_ddk_lib* diff --git a/.gitmodules b/.gitmodules index 107036c70292cf33e945f45a8bac935dea554ece..37af6a724560144190539ab677c8f17524f5e645 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "third-party/protobuf-host"] path = third-party/protobuf-host url = https://github.com/protocolbuffers/protobuf.git +[submodule "third-party/flatbuffers"] + path = third-party/flatbuffers + url = https://github.com/google/flatbuffers.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ac227f0154feb64178d9a99b6784bfd6db40d50..55375994031850d93caa89ec7050a9e8e657d04f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,7 +106,8 @@ lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kerne lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF) # cv build options lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF) -lite_option(LITE_WITH_STATIC_CUDA "Statically link cuda libraries." ON) +lite_option(LITE_WITH_STATIC_CUDA "Statically link cuda libraries." OFF) +lite_option(CUDA_WITH_FP16 "Compile with cuda half support" OFF) lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF) # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter. @@ -168,6 +169,7 @@ if(LITE_WITH_RKNPU) include(device/rknpu) endif() +include(external/flatbuffers) # for mobile if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake index 4fc59ccd62671c5862a298832b1ec03d4e96d05a..e6193e0bb3c93292d2264501fc4d5739ff8766ee 100644 --- a/cmake/cross_compiling/android.cmake +++ b/cmake/cross_compiling/android.cmake @@ -35,7 +35,11 @@ endif() if(NOT DEFINED ANDROID_API_LEVEL) set(ANDROID_API_LEVEL "23") if(ARM_TARGET_ARCH_ABI STREQUAL "armv7") - set(ANDROID_API_LEVEL "22") + if(LITE_WITH_NPU AND NOT LITE_ON_TINY_PUBLISH) + set(ANDROID_API_LEVEL "24") # HIAI DDK depends on android-24 + else() + set(ANDROID_API_LEVEL "22") + endif() endif() endif() diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index e7df3f0fd6f0b0efcaf9cd859df5fb84a0cadfc4..eb8e26218ad1d8adc920b1834abd9ba10669a3e9 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -2,6 +2,10 @@ if(NOT LITE_WITH_CUDA) return() endif() +if(WITH_CUDA_FP16) + add_definitions("-DCUDA_WITH_FP16") +endif() + set(paddle_known_gpu_archs "30 35 50 52 60 61 70") set(paddle_known_gpu_archs7 "30 35 50 52") set(paddle_known_gpu_archs8 "30 35 50 52 53 60 61 62") @@ -167,6 +171,10 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x add_definitions("-DPADDLE_CUDA_BINVER=\"100\"") endif() +if (CUDA_WITH_FP16) + STRING(REGEX REPLACE "30|35|50|52" "" paddle_known_gpu_archs ${paddle_known_gpu_archs}) +endif() + include_directories(${CUDA_INCLUDE_DIRS}) if(NOT WITH_DSO) if(WIN32) diff --git a/cmake/device/xpu.cmake b/cmake/device/xpu.cmake index 823048552f3cb5f05375e97e94cd5b5ad63e7563..16fc7dcf4191a6b2a145d4d6e70e915fe5321a6b 100644 --- a/cmake/device/xpu.cmake +++ b/cmake/device/xpu.cmake @@ -39,7 +39,7 @@ else() endif() find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt - PATHS ${XPU_SDK_ROOT}/XTDK/shlib + PATHS ${XPU_SDK_ROOT}/XTDK/runtime/shlib ${XPU_SDK_ROOT}/XTDK/shlib # libxpurt.so may have been moved to XTDK/runtime/shlib NO_DEFAULT_PATH) if(NOT XPU_SDK_XPU_RT_FILE) diff --git a/cmake/external/flatbuffers.cmake b/cmake/external/flatbuffers.cmake new file mode 100644 index 0000000000000000000000000000000000000000..12c6b162f686f0c08f1c90610767b3508130d0da --- /dev/null +++ b/cmake/external/flatbuffers.cmake @@ -0,0 +1,116 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +# Introduce variables: +# * CMAKE_INSTALL_LIBDIR +INCLUDE(GNUInstallDirs) +SET(LIBDIR "lib") +if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$") + SET(LIBDIR "lib64") +endif() + +SET(FLATBUFFERS_PREFIX_DIR ${THIRD_PARTY_PATH}/flatbuffers) +SET(FLATBUFFERS_SOURCES_DIR ${CMAKE_SOURCE_DIR}/third-party/flatbuffers) +SET(FLATBUFFERS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flatbuffers) +SET(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_SOURCES_DIR}/include" CACHE PATH "flatbuffers include directory." FORCE) +IF(WIN32) + set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE) +ELSE(WIN32) + set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.a" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE) +ENDIF(WIN32) + +INCLUDE_DIRECTORIES(${FLATBUFFERS_INCLUDE_DIR}) + +if(NOT HOST_CXX_COMPILER) + set(HOST_CXX_COMPILER ${CMAKE_CXX_COMPILER}) + set(HOST_C_COMPILER ${CMAKE_C_COMPILER}) +endif() + +SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${HOST_CXX_COMPILER}" + "-DCMAKE_C_COMPILER=${HOST_C_COMPILER}") + +ExternalProject_Add( + extern_flatbuffers + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/google/flatbuffers.git" + GIT_TAG "v1.12.0" + SOURCE_DIR ${FLATBUFFERS_SOURCES_DIR} + PREFIX ${FLATBUFFERS_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DBUILD_STATIC_LIBS=ON + -DCMAKE_INSTALL_PREFIX=${FLATBUFFERS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_TESTING=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR} + -DFLATBUFFERS_BUILD_TESTS=OFF + ${CROSS_COMPILE_CMAKE_ARGS} + ${OPTIONAL_ARGS} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${FLATBUFFERS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} +) +IF(WIN32) + IF(NOT EXISTS "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib") + add_custom_command(TARGET extern_flatbuffers POST_BUILD + COMMAND cmake -E copy ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/flatbuffers_static.lib ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib + ) + ENDIF() +ENDIF(WIN32) +ADD_LIBRARY(flatbuffers STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET flatbuffers PROPERTY IMPORTED_LOCATION ${FLATBUFFERS_LIBRARIES}) +ADD_DEPENDENCIES(flatbuffers extern_flatbuffers) + +SET(FLATBUFFERS_FLATC_EXECUTABLE ${FLATBUFFERS_INSTALL_DIR}/bin/flatc) + +function(register_generated_output file_name) + get_property(tmp GLOBAL PROPERTY FBS_GENERATED_OUTPUTS) + list(APPEND tmp ${file_name}) + set_property(GLOBAL PROPERTY FBS_GENERATED_OUTPUTS ${tmp}) +endfunction(register_generated_output) + +function(compile_flatbuffers_schema_to_cpp_opt TARGET SRC_FBS OPT) + if(FLATBUFFERS_BUILD_LEGACY) + set(OPT ${OPT};--cpp-std c++0x) + else() + # --cpp-std is defined by flatc default settings. + endif() + message(STATUS "`${SRC_FBS}`: add generation of C++ code with '${OPT}'") + get_filename_component(SRC_FBS_DIR ${SRC_FBS} PATH) + message(STATUS "SRC_FBS_DIR: ${SRC_FBS_DIR}") + string(REGEX REPLACE "\\.fbs$" "_generated.h" GEN_HEADER ${SRC_FBS}) + add_custom_command( + OUTPUT ${GEN_HEADER} + COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}" + --cpp --gen-mutable --gen-object-api --reflect-names + --cpp-ptr-type flatbuffers::unique_ptr # Used to test with C++98 STLs + ${OPT} + -I "${CMAKE_CURRENT_SOURCE_DIR}/tests/include_test" + -o "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR}" + "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}" + DEPENDS flatbuffers + COMMENT "Run generation: '${GEN_HEADER}'") + register_generated_output(${GEN_HEADER}) + add_custom_target(${TARGET} ALL DEPENDS ${GEN_HEADER}) +endfunction() + +set(FRAMEWORK_FBS_DIR "lite/model_parser/flatbuffers") +set(FRAMEWORK_SCHEMA_PATH "${FRAMEWORK_FBS_DIR}/framework.fbs") +compile_flatbuffers_schema_to_cpp_opt(framework_fbs_header ${FRAMEWORK_SCHEMA_PATH} "--no-includes;--gen-compare;--force-empty") +include_directories(${FLATBUFFERS_INCLUDE_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR}) + diff --git a/lite/demo/cxx/train_demo/README.md b/docs/demo_guides/cpp_train_demo.md similarity index 82% rename from lite/demo/cxx/train_demo/README.md rename to docs/demo_guides/cpp_train_demo.md index 56f4513d45676a1deb51bfb93096db156ddd0449..c10f2091f9c14f6fc81563248c75e72abd713666 100644 --- a/lite/demo/cxx/train_demo/README.md +++ b/docs/demo_guides/cpp_train_demo.md @@ -1,8 +1,10 @@ +# C++ Train Demo -# Introduction - 我们都知道,PaddleLite可以做移动端预测,事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子,这一例子对应的任务是“波士顿房价预测”,又称作“fit-a-line”。 +## Introduction + +我们都知道,PaddleLite可以做移动端预测,事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子,这一例子对应的任务是“波士顿房价预测”,又称作“fit-a-line”。 - 你可以通过book库中的 +你可以通过book库中的 [文档](https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html) 和 [源码](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line) @@ -10,18 +12,16 @@ 其使用线性回归(Linear Regression) 模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。 -注:这是一篇使用C++ API做模型训练的教程,其他API暂时不支持训练功能。 - -# Requirements +## Requirements - 一部安卓手机,用于运行训练程序 -- 装了Paddle (version: 1.7.0) 的python +- 装了Paddle (version >= 1.7.0) 的python -# Quick start +## Quick start -## Step1 build paddle-lite +### Step1 build paddle-lite -请按照[paddle-lite官方文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#paddlelite) 的教程编译full_publish的paddle-lite lib。以Linux上编译为例,其具体的命令为: +请按照paddle-lite官方文档的教程编译full_publish的paddle-lite lib。以Linux上编译为例,其具体的命令为: ```shell ## 配置环境 @@ -51,7 +51,7 @@ cd Paddle-Lite Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so ``` -## Step2 编译lr_trainer +### Step2 编译lr_trainer ```shell cd Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/ @@ -64,7 +64,7 @@ bin/ `-- demo_trainer ``` -## Step3 download model and run it! +### Step3 download model and run it! 在你的笔记本电脑上,用usb连接到手机,开启开发者模式,在任意目录下执行: @@ -102,7 +102,7 @@ sample 8: Loss: 248.445 sample 9: Loss: 325.135 ``` -# 更多细节 +## 更多细节 上面提到的模型是直接下载得到的,如果你想自己生成,可以执行以下命令: ```shell @@ -125,9 +125,9 @@ md5sum fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d 如果你想生成自己的模型用于训练,可以参考`train.py`中保存模型的方式。 -# 与Paddle训练结果做校对 +## 与Paddle训练结果做校对 -## 前10个Loss值 +### 前10个Loss值 为了验证paddle与lite的一致性,我们控制模型参数一致、数据一致、batch size = 1的情况下,训练10个batch, 记录了二者的loss值。 @@ -171,11 +171,11 @@ sample 8: Loss: 248.445 sample 9: Loss: 325.135 ``` -## Loss 曲线 +### Loss 曲线 控制训练时的batch size为20,每个epoch对训练数据做全局shuffle,训练100个epoch后,paddle和lite的loss曲线对比如下。 -![lr_loss](image/lr_loss.png) +![lr_loss](../images/lr_loss.png) 如果想复现上述效果,paddle+python的运行命令为: diff --git a/docs/demo_guides/python_demo.md b/docs/demo_guides/python_demo.md index d6a7b15bd9be638ef586e6b589e35eecbf1613c2..59f81783c0b2e791f9623e84cf57c269cbb7d6f2 100644 --- a/docs/demo_guides/python_demo.md +++ b/docs/demo_guides/python_demo.md @@ -86,19 +86,28 @@ config.set_model_from_file(/YOU_MODEL_PATH/mobilenet_v1_opt.nb) predictor = create_paddle_predictor(config) ``` -(3) 设置输入数据 +(3) 从图片读入数据 + +```python +image = Image.open('./example.jpg') +resized_image = image.resize((224, 224), Image.BILINEAR) +image_data = np.array(resized_image).flatten().tolist() +``` + +(4) 设置输入数据 + ```python input_tensor = predictor.get_input(0) input_tensor.resize([1, 3, 224, 224]) -input_tensor.set_float_data([1.] * 3 * 224 * 224) +input_tensor.set_float_data(image_data) ``` -(4) 执行预测 +(5) 执行预测 ```python predictor.run() ``` -(5) 得到输出数据 +(6) 得到输出数据 ```python output_tensor = predictor.get_output(0) print(output_tensor.shape()) diff --git a/lite/demo/cxx/train_demo/image/lr_loss.png b/docs/images/lr_loss.png similarity index 100% rename from lite/demo/cxx/train_demo/image/lr_loss.png rename to docs/images/lr_loss.png diff --git a/docs/index.rst b/docs/index.rst index c241f091ed2cae906879f98b769bc6b7ce830fe1..b2fba7daba51c68207af27e249559c18ab10235f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -59,7 +59,14 @@ Welcome to Paddle-Lite's documentation! demo_guides/baidu_xpu demo_guides/rockchip_npu demo_guides/mediatek_apu - + +.. toctree:: + :maxdepth: 1 + :caption: 训练示例(预览) + :name: sec-train_demo_guides + + demo_guides/cpp_train_demo + .. toctree:: :maxdepth: 1 :caption: API文档 diff --git a/docs/user_guides/Compile/iOS.md b/docs/user_guides/Compile/iOS.md index 355cc11875ce8f8db891fb843d2f1624180b71ff..60375ad1085dfac090442f9c0dad86cf71b64c9e 100644 --- a/docs/user_guides/Compile/iOS.md +++ b/docs/user_guides/Compile/iOS.md @@ -61,7 +61,7 @@ inference_lite_lib.ios64.armv8 iOS预测库和头文件 - 裁剪预测库方法(只编译模型中的kernel&OP,降低预测库体积): ```shell -./lite/tools/build_android.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir +./lite/tools/build_ios.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir ``` ```shell --with_strip: (OFF|ON); 是否根据输入模型裁剪预测库,默认为OFF diff --git a/docs/user_guides/model_optimize_tool.md b/docs/user_guides/model_optimize_tool.md index fed728cb0e06c9758a0497a9cbb93d7edf39bda7..4c80d638d224d294e247ad3f5300498dd536be62 100644 --- a/docs/user_guides/model_optimize_tool.md +++ b/docs/user_guides/model_optimize_tool.md @@ -21,11 +21,11 @@ pip install paddlelite - 方法二: 下载opt可执行文件 从[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases),选择当前预测库对应版本的`opt`转化工具 -本文提供`release/v2.6`和`release/v2.2.0`版本的优化工具下载 +本文提供`release/v2.6.1`和`release/v2.2.0`版本的优化工具下载 |版本 | Linux | MacOS| |---|---|---| -| `release/v2.3`| [opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) | +| `release/v2.6.1` | [opt](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt_mac) | |`release/v2.2.0` | [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool) | [model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) | - 方法三: 源码编译opt diff --git a/docs/user_guides/tutorial.md b/docs/user_guides/tutorial.md index 338449bfcb92e4029763c4357eb6d1fd5b820272..ee156038a6ea144921258734c92e9a2ea757d6ec 100644 --- a/docs/user_guides/tutorial.md +++ b/docs/user_guides/tutorial.md @@ -49,4 +49,4 @@ $ ./opt \ ## 五. 测试工具 -为了使您更好的了解并使用Lite框架,我们向有进一步使用需求的用户开放了 [Debug工具](debug#debug) 和 [Profile工具](debug#profiler)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异,进一步快速定位问题Op,方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗,其会自动统计Op执行的次数,最长、最短、平均执行时间等等信息,为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。 +为了使您更好的了解并使用Lite框架,我们向有进一步使用需求的用户开放了 [Debug工具](debug) 和 [Profile工具](debug)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异,进一步快速定位问题Op,方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗,其会自动统计Op执行的次数,最长、最短、平均执行时间等等信息,为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。 diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index ff4d00dbb1051320f817c8220a11a77edde7fb05..eeea3b3adf4caf2e3ea57eb365c32f24626851e6 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -48,11 +48,13 @@ if (WITH_TESTING) endif() if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz") - lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz") + lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "step_rnn.tar.gz") + lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "bert.tar.gz") + lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "ernie.tar.gz") endif() endif() @@ -242,7 +244,6 @@ if (LITE_WITH_X86) add_dependencies(publish_inference_x86_cxx_lib test_model_bin) add_custom_target(publish_inference_x86_cxx_demos ${TARGET} - COMMAND rm -rf "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_light_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_light" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_full_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_full" diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 38a698d2ef608435b5aaa8274958ee6b8c7a8e03..0a8cf165996c6f1d3948cd29e3c0562b23570561 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -2,7 +2,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR (NOT LITE_WITH_LOG)) lite_cc_library(place SRCS paddle_place.cc DEPS logging) else() lite_cc_library(place SRCS paddle_place.cc DEPS glog) -endif(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) +endif() if (LITE_ON_TINY_PUBLISH) set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG") @@ -15,8 +15,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH #full api dynamic library lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc DEPS paddle_api paddle_api_light paddle_api_full) - add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto) - target_link_libraries(paddle_full_api_shared framework_proto) + target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files}) + add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto op_registry framework_fbs_header) + target_link_libraries(paddle_full_api_shared framework_proto op_registry) if(LITE_WITH_X86) add_dependencies(paddle_full_api_shared xxhash) target_link_libraries(paddle_full_api_shared xxhash) @@ -70,7 +71,7 @@ else() set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto") endif() set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}") - add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) + add_dependencies(paddle_light_api_shared op_list_h kernel_list_h framework_fbs_header) if (LITE_WITH_NPU) # Need to add HIAI runtime libs (libhiai.so) dependency target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs}) @@ -368,6 +369,9 @@ endif() if (LITE_WITH_PYTHON) add_subdirectory(python) + # add library for opt_base + lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils) + add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h) endif() if (LITE_ON_TINY_PUBLISH) @@ -375,9 +379,6 @@ if (LITE_ON_TINY_PUBLISH) endif() -# add library for opt_base -lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils) -add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h) if (LITE_ON_MODEL_OPTIMIZE_TOOL) message(STATUS "Compiling opt") diff --git a/lite/api/android/jni/native/CMakeLists.txt b/lite/api/android/jni/native/CMakeLists.txt index d46e9f7cdec1cf422340ff11165ee166c7520bab..2929e24117c616a99ff4e078fd77fe8827186cb1 100644 --- a/lite/api/android/jni/native/CMakeLists.txt +++ b/lite/api/android/jni/native/CMakeLists.txt @@ -17,6 +17,7 @@ if (NOT LITE_ON_TINY_PUBLISH) # Unlike static library, module library has to link target to be able to work # as a single .so lib. target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels}) + add_dependencies(paddle_lite_jni framework_fbs_header) if (LITE_WITH_NPU) # Strips the symbols of our protobuf functions to fix the conflicts during # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so) @@ -31,7 +32,7 @@ else() endif() set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS ${TARGET_COMIPILE_FLAGS}) target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc) - add_dependencies(paddle_lite_jni op_list_h kernel_list_h) + add_dependencies(paddle_lite_jni op_list_h kernel_list_h framework_fbs_header) if (LITE_WITH_NPU) # Need to add HIAI runtime libs (libhiai.so) dependency target_link_libraries(paddle_lite_jni ${npu_builder_libs} ${npu_runtime_libs}) diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index 1ed0f8a0e4c0d1bffbc8c8cd75261208a80ed546..505f42f98725a595e3a2e0c0b412d11ae7ad709e 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -13,18 +13,24 @@ // limitations under the License. #include "lite/api/cxx_api.h" + #include #include #include #include #include #include + #include "lite/api/paddle_use_passes.h" #include "lite/utils/io.h" namespace paddle { namespace lite { +std::vector GetAllOps() { + return OpLiteFactory::Global().GetAllOps(); +} + void Predictor::SaveModel(const std::string &dir, lite_api::LiteModelType model_type, bool record_info) { @@ -326,10 +332,8 @@ void Predictor::Build(const std::shared_ptr &desc, } } if (is_quantized_model) { -#ifdef LITE_WITH_ARM inner_places.insert(inner_places.begin(), Place{TARGET(kARM), PRECISION(kInt8)}); -#endif } Program program(*desc.get(), scope_, inner_places); diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h index 62e29dd71cca1692546517762e3dba72497acb6a..8206912bb6621764dc5d5d3b0fb5a0eae19d862c 100644 --- a/lite/api/cxx_api.h +++ b/lite/api/cxx_api.h @@ -41,6 +41,8 @@ static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] = ".tailored_kernels_source_list"; static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list"; +std::vector GetAllOps(); + /* * Predictor for inference, input a model, it will optimize and execute it. */ diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc index 5fca2c9d70c18dfc731c720ba788f18e73c17742..db4d8a98ff86cd4a85dfbb2f9a8e25da0ea4390b 100644 --- a/lite/api/cxx_api_impl.cc +++ b/lite/api/cxx_api_impl.cc @@ -52,12 +52,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { if (!status_is_cloned_) { #ifdef LITE_WITH_MLU Env::Init(); - lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(), - config.mlu_core_number(), - config.mlu_use_first_conv(), - config.mlu_first_conv_mean(), - config.mlu_first_conv_std(), - config.mlu_input_layout()); + lite::TargetWrapperMlu::SetMLURunMode(config.mlu_core_version(), + config.mlu_core_number(), + config.mlu_input_layout(), + config.mlu_firstconv_param()); #endif // LITE_WITH_MLU auto use_layout_preprocess_pass = config.model_dir().find("OPENCL_PRE_PRECESS"); @@ -75,6 +73,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { mode_ = config.power_mode(); threads_ = config.threads(); +#ifdef LITE_WITH_NPU + Context::SetSubgraphModelCacheDir( + config.subgraph_model_cache_dir()); +#endif #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) int num_threads = config.x86_math_library_num_threads(); diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc index 5f57ed40ddb762f2d80fce2327a01100bae741d9..f0d1fb96fe4dfd5f8fa57808a2098cbc42db6a11 100644 --- a/lite/api/light_api.cc +++ b/lite/api/light_api.cc @@ -15,8 +15,6 @@ #include "lite/api/light_api.h" #include #include -#include "paddle_use_kernels.h" // NOLINT -#include "paddle_use_ops.h" // NOLINT namespace paddle { namespace lite { diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc index 938079c51285bc8e8a7a25cd9e2d3682a739b567..cd640581a7a631aad733fd377bfc405869e90322 100644 --- a/lite/api/paddle_api.cc +++ b/lite/api/paddle_api.cc @@ -13,6 +13,9 @@ // limitations under the License. #include "lite/api/paddle_api.h" + +#include + #include "lite/core/context.h" #include "lite/core/device_info.h" #include "lite/core/target_wrapper.h" @@ -21,6 +24,13 @@ #ifdef LITE_WITH_CUDA #include "lite/backends/cuda/target_wrapper.h" #endif +#ifdef LITE_WITH_XPU +#include "lite/backends/xpu/target_wrapper.h" +#endif + +#ifdef LITE_WITH_MLU +#include "lite/backends/mlu/target_wrapper.h" +#endif namespace paddle { namespace lite_api { @@ -106,6 +116,13 @@ void Tensor::CopyFromCpu(const T *src_data) { data, src_data, num * sizeof(T), lite::IoDirection::HtoD, *io_stream_); #else LOG(FATAL) << "Please compile the lib with CUDA."; +#endif + } else if (type == TargetType::kMLU) { +#ifdef LITE_WITH_MLU + lite::TargetWrapperMlu::MemcpySync( + data, src_data, num * sizeof(T), lite::IoDirection::HtoD); +#else + LOG(FATAL) << "Please compile the lib with MLU."; #endif } else { LOG(FATAL) << "The CopyFromCpu interface just support kHost, kARM, kCUDA"; @@ -127,6 +144,13 @@ void Tensor::CopyToCpu(T *data) const { lite::TargetWrapperCuda::StreamSync(*io_stream_); #else LOG(FATAL) << "Please compile the lib with CUDA."; +#endif + } else if (type == TargetType::kMLU) { +#ifdef LITE_WITH_MLU + lite::TargetWrapperMlu::MemcpySync( + data, src_data, num * sizeof(T), lite::IoDirection::DtoH); +#else + LOG(FATAL) << "Please compile the lib with MLU."; #endif } else { LOG(FATAL) << "The CopyToCpu interface just support kHost, kARM, kCUDA"; @@ -148,6 +172,11 @@ template void Tensor::CopyFromCpu(const int64_t *); template void Tensor::CopyFromCpu(const float *); template void Tensor::CopyFromCpu(const int8_t *); +template void Tensor::CopyFromCpu(const int *); +template void Tensor::CopyFromCpu(const int64_t *); +template void Tensor::CopyFromCpu(const float *); +template void Tensor::CopyFromCpu(const int8_t *); + template void Tensor::CopyToCpu(float *) const; template void Tensor::CopyToCpu(int *) const; template void Tensor::CopyToCpu(int8_t *) const; @@ -238,13 +267,9 @@ void CxxConfig::set_mlu_core_number(int core_number) { void CxxConfig::set_mlu_input_layout(DataLayoutType layout) { mlu_input_layout_ = layout; } -void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) { - mlu_use_first_conv_ = use_first_conv; -} -void CxxConfig::set_mlu_first_conv_mean(const std::vector &mean) { +void CxxConfig::set_mlu_firstconv_param(const std::vector &mean, + const std::vector &std) { mlu_first_conv_mean_ = mean; -} -void CxxConfig::set_mlu_first_conv_std(const std::vector &std) { mlu_first_conv_std_ = std; } lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const { @@ -252,18 +277,15 @@ lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const { } int CxxConfig::mlu_core_number() const { return mlu_core_number_; } DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; } -bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; } -const std::vector &CxxConfig::mlu_first_conv_mean() const { - return mlu_first_conv_mean_; -} -const std::vector &CxxConfig::mlu_first_conv_std() const { - return mlu_first_conv_std_; +std::pair, std::vector> +CxxConfig::mlu_firstconv_param() const { + return std::make_pair(mlu_first_conv_mean_, mlu_first_conv_std_); } #endif void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) { #ifdef LITE_WITH_XPU - lite::Context::SetWorkspaceL3Size(l3_size); + lite::TargetWrapperXPU::workspace_l3_size_per_thread = l3_size; #else LOG(WARNING) << "The invoking of the function " "'set_xpu_workspace_l3_size_per_thread' is ignored, please " @@ -273,7 +295,7 @@ void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) { void CxxConfig::set_xpu_dev_per_thread(int dev_no) { #ifdef LITE_WITH_XPU - lite::Context::SetDev(dev_no); + lite::TargetWrapperXPU::SetDev(dev_no); #else LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is " "ignored, please rebuild it with LITE_WITH_XPU=ON."; @@ -282,7 +304,7 @@ void CxxConfig::set_xpu_dev_per_thread(int dev_no) { void CxxConfig::set_xpu_multi_encoder_precision(const std::string &precision) { #ifdef LITE_WITH_XPU - lite::Context::_multi_encoder_precision = precision; + lite::TargetWrapperXPU::multi_encoder_precision = precision; #else LOG(WARNING) << "The invoking of the function " "'set_xpu_multi_encoder_precision' is " diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index 9cf2e580bf7927b17bc62fb1c524a977ee806307..9c8e18f4c8f0505b9d909d4cf81b4dec6feece77 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -21,6 +21,7 @@ #define PADDLE_LITE_API_H_ #include #include +#include #include #include "paddle_place.h" // NOLINT @@ -174,9 +175,8 @@ class LITE_API CxxConfig : public ConfigBase { lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270}; int mlu_core_number_{1}; DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)}; - bool mlu_use_first_conv_{false}; - std::vector mlu_first_conv_mean_; - std::vector mlu_first_conv_std_; + std::vector mlu_first_conv_mean_{}; + std::vector mlu_first_conv_std_{}; #endif public: @@ -232,24 +232,22 @@ class LITE_API CxxConfig : public ConfigBase { void set_mlu_core_version(lite_api::MLUCoreVersion core_version); // set MLU core number, which is used when compiling MLU kernels void set_mlu_core_number(int core_number); - // set MLU input layout. User can specify layout of input data to be NHWC, - // default is NCHW - void set_mlu_input_layout(DataLayoutType layout); // whether use MLU's first conv kernel. First conv is a special kernel // provided by MLU, its input is uint8, and also needs two 3-dimentional // vectors which save all inputs' mean and std values - void set_mlu_use_first_conv(bool use_first_conv); - // set the 3-dimentional mean vector used by MLU's first conv - void set_mlu_first_conv_mean(const std::vector& mean); - // set the 3-dimentional std vector used by MLU's first conv - void set_mlu_first_conv_std(const std::vector& std); + // set the 3-dimentional mean vector and 3-dimentional std vector used by + // MLU's first conv + void set_mlu_firstconv_param(const std::vector& mean, + const std::vector& std); + // set MLU input layout. User can specify layout of input data to be NHWC, + // default is NCHW + void set_mlu_input_layout(DataLayoutType layout); lite_api::MLUCoreVersion mlu_core_version() const; int mlu_core_number() const; DataLayoutType mlu_input_layout() const; - bool mlu_use_first_conv() const; - const std::vector& mlu_first_conv_mean() const; - const std::vector& mlu_first_conv_std() const; + // std::pair + std::pair, std::vector> mlu_firstconv_param() const; #endif // XPU only, set the size of the workspace memory from L3 cache for the diff --git a/lite/api/paddle_api_test.cc b/lite/api/paddle_api_test.cc index 832867df079efa1baebf08da4c0d8e37958460f1..4edd61277059e20f7dfb1b8410a784fd04d85502 100644 --- a/lite/api/paddle_api_test.cc +++ b/lite/api/paddle_api_test.cc @@ -15,8 +15,11 @@ #include "lite/api/paddle_api.h" #include #include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" #include "lite/utils/cp_logging.h" #include "lite/utils/io.h" + DEFINE_string(model_dir, "", ""); namespace paddle { diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index 485bd10770d6e5a29963f336dfdf6d47302ccbc0..2ec4965d3d526c82c41b51954f9564488c5126e1 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -55,6 +55,8 @@ USE_MIR_PASS(apu_subgraph_pass); USE_MIR_PASS(quantized_op_attributes_inference_pass); USE_MIR_PASS(lite_scale_activation_fuse_pass); USE_MIR_PASS(__xpu__resnet_fuse_pass); +USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass); USE_MIR_PASS(__xpu__multi_encoder_fuse_pass); USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass); USE_MIR_PASS(__xpu__fc_fuse_pass); +USE_MIR_PASS(__xpu__mmdnn_fuse_pass); diff --git a/lite/api/test_yolov3_lite_bm.cc b/lite/api/test_yolov3_lite_bm.cc index d70ecf3c03955286244aa13cfe65f19569a55930..ded851d93313c3e155dd7f8860eee7446e56e715 100644 --- a/lite/api/test_yolov3_lite_bm.cc +++ b/lite/api/test_yolov3_lite_bm.cc @@ -59,9 +59,9 @@ void TestModel(const std::vector& valid_places) { } auto* image_tensor = predictor.GetInput(1); image_tensor->Resize(DDim(std::vector({1, 2}))); - data = image_tensor->mutable_data(); - data[0] = FLAGS_im_height; - data[1] = FLAGS_im_width; + auto* data_1 = image_tensor->mutable_data(); + data_1[0] = FLAGS_im_height; + data_1[1] = FLAGS_im_width; for (int i = 0; i < FLAGS_warmup; ++i) { predictor.Run(); diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt index d50b46d5bd26e3186e5def2100042e5b22ce4977..9cf8f6a507401656bb0df214bd463a09fd82a61d 100644 --- a/lite/backends/arm/math/CMakeLists.txt +++ b/lite/backends/arm/math/CMakeLists.txt @@ -127,5 +127,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR) split_merge_lod_tenosr.cc reduce_prod.cc lstm.cc + clip.cc DEPS ${lite_kernel_deps} context tensor) endif() diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc index 8e94e212fcb5ff83e8dbfa9d70652cbdaca50656..01f25cbd36d327f7a3c252fdc675262d39748318 100644 --- a/lite/backends/arm/math/activation.cc +++ b/lite/backends/arm/math/activation.cc @@ -763,24 +763,6 @@ void act_thresholded_relu( } } -#ifdef LITE_WITH_TRAIN -template <> -void act_square_grad(const float* din, - const float* dout_grad, - float* din_grad, - int size, - int threads) { - const float* ptr_out_grad = dout_grad; - float* ptr_in_grad = din_grad; - for (int i = 0; i < size; ++i) { - ptr_in_grad[0] = ptr_out_grad[0] * 2.0 * din[0]; - ptr_out_grad++; - ptr_in_grad++; - din++; - } -} -#endif - } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h index 0a849e9ec711a8c554388d9b69a25b79a7b392ec..b0147040cd11a888ec045948f0914a13aa932a2f 100644 --- a/lite/backends/arm/math/activation.h +++ b/lite/backends/arm/math/activation.h @@ -90,12 +90,6 @@ template void act_thresholded_relu( const T* din, T* dout, int size, float threshold, int threads); -#ifdef LITE_WITH_TRAIN -template -void act_square_grad( - const T* din, const T* dout_grad, T* din_grad, int size, int threads); -#endif - } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/clip.cc b/lite/backends/arm/math/clip.cc new file mode 100644 index 0000000000000000000000000000000000000000..8f8b48db53b9fe1b50a0832a64b3849faa417fb8 --- /dev/null +++ b/lite/backends/arm/math/clip.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/arm/math/clip.h" +#include +#include +#include +#include "lite/backends/arm/math/funcs.h" +#include "lite/backends/arm/math/saturate.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void clip_kernel_fp32( + const float* input, int64_t num, float min, float max, float* output) { + float tmp; + for (int64_t i = 0; i < num; i++) { + tmp = *input; + tmp = tmp > min ? tmp : min; + *output = tmp < max ? tmp : max; + input++; + output++; + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/clip.h b/lite/backends/arm/math/clip.h new file mode 100644 index 0000000000000000000000000000000000000000..cd74a8880abfb660c13c630ca708fa9c8f849d12 --- /dev/null +++ b/lite/backends/arm/math/clip.h @@ -0,0 +1,33 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/operators/op_params.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void clip_kernel_fp32( + const float* input, int64_t num, float min, float max, float* output); +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc index 4d08c1e957d43b5b748ffdb90fd14a07a61d0183..04373992e4802a0b0c2529daac851e00ebcb56cf 100644 --- a/lite/backends/arm/math/elementwise.cc +++ b/lite/backends/arm/math/elementwise.cc @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "lite/backends/arm/math/elementwise.h" +#include #include #include "lite/backends/arm/math/funcs.h" @@ -1254,6 +1254,19 @@ void elementwise_max_relu_broadcast(const float* dinx, } } +template <> +void elementwise_div(const int64_t* dinx, + const int64_t* diny, + int64_t* dout, + int num) { + for (int i = 0; i < num; i++) { + *dout = *dinx / *diny; + dout++; + dinx++; + diny++; + } +} + template <> void elementwise_div(const float* dinx, const float* diny, @@ -1306,6 +1319,28 @@ void elementwise_div(const float* dinx, } } +template <> +void elementwise_div_broadcast(const int64_t* dinx, + const int64_t* diny, + int64_t* dout, + int batch, + int channels, + int num) { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const int64_t* din_ptr = dinx + offset; + const int64_t diny_data = diny[j]; + int64_t* dout_ptr = dout + offset; + for (int p = 0; p < num; p++) { + *dout_ptr = *din_ptr / diny_data; + dout_ptr++; + din_ptr++; + } + } + } +} + template <> void elementwise_div_broadcast(const float* dinx, const float* diny, @@ -1541,6 +1576,87 @@ void elementwise_div_relu_broadcast(const float* dinx, } } +template +void elementwise_mod_broadcast( + const T* dinx, const T* diny, T* dout, int batch, int channels, int num) { +#pragma omp parallel for collapse(2) + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const T* din_ptr = dinx + offset; + const T diny_data = diny[j]; + T* dout_ptr = dout + offset; + + int cnt = num >> 2; + int remain = num % 4; + for (int k = 0; k < cnt; ++k) { + register T dinx0 = din_ptr[0]; + register T dinx1 = din_ptr[1]; + register T dinx2 = din_ptr[2]; + register T dinx3 = din_ptr[3]; + dout_ptr[0] = dinx0 % diny_data; + dout_ptr[1] = dinx1 % diny_data; + dout_ptr[2] = dinx2 % diny_data; + dout_ptr[3] = dinx3 % diny_data; + din_ptr += 4; + dout_ptr += 4; + } + if (remain > 0) { + for (int p = 0; p < remain; p++) { + *dout_ptr++ = *din_ptr++ % diny_data; + } + } + } + } +} + +template +void elementwise_mod(const T* dinx, const T* diny, T* dout, int num) { + int cnt = num >> 2; + int remain = num % 4; +#pragma omp parallel for + for (int i = 0; i < cnt; i++) { + const T* dinx_ptr = dinx + (i << 2); + const T* diny_ptr = diny + (i << 2); + T* dout_ptr = dout + (i << 2); + + register T dinx0 = dinx_ptr[0]; + register T dinx1 = dinx_ptr[1]; + register T dinx2 = dinx_ptr[2]; + register T dinx3 = dinx_ptr[3]; + + register T diny0 = diny_ptr[0]; + register T diny1 = diny_ptr[1]; + register T diny2 = diny_ptr[2]; + register T diny3 = diny_ptr[3]; + + dout_ptr[0] = dinx0 % diny0; + dout_ptr[1] = dinx1 % diny1; + dout_ptr[2] = dinx2 % diny2; + dout_ptr[3] = dinx3 % diny3; + } + if (remain > 0) { + const T* dinx_ptr = dinx + (cnt << 2); + const T* diny_ptr = diny + (cnt << 2); + T* dout_ptr = dout + (cnt << 2); + for (int i = 0; i < remain; i++) { + *dout_ptr++ = *dinx_ptr++ % *diny_ptr++; + } + } +} + +template void elementwise_mod(const int64_t* dinx, + const int64_t* diny, + int64_t* dout, + int num); + +template void elementwise_mod_broadcast(const int64_t* dinx, + const int64_t* diny, + int64_t* dout, + int batch, + int channels, + int num); + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/elementwise.h b/lite/backends/arm/math/elementwise.h index 06ecab08edcaf06614de94b99084be2ee80647aa..0b400fcce26c7d307777cc6e25d8d25e0d6234bc 100644 --- a/lite/backends/arm/math/elementwise.h +++ b/lite/backends/arm/math/elementwise.h @@ -253,6 +253,13 @@ template void elementwise_div_relu_broadcast( const T* dinx, const T* diny, T* dout, int batch, int channels, int num); +template +void elementwise_mod(const T* dinx, const T* diny, T* dout, int num); + +template +void elementwise_mod_broadcast( + const T* dinx, const T* diny, T* dout, int batch, int channels, int num); + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/funcs.h b/lite/backends/arm/math/funcs.h index e975160c97b6e7396ab208805a4d685586ac00c8..75dcc971b80e53c3874ffcbb108afdc0e0faa705 100644 --- a/lite/backends/arm/math/funcs.h +++ b/lite/backends/arm/math/funcs.h @@ -25,6 +25,7 @@ #include "lite/backends/arm/math/axpy.h" #include "lite/backends/arm/math/beam_search.h" #include "lite/backends/arm/math/box_coder.h" +#include "lite/backends/arm/math/clip.h" #include "lite/backends/arm/math/col_im_transform.h" #include "lite/backends/arm/math/concat.h" #include "lite/backends/arm/math/conv_block_utils.h" diff --git a/lite/backends/arm/math/softmax.cc b/lite/backends/arm/math/softmax.cc index 65d41b049123680f26674cc05d3c02172a260b31..b7f82e9f376e8b62195d884e8de19a142d76b316 100644 --- a/lite/backends/arm/math/softmax.cc +++ b/lite/backends/arm/math/softmax.cc @@ -531,7 +531,7 @@ void softmax_inner1_large_axis(const float* din, } float32x2_t vhmax = vmax_f32(vget_high_f32(vmax), vget_low_f32(vmax)); float max_data = std::max(vget_lane_f32(vhmax, 0), vget_lane_f32(vhmax, 1)); - for (j = 4 * j; j < axis_size; ++j) { + for (j = 4 * nn; j < axis_size; ++j) { max_data = std::max(max_data, din_max_ptr[0]); din_max_ptr++; } @@ -557,7 +557,7 @@ void softmax_inner1_large_axis(const float* din, float32x2_t vhsum = vadd_f32(vget_high_f32(vsum), vget_low_f32(vsum)); float sum_data = vget_lane_f32(vhsum, 0) + vget_lane_f32(vhsum, 1); - for (j = 4 * j; j < axis_size; ++j) { + for (j = 4 * nn; j < axis_size; ++j) { dout_sum_ptr[0] = expf(din_sum_ptr[0] - max_data); sum_data += dout_sum_ptr[0]; din_sum_ptr++; diff --git a/lite/backends/cuda/cuda_utils.h b/lite/backends/cuda/cuda_utils.h index 4c7cedaa97e22f74caebc5288fad8543f61bc88d..012004a65fa7d531ed85837e27b880c8c493ffca 100644 --- a/lite/backends/cuda/cuda_utils.h +++ b/lite/backends/cuda/cuda_utils.h @@ -41,6 +41,8 @@ << "CUDA: " << cudaGetErrorString(e); \ } +#define CUDA_POST_KERNEL_CHECK CUDA_CALL(cudaPeekAtLastError()) + #define CUBLAS_CALL(func) \ { \ auto e = (func); \ @@ -127,6 +129,10 @@ static const char* CudnnGetErrorInfo(cudnnStatus_t status) { return "CUDNN_STATUS_RUNTIME_IN_PROGRESS"; case CUDNN_STATUS_RUNTIME_FP_OVERFLOW: return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW"; +#endif +#if CUDNN_VERSION_MIN(8, 0, 0) + case CUDNN_STATUS_VERSION_MISMATCH: + return "CUDNN_STATUS_VERSION_MISMATCH"; #endif } return "Unknown cudnn status"; diff --git a/lite/backends/cuda/math/CMakeLists.txt b/lite/backends/cuda/math/CMakeLists.txt index 9e33d38feedbe682f3c4d962b4ccb85b74af3a7b..7f96308a5dcaf5742bd5dcef7c2e5f146cdb7c59 100644 --- a/lite/backends/cuda/math/CMakeLists.txt +++ b/lite/backends/cuda/math/CMakeLists.txt @@ -13,6 +13,8 @@ nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps}) nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps}) nv_library(cuda_gemm SRCS gemm.cc DEPS ${cuda_static_deps}) nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps}) +nv_library(cuda_strided_gemm SRCS strided_gemm.cc DEPS ${cuda_static_deps}) +nv_library(cuda_sequence_padding SRCS sequence_padding.cu DEPS ${cuda_static_deps}) set ( math_cuda @@ -25,6 +27,8 @@ set ( cudnn_pool cuda_gemm cuda_batched_gemm + cuda_strided_gemm + cuda_sequence_padding ) set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda") diff --git a/lite/backends/cuda/math/cudnn_conv.cc b/lite/backends/cuda/math/cudnn_conv.cc index 19ace2762af7d2088d5235e20387d8a4d941be30..5db41302c0cb0133e3badad0b5fa167d2c88f9df 100644 --- a/lite/backends/cuda/math/cudnn_conv.cc +++ b/lite/backends/cuda/math/cudnn_conv.cc @@ -161,15 +161,17 @@ bool CudnnConv2D::create(const operators::ConvParam& param, search_func); } else { - CUDNN_CHECK( - cudnnGetConvolutionForwardAlgorithm(this->handle_, - this->input_desc_, - this->filter_desc_, - this->conv_desc_, - this->output_desc_, - this->preference_, - this->workspace_limit_bytes_, - &this->fwd_algo_)); + int requestedAlgoCount = 1; + int returnedAlgoCount; + CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(this->handle_, + this->input_desc_, + this->filter_desc_, + this->conv_desc_, + this->output_desc_, + requestedAlgoCount, + &returnedAlgoCount, + &this->algo_perf_)); + this->fwd_algo_ = this->algo_perf_.algo; } CUDNN_CHECK( cudnnGetConvolutionForwardWorkspaceSize(this->handle_, diff --git a/lite/backends/cuda/math/cudnn_conv.h b/lite/backends/cuda/math/cudnn_conv.h index f73f1db7b1785814b6e97f28c8624b76fa75f89c..a084edefa17a5882f7e6d67407e1f48a818e3407 100644 --- a/lite/backends/cuda/math/cudnn_conv.h +++ b/lite/backends/cuda/math/cudnn_conv.h @@ -81,6 +81,7 @@ class CudnnConv2DBase { cudaStream_t stream_; cudnnHandle_t handle_; cudnnConvolutionFwdAlgo_t fwd_algo_; + cudnnConvolutionFwdAlgoPerf_t algo_perf_; cudnnTensorDescriptor_t input_desc_; cudnnTensorDescriptor_t output_desc_; cudnnTensorDescriptor_t bias_desc_; @@ -98,8 +99,6 @@ class CudnnConv2DBase { const bool use_tensor_core_ = true; const size_t workspace_limit_bytes_ = 4 * 1024 * 1024; - const cudnnConvolutionFwdPreference_t preference_ = - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; // For int8 Tensor temp_tensor_; diff --git a/lite/backends/cuda/math/sequence_padding.cu b/lite/backends/cuda/math/sequence_padding.cu new file mode 100644 index 0000000000000000000000000000000000000000..3a32be2a3446e420cac53a33506f141a001d61f0 --- /dev/null +++ b/lite/backends/cuda/math/sequence_padding.cu @@ -0,0 +1,166 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/backends/cuda/math/sequence_padding.h" +#include "lite/backends/cuda/math/utils.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +enum CopyType { kSeqToPad, kPadToSeq }; + +template +__global__ void SequencePadKernel(T* dst, + const T* src, + const T* pad_value, + bool is_constant_pad, + const size_t* seq_offsets, + const int seq_num, + const int pad_seq_len, + const int step_width) { + size_t seq_idx = blockIdx.y; + size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx]; + + size_t step_idx = blockIdx.x * blockDim.y + threadIdx.y; + size_t seq_data_offset = (seq_offsets[seq_idx] + step_idx) * step_width; + size_t pad_data_offset = (seq_idx * pad_seq_len + step_idx) * step_width; + T* dst_data = dst + (Type == kSeqToPad ? pad_data_offset : seq_data_offset); + const T* src_data = + src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset); + + if (step_idx < seq_len) { + for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) { + dst_data[i] = src_data[i]; + } + } else if (step_idx < pad_seq_len && Type == kSeqToPad) { + for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) { + dst_data[i] = is_constant_pad ? pad_value[0] : pad_value[i]; + } + } +} + +template +void SequencePadding(T* pad_data, + const T* seq_data, + const T* pad_value_data, + bool is_constant_pad, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream) { + const int kBlockSize = 512; + /* At least use 32 threads to copy sequence_width elements, + * and at least 8 elements for each thread. + */ + size_t block_dim_x = + std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); + size_t block_dim_y = kBlockSize / block_dim_x; + dim3 threads(block_dim_x, block_dim_y); + + size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y; + size_t grid_dim_y = seq_num; + dim3 grid(grid_dim_x, grid_dim_y); + + SequencePadKernel<<>>( + pad_data, + seq_data, + pad_value_data, + is_constant_pad, + seq_offsets_data, + seq_num, + pad_seq_len, + step_width); + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error); +} + +template +void SequenceUnpadding(T* seq_data, + const T* pad_data, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream) { + const int kBlockSize = 512; + /* At least use 32 threads to copy sequence_width elements, + * and at least 8 elements for each thread. + */ + size_t block_dim_x = + std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); + size_t block_dim_y = kBlockSize / block_dim_x; + dim3 threads(block_dim_x, block_dim_y); + + size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y; + size_t grid_dim_y = seq_num; + dim3 grid(grid_dim_x, grid_dim_y); + + SequencePadKernel<<>>( + seq_data, + pad_data, + nullptr, + false, + seq_offsets_data, + seq_num, + pad_seq_len, + step_width); + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error); +} + +template void SequencePadding(float* pad_data, + const float* seq_data, + const float* pad_value_data, + bool is_constant_pad, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream); + +template void SequencePadding(half* pad_data, + const half* seq_data, + const half* pad_value_data, + bool is_constant_pad, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream); + +template void SequenceUnpadding(float* seq_data, + const float* pad_data, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream); + +template void SequenceUnpadding(half* seq_data, + const half* pad_data, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream); + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/sequence_padding.h b/lite/backends/cuda/math/sequence_padding.h new file mode 100644 index 0000000000000000000000000000000000000000..cfbac9b5bce2cad75174695ee85c28720a3eaf11 --- /dev/null +++ b/lite/backends/cuda/math/sequence_padding.h @@ -0,0 +1,51 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "lite/core/context.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +void SequenceUnpadding(T* seq_data, + const T* pad_data, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream); + +template +void SequencePadding(T* pad_data, + const T* seq_data, + const T* pad_value_data, + bool is_constant_pad, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream); + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/strided_gemm.cc b/lite/backends/cuda/math/strided_gemm.cc new file mode 100644 index 0000000000000000000000000000000000000000..91013d977702682a42050407f49356bf7445bcbd --- /dev/null +++ b/lite/backends/cuda/math/strided_gemm.cc @@ -0,0 +1,136 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/cuda/math/strided_gemm.h" + +#include + +#include "lite/core/device_info.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +bool StridedGemm::init(const bool trans_a, + const bool trans_b, + Context* ctx) { + if (cu_handle_ == nullptr) { + this->exe_stream_ = ctx->exec_stream(); + CUBLAS_CALL(cublasCreate(&cu_handle_)); + CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_)); + } + cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N; + return true; +} + +template <> +bool StridedGemm::run(const float alpha, + const float beta, + const int m, + const int n, + const int k, + const float* a_data, + const float* b_data, + float* c_data, + const int batch_size, + const int64_t stride_a, + const int64_t stride_b) { + lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m; + ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k; + ldc_ = n; + m_ = m; + n_ = n; + k_ = k; + const int64_t stride_c = m_ * n_; + CUBLAS_CALL(cublasGemmStridedBatchedEx(cu_handle_, + cu_trans_b_, + cu_trans_a_, + n_, + m_, + k_, + &alpha, + b_data, + CUDA_R_32F, + ldb_, + stride_b, + a_data, + CUDA_R_32F, + lda_, + stride_a, + &beta, + c_data, + CUDA_R_32F, + ldc_, + stride_c, + batch_size, + CUDA_R_32F, + algo_)); + return true; +} + +template <> +bool StridedGemm::run(const half alpha, + const half beta, + const int m, + const int n, + const int k, + const half* a_data, + const half* b_data, + half* c_data, + const int batch_size, + const int64_t stride_a, + const int64_t stride_b) { + lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m; + ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k; + ldc_ = n; + m_ = m; + n_ = n; + k_ = k; + const int64_t stride_c = m_ * n_; + CUBLAS_CALL(cublasGemmStridedBatchedEx(cu_handle_, + cu_trans_b_, + cu_trans_a_, + n_, + m_, + k_, + &alpha, + b_data, + CUDA_R_16F, + ldb_, + stride_b, + a_data, + CUDA_R_16F, + lda_, + stride_a, + &beta, + c_data, + CUDA_R_16F, + ldc_, + stride_c, + batch_size, + CUDA_R_16F, + algo_)); + return true; +} + +template class StridedGemm; +template class StridedGemm; + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/strided_gemm.h b/lite/backends/cuda/math/strided_gemm.h new file mode 100644 index 0000000000000000000000000000000000000000..4a0fe7143a2569eda36d203d9c905f2a4a0c772c --- /dev/null +++ b/lite/backends/cuda/math/strided_gemm.h @@ -0,0 +1,72 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include + +#include "lite/api/paddle_place.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/context.h" +#include "lite/core/target_wrapper.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +class StridedGemm { + public: + StridedGemm() : cu_handle_(nullptr) {} + ~StridedGemm() {} + + bool init(const bool trans_a, + const bool trans_b, + Context* ctx); + + bool run(const PtypeIn alpha, + const PtypeIn beta, + const int m, + const int n, + const int k, + const PtypeIn* a_data, + const PtypeIn* b_data, + PtypeOut* c_data, + const int batch_size, + const int64_t stride_a, + const int64_t stride_b); + + private: + cudaStream_t exe_stream_; + cublasHandle_t cu_handle_; + cublasOperation_t cu_trans_a_; + cublasOperation_t cu_trans_b_; + int m_{-1}; + int n_{-1}; + int k_{-1}; + int lda_{-1}; + int ldb_{-1}; + int ldc_{-1}; + cublasGemmAlgo_t algo_{CUBLAS_GEMM_DEFAULT_TENSOR_OP}; +}; + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/transpose.cu b/lite/backends/cuda/math/transpose.cu index c50840fe269657965db8c58b171fce6819009775..d919bd757fbbcfcc5e5f8a3a4c18fbd1ed9ac53f 100644 --- a/lite/backends/cuda/math/transpose.cu +++ b/lite/backends/cuda/math/transpose.cu @@ -174,24 +174,9 @@ void Transpose::transpose(T* dst, TransposeCUDAImpl(src_dims, axes, src, dst, &Y_dims_, &strides_, stream); } -// template -// void Transpose::transpose(T* dst, -// const T* src, -// const std::vector& src_dims, -// const std::vector& axes, -// cudaStream_t* stream) { -// std::vector _src_dims(src_dims.size(), 0); -// std::transform( -// src_dims.begin(), -// src_dims.end(), -// _src_dims.begin(), -// [](int data) -> int64_t { return static_cast(data); }); -// TransposeCUDAImpl(_src_dims, axes, src, dst, &Y_dims_, &strides_, -// stream); -//} - template class Transpose; template class Transpose; +template class Transpose; } // namespace math } // namespace cuda diff --git a/lite/backends/mlu/target_wrapper.cc b/lite/backends/mlu/target_wrapper.cc index 2385f69246a163830e0df855082d728da2743e02..b98854946db7eda4f133d773ae0f5ba9e45a77cc 100644 --- a/lite/backends/mlu/target_wrapper.cc +++ b/lite/backends/mlu/target_wrapper.cc @@ -15,6 +15,7 @@ #include "lite/backends/mlu/target_wrapper.h" #include +#include #include "lite/backends/mlu/mlu_utils.h" @@ -36,6 +37,13 @@ void cnrtMemcpyDtoH(void* dst, const void* src, size_t size) { } // namespace mlu +thread_local cnmlCoreVersion_t TargetWrapperMlu::mlu_core_version_{CNML_MLU270}; +thread_local int TargetWrapperMlu::mlu_core_number_{1}; +thread_local bool TargetWrapperMlu::use_first_conv_{false}; +thread_local std::vector TargetWrapperMlu::mean_vec_; +thread_local std::vector TargetWrapperMlu::std_vec_; +thread_local DataLayoutType TargetWrapperMlu::input_layout_{DATALAYOUT(kNCHW)}; + size_t TargetWrapperMlu::num_devices() { uint32_t dev_count = 0; CNRT_CALL(cnrtGetDeviceCount(&dev_count)) << " cnrt get device count failed"; @@ -77,15 +85,42 @@ void TargetWrapperMlu::MemcpySync(void* dst, LOG(FATAL) << "Unsupported IoDirection" << static_cast(dir); } } +void TargetWrapperMlu::SetMLURunMode( + lite_api::MLUCoreVersion core_version, + int core_number, + DataLayoutType input_layout, + std::pair, std::vector> firstconv_param) { + switch (core_version) { + case (lite_api::MLUCoreVersion::MLU_220): + mlu_core_version_ = CNML_MLU220; + break; + case (lite_api::MLUCoreVersion::MLU_270): + mlu_core_version_ = CNML_MLU270; + break; + default: + mlu_core_version_ = CNML_MLU270; + break; + } + mlu_core_number_ = core_number; + mean_vec_ = firstconv_param.first; + std_vec_ = firstconv_param.second; + use_first_conv_ = !(mean_vec_.empty() || std_vec_.empty()); + input_layout_ = input_layout; +} + +cnmlCoreVersion_t TargetWrapperMlu::MLUCoreVersion() { + return mlu_core_version_; +} + +int TargetWrapperMlu::MLUCoreNumber() { return mlu_core_number_; } + +bool TargetWrapperMlu::UseFirstConv() { return use_first_conv_; } + +const std::vector& TargetWrapperMlu::MeanVec() { return mean_vec_; } + +const std::vector& TargetWrapperMlu::StdVec() { return std_vec_; } -// void TargetWrapperMlu::MemcpyAsync(void* dst, -// const void* src, -// size_t size, -// IoDirection dir, -// const stream_t& stream) { -// LOG(WARNING) << "Mlu unsupported MemcpyAsync now, use MemcpySync."; -// MemcpySync(dst, src, size, dir); -// } +DataLayoutType TargetWrapperMlu::InputLayout() { return input_layout_; } } // namespace lite } // namespace paddle diff --git a/lite/backends/mlu/target_wrapper.h b/lite/backends/mlu/target_wrapper.h index 2d9e10806f78e56f50b04d408dab219c923456fc..2566ae153e2f9539d1ad5739f208bc5f946a7542 100644 --- a/lite/backends/mlu/target_wrapper.h +++ b/lite/backends/mlu/target_wrapper.h @@ -13,6 +13,8 @@ // limitations under the License. #pragma once +#include +#include #include "lite/backends/mlu/mlu_utils.h" #include "lite/core/target_wrapper.h" @@ -43,11 +45,25 @@ class TargetWrapper { const void* src, size_t size, IoDirection dir); - // static void MemcpyAsync(void* dst, - // const void* src, - // size_t size, - // IoDirection dir, - // const queue_t& queue); + static void SetMLURunMode( + lite_api::MLUCoreVersion core_version, + int core_number, + DataLayoutType input_layout, + std::pair, std::vector> firstconv_param); + static cnmlCoreVersion_t MLUCoreVersion(); + static int MLUCoreNumber(); + static bool UseFirstConv(); + static const std::vector& MeanVec(); + static const std::vector& StdVec(); + static DataLayoutType InputLayout(); + + private: + static thread_local cnmlCoreVersion_t mlu_core_version_; + static thread_local int mlu_core_number_; + static thread_local bool use_first_conv_; + static thread_local std::vector mean_vec_; + static thread_local std::vector std_vec_; + static thread_local DataLayoutType input_layout_; }; } // namespace lite diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc index f9803aa8810ada33b9eecafe1502515501514e41..22f760e39f86b29ccf025a83b2a43c87882f9e02 100644 --- a/lite/backends/npu/device.cc +++ b/lite/backends/npu/device.cc @@ -20,96 +20,122 @@ namespace paddle { namespace lite { namespace npu { -bool WriteToOMFile(const domi::ModelBufferData& om_model_buff, - std::string om_file_path) { - FILE* fp; - fp = fopen(om_file_path.c_str(), "wb"); - CHECK(fp != nullptr) << om_file_path << " open failed!"; - - uint32_t write_size = - (uint32_t)fwrite(om_model_buff.data, 1, om_model_buff.length, fp); - CHECK_EQ(write_size, om_model_buff.length) << "write om file failed !"; - - fclose(fp); - return true; -} - -bool ReadFromOMFile(domi::ModelBufferData* om_model_buff, - std::string om_file_path) { - FILE* fp; - fp = fopen(om_file_path.c_str(), "rb"); - CHECK(fp != nullptr) << om_file_path << " open failed!"; - - fseek(fp, 0, SEEK_END); - uint32_t model_length = (uint32_t)ftell(fp); - fseek(fp, 0, SEEK_SET); - om_model_buff->data = malloc(model_length); - om_model_buff->length = model_length; - uint32_t read_size = - (uint32_t)fread(om_model_buff->data, 1, model_length, fp); - CHECK_EQ(read_size, model_length) << "read om file failed !"; - - fclose(fp); - return true; -} - -std::shared_ptr Device::Build( - const std::string model_name, // NOLINT - std::vector& input_nodes, // NOLINT - std::vector& output_nodes, // NOLINT - const std::string model_cache_full_dir = "" // NOLINT - ) { - VLOG(3) << "[NPU] Build model"; - // Build the HiAI IR graph to the HiAI om model - ge::Graph ir_graph("graph"); - ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes); - ge::Model om_model("model", "model"); - om_model.SetGraph(ir_graph); - domi::HiaiIrBuild ir_build; - domi::ModelBufferData om_model_buf; - - if (!model_cache_full_dir.empty() && IsFileExists(model_cache_full_dir)) { - VLOG(3) << "Will read om model from " << model_cache_full_dir; - ReadFromOMFile(&om_model_buf, model_cache_full_dir); - } else { - if (!ir_build.CreateModelBuff(om_model, om_model_buf)) { - LOG(WARNING) << "[NPU] CreateModelBuff failed!"; - return nullptr; - } - if (!ir_build.BuildIRModel(om_model, om_model_buf)) { - LOG(WARNING) << "[NPU] BuildIRModel failed!"; - ir_build.ReleaseModelBuff(om_model_buf); - return nullptr; - } - if (!model_cache_full_dir.empty()) { - VLOG(3) << "Will write om model to " << model_cache_full_dir; - WriteToOMFile(om_model_buf, model_cache_full_dir); - } - } - +std::shared_ptr Device::Load( + const std::string& model_name, + std::vector* model_buffer, + bool* model_comp) { // Create a HiAI model manager client to load the HiAI om model - std::shared_ptr model_client( - new hiai::AiModelMngerClient()); + auto model_client = std::make_shared(); if (model_client->Init(nullptr) != hiai::AI_SUCCESS) { - LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!"; - ir_build.ReleaseModelBuff(om_model_buf); + LOG(WARNING) << "[NPU] Init hiai model client failed!"; return nullptr; } + // Check HiAI DDK version + const char* ddk_version = model_client->GetVersion(); + if (ddk_version) { + LOG(INFO) << "[NPU] HiAI DDK version: " << ddk_version; + } else { + LOG(WARNING) << "[NPU] Unable to get HiAI DDK version!"; + } + // Check model compatibility auto model_desc = std::make_shared( model_name, freq_level(), framework_type(), model_type(), device_type()); - model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length); - std::vector> model_descs; - model_descs.push_back(model_desc); + model_desc->SetModelBuffer( + reinterpret_cast(model_buffer->data()), + model_buffer->size()); + if (!*model_comp && + model_client->CheckModelCompatibility(*model_desc, *model_comp) != + hiai::AI_SUCCESS) { + *model_comp = false; + VLOG(3) << "[NPU] model is NOT compatiblitiable, setting model_comp to " + << *model_comp; + } else { + *model_comp = true; + VLOG(3) << "[NPU] model is compatiblitiable, setting model_comp to " + << *model_comp; + } + // Rebuild and write the data of the compatible model to the model buffer + if (!*model_comp) { + std::shared_ptr model_builder = + std::make_shared(model_client); + hiai::MemBuffer* org_model_buffer = model_builder->InputMemBufferCreate( + reinterpret_cast(model_buffer->data()), model_buffer->size()); + if (org_model_buffer) { + std::vector org_model_buffers; + org_model_buffers.push_back(org_model_buffer); + hiai::MemBuffer* new_model_buffer = model_builder->OutputMemBufferCreate( + framework_type(), org_model_buffers); + // VLOG(3) << "[NPU] new model buffer memeory size is " << + // new_model_buffer->GetMemBufferSize(); + if (new_model_buffer) { + uint32_t new_model_size = 0; + if (model_builder->BuildModel(org_model_buffers, + new_model_buffer, + new_model_size) == hiai::AI_SUCCESS) { + // need to change to new_model_size as GetMemBufferSize is not + // correct. + model_buffer->resize(new_model_size); + memcpy(reinterpret_cast(model_buffer->data()), + new_model_buffer->GetMemBufferData(), + new_model_size); + // Reset the model buffer + model_desc->SetModelBuffer( + reinterpret_cast(model_buffer->data()), + model_buffer->size()); + VLOG(3) << "[NPU] Rebuild the compatible model done."; + } else { + LOG(WARNING) << "[NPU] Rebuild the compatible model failed!"; + } + model_builder->MemBufferDestroy(new_model_buffer); + } else { + LOG(WARNING) << "[NPU] OutputMemBufferCreate failed!"; + } + model_builder->MemBufferDestroy(org_model_buffer); + } else { + LOG(WARNING) << "[NPU] InputMemBufferCreate failed!"; + } + } + // Load the compatible model + std::vector> model_descs{ + model_desc}; if (model_client->Load(model_descs) != hiai::AI_SUCCESS) { LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!"; - ir_build.ReleaseModelBuff(om_model_buf); return nullptr; } - ir_build.ReleaseModelBuff(om_model_buf); - VLOG(3) << "[NPU] Build done"; + VLOG(3) << "[NPU] Load model done."; return model_client; } +bool Device::Build(std::vector& input_nodes, // NOLINT + std::vector& output_nodes, // NOLINT + std::vector* model_buffer) { + // Convert the HiAI IR graph to the HiAI om model + ge::Graph ir_graph("graph"); + ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes); + ge::Model om_model("model", "model"); + om_model.SetGraph(ir_graph); + + // Build the HiAI om model, serialize and output it to the om buffer + domi::HiaiIrBuild ir_build; + domi::ModelBufferData om_buffer; + if (!ir_build.CreateModelBuff(om_model, om_buffer)) { + LOG(WARNING) << "[NPU] CreateModelBuff failed!"; + return false; + } + if (!ir_build.BuildIRModel(om_model, om_buffer)) { + LOG(WARNING) << "[NPU] BuildIRModel failed!"; + ir_build.ReleaseModelBuff(om_buffer); + return false; + } + model_buffer->resize(om_buffer.length); + memcpy(reinterpret_cast(model_buffer->data()), + reinterpret_cast(om_buffer.data), + om_buffer.length); + ir_build.ReleaseModelBuff(om_buffer); + VLOG(3) << "[NPU] Build model done."; + return true; +} + } // namespace npu } // namespace lite } // namespace paddle diff --git a/lite/backends/npu/device.h b/lite/backends/npu/device.h index cf03e097194bf20ab428677b09b840991e8a902c..5862f0b393292d95b6500ae75171fab07a5279a6 100644 --- a/lite/backends/npu/device.h +++ b/lite/backends/npu/device.h @@ -38,14 +38,18 @@ class Device { int model_type() { return model_type_; } int device_type() { return device_type_; } + // Load the HiAI om model from buffer, rebuild the model if it's incompatible + // with the current device, then create a HiAI model manager client(from HiAI + // Server) to run inference + std::shared_ptr Load( + const std::string& model_name, + std::vector* model_buffer, + bool* model_comp); // Build the HiAI IR graph to om model, return HiAI model manager client to // load om model and run inference. - std::shared_ptr Build( - const std::string model_name, // NOLINT - std::vector& input_nodes, // NOLINT - std::vector& output_nodes, // NOLINT - const std::string model_cache_name // NOLINT - ); // NOLINT + bool Build(std::vector& input_nodes, // NOLINT + std::vector& output_nodes, // NOLINT + std::vector* model_buffer); private: int freq_level_{3}; diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc index 67d679fdd596b109b714bf7ba3cd45b2632b9420..002073517bc61af60da213db9af6e56da5f5b501 100644 --- a/lite/backends/opencl/cl_context.cc +++ b/lite/backends/opencl/cl_context.cc @@ -119,7 +119,7 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) { } } -cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size, +cl::NDRange CLContext::LocalWorkSizeTune(cl::NDRange global_work_size, size_t max_work_size, int divisor) { int preferred_lws = 0; @@ -157,7 +157,7 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size, static_cast(gws0)}; #endif } -cl::NDRange CLContext::LocalWorkSizeTurnReverse(cl::NDRange global_work_size, +cl::NDRange CLContext::LocalWorkSizeTuneReverse(cl::NDRange global_work_size, size_t max_work_size, int divisor) { int preferred_lws = 0; diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h index 82d15bee5ec460a1fb06430571f007fcef23f66f..c204a8510402b8741c761938c3b2c37ac07fe961 100644 --- a/lite/backends/opencl/cl_context.h +++ b/lite/backends/opencl/cl_context.h @@ -62,10 +62,10 @@ class CLContext { cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size); - cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size, + cl::NDRange LocalWorkSizeTune(cl::NDRange global_work_size, size_t max_work_size, int divitor = 2); - cl::NDRange LocalWorkSizeTurnReverse(cl::NDRange global_work_size, + cl::NDRange LocalWorkSizeTuneReverse(cl::NDRange global_work_size, size_t max_work_size, int divitor = 2); bool IsArmMali(); diff --git a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl index a14748c69f3eafce515c90f2b8a226703fe5883d..080ce2b457421970409431dee6841ac4f7d57bb5 100644 --- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl @@ -405,7 +405,9 @@ void fc_gemm_4x4(__global const CL_DTYPE* a, } else { for (int cidx = col; cidx < N; ++cidx) { for (int ridx = row; ridx < M; ++ridx) { - CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0; + CL_COMPUTE_DTYPE a0 = 0; + CL_COMPUTE_DTYPE b0 = 0; + CL_COMPUTE_DTYPE c0 = bias ? bias[cidx] : 0; for (int p = 0; p < K; ++p) { a0 = *(a + ridx * K + p); b0 = *(b + p * N + cidx), diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl index 1c808da68ddc923e12234bc4b6ac99b35bfffb0b..9209f0e0f8d04fad5e788f3742c7922af8e13f49 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl @@ -6,9 +6,7 @@ __kernel void conv2d_1x1_opt( __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, @@ -284,9 +282,7 @@ __kernel void conv2d_1x1_simple( __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl index 771765ea6063a08784ae824a757b28450d808f6d..6a3aa6455daf8d20430a434ff6f47dac382f1f74 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl @@ -19,9 +19,7 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int offset, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl index 79f3922e89549fc15b7a849efb0e2b6595357102..739f852a7c6b60e4c38cb2523dfb745af65bc8df 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl @@ -19,9 +19,7 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, @@ -264,9 +262,7 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl index d856af6a1d4026b1595bc287901e53f64267dc81..f08d53fa4968d041337adfe3252529bca3b5c55e 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl @@ -5,9 +5,7 @@ __kernel void conv2d_5x5(__private const int global_size_dim0, __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl index 4ed2e072022dc4b457a86d634bf4bc21ab62bc45..4cce039f27b750950a1475ac266e0f5117c6d259 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl @@ -20,9 +20,7 @@ __kernel void conv2d_5x5_opt(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, @@ -268,9 +266,7 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, @@ -513,4 +509,4 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]); } -} \ No newline at end of file +} diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl index 4998dc99279fffad8750ef3b6495597e9fc4ad65..2a2f210601e760651ee850686391af3c040fbe7f 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl @@ -5,9 +5,7 @@ __kernel void conv2d_7x7(__private const int global_size_dim0, __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl index d82f4b4c96b586b6ecf948827402afd0766dcea4..4eadcd9f8032996abae04660b6878ab5beaff9a7 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl @@ -20,9 +20,7 @@ __kernel void conv2d_7x7_opt(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, @@ -268,9 +266,7 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, @@ -513,4 +509,4 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]); } -} \ No newline at end of file +} diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl index 27313aea23ed16ecc7a6763dfbbbe63bca18941a..465b9f8f925a130b4d1b059ab15e93bc29128ec7 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl @@ -19,9 +19,7 @@ __kernel void depth_conv2d(__private const int global_size_dim0, __private const int global_size_dim2, __read_only image2d_t input, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl index 5626fe6be7d451d4ffe22a2008affa7d82298bc3..6fbdc21f934f21dd26c3eb66885f7087e3d340c0 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl @@ -20,9 +20,7 @@ __kernel void depth_conv2d_3x3( __private const int global_size_dim2, __read_only image2d_t input, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int offset, @@ -249,9 +247,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, __private const int ou_nh, __read_only image2d_t input, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, diff --git a/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl b/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..b8533076b79aa2e94e30e38dd34d3f2292fdf88a --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl @@ -0,0 +1,160 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void transpose_4d(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = 1; + const int out_h = out_nh % out_H; + const int out_c0 = out_c * 4; + const int out_c1 = out_c * 4 + 1; + const int out_c2 = out_c * 4 + 2; + const int out_c3 = out_c * 4 + 3; + + const int in_n = out_n; + const int in_c = out_w * 0.25; + const int in_h0 = out_c0; + const int in_h1 = out_c1; + const int in_h2 = out_c2; + const int in_h3 = out_c3; + const int in_w = out_h; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n * in_h0; + + input_pos1.x = in_W * in_c + in_w; + input_pos1.y = in_n * in_h1; + + input_pos2.x = in_W * in_c + in_w; + input_pos2.y = in_n * in_h2; + + input_pos3.x = in_W * in_c + in_w; + input_pos3.y = in_n * in_h3; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos0); + + if (out_w % 4 == 0) { + output.x = input0.x; + } else if (out_w % 4 == 1) { + output.x = input0.y; + } else if (out_w % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos1); + if(out_w % 4 == 0) { + output.y = input1.x; + } else if(out_w % 4 == 1) { + output.y = input1.y; + } else if(out_w % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos2); + if (out_w % 4 == 0){ + output.z = input2.x; + } else if (out_w % 4 == 1) { + output.z = input2.y; + } else if (out_w % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos3); + if (out_w % 4 == 0) { + output.w = input3.x; + } else if (out_w % 4 == 1) { + output.w = input3.y; + } else if (out_w % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = 1; + const int out_h = out_nh % out_H; + + const int in_n = 1; + const int in_c = out_c; + const int in_w = out_h; + const int in_h = out_w; + + int2 input_pos; + int2 output_pos; + input_pos.x = in_c * in_W + in_w; + input_pos.y = in_n * in_h; + + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_n * out_h; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 input; + CL_DTYPE4 output; + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos); + + output = input; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input); +} \ No newline at end of file diff --git a/lite/backends/x86/dynamic_loader.cc b/lite/backends/x86/dynamic_loader.cc index 4978dfb84a4ee5770df011c54dccde59a62135b7..0d4301c5b6a56e50eba2d9a6ae13ce353a9b1e2e 100644 --- a/lite/backends/x86/dynamic_loader.cc +++ b/lite/backends/x86/dynamic_loader.cc @@ -20,8 +20,8 @@ limitations under the License. */ #include "lite/backends/x86/cupti_lib_path.h" #include "lite/backends/x86/port.h" #include "lite/backends/x86/warpctc_lib_path.h" +#include "lite/utils/cp_logging.h" #include "lite/utils/env.h" -#include "lite/utils/paddle_enforce.h" // DEFINE_string(cudnn_dir, // "", @@ -178,7 +178,7 @@ auto error_msg = #endif // !_WIN32 if (throw_on_error) { CHECK(dso_handle != nullptr); - // PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, errorno); + // CHECK(nullptr != dso_handle, error_msg, dlPath, errorno); } else if (nullptr == dso_handle) { // LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno); } diff --git a/lite/backends/x86/jit/benchmark.cc b/lite/backends/x86/jit/benchmark.cc index c49984691e5beca5a42defd68243e1352372cf11..6318916dfa53d5cce0c33d0149a520ccb9288c28 100644 --- a/lite/backends/x86/jit/benchmark.cc +++ b/lite/backends/x86/jit/benchmark.cc @@ -319,8 +319,8 @@ void BenchKernelSgd() { const T lr = 0.1; auto UnDuplicatedRandomVec = []( int n, const int64_t lower, const int64_t upper) -> std::vector { - PADDLE_ENFORCE_LE(static_cast(upper - lower), n - 1); - PADDLE_ENFORCE_GT(n, 0); + CHECK_LE(static_cast(upper - lower), n - 1); + CHECK_GT(n, 0); std::vector all, out; for (int i = 0; i < n; ++i) { all.push_back(i); diff --git a/lite/backends/x86/jit/gen/embseqpool.cc b/lite/backends/x86/jit/gen/embseqpool.cc index 7e697014ed241a75693b783127633b255964f80b..e6628058d03959a2a58b403a6ad61af6c50b431c 100644 --- a/lite/backends/x86/jit/gen/embseqpool.cc +++ b/lite/backends/x86/jit/gen/embseqpool.cc @@ -129,11 +129,11 @@ class EmbSeqPoolCreator : public JitCodeCreator { } std::unique_ptr CreateJitCode( const emb_seq_pool_attr_t& attr) const override { - PADDLE_ENFORCE_GT(attr.table_height, 0); - PADDLE_ENFORCE_GT(attr.table_width, 0); - PADDLE_ENFORCE_GT(attr.index_height, 0); - PADDLE_ENFORCE_GT(attr.index_width, 0); - PADDLE_ENFORCE_GT(attr.out_width, 0); + CHECK_GT(attr.table_height, 0); + CHECK_GT(attr.table_width, 0); + CHECK_GT(attr.index_height, 0); + CHECK_GT(attr.index_width, 0); + CHECK_GT(attr.out_width, 0); return make_unique(attr, CodeSize(attr)); } }; diff --git a/lite/backends/x86/jit/gen/embseqpool.h b/lite/backends/x86/jit/gen/embseqpool.h index 7bb248dd1d384af949fd3cd190df3d90d21921ef..d013887be5ecec1f67fa022b49b889f9cee9ade4 100644 --- a/lite/backends/x86/jit/gen/embseqpool.h +++ b/lite/backends/x86/jit/gen/embseqpool.h @@ -17,7 +17,7 @@ #include #include "lite/backends/x86/jit/gen/jitcode.h" #include "lite/utils/cp_logging.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/string.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/jit/gen/matmul.cc b/lite/backends/x86/jit/gen/matmul.cc index f78df73f66532f891721c74cff9c78cc3bb61922..87fe758809e3e7e18d2f939a26f3729b937bf6f6 100644 --- a/lite/backends/x86/jit/gen/matmul.cc +++ b/lite/backends/x86/jit/gen/matmul.cc @@ -27,7 +27,7 @@ void MatMulJitCode::genCode() { preCode(); int block, rest; const auto groups = packed_groups(n_, k_, &block, &rest); - PADDLE_ENFORCE_GT(groups.front(), 0); + CHECK_GT(groups.front(), 0); const int block_len = sizeof(float) * block; const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1; @@ -116,9 +116,9 @@ class MatMulCreator : public JitCodeCreator { } std::unique_ptr CreateJitCode( const matmul_attr_t& attr) const override { - PADDLE_ENFORCE_GT(attr.m, 0); - PADDLE_ENFORCE_GT(attr.n, 0); - PADDLE_ENFORCE_GT(attr.k, 0); + CHECK_GT(attr.m, 0); + CHECK_GT(attr.n, 0); + CHECK_GT(attr.k, 0); return make_unique(attr, CodeSize(attr)); } }; diff --git a/lite/backends/x86/jit/gen/matmul.h b/lite/backends/x86/jit/gen/matmul.h index 95edc14201ac94d302ff806d0a4b8f5f50b2835c..8bc1e41d0a17d548c47819b5e11daf7ed5065e86 100644 --- a/lite/backends/x86/jit/gen/matmul.h +++ b/lite/backends/x86/jit/gen/matmul.h @@ -19,7 +19,7 @@ #include #include "lite/backends/x86/jit/gen/jitcode.h" #include "lite/utils/cp_logging.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/string.h" namespace paddle { namespace lite { @@ -32,7 +32,7 @@ class MatMulJitCode : public JitCode { size_t code_size = 256 * 1024, void* code_ptr = nullptr) : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) { - PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet"); + CHECK_EQ(m_, 1) << "Only support m==1 yet"; this->genCode(); } diff --git a/lite/backends/x86/jit/gen/seqpool.cc b/lite/backends/x86/jit/gen/seqpool.cc index 4c80737aac4bc9cd09f4ff222c8fad8c441887ec..c54093e4dfa00f89f51c70840c45518f3eddfd3d 100644 --- a/lite/backends/x86/jit/gen/seqpool.cc +++ b/lite/backends/x86/jit/gen/seqpool.cc @@ -69,8 +69,8 @@ class SeqPoolCreator : public JitCodeCreator { } std::unique_ptr CreateJitCode( const seq_pool_attr_t& attr) const override { - PADDLE_ENFORCE_GT(attr.w, 0); - PADDLE_ENFORCE_GT(attr.h, 0); + CHECK_GT(attr.w, 0); + CHECK_GT(attr.h, 0); return make_unique(attr, CodeSize(attr)); } }; diff --git a/lite/backends/x86/jit/gen/seqpool.h b/lite/backends/x86/jit/gen/seqpool.h index a00428f3e0982889665cd23b21a5978c7c239399..a1bde4a9b66f22ef8815bdc61fe866065e7f4203 100644 --- a/lite/backends/x86/jit/gen/seqpool.h +++ b/lite/backends/x86/jit/gen/seqpool.h @@ -17,7 +17,7 @@ #include #include "lite/backends/x86/jit/gen/jitcode.h" #include "lite/utils/cp_logging.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/string.h" namespace paddle { namespace lite { @@ -125,8 +125,8 @@ class SeqPoolJitCode : public JitCode { vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); reg_idx++; } - PADDLE_ENFORCE_EQ( - reg_idx, rest_used_num_regs, "All heights should use same regs"); + CHECK_EQ(reg_idx, rest_used_num_regs) + << "All heights should use same regs"; for (int i = 0; i < reg_idx; ++i) { vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs)); } diff --git a/lite/backends/x86/jit/gen/sgd.cc b/lite/backends/x86/jit/gen/sgd.cc index 44e083366132c675b339b2da4bbb3b7c1c6b7569..f91f1305ee30af708443e6a9a8bbb3fae2cc0b80 100644 --- a/lite/backends/x86/jit/gen/sgd.cc +++ b/lite/backends/x86/jit/gen/sgd.cc @@ -17,7 +17,7 @@ #include #include #include "lite/backends/x86/jit/registry.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -113,9 +113,9 @@ class SgdCreator : public JitCodeCreator { } std::unique_ptr CreateJitCode( const sgd_attr_t& attr) const override { - PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width); - PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height); - PADDLE_ENFORCE_GE(attr.selected_rows_size, 0); + CHECK_EQ(attr.param_width, attr.grad_width); + CHECK_LE(attr.selected_rows_size, attr.grad_height); + CHECK_GE(attr.selected_rows_size, 0); return make_unique(attr, CodeSize(attr)); } }; diff --git a/lite/backends/x86/jit/gen/vbroadcast.cc b/lite/backends/x86/jit/gen/vbroadcast.cc index fb1e71f7b0b1e6f68a331d264682e80fbab7c219..7c4860ba5084860b67b6ecb7e3eed8aafb16cb2c 100644 --- a/lite/backends/x86/jit/gen/vbroadcast.cc +++ b/lite/backends/x86/jit/gen/vbroadcast.cc @@ -16,7 +16,7 @@ #include #include #include "lite/backends/x86/jit/registry.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -76,7 +76,7 @@ class VBroadcastCreator : public JitCodeCreator { return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8; } std::unique_ptr CreateJitCode(const int64_t& w) const override { - PADDLE_ENFORCE_GT(w, 0); + CHECK_GT(w, 0); return make_unique(w, CodeSize(w)); } }; diff --git a/lite/backends/x86/jit/gen_base.cc b/lite/backends/x86/jit/gen_base.cc index a3376be423828b25c6eda6fff30a56578c7bbbe5..a9a89fdb205ad54268986eeee628aec75ac01b74 100644 --- a/lite/backends/x86/jit/gen_base.cc +++ b/lite/backends/x86/jit/gen_base.cc @@ -21,8 +21,8 @@ // posix_memalign #include "lite/backends/x86/cpu_info.h" #include "lite/backends/x86/jit/macro.h" +#include "lite/utils/cp_logging.h" #include "lite/utils/env.h" -#include "lite/utils/paddle_enforce.h" #ifndef _WIN32 #define posix_memalign_free free @@ -62,12 +62,10 @@ void* GenBase::operator new(size_t size) { #ifdef _WIN32 ptr = _aligned_malloc(size, alignment); #else - PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), - 0, - "GenBase Alloc %ld error!", - size); + CHECK_EQ(posix_memalign(&ptr, alignment, size), 0) << "GenBase Alloc " << size + << " error!"; #endif - PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size); + CHECK(ptr) << "Fail to allocate GenBase CPU memory: size = " << size; return ptr; } diff --git a/lite/backends/x86/jit/helper.cc b/lite/backends/x86/jit/helper.cc index 8322f7ebd2ce78f99979574983d81cebe5139606..f80a24d15c4666eacd31770c46f8a7ad4e7cfb37 100644 --- a/lite/backends/x86/jit/helper.cc +++ b/lite/backends/x86/jit/helper.cc @@ -14,9 +14,10 @@ #include "lite/backends/x86/jit/helper.h" #include // tolower +#include #include #include -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -104,12 +105,12 @@ void pack_weights(const float* src, float* dst, int n, int k) { int block, rest; const auto groups = packed_groups(n, k, &block, &rest); std::for_each(groups.begin(), groups.end(), [&](int i) { - PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0."); + CHECK_GT(i, 0) << "each element of groups should be larger than 0."; }); int sum = std::accumulate(groups.begin(), groups.end(), 0); std::memset(dst, 0, k * sum * block * sizeof(float)); - PADDLE_ENFORCE_GE( - sum * block, n, "The packed n should be equal to or larger than n"); + CHECK_GE(sum * block, n) + << "The packed n should be equal to or larger than n"; const int block_len = sizeof(float) * block; int n_offset = 0; diff --git a/lite/backends/x86/jit/helper.h b/lite/backends/x86/jit/helper.h index f741edbbed5b721fb9104a9c9a171a12532e4705..57a3611bb671c6d83ec3212702a57e3fc7d7f35f 100644 --- a/lite/backends/x86/jit/helper.h +++ b/lite/backends/x86/jit/helper.h @@ -23,7 +23,7 @@ #include "lite/backends/x86/jit/kernel_base.h" #include "lite/backends/x86/jit/kernel_key.h" #include "lite/backends/x86/jit/kernel_pool.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -78,8 +78,8 @@ inline const Kernel* GetReferKernel() { auto& ref_pool = ReferKernelPool::Instance().AllKernels(); KernelKey kkey(KernelTuple::kernel_type, lite::fluid::CPUPlace()); auto ref_iter = ref_pool.find(kkey); - PADDLE_ENFORCE(ref_iter != ref_pool.end(), - "Every Kernel should have reference function."); + CHECK(ref_iter != ref_pool.end()) + << "Every Kernel should have reference function."; auto& ref_impls = ref_iter->second; for (auto& impl : ref_impls) { auto i = dynamic_cast*>(impl.get()); @@ -94,7 +94,7 @@ template inline typename KernelTuple::func_type GetReferFunc() { auto ker = GetReferKernel(); auto p = dynamic_cast*>(ker); - PADDLE_ENFORCE(p, "The Refer kernel should exsit"); + CHECK(p) << "The Refer kernel should exsit"; return p->GetFunc(); } @@ -125,7 +125,7 @@ std::vector GetAllCandidateKernels( // The last implementation should be reference function on CPUPlace. auto ref = GetReferKernel(); - PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty."); + CHECK(ref != nullptr) << "Refer Kernel can not be empty."; res.emplace_back(ref); return res; } @@ -140,11 +140,11 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) { std::string name = k->ImplType(); if (name == "JitCode") { auto i = dynamic_cast(k); - PADDLE_ENFORCE(i, "jitcode kernel cast can not fail."); + CHECK(i) << "jitcode kernel cast can not fail."; res.emplace_back(std::make_pair(name, i->template getCode())); } else { auto i = dynamic_cast*>(k); - PADDLE_ENFORCE(i, "kernel cast can not fail."); + CHECK(i) << "kernel cast can not fail."; res.emplace_back(std::make_pair(name, i->GetFunc())); } } @@ -166,7 +166,7 @@ template typename KernelTuple::func_type GetDefaultBestFunc( const typename KernelTuple::attr_type& attr) { auto funcs = GetAllCandidateFuncs(attr); - PADDLE_ENFORCE_GE(funcs.size(), 1UL); + CHECK_GE(funcs.size(), 1UL); // Here could do some runtime benchmark of this attr and return the best one. // But yet just get the first one as the default best one, // which is searched in order and tuned by offline. diff --git a/lite/backends/x86/jit/kernel_key.cc b/lite/backends/x86/jit/kernel_key.cc index a6288fcf19d6867e1e1eb0bce32e559a4f303929..30397ffe1c4980e4af19a7a0eb44b47585b44f2c 100644 --- a/lite/backends/x86/jit/kernel_key.cc +++ b/lite/backends/x86/jit/kernel_key.cc @@ -14,7 +14,7 @@ #include "lite/backends/x86/jit/kernel_key.h" #include // XXH64: 13.8 GB/s -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/jit/more/mkl/mkl.h b/lite/backends/x86/jit/more/mkl/mkl.h index 6bc791e64575b8f481f91ea3c28ea4896fe1860d..473e1253194513c16d6d8c3b52eac110512e806e 100644 --- a/lite/backends/x86/jit/more/mkl/mkl.h +++ b/lite/backends/x86/jit/more/mkl/mkl.h @@ -18,7 +18,7 @@ #include #include #include "lite/backends/x86/jit/kernel_base.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -104,11 +104,11 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out, const emb_seq_pool_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width); + CHECK_EQ(attr->table_width * attr->index_width, attr->out_width); auto check_idx_value_valid = [&](int64_t i) { - PADDLE_ENFORCE_LT( - idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i); - PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i); + CHECK_LT(idx[i], attr->table_height) << "idx value: " << idx[i] + << " i: " << i; + CHECK_GE(idx[i], 0) << "idx value: " << idx[i] << " i: " << i; }; for (int64_t w = 0; w != attr->index_width; ++w) { @@ -175,22 +175,22 @@ void Sgd(const T* lr, const int64_t* rows, T* out, const sgd_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width); - PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height); + CHECK_EQ(attr->param_width, attr->grad_width); + CHECK_LE(attr->selected_rows_size, attr->grad_height); T scalar = -lr[0]; int width = attr->grad_width; if (out == param) { for (int64_t i = 0; i < attr->selected_rows_size; ++i) { auto h_idx = rows[i]; - PADDLE_ENFORCE_LT(h_idx, attr->param_height); - PADDLE_ENFORCE_GE(h_idx, 0); + CHECK_LT(h_idx, attr->param_height); + CHECK_GE(h_idx, 0); VAXPY(scalar, grad + i * width, out + h_idx * width, width); } } else { for (int64_t i = 0; i < attr->selected_rows_size; ++i) { auto h_idx = rows[i]; - PADDLE_ENFORCE_LT(h_idx, attr->param_height); - PADDLE_ENFORCE_GE(h_idx, 0); + CHECK_LT(h_idx, attr->param_height); + CHECK_GE(h_idx, 0); VScal(&scalar, grad + i * width, out + h_idx * width, width); VAdd(param + h_idx * width, out + h_idx * width, diff --git a/lite/backends/x86/jit/refer/refer.h b/lite/backends/x86/jit/refer/refer.h index d8c8d86911ab9a7794192aa68fb0c0571b1e4d26..b7243dfda350e8d0ea5909cf84ae3aa76d845055 100644 --- a/lite/backends/x86/jit/refer/refer.h +++ b/lite/backends/x86/jit/refer/refer.h @@ -22,7 +22,6 @@ #include "lite/backends/x86/jit/kernel_base.h" #include "lite/backends/x86/jit/macro.h" #include "lite/utils/cp_logging.h" -#include "lite/utils/paddle_enforce.h" namespace paddle { namespace lite { @@ -480,12 +479,12 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out, const emb_seq_pool_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width); + CHECK_EQ(attr->table_width * attr->index_width, attr->out_width); auto check_idx_value_valid = [&](int64_t i) { - PADDLE_ENFORCE_LT( - idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i); - PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i); + CHECK_LT(idx[i], attr->table_height) << "idx value: " << idx[i] + << " i: " << i; + CHECK_GE(idx[i], 0) << "idx value: " << idx[i] << " i: " << i; }; for (int64_t w = 0; w != attr->index_width; ++w) { @@ -527,12 +526,12 @@ void Sgd(const T* lr, const int64_t* rows, T* out, const lite::jit::sgd_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width); - PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height); + CHECK_EQ(attr->param_width, attr->grad_width); + CHECK_LE(attr->selected_rows_size, attr->grad_height); for (int64_t i = 0; i < attr->selected_rows_size; ++i) { auto h_idx = rows[i]; - PADDLE_ENFORCE_LT(h_idx, attr->param_height); - PADDLE_ENFORCE_GE(h_idx, 0); + CHECK_LT(h_idx, attr->param_height); + CHECK_GE(h_idx, 0); for (int64_t j = 0; j < attr->grad_width; ++j) { out[h_idx * attr->grad_width + j] = param[h_idx * attr->grad_width + j] - diff --git a/lite/backends/x86/jit/test.cc b/lite/backends/x86/jit/test.cc index aafcad579fdefd675323e0e2a6f40bd89c2a0166..03570a56d9c766271be630fe1d2e3048c6c42608 100644 --- a/lite/backends/x86/jit/test.cc +++ b/lite/backends/x86/jit/test.cc @@ -910,8 +910,8 @@ void TestKernelSgd() { const T lr = 0.1; auto UnDuplicatedRandomVec = []( int n, const int64_t lower, const int64_t upper) -> std::vector { - PADDLE_ENFORCE_LE(static_cast(upper - lower), n - 1); - PADDLE_ENFORCE_GT(n, 0); + CHECK_LE(static_cast(upper - lower), n - 1); + CHECK_GT(n, 0); std::vector all, out; for (int i = 0; i < n; ++i) { all.push_back(i); diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc index 5d7e98629cb89bd7a3fdee852507e0f381e54931..274e8836dd6e59d610ddeb7a63f898cdc1b19cc1 100644 --- a/lite/backends/x86/math/beam_search.cc +++ b/lite/backends/x86/math/beam_search.cc @@ -116,7 +116,7 @@ class BeamSearchFunctor { lod[0].assign(high_level.begin(), high_level.end()); lod[1].assign(low_level.begin(), low_level.end()); // if (!lite::fluid::CheckLoD(lod)) { - // //PADDLE_THROW("lod %s is not right", framework::LoDToString(lod)); + // //LOG(FATAL)<<"lod %s is not right", framework::LoDToString(lod)); //} selected_ids->set_lod(lod); selected_scores->set_lod(lod); diff --git a/lite/backends/x86/math/blas.cc b/lite/backends/x86/math/blas.cc index 3bc5f9f67ad96e7ec699400ff6369fe48c745b7e..4c6bf06951f81e90a73c91c2378f904db5678495 100644 --- a/lite/backends/x86/math/blas.cc +++ b/lite/backends/x86/math/blas.cc @@ -23,7 +23,7 @@ namespace math { MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim, int num_flatten_cols, bool trans) { - PADDLE_ENFORCE_GT(tensor_dim.size(), 1u); + CHECK_GT(tensor_dim.size(), 1u); MatDescriptor retv; if (num_flatten_cols > 1) { auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols); diff --git a/lite/backends/x86/math/blas_impl.h b/lite/backends/x86/math/blas_impl.h index 34b258892be05625ae88076eff175f56a53d3537..4a64e45ea945f2d46c06ba31d67bd2a0fbf7c635 100644 --- a/lite/backends/x86/math/blas_impl.h +++ b/lite/backends/x86/math/blas_impl.h @@ -287,22 +287,22 @@ struct CBlas { template <> struct CBlas { - static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); } + static void GEMM(...) { LOG(FATAL) << "float16 GEMM not supported on CPU"; } static void SMM_GEMM(...) { - PADDLE_THROW("float16 SMM_GEMM not supported on CPU"); + LOG(FATAL) << "float16 SMM_GEMM not supported on CPU"; } - static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); } - static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); } + static void VMUL(...) { LOG(FATAL) << "float16 VMUL not supported on CPU"; } + static void VEXP(...) { LOG(FATAL) << "float16 VEXP not supported on CPU"; } static void VSQUARE(...) { - PADDLE_THROW("float16 VSQUARE not supported on CPU"); + LOG(FATAL) << "float16 VSQUARE not supported on CPU"; } - static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); } - static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); }; - static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); }; - static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); }; + static void VPOW(...) { LOG(FATAL) << "float16 VPOW not supported on CPU"; } + static void DOT(...) { LOG(FATAL) << "float16 DOT not supported on CPU"; }; + static void SCAL(...) { LOG(FATAL) << "float16 SCAL not supported on CPU"; }; + static void ASUM(...) { LOG(FATAL) << "float16 ASUM not supported on CPU"; }; #ifdef PADDLE_WITH_MKLML static void GEMM_BATCH(...) { - PADDLE_THROW("float16 GEMM_BATCH not supported on CPU"); + LOG(FATAL) << "float16 GEMM_BATCH not supported on CPU"; } #endif }; @@ -461,11 +461,11 @@ void Blas::MatMul(const lite::Tensor &mat_a, auto dim_a = mat_a.dims(); auto dim_b = mat_b.dims(); auto dim_out = mat_out->dims(); - PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, - "The input and output of matmul be matrix"); - // PADDLE_ENFORCE( - // mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target(), - // "The targets of matrices must be same"); + CHECK(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2) + << "The input and output of matmul be matrix"; + // CHECK( + // mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target()) + // << "The targets of matrices must be same"; int M = dim_out[0]; int N = dim_out[1]; @@ -746,7 +746,7 @@ void Blas::MatMul(const lite::Tensor &mat_a, T alpha, lite::Tensor *mat_out, T beta) const { - PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_); + CHECK_EQ(dim_a.width_, dim_b.height_); CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans; if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) { @@ -761,8 +761,8 @@ void Blas::MatMul(const lite::Tensor &mat_a, beta, mat_out->template mutable_data()); } else { - PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ || - dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0); + CHECK(dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 || + dim_b.batch_size_ == 0); this->template BatchedGEMM( transA, transB, diff --git a/lite/backends/x86/math/context_project.h b/lite/backends/x86/math/context_project.h index 0c56e0d759fd9b1e3abba5209f43d7a0c8fe194e..72a2f4ce12cbd72b26cd87e97d0178275a4b4abd 100644 --- a/lite/backends/x86/math/context_project.h +++ b/lite/backends/x86/math/context_project.h @@ -146,7 +146,7 @@ class ContextProjectFunctor { } } if (padding_trainable) { - PADDLE_ENFORCE(padding_data != nullptr); + CHECK(padding_data != nullptr); for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { if (lod_level_0[i] == lod_level_0[i + 1]) continue; diff --git a/lite/backends/x86/math/cpu_vec.h b/lite/backends/x86/math/cpu_vec.h index 9ff64d53f069d2e4c5b639d273af5b4aa5738b2b..0e721cc8c272eee4b1df1f4b254b5e1d0c1ebb0a 100644 --- a/lite/backends/x86/math/cpu_vec.h +++ b/lite/backends/x86/math/cpu_vec.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include #include "lite/backends/x86/cpu_info.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" #ifdef PADDLE_WITH_MKLML #include "lite/backends/x86/mklml.h" @@ -652,7 +652,7 @@ class VecActivations { } else if (type == "identity" || type == "") { return vec_identity; } - PADDLE_THROW("Not support type: %s", type); + LOG(FATAL) << "Not support type: " << type; } }; diff --git a/lite/backends/x86/math/cross_entropy.cc b/lite/backends/x86/math/cross_entropy.cc index 941a34643669f060cdd18f38f92c39e529da7b19..2419620111b7ace292d8a2d366fc1dce2353a15c 100644 --- a/lite/backends/x86/math/cross_entropy.cc +++ b/lite/backends/x86/math/cross_entropy.cc @@ -57,7 +57,7 @@ class CrossEntropyFunctor { for (int i = 0; i < batch_size; ++i) { for (int j = 0; j < num_remain; j++) { int lbl = label_data[i * num_remain + j]; - PADDLE_ENFORCE((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index); + CHECK((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index); int index = i * num_classes + lbl * num_remain + j; int loss_idx = i * num_remain + j; loss_data[loss_idx] = diff --git a/lite/backends/x86/math/cross_entropy.h b/lite/backends/x86/math/cross_entropy.h index 6b66f0b08548c1306681409345c051d1ab40a7c0..d2a66083ac1a72de9e5e469618fc387a5ea784dc 100644 --- a/lite/backends/x86/math/cross_entropy.h +++ b/lite/backends/x86/math/cross_entropy.h @@ -27,7 +27,7 @@ namespace math { template struct TolerableValue { HOSTDEVICE T operator()(const T& x) const { - PADDLE_ENFORCE(static_cast(std::is_floating_point::value)); + CHECK(static_cast(std::is_floating_point::value)); const T kApproInf = 1e20; if (x == INFINITY) return kApproInf; diff --git a/lite/backends/x86/math/detail/activation_functions.h b/lite/backends/x86/math/detail/activation_functions.h index 6a13a3d471e10970b36120a12b21a36256350803..dc3c3eac1989f256378e408b8e8e4401bea43e7c 100644 --- a/lite/backends/x86/math/detail/activation_functions.h +++ b/lite/backends/x86/math/detail/activation_functions.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include #include "lite/backends/x86/cpu_info.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -46,8 +46,6 @@ inline ActivationType GetActivationType(const std::string &type) { return ActivationType::kIdentity; } LOG(ERROR) << "Not support type " << type; - // PADDLE_ENFORCE(false, "Not support type %s", type); - // PADDLE_THROW("Not support type %s.", type); return ActivationType(); } diff --git a/lite/backends/x86/math/gru_compute.h b/lite/backends/x86/math/gru_compute.h index 86b7a91f4127de50aeb5c5fb02122bced0af4188..767e9b9da0e2977f566c793c2fdc71f83ab5b6d4 100644 --- a/lite/backends/x86/math/gru_compute.h +++ b/lite/backends/x86/math/gru_compute.h @@ -13,7 +13,7 @@ limitations under the License. */ #include "lite/backends/x86/math/detail/activation_functions.h" #include "lite/core/context.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/math/im2col.cc b/lite/backends/x86/math/im2col.cc index b916c912ffc2a4d62b63b98fdce150b353ba087e..abbd9b0e2811913f6aff79561e365d20bffbeae4 100644 --- a/lite/backends/x86/math/im2col.cc +++ b/lite/backends/x86/math/im2col.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "lite/backends/x86/math/im2col.h" #include #include "lite/backends/x86/math/im2col_cfo_cpu.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -38,8 +38,8 @@ class Im2ColFunctor& stride, const std::vector& padding, lite::Tensor* col) { - PADDLE_ENFORCE(im.dims().size() == 3); - PADDLE_ENFORCE(col->dims().size() == 5); + CHECK_EQ(im.dims().size(), 3); + CHECK_EQ(col->dims().size(), 5); if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 && dilation[1] == 1) { @@ -72,8 +72,8 @@ class Col2ImFunctor& stride, const std::vector& padding, lite::Tensor* im) { - PADDLE_ENFORCE(im->dims().size() == 3); - PADDLE_ENFORCE(col.dims().size() == 5); + CHECK_EQ(im->dims().size(), 3); + CHECK_EQ(col.dims().size(), 5); int im_channels = im->dims()[0]; int im_height = im->dims()[1]; int im_width = im->dims()[2]; @@ -82,20 +82,20 @@ class Col2ImFunctor& stride, const std::vector& padding, lite::Tensor* col) { - PADDLE_ENFORCE(im.dims().size() == 3); - PADDLE_ENFORCE(col->dims().size() == 5); + CHECK_EQ(im.dims().size(), 3); + CHECK_EQ(col->dims().size(), 5); int im_channels = im.dims()[0]; int im_height = im.dims()[1]; int im_width = im.dims()[2]; @@ -214,8 +214,8 @@ class Col2ImFunctor& stride, const std::vector& padding, lite::Tensor* im) { - PADDLE_ENFORCE(im->dims().size() == 3); - PADDLE_ENFORCE(col.dims().size() == 5); + CHECK_EQ(im->dims().size(), 3); + CHECK_EQ(col.dims().size(), 5); int im_channels = im->dims()[0]; int im_height = im->dims()[1]; int im_width = im->dims()[2]; @@ -224,16 +224,16 @@ class Col2ImFunctortemplate mutable_data(); const T* col_data = col.data(); diff --git a/lite/backends/x86/math/lstm_compute.h b/lite/backends/x86/math/lstm_compute.h index ddb7bea9995ebcca978be97f8295eb07b0e4e17e..b403770cca7248fba10e62708dddfb91f2789488 100644 --- a/lite/backends/x86/math/lstm_compute.h +++ b/lite/backends/x86/math/lstm_compute.h @@ -16,7 +16,7 @@ limitations under the License. */ #include "lite/backends/x86/math/detail/activation_functions.h" #include "lite/core/context.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc index cb1781db2199c1b7a12aaec80b1904f65b23b534..cc4aa5d9fa54c50eb944714c14a5f6b15634a181 100644 --- a/lite/backends/x86/math/math_function.cc +++ b/lite/backends/x86/math/math_function.cc @@ -121,8 +121,8 @@ struct RowwiseAdd { lite::Tensor* output) { const auto& in_dims = input.dims(); auto size = input.numel() / in_dims[0]; - PADDLE_ENFORCE_EQ(vector.numel(), size); - PADDLE_ENFORCE_EQ(output->dims(), in_dims); + CHECK_EQ(vector.numel(), size); + CHECK_EQ(output->dims(), in_dims); const T* input_data = input.data(); const T* vector_data = vector.data(); diff --git a/lite/backends/x86/math/math_function.h b/lite/backends/x86/math/math_function.h index 8f629b5f171814f0df8e51e61123c7c0aabf7643..7081ec0053e0b4194730e6f4353e1274d6019bb4 100644 --- a/lite/backends/x86/math/math_function.h +++ b/lite/backends/x86/math/math_function.h @@ -20,8 +20,8 @@ limitations under the License. */ #include "lite/core/op_lite.h" #include "lite/core/tensor.h" #include "lite/fluid/float16.h" -#include "lite/utils/paddle_enforce.h" -//#include "lite/tensor_util.h" +#include "lite/utils/cp_logging.h" +// #include "lite/tensor_util.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/math/math_function_impl.h b/lite/backends/x86/math/math_function_impl.h index acfb76759f6fc9fa4122afd2388bc3adf8f5ea22..9bbfebcfb2feb0e3c9d68261240bed18888350c3 100644 --- a/lite/backends/x86/math/math_function_impl.h +++ b/lite/backends/x86/math/math_function_impl.h @@ -59,7 +59,7 @@ void ColwiseSum::operator()(const lite::Context& context, lite::TensorLite* out) { auto in_dims = input.dims(); auto size = input.numel() / in_dims[0]; - PADDLE_ENFORCE_EQ(out->numel(), size); + CHECK_EQ(out->numel(), size); auto in = lite::fluid::EigenMatrix::From(input); auto vec = lite::fluid::EigenVector::Flatten(*out); @@ -81,7 +81,7 @@ class ColwiseSum { auto& in_dims = input.dims(); auto height = in_dims[0]; auto size = in_dims[1]; - PADDLE_ENFORCE_EQ(out->numel(), size); + CHECK_EQ(out->numel(), size); T* out_buf = out->template mutable_data(out->target()); const T* in_buf = input.data(); @@ -103,8 +103,8 @@ void RowwiseMean::operator()(const lite::Context& context, const lite::TensorLite& input, lite::TensorLite* out) { auto in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); - PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); + CHECK_EQ(in_dims.size(), 2U); + CHECK_EQ(out->numel(), in_dims[0]); auto in = lite::fluid::EigenMatrix::From(input); auto vec = lite::fluid::EigenVector::Flatten(*out); @@ -124,10 +124,10 @@ class RowwiseMean { const lite::TensorLite& input, lite::TensorLite* out) { auto& in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + CHECK_EQ(in_dims.size(), 2U); auto height = in_dims[0]; auto size = in_dims[1]; - PADDLE_ENFORCE_EQ(out->numel(), height); + CHECK_EQ(out->numel(), height); auto inv_size = 1.0 / size; T* out_buf = out->template mutable_data(out->target()); const T* in_buf = input.data(); @@ -147,8 +147,8 @@ void RowwiseSum::operator()(const lite::Context& context, const lite::TensorLite& input, lite::TensorLite* out) { auto in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); - PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); + CHECK_EQ(in_dims.size(), 2U); + CHECK_EQ(out->numel(), in_dims[0]); auto in = lite::fluid::EigenMatrix::From(input); auto vec = lite::fluid::EigenVector::Flatten(*out); @@ -168,10 +168,10 @@ class RowwiseSum { const lite::TensorLite& input, lite::TensorLite* out) { auto& in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + CHECK_EQ(in_dims.size(), 2U); auto height = in_dims[0]; auto size = in_dims[1]; - PADDLE_ENFORCE_EQ(out->numel(), height); + CHECK_EQ(out->numel(), height); T* out_buf = out->template mutable_data(out->target()); const T* in_buf = input.data(); diff --git a/lite/backends/x86/math/math_function_test.cc b/lite/backends/x86/math/math_function_test.cc index 19122a6169fbbe1729e38389b0006b11190bc206..b3511ca3521634a771965348e754e10bfd72e19f 100644 --- a/lite/backends/x86/math/math_function_test.cc +++ b/lite/backends/x86/math/math_function_test.cc @@ -273,7 +273,7 @@ TEST(math_funciton, set_constant) { auto* ctx = new paddle::platform::CPUDeviceContext(); paddle::operators::math::set_constant(*ctx, &t, 10); for (int64_t i = 0; i < t.numel(); ++i) { - PADDLE_ENFORCE_EQ(10, t.data()[i]); + CHECK_EQ(10, t.data()[i]); } delete ctx; } diff --git a/lite/backends/x86/math/sampler.h b/lite/backends/x86/math/sampler.h index efd9e48e5443186b6b735287cc150f99cb42be81..07cca52e1f436c2979a331dd27c2ddc554c0dad8 100644 --- a/lite/backends/x86/math/sampler.h +++ b/lite/backends/x86/math/sampler.h @@ -32,7 +32,7 @@ namespace math { class Sampler { public: explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) { - // PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0."); + // CHECK_GT(range, 0, "Range should be greater than 0."); if (seed == 0) { std::random_device r; seed_ = r(); diff --git a/lite/backends/x86/math/selected_rows_functor.cc b/lite/backends/x86/math/selected_rows_functor.cc index 03a18587f4a029bcaebe484ca1ab1951e7c3ecad..8e2a81905b871902aa8ec79c9dd718a62c9f6dec 100644 --- a/lite/backends/x86/math/selected_rows_functor.cc +++ b/lite/backends/x86/math/selected_rows_functor.cc @@ -31,7 +31,7 @@ struct SelectedRowsAdd { const fluid::SelectedRows& input2, fluid::SelectedRows* output) { auto in1_height = input1.height(); - PADDLE_ENFORCE_EQ(in1_height, input2.height()); + CHECK_EQ(in1_height, input2.height()); output->set_height(in1_height); auto& in1_rows = input1.rows(); @@ -49,8 +49,8 @@ struct SelectedRowsAdd { auto& in2_value = input2.value(); auto in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); - PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size()); + CHECK_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); + CHECK_EQ(in1_row_numel, out_value->numel() / out_rows.size()); auto* out_data = out_value->template mutable_data(); auto* in1_data = in1_value.data(); @@ -73,15 +73,15 @@ struct SelectedRowsAddTensor { auto in1_height = input1.height(); auto in2_dims = input2.dims(); auto out_dims = output->dims(); - PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); - PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); + CHECK_EQ(in1_height, in2_dims[0]); + CHECK_EQ(in1_height, out_dims[0]); auto& in1_value = input1.value(); auto& in1_rows = input1.rows(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); - PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height); + CHECK_EQ(in1_row_numel, input2.numel() / in1_height); + CHECK_EQ(in1_row_numel, output->numel() / in1_height); SetConstant functor; functor(context, output, 0.0); @@ -113,7 +113,7 @@ struct SelectedRowsAddTo { const int64_t input2_offset, fluid::SelectedRows* input2) { auto in1_height = input1.height(); - PADDLE_ENFORCE_EQ(in1_height, input2->height()); + CHECK_EQ(in1_height, input2->height()); auto& in1_rows = input1.rows(); auto& in2_rows = *(input2->mutable_rows()); @@ -149,7 +149,7 @@ struct SelectedRowsSumTo { auto& in_rows = (*iter)->rows(); size += in_rows.end() - in_rows.begin(); auto in1_height = (*iter)->height(); - PADDLE_ENFORCE_EQ(in1_height, input2->height()); + CHECK_EQ(in1_height, input2->height()); } // concat rows std::vector in2_rows; @@ -185,13 +185,13 @@ struct SelectedRowsAddToTensor { auto in1_height = input1.height(); auto in2_dims = input2->dims(); - PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + CHECK_EQ(in1_height, in2_dims[0]); auto& in1_value = input1.value(); auto& in1_rows = input1.rows(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + CHECK_EQ(in1_row_numel, input2->numel() / in1_height); auto* in1_data = in1_value.data(); auto* input2_data = input2->template mutable_data(); @@ -291,12 +291,11 @@ struct MergeAdd { if (input->rows().size() == 0) { continue; } - PADDLE_ENFORCE_EQ(input_width, - input->value().dims()[1], - "all input should have same " - "dimension except for the first one"); - PADDLE_ENFORCE_EQ( - input_height, input->height(), "all input should have same height"); + CHECK_EQ(input_width, input->value().dims()[1]) + << "all input should have same " + "dimension except for the first one"; + CHECK_EQ(input_height, input->height()) + << "all input should have same height"; row_num += input->rows().size(); merged_row_set.insert(input->rows().begin(), input->rows().end()); } @@ -376,13 +375,13 @@ struct UpdateToTensor { lite::Tensor* input2) { auto in1_height = input1.height(); auto in2_dims = input2->dims(); - PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + CHECK_EQ(in1_height, in2_dims[0]); auto& in1_value = input1.value(); auto& in1_rows = input1.rows(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + CHECK_EQ(in1_row_numel, input2->numel() / in1_height); auto* in1_data = in1_value.data(); auto* input2_data = input2->template data(); diff --git a/lite/backends/x86/math/sequence2batch.cc b/lite/backends/x86/math/sequence2batch.cc index aa7aeac532e2fa1f90d452924b364be1896ee862..597521b6e7cac49ac91dbddac71af22bb5a8760c 100644 --- a/lite/backends/x86/math/sequence2batch.cc +++ b/lite/backends/x86/math/sequence2batch.cc @@ -30,12 +30,10 @@ class CopyMatrixRowsFunctor { const uint64_t* index = index_lod.data(); const auto& src_dims = src.dims(); const auto& dst_dims = dst->dims(); - PADDLE_ENFORCE_EQ( - src_dims.size(), 2UL, "The src must be matrix with rank 2."); - PADDLE_ENFORCE_EQ( - dst_dims.size(), 2UL, "The dst must be matrix with rank 2."); - PADDLE_ENFORCE_EQ( - src_dims[1], dst_dims[1], "The width of src and dst must be same."); + CHECK_EQ(src_dims.size(), 2UL) << "The src must be matrix with rank 2."; + CHECK_EQ(dst_dims.size(), 2UL) << "The dst must be matrix with rank 2."; + CHECK_EQ(src_dims[1], dst_dims[1]) + << "The width of src and dst must be same."; auto height = dst_dims[0]; auto width = dst_dims[1]; auto* src_data = src.data(); diff --git a/lite/backends/x86/math/sequence2batch.h b/lite/backends/x86/math/sequence2batch.h index 796894cb7d18ec4db7b670276bb3d3fc5b1427f8..953576eea4170cca57f10bb977ca9bfecb36ae6d 100644 --- a/lite/backends/x86/math/sequence2batch.h +++ b/lite/backends/x86/math/sequence2batch.h @@ -19,7 +19,7 @@ limitations under the License. */ #include "lite/core/context.h" #include "lite/core/tensor.h" #include "lite/fluid/eigen.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -66,21 +66,18 @@ class LoDTensor2BatchFunctor { bool is_reverse = false) const { if (!is_cal_batch_lod) { auto lods = batch->lod(); - PADDLE_ENFORCE_GT(lods.size(), - 2UL, - "The LoD of LoDTensor should inlcude at least 2-level " - "sequence information."); - PADDLE_ENFORCE_EQ( - lods[1].size(), - static_cast(lod_tensor.dims()[0]), - "The LoD information should be consistent with the dims."); + CHECK_GT(lods.size(), 2UL) + << "The LoD of LoDTensor should inlcude at least 2-level " + "sequence information."; + CHECK_EQ(lods[1].size(), static_cast(lod_tensor.dims()[0])) + << "The LoD information should be consistent with the dims."; CopyMatrixRowsFunctor to_batch; to_batch(context, lod_tensor, lods[1], batch, true); return; } auto lods = lod_tensor.lod(); - PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now."); + CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now."; const auto& lod = lods[0]; @@ -165,14 +162,11 @@ class Batch2LoDTensorFunctor { const lite::Tensor& batch, lite::Tensor* lod_tensor) const { auto in_lod = batch.lod(); - PADDLE_ENFORCE_GT(in_lod.size(), - 2UL, - "The LoD of LoDTensor should inlcude at least 2-level " - "sequence information."); - PADDLE_ENFORCE_EQ( - in_lod[1].size(), - static_cast(lod_tensor->dims()[0]), - "The LoD information should be consistent with the dims."); + CHECK_GT(in_lod.size(), 2UL) + << "The LoD of LoDTensor should inlcude at least 2-level " + "sequence information."; + CHECK_EQ(in_lod[1].size(), static_cast(lod_tensor->dims()[0])) + << "The LoD information should be consistent with the dims."; CopyMatrixRowsFunctor to_seq; to_seq(context, batch, in_lod[1], lod_tensor, false); } diff --git a/lite/backends/x86/math/sequence_padding.cc b/lite/backends/x86/math/sequence_padding.cc index eb977dc2d23f4cfaeec7dd5a6e2834ca23345f76..3b2f8bfc4f58a4bfcab968a9288eb8d1d817d78d 100644 --- a/lite/backends/x86/math/sequence_padding.cc +++ b/lite/backends/x86/math/sequence_padding.cc @@ -37,10 +37,9 @@ void CopyValidData(lite::Tensor* dst_tensor, layout == kBatchLengthWidth ? step_width : seq_num * step_width; for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) { int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx]; - PADDLE_ENFORCE_GE( - pad_seq_len, - valid_seq_len, - "The padded sequence length can not be less than its original length."); + CHECK_GE(pad_seq_len, valid_seq_len) << "The padded sequence length can " + "not be less than its original " + "length."; int seq_data_offset = seq_offsets[seq_idx] * step_width; int pad_data_offset = layout == kBatchLengthWidth ? seq_idx * pad_seq_len * step_width @@ -108,9 +107,9 @@ class PaddingLoDTensorFunctor { pad_seq_len, step_width, layout); - PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width, - "The numel of 'pad_value' can only be 1 or be equal to the " - "'step_width'."); + CHECK(pad_value.numel() == 1 || pad_value.numel() == step_width) + << "The numel of 'pad_value' can only be 1 or be equal to the " + "'step_width'."; // fill padding value T* pad_data = pad_tensor->template mutable_data(); diff --git a/lite/backends/x86/math/sequence_padding.h b/lite/backends/x86/math/sequence_padding.h index 43407014dea0ed0c78ab29da7fb8ebb0e0310566..5512c4aa11fb5dc05283d01b1d6d3da7fb83c064 100644 --- a/lite/backends/x86/math/sequence_padding.h +++ b/lite/backends/x86/math/sequence_padding.h @@ -19,7 +19,7 @@ limitations under the License. */ #include "lite/core/context.h" #include "lite/core/tensor.h" #include "lite/fluid/lod.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -46,15 +46,14 @@ inline static void CheckDims(const lite::DDim& seq_tensor_dims, int64_t padded_seq_len, int64_t step_width, const PadLayout& layout) { - PADDLE_ENFORCE_EQ(static_cast(seq_tensor_dims[0]), - seq_offset.back(), - "Value of 1st dimension of the sequence tensor should be " - "equal to sum of lengths of all sequences."); + CHECK_EQ(static_cast(seq_tensor_dims[0]), seq_offset.back()) + << "Value of 1st dimension of the sequence tensor should be " + "equal to sum of lengths of all sequences."; - PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() || - seq_tensor_dims.size() == pad_tensor_dims.size(), - "pad_tensor's rank should be 1 greater than seq_tensor's " - "rank, or be equal with it."); + CHECK(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() || + seq_tensor_dims.size() == pad_tensor_dims.size()) + << "pad_tensor's rank should be 1 greater than seq_tensor's " + "rank, or be equal with it."; } /* diff --git a/lite/backends/x86/math/sequence_pooling.cc b/lite/backends/x86/math/sequence_pooling.cc index 2d00ebad61840da5b14fbf12d9255394b2b2df1a..c1ddb030349a7f7f46fd6b98d3f967eb6fdfe48e 100644 --- a/lite/backends/x86/math/sequence_pooling.cc +++ b/lite/backends/x86/math/sequence_pooling.cc @@ -46,12 +46,12 @@ class MaxSeqPoolFunctor { auto in_dims = input.dims(); auto out_dims = output->dims(); auto idx_dims = index->dims(); - PADDLE_ENFORCE_GT(in_dims.size(), 1u); - PADDLE_ENFORCE_GT(out_dims.size(), 1u); + CHECK_GT(in_dims.size(), 1u); + CHECK_GT(out_dims.size(), 1u); for (size_t i = 1; i < in_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); + CHECK_EQ(in_dims[i], out_dims[i]); } - PADDLE_ENFORCE_EQ(idx_dims, out_dims); + CHECK_EQ(idx_dims, out_dims); auto starts = input.lod()[0]; const T* in_data = input.data(); @@ -95,10 +95,10 @@ class MaxSeqPoolFunctor { lite::Tensor* index) { auto in_dims = input.dims(); auto out_dims = output->dims(); - PADDLE_ENFORCE_GT(in_dims.size(), 1u); - PADDLE_ENFORCE_GT(out_dims.size(), 1u); + CHECK_GT(in_dims.size(), 1u); + CHECK_GT(out_dims.size(), 1u); for (size_t i = 1; i < in_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); + CHECK_EQ(in_dims[i], out_dims[i]); } auto starts = input.lod()[0]; @@ -136,12 +136,12 @@ class MaxSeqPoolGradFunctor { auto og_dims = out_grad.dims(); auto ig_dims = in_grad->dims(); auto idx_dims = index.dims(); - PADDLE_ENFORCE_GT(og_dims.size(), 1); - PADDLE_ENFORCE_GT(ig_dims.size(), 1); + CHECK_GT(og_dims.size(), 1); + CHECK_GT(ig_dims.size(), 1); for (size_t i = 1; i < og_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]); + CHECK_EQ(og_dims[i], ig_dims[i]); } - PADDLE_ENFORCE_EQ(idx_dims, og_dims); + CHECK_EQ(idx_dims, og_dims); const T* og_data = out_grad.data(); const int* max_index = index.data(); @@ -236,7 +236,7 @@ class SumSeqPoolGradFunctor { auto lod = in_grad->lod()[0]; int64_t out_w = out_grad.numel() / out_grad.dims()[0]; int64_t in_w = in_grad->numel() / in_grad->dims()[0]; - PADDLE_ENFORCE(in_w == out_w); + CHECK(in_w == out_w); const T* out_g_data = out_grad.data(); T* in_g_data = in_grad->template mutable_data(TARGET(kX86)); auto blas = math::GetBlas(context); @@ -330,7 +330,7 @@ class SequencePoolFunctor { out_e.device(eigen_device) = in_e.sum(Eigen::array({{0}})) / std::sqrt(static_cast(h)); } else { - PADDLE_THROW("unsupported pooling pooltype"); + LOG(FATAL) << "unsupported pooling pooltype"; } } } @@ -389,7 +389,7 @@ class SequencePoolGradFunctor { } else if (pooltype == "FIRST") { in_g_e.chip(0, 0).device(eigen_device) = out_g_e_v; } else { - PADDLE_THROW("unsupported pooling pooltype"); + LOG(FATAL) << "unsupported pooling pooltype"; } } } diff --git a/lite/backends/x86/math/sequence_pooling_test.cc b/lite/backends/x86/math/sequence_pooling_test.cc index b91f43a571994bef95650361a6dc62c0465837a7..8bba0f92055dbee5a81bf12ab2fa5cc6592bd60c 100644 --- a/lite/backends/x86/math/sequence_pooling_test.cc +++ b/lite/backends/x86/math/sequence_pooling_test.cc @@ -50,9 +50,9 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { in_grad.mutable_data(in_dims, context->GetPlace()); // check tensor contruction result - PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size()); + CHECK_EQ(in_grad.dims().size(), out_grad.dims().size()); for (int64_t i = 1; i < out_grad.dims().size(); ++i) { - PADDLE_ENFORCE_EQ(in_grad.dims()[i], out_grad.dims()[i]); + CHECK_EQ(in_grad.dims()[i], out_grad.dims()[i]); } // call functor diff --git a/lite/backends/x86/math/tree2col.cc b/lite/backends/x86/math/tree2col.cc index c54bb2099edd0a7e6be61cfdff6340734f09116a..bcab1e77c0bef356453bf1ea1f30aabfc9f1dff0 100644 --- a/lite/backends/x86/math/tree2col.cc +++ b/lite/backends/x86/math/tree2col.cc @@ -55,7 +55,7 @@ void Tree2ColUtil::construct_tree(const lite::Tensor &EdgeSet, std::vector> *tr, size_t *node_count) { auto edge_set_dims = EdgeSet.dims(); - PADDLE_ENFORCE_EQ(edge_set_dims[1], 2); + CHECK_EQ(edge_set_dims[1], 2); int64_t edge_count = EdgeSet.numel(); const int *edge_data = EdgeSet.data(); diff --git a/lite/backends/x86/math/unpooling.cc b/lite/backends/x86/math/unpooling.cc index 119d7294e9ec21e67f09776ad20d04f15b8b81ce..7ff132cbf121172b5bf35966637080d599eaf498 100644 --- a/lite/backends/x86/math/unpooling.cc +++ b/lite/backends/x86/math/unpooling.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "lite/backends/x86/math/unpooling.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -41,7 +41,7 @@ class Unpool2dMaxFunctor { for (int c = 0; c < output_channels; ++c) { for (int i = 0; i < input_feasize; ++i) { int index = indices_data[i]; - PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); + CHECK(index < output_feasize) << "err index in unpooling!"; output_data[index] = input_data[i]; } input_data += input_feasize; @@ -77,7 +77,7 @@ class Unpool2dMaxGradFunctor { for (int c = 0; c < output_channels; ++c) { for (int i = 0; i < input_feasize; ++i) { int index = indices_data[i]; - PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); + CHECK(index < output_feasize) << "err index in unpooling!"; input_grad_data[i] = output_grad_data[index]; } input_grad_data += input_feasize; diff --git a/lite/backends/x86/math/vol2col.cc b/lite/backends/x86/math/vol2col.cc index 91979bb7fdcfe66d84ded3f9797144ddafc8769e..8e8f44be55fc2df342092ad399f00bcc7941908d 100644 --- a/lite/backends/x86/math/vol2col.cc +++ b/lite/backends/x86/math/vol2col.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "lite/backends/x86/math/vol2col.h" #include -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -36,8 +36,8 @@ class Vol2ColFunctor { const std::vector& strides, const std::vector& paddings, lite::Tensor* col) const { - PADDLE_ENFORCE(vol.dims().size() == 4); - PADDLE_ENFORCE(col->dims().size() == 7); + CHECK_EQ(vol.dims().size(), 4); + CHECK_EQ(col->dims().size(), 7); int input_channels = vol.dims()[0]; int input_depth = vol.dims()[1]; @@ -52,27 +52,27 @@ class Vol2ColFunctor { int channels_col = input_channels * filter_depth * filter_height * filter_width; - PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] - - ((dilations[0] * (filter_depth - 1) + 1))) / - strides[0] + - 1, - output_depth, - "input_depth and output_depth are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] - - ((dilations[1] * (filter_height - 1) + 1))) / - strides[1] + - 1, - output_height, - "input_height and output_height are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] - - ((dilations[2] * (filter_width - 1) + 1))) / - strides[2] + - 1, - output_width, - "input_width and output_width are " - "mismatching."); + CHECK_EQ((input_depth + 2 * paddings[0] - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1, + output_depth) + << "input_depth and output_depth are " + "mismatching."; + CHECK_EQ((input_height + 2 * paddings[1] - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1, + output_height) + << "input_height and output_height are " + "mismatching."; + CHECK_EQ((input_width + 2 * paddings[2] - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1, + output_width) + << "input_width and output_width are " + "mismatching."; const T* vol_data = vol.data(); T* col_data = col->template mutable_data(); @@ -122,8 +122,8 @@ class Col2VolFunctor { const std::vector& strides, const std::vector& paddings, lite::Tensor* vol) const { - PADDLE_ENFORCE(vol->dims().size() == 4); - PADDLE_ENFORCE(col.dims().size() == 7); + CHECK_EQ(vol->dims().size(), 4); + CHECK_EQ(col.dims().size(), 7); int input_channels = vol->dims()[0]; int input_depth = vol->dims()[1]; @@ -138,27 +138,27 @@ class Col2VolFunctor { int channels_col = input_channels * filter_depth * filter_height * filter_width; - PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] - - ((dilations[0] * (filter_depth - 1) + 1))) / - strides[0] + - 1, - output_depth, - "input_depth and output_depth are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] - - ((dilations[1] * (filter_height - 1) + 1))) / - strides[1] + - 1, - output_height, - "input_height and output_height are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] - - ((dilations[2] * (filter_width - 1) + 1))) / - strides[2] + - 1, - output_width, - "input_width and output_width are " - "mismatching."); + CHECK_EQ((input_depth + 2 * paddings[0] - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1, + output_depth) + << "input_depth and output_depth are " + "mismatching."; + CHECK_EQ((input_height + 2 * paddings[1] - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1, + output_height) + << "input_height and output_height are " + "mismatching."; + CHECK_EQ((input_width + 2 * paddings[2] - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1, + output_width) + << "input_width and output_width are " + "mismatching."; T* vol_data = vol->template mutable_data(); const T* col_data = col.data(); diff --git a/lite/backends/xpu/debug.h b/lite/backends/xpu/debug.h new file mode 100644 index 0000000000000000000000000000000000000000..75d18b6f4bf461a871c26c7665d8b48bc2f3db38 --- /dev/null +++ b/lite/backends/xpu/debug.h @@ -0,0 +1,131 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/backends/xpu/xpu_header_sitter.h" + +namespace paddle { +namespace lite { +namespace xpu { + +template +void DumpCPUMem(const T* ptr, + size_t len, + const std::string& comment = "", + size_t stride = 1, + size_t item_per_line = 30) { + size_t after_stride_len = (len + stride - 1) / stride; + std::unique_ptr after_stride(new T[after_stride_len]); + for (size_t i = 0; i < after_stride_len; ++i) { + after_stride[i] = ptr[i * stride]; + } + double sum = 0; + for (size_t i = 0; i < len; ++i) { + sum += ptr[i]; + } + + printf( + "------------------------------ [%s] len=%zd stride=%zd sum=%f BEGIN " + "------------------------------\n", + comment.c_str(), + len, + stride, + sum); + size_t nline = (after_stride_len + item_per_line - 1) / item_per_line; + for (size_t i = 0; i < nline; ++i) { + size_t line_begin = i * item_per_line; + size_t line_end = line_begin + item_per_line; + printf("line[%04zd] -- ", i); + for (size_t ii = line_begin; (ii < line_end) && (ii < after_stride_len); + ++ii) { + if (std::is_same::value) { + printf("%.6f, ", static_cast(after_stride[ii])); + } else if (std::is_same::value) { + printf("%d ", static_cast(after_stride[ii])); + } else { + // CHECK(false) << "unknown type"; + } + } + printf("\n"); + } + printf( + "------------------------------ [%s] len=%zd stride=%zd sum=%f END " + "------------------------------\n", + comment.c_str(), + len, + stride, + sum); +} + +template +void DumpXPUMem(const T* ptr, + size_t len, + const std::string& comment = "", + size_t stride = 1, + size_t item_per_line = 30) { + size_t after_stride_len = (len + stride - 1) / stride; + std::unique_ptr cpu_mem(new T[len]); + xpu_memcpy( + cpu_mem.get(), ptr, len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST); + std::unique_ptr after_stride(new T[after_stride_len]); + for (size_t i = 0; i < after_stride_len; ++i) { + after_stride[i] = cpu_mem[i * stride]; + } + double sum = 0; + for (size_t i = 0; i < len; ++i) { + sum += cpu_mem[i]; + } + + printf( + "------------------------------ [%s] len=%zd stride=%zd sum=%f BEGIN " + "------------------------------\n", + comment.c_str(), + len, + stride, + sum); + size_t nline = (after_stride_len + item_per_line - 1) / item_per_line; + for (size_t i = 0; i < nline; ++i) { + size_t line_begin = i * item_per_line; + size_t line_end = line_begin + item_per_line; + printf("line[%04zd] -- ", i); + for (size_t ii = line_begin; (ii < line_end) && (ii < after_stride_len); + ++ii) { + if (std::is_same::value) { + printf("%.6f, ", static_cast(after_stride[ii])); + } else if (std::is_same::value) { + printf("%d ", static_cast(after_stride[ii])); + } else { + // CHECK(false) << "unknown type"; + } + } + printf("\n"); + } + printf( + "------------------------------ [%s] len=%zd stride=%zd sum=%f END " + "------------------------------\n", + comment.c_str(), + len, + stride, + sum); +} + +} // namespace xpu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc index 5dcbc1e275cca8c32003cbef74dfb1f6d4caee93..85a0023590858ab72e9e4f258d62dce809888918 100644 --- a/lite/backends/xpu/target_wrapper.cc +++ b/lite/backends/xpu/target_wrapper.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "lite/backends/xpu/target_wrapper.h" -#include "lite/backends/xpu/xpu_header_sitter.h" namespace paddle { namespace lite { @@ -42,5 +41,21 @@ void TargetWrapperXPU::MemcpySync(void* dst, } } +XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size, + bool use_l3) { + void* ptr{nullptr}; + if (use_l3) { + ptr = xdnn::alloc_workspace(GetRawContext(), size); + } else { + ptr = TargetWrapperXPU::Malloc(size); + } + CHECK(ptr != nullptr); + return XPUScratchPadGuard(new XPUScratchPad(ptr, use_l3)); +} + +std::string TargetWrapperXPU::multi_encoder_precision; // NOLINT +int TargetWrapperXPU::workspace_l3_size_per_thread{0}; +thread_local xdnn::Context* TargetWrapperXPU::tls_raw_ctx_{nullptr}; + } // namespace lite } // namespace paddle diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h index c42d4139246085d8b9a367b45b60699209d0b668..b84b5d75e74a14e81091b003aa3ae5514e53a42c 100644 --- a/lite/backends/xpu/target_wrapper.h +++ b/lite/backends/xpu/target_wrapper.h @@ -14,6 +14,8 @@ #pragma once +#include // std::unique_ptr +#include "lite/backends/xpu/xpu_header_sitter.h" // xpu_free #include "lite/core/target_wrapper.h" namespace paddle { @@ -21,6 +23,24 @@ namespace lite { using TargetWrapperXPU = TargetWrapper; +struct XPUScratchPad { + XPUScratchPad(void* addr, bool is_l3) : addr_(addr), is_l3_(is_l3) {} + + void* addr_{nullptr}; + bool is_l3_{false}; +}; + +struct XPUScratchPadDeleter { + void operator()(XPUScratchPad* sp) const { + if (!sp->is_l3_) { + xpu_free(sp->addr_); + } + delete sp; + } +}; + +using XPUScratchPadGuard = std::unique_ptr; + template <> class TargetWrapper { public: @@ -34,6 +54,41 @@ class TargetWrapper { const void* src, size_t size, IoDirection dir); + + static XPUScratchPadGuard MallocScratchPad(size_t size, bool use_l3 = true); + + static xdnn::Context* GetRawContext() { + if (tls_raw_ctx_ == nullptr) { + tls_raw_ctx_ = xdnn::create_context(); + CHECK(tls_raw_ctx_); + int r = xdnn::set_workspace_l3_size(tls_raw_ctx_, + workspace_l3_size_per_thread); + if (r != 0) { + LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r + << ", workspace_l3_size_per_thread = " + << workspace_l3_size_per_thread; + } + } + return tls_raw_ctx_; + } + + // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker + // thread + static void SetDev(int dev_no = 0) { + const char* dev_env = getenv("LITE_XPU_DEV"); + if (dev_env) { + xpu_set_device(atoi(dev_env)); + return; + } + + xpu_set_device(dev_no); + } + + static std::string multi_encoder_precision; // NOLINT + static int workspace_l3_size_per_thread; + + private: + static thread_local xdnn::Context* tls_raw_ctx_; }; } // namespace lite diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index 56a5c9b8f7ea0ed47d21629d7ccf083b4f9fa232..af2bfbe86aaa1b3f145838015a6d6a62090cb3b1 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -121,7 +121,7 @@ lite_cc_library(kernel SRCS kernel.cc PROFILE_DEPS lite_profiler ) lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel - cpp_op_desc tensor + cpp_op_desc tensor utils ) add_dependencies(kernel kernel_list_h) diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc index 731215f542567ec3ff0cc87d6990624bfa6b2bc2..1138a3bcc2e3e3f3c77d94bf8128b8231f930550 100644 --- a/lite/core/arena/framework.cc +++ b/lite/core/arena/framework.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/core/arena/framework.h" +#include #include "lite/core/context.h" #include "lite/operators/subgraph_op.h" @@ -22,7 +23,14 @@ namespace arena { void TestCase::CreateInstruction() { std::shared_ptr op = nullptr; - if (place_.target == TARGET(kNPU) || place_.target == TARGET(kXPU)) { + static const std::set subgraph_op_supported_targets( + {TARGET(kNPU), TARGET(kXPU)}); + bool enable_subgraph_op = subgraph_op_supported_targets.find(place_.target) != + subgraph_op_supported_targets.end(); +#if defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL) + enable_subgraph_op = false; // Use XPU kernel directly if XTCL is disabled. +#endif + if (enable_subgraph_op) { // Create a new block desc to wrap the original op desc int sub_block_idx = 0; auto sub_block_desc = new cpp::BlockDesc(); @@ -47,7 +55,7 @@ void TestCase::CreateInstruction() { op = LiteOpRegistry::Global().Create(op_desc().Type()); } CHECK(op) << "no op for " << op_desc().Type(); - op->Attach(*op_desc_, inst_scope_); + op->Attach(*op_desc_, inst_scope_.get()); auto kernels = op->CreateKernels({place_}); // filter out the target kernel CHECK(!kernels.empty()) << "No kernel found for place " @@ -72,53 +80,35 @@ void TestCase::CreateInstruction() { void TestCase::PrepareInputsForInstruction() { for (auto& arg : op_desc().InputArgumentNames()) { for (auto& var : op_desc().Input(arg)) { - std::string kernel_key = instruction_->kernel()->key_with_alias(); - const auto* param_type = ParamTypeRegistry::Global().RetrieveInArgument( - place_, kernel_key, arg); - - const Type* inst_type = nullptr; - if (param_type->type->IsTensor()) { - inst_type = Type::GetTensorTy(TARGET(kHost)); - } else if (param_type->type->IsTensorList()) { - inst_type = Type::GetTensorListTy(TARGET(kHost)); - } else { - LOG(FATAL) << "unsupported param_type"; - } - - CHECK(scope_->FindVar(var)); - if (!TargetCompatibleTo(*inst_type, *param_type->type)) { - /// Create a tensor or tensor_array in the instruction's scope, - /// alloc memory and then copy data there. - if (param_type->type->IsTensor()) { - const auto* shared_tensor = scope_->FindTensor(var); - auto* target_tensor = inst_scope_->NewTensor(var); - CHECK(!shared_tensor->dims().empty()) << "shared_tensor is empty yet"; - target_tensor->Resize(shared_tensor->dims()); - TargetCopy(param_type->type->target(), - target_tensor->mutable_data(param_type->type->target(), - shared_tensor->memory_size()), - shared_tensor->raw_data(), - shared_tensor->memory_size()); - } else if (param_type->type->IsTensorList()) { - const auto* shared_tensor_array = - scope_->FindVar(var)->GetMutable>(); - auto* target_tensor_array = - inst_scope_->Var(var)->GetMutable>(); - CHECK(!shared_tensor_array->empty()) - << "shared_tensor_array is empty yet"; - target_tensor_array->resize(shared_tensor_array->size()); - for (size_t i = 0; i < shared_tensor_array->size(); i++) { - target_tensor_array->at(i).Resize( - shared_tensor_array->at(i).dims()); - TargetCopy(param_type->type->target(), - target_tensor_array->at(i).mutable_data( - param_type->type->target(), - shared_tensor_array->at(i).memory_size()), - shared_tensor_array->at(i).raw_data(), - shared_tensor_array->at(i).memory_size()); - } - } else { - LOG(FATAL) << "not support"; + const auto* type = instruction_->kernel()->GetInputDeclType(arg); + CHECK(base_scope_->FindVar(var)); + /// Create a tensor or tensor_array in the instruction's scope, + /// alloc memory and then copy data there. + if (type->IsTensor() && + !TargetCompatibleTo(*Type::GetTensorTy(TARGET(kHost)), *type)) { + const auto* base_tensor = base_scope_->FindTensor(var); + auto* inst_tensor = inst_scope_->FindMutableTensor(var); + CHECK(!base_tensor->dims().empty()) + << "The dims of input tensor is empty yet"; + TargetCopy(type->target(), + inst_tensor->mutable_data(type->target(), + base_tensor->memory_size()), + base_tensor->raw_data(), + base_tensor->memory_size()); + } else if (type->IsTensorList() && + !TargetCompatibleTo(*Type::GetTensorListTy(TARGET(kHost)), + *type)) { + const auto* base_tensor_list = base_scope_->FindTensorList(var); + auto* inst_tensor_list = inst_scope_->FindMutableTensorList(var); + CHECK_EQ(base_tensor_list->size(), inst_tensor_list->size()); + for (size_t i = 0; i < base_tensor_list->size(); i++) { + CHECK(!base_tensor_list->at(i).dims().empty()) + << "The dims of input tensor[" << i << "] is empty yet"; + TargetCopy(type->target(), + inst_tensor_list->at(i).mutable_data( + type->target(), base_tensor_list->at(i).memory_size()), + inst_tensor_list->at(i).raw_data(), + inst_tensor_list->at(i).memory_size()); } } } @@ -126,78 +116,88 @@ void TestCase::PrepareInputsForInstruction() { } template -bool TestCase::CheckTensorPrecision(const Tensor* a_tensor, - const Tensor* b_tensor, +bool TestCase::CheckTensorPrecision(const Tensor* inst_tensor, + const Tensor* base_tensor, float abs_error) { - CHECK(a_tensor); - CHECK(b_tensor); + CHECK(inst_tensor); + CHECK(base_tensor); - CHECK(ShapeEquals(a_tensor->dims(), b_tensor->dims())); + CHECK(ShapeEquals(inst_tensor->dims(), base_tensor->dims())); - CHECK(a_tensor->lod() == b_tensor->lod()) << "lod not match"; + CHECK(inst_tensor->lod() == base_tensor->lod()) << "lod not match"; // The baseline should output in host devices. - CHECK(b_tensor->target() == TARGET(kHost) || - b_tensor->target() == TARGET(kX86) || - b_tensor->target() == TARGET(kARM)); - - const T* a_data{}; - switch (a_tensor->target()) { + CHECK(base_tensor->target() == TARGET(kHost) || + base_tensor->target() == TARGET(kX86) || + base_tensor->target() == TARGET(kARM)); + const T* inst_data{}; + Tensor inst_host_tensor; + inst_host_tensor.Resize(inst_tensor->dims()); + switch (inst_tensor->target()) { case TARGET(kX86): case TARGET(kHost): case TARGET(kARM): - a_data = static_cast(a_tensor->raw_data()); + inst_data = static_cast(inst_tensor->raw_data()); + break; +#ifdef LITE_WITH_XPU + case TARGET(kXPU): + CopySync(inst_host_tensor.mutable_data(), + inst_tensor->raw_data(), + sizeof(T) * inst_tensor->dims().production(), + IoDirection::DtoH); + inst_data = inst_host_tensor.data(); break; +#endif default: // Before compare, need to copy data from `target` device to host. LOG(FATAL) << "Not supported"; } - CHECK(a_data); + CHECK(inst_data); - const T* b_data = static_cast(b_tensor->raw_data()); + const T* base_data = static_cast(base_tensor->raw_data()); bool success = true; - for (int i = 0; i < a_tensor->dims().production(); i++) { - EXPECT_NEAR(a_data[i], b_data[i], abs_error); - if (fabsf(a_data[i] - b_data[i]) > abs_error) { + for (int i = 0; i < inst_tensor->dims().production(); i++) { + EXPECT_NEAR(inst_data[i], base_data[i], abs_error); + if (fabsf(inst_data[i] - base_data[i]) > abs_error) { success = false; } } return success; } -bool TestCase::CheckPrecision(const Tensor* a_tensor, - const Tensor* b_tensor, +bool TestCase::CheckPrecision(const Tensor* inst_tensor, + const Tensor* base_tensor, float abs_error, PrecisionType precision_type) { PrecisionType precision_type_t = precision_type; if (precision_type == PRECISION(kAny)) { - precision_type_t = b_tensor->precision(); + precision_type_t = base_tensor->precision(); } - CHECK(precision_type_t == b_tensor->precision()) + CHECK(precision_type_t == base_tensor->precision()) << "arg precision type and base tensor precision type are not matched! " "arg precision type is: " << PrecisionToStr(precision_type) << ", base tensor precision type is: " - << PrecisionToStr(b_tensor->precision()); - CHECK(a_tensor->precision() == b_tensor->precision()) + << PrecisionToStr(base_tensor->precision()); + CHECK(inst_tensor->precision() == base_tensor->precision()) << "real tensor precision type and base tensor precision type are not " "matched! real tensor precision type is: " - << PrecisionToStr(a_tensor->precision()) + << PrecisionToStr(inst_tensor->precision()) << ", base tensor precision type is: " - << PrecisionToStr(b_tensor->precision()); + << PrecisionToStr(base_tensor->precision()); switch (precision_type_t) { case PRECISION(kFloat): - return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + return CheckTensorPrecision(inst_tensor, base_tensor, abs_error); case PRECISION(kInt8): - return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + return CheckTensorPrecision(inst_tensor, base_tensor, abs_error); case PRECISION(kInt32): - return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + return CheckTensorPrecision(inst_tensor, base_tensor, abs_error); case PRECISION(kInt64): - return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + return CheckTensorPrecision(inst_tensor, base_tensor, abs_error); case PRECISION(kBool): - return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + return CheckTensorPrecision(inst_tensor, base_tensor, abs_error); default: LOG(FATAL) << "not support type: " << PrecisionToStr(precision_type); return false; @@ -209,24 +209,24 @@ bool TestCase::CheckPrecision(const std::string& var_name, PrecisionType precision_type) { bool success = true; if (inst_scope_->FindVar(var_name)->IsType()) { - auto a_tensor = inst_scope_->FindTensor(var_name); - auto b_tensor = base_scope_->FindTensor(var_name); - success = success && - CheckPrecision(a_tensor, b_tensor, abs_error, precision_type); + auto inst_tensor = inst_scope_->FindTensor(var_name); + auto base_tensor = base_scope_->FindTensor(var_name); + success = + success && + CheckPrecision(inst_tensor, base_tensor, abs_error, precision_type); } else if (inst_scope_->FindVar(var_name)->IsType>()) { - auto a_tensor_array = - inst_scope_->FindVar(var_name)->GetMutable>(); - auto b_tensor_array = - base_scope_->FindVar(var_name)->GetMutable>(); - CHECK_EQ(a_tensor_array->size(), b_tensor_array->size()); - for (size_t i = 0; i < a_tensor_array->size(); i++) { - Tensor* a_tensor = &(a_tensor_array->at(i)); - Tensor* b_tensor = &(b_tensor_array->at(i)); - if (a_tensor->dims().size() == 0 && b_tensor->dims().size() == 0) { + auto inst_tensor_list = inst_scope_->FindMutableTensorList(var_name); + auto base_tensor_list = base_scope_->FindMutableTensorList(var_name); + CHECK_EQ(inst_tensor_list->size(), base_tensor_list->size()); + for (size_t i = 0; i < inst_tensor_list->size(); i++) { + Tensor* inst_tensor = &(inst_tensor_list->at(i)); + Tensor* base_tensor = &(base_tensor_list->at(i)); + if (inst_tensor->dims().size() == 0 && base_tensor->dims().size() == 0) { continue; } - success = success && - CheckPrecision(a_tensor, b_tensor, abs_error, precision_type); + success = + success && + CheckPrecision(inst_tensor, base_tensor, abs_error, precision_type); } } else { LOG(FATAL) << "unsupported var type"; diff --git a/lite/core/arena/framework.h b/lite/core/arena/framework.h index cf864a32044e3dfd03ecd03327a0db69275ef586..4e73768e53576f03e47158618fa4f0eac0851382 100644 --- a/lite/core/arena/framework.h +++ b/lite/core/arena/framework.h @@ -28,7 +28,7 @@ #include "lite/core/program.h" #include "lite/core/scope.h" #include "lite/core/types.h" -#include "lite/model_parser/cpp/op_desc.h" +#include "lite/model_parser/cpp_desc.h" namespace paddle { namespace lite { @@ -40,13 +40,15 @@ namespace arena { class TestCase { public: explicit TestCase(const Place& place, const std::string& alias) - : place_(place), scope_(new Scope), alias_(alias) { + : place_(place), + alias_(alias), + inst_scope_(new Scope), + base_scope_(new Scope) { ctx_ = ContextScheduler::Global().NewContext(place_.target); } virtual ~TestCase(); void Prepare() { - PrepareScopes(); PrepareData(); op_desc_.reset(new cpp::OpDesc); PrepareOpDesc(op_desc_.get()); @@ -91,16 +93,15 @@ class TestCase { // kernel registry. void CheckKernelConsistWithDefinition() {} - Scope& scope() { return *scope_; } - - Scope* baseline_scope() { return base_scope_; } - Scope* inst_scope() { return inst_scope_; } + Scope* baseline_scope() { return base_scope_.get(); } + Scope* inst_scope() { return inst_scope_.get(); } protected: // Prepare inputs in scope() for Tester. virtual void PrepareData() = 0; - /// Prepare a tensor in host. The tensors will be created in scope_. + /// Prepare a tensor in host. The tensors will be created both in base_scope_ + /// and inst_scope_. /// Need to specify the targets other than X86 or ARM. template void SetCommonTensor(const std::string& var_name, @@ -108,42 +109,47 @@ class TestCase { const T* data, const LoD& lod = {}, bool is_persistable = false) { - auto* tensor = scope_->NewTensor(var_name); - tensor->Resize(ddim); - auto* d = tensor->mutable_data(); - memcpy(d, data, ddim.production() * sizeof(T)); + // Create and fill a input tensor with the given data for baseline + auto* base_tensor = base_scope_->NewTensor(var_name); + base_tensor->Resize(ddim); + memcpy(base_tensor->mutable_data(), data, ddim.production() * sizeof(T)); // set lod - if (!lod.empty()) *tensor->mutable_lod() = lod; + if (!lod.empty()) *base_tensor->mutable_lod() = lod; // set persistable - tensor->set_persistable(is_persistable); + base_tensor->set_persistable(is_persistable); + + // Create a copy for instruction + auto* inst_tensor = inst_scope_->NewTensor(var_name); + inst_tensor->CopyDataFrom(*base_tensor); } /// Prepare a tensor_array in host. The tensors will be created in scope_. /// Need to specify the targets other than X86 or ARM. template void SetCommonTensorList(const std::string& var_name, - const std::vector& array_tensor_dims, + const std::vector& ddims, const std::vector>& datas, const std::vector& lods = {}) { - CHECK_EQ(array_tensor_dims.size(), datas.size()); + // Create a tensor array for baseline, and a copy for instruction + CHECK_EQ(ddims.size(), datas.size()); if (!lods.empty()) { - CHECK_EQ(array_tensor_dims.size(), lods.size()); + CHECK_EQ(ddims.size(), lods.size()); } - auto* tensor_array = - scope_->Var(var_name)->GetMutable>(); - for (int i = 0; i < array_tensor_dims.size(); i++) { - Tensor tmp; - tmp.Resize(array_tensor_dims[i]); - auto* tmp_data = tmp.mutable_data(); - memcpy(tmp_data, + auto* base_tensor_list = base_scope_->NewTensorList(var_name); + auto* inst_tensor_list = inst_scope_->NewTensorList(var_name); + for (int i = 0; i < ddims.size(); i++) { + Tensor item; + item.Resize(ddims[i]); + memcpy(item.mutable_data(), datas[i].data(), - array_tensor_dims[i].production() * sizeof(T)); + ddims[i].production() * sizeof(T)); if (!lods.empty()) { - tmp.set_lod(lods[i]); + item.set_lod(lods[i]); } - tensor_array->push_back(tmp); + base_tensor_list->push_back(item); + inst_tensor_list->push_back(item); } } @@ -157,11 +163,6 @@ class TestCase { std::unique_ptr ctx_; void CreateInstruction(); - void PrepareScopes() { - inst_scope_ = &scope_->NewScope(); - base_scope_ = &scope_->NewScope(); - } - // Check shape // TODO(Superjomn) Move this method to utils or DDim? bool ShapeEquals(const DDim& a, const DDim& b) { @@ -172,25 +173,23 @@ class TestCase { return true; } - /// Copy the input tensors to target devices needed by the instruction. + // Copy the host tensors to the device tensors if needed by the instruction. void PrepareInputsForInstruction(); // Create output tensors and variables. void PrepareOutputsForInstruction() { for (auto x : op_desc().output_vars()) { - inst_scope_->NewTensor(x); - base_scope_->NewTensor(x); + inst_scope_->Var(x); } } private: Place place_; - std::shared_ptr scope_; std::string alias_; // The workspace for the Instruction. - Scope* inst_scope_{}; + std::shared_ptr inst_scope_; // The workspace for the baseline implementation. - Scope* base_scope_{}; + std::shared_ptr base_scope_; std::unique_ptr op_desc_; std::unique_ptr instruction_; }; diff --git a/lite/core/context.cc b/lite/core/context.cc index eb8f90d7fa90d459846b24bc93b5d26cdfc3969a..f14d1dfddea806ab3839f6f897b9d4d3fe396ca8 100644 --- a/lite/core/context.cc +++ b/lite/core/context.cc @@ -21,10 +21,10 @@ namespace lite { std::string Context::subgraph_model_cache_dir_{""}; // NOLINT #endif -#ifdef LITE_WITH_XPU -std::string Context::_multi_encoder_precision; // NOLINT -thread_local xdnn::Context* Context::_tls_raw_ctx{nullptr}; -int Context::_workspace_l3_size_per_thread{0}; +#ifdef LITE_WITH_MLU +int Context::next_queue_id_{0}; +std::map Context::queue_id_map_; +std::mutex Context::map_mutex_; #endif } // namespace lite diff --git a/lite/core/context.h b/lite/core/context.h index f606eeffaf8ccf932e2d17f03478d4d893ee482d..c3993d9589eeac442eaa827152fd1293852396db 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -25,6 +25,7 @@ #ifdef LITE_WITH_MLU #include #include +#include // NOLINT #include "lite/backends/mlu/mlu_utils.h" #endif #ifdef LITE_WITH_XPU @@ -143,45 +144,12 @@ class Context { void CopySharedTo(XPUContext* ctx) {} + // TODO(miaotianxiang): remove this static xdnn::Context* GetRawContext() { - if (_tls_raw_ctx == nullptr) { - _tls_raw_ctx = xdnn::create_context(); - CHECK(_tls_raw_ctx); - int r = xdnn::set_workspace_l3_size(_tls_raw_ctx, - _workspace_l3_size_per_thread); - if (r != 0) { - LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r - << ", _workspace_l3_size_per_thread = " - << _workspace_l3_size_per_thread; - } - } - return _tls_raw_ctx; - } - - static void SetWorkspaceL3Size(int l3_size = 0xfffc00) { - _workspace_l3_size_per_thread = l3_size; - } - - // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker - // thread - static void SetDev(int dev_no = 0) { - const char* dev_env = getenv("LITE_XPU_DEV"); - if (dev_env) { - xpu_set_device(atoi(dev_env)); - return; - } - - xpu_set_device(dev_no); + return TargetWrapperXPU::GetRawContext(); } std::string name() const { return "XPUContext"; } - - public: - static std::string _multi_encoder_precision; // NOLINT - - private: - static thread_local xdnn::Context* _tls_raw_ctx; - static int _workspace_l3_size_per_thread; }; #endif @@ -249,11 +217,11 @@ class Context { void InitOnce() {} MLUContext& operator=(const MLUContext& ctx) { - this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_); + this->Init(ctx.device_id_, ctx.exec_queue_id_); return *this; } - void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) { + void Init(int dev_id, int exec_queue_id = 0) { CHECK_GT(devs.size(), 0UL) << "Env is not initialized or current target is not exit!"; if (dev_id >= static_cast(devs.size())) { @@ -264,21 +232,19 @@ class Context { device_id_ = dev_id; } SetMluDevice(device_id_); - if (io_queue_id >= devs[dev_id].max_queue()) { - LOG(WARNING) << "data queue index exceeds the maximum queue number, " - "set to default qeueu(0)!"; - io_queue_id = 0; - } - if (exec_queue_id >= devs[dev_id].max_queue()) { - LOG(WARNING) << "exec queue index exceeds the maximum queue number, " - "set to default qeueu(0)!"; - exec_queue_id = 0; + + // get queue id from map + std::unique_lock lk(map_mutex_); + if (queue_id_map_.find(exec_queue_id) == queue_id_map_.end()) { + queue_id_map_[exec_queue_id] = + next_queue_id_++ % devs[dev_id].max_queue(); } - io_queue_ = devs[dev_id].io_queues()[io_queue_id]; - exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id]; + exec_queue_id_ = queue_id_map_[exec_queue_id]; + VLOG(4) << "pick mlu queue id: " << exec_queue_id_; + lk.unlock(); - exec_queue_id_ = exec_queue_id; - io_queue_id_ = io_queue_id; + io_queue_ = devs[dev_id].io_queues()[exec_queue_id_]; + exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id_]; } void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; } @@ -290,10 +256,12 @@ class Context { void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; } cnmlCoreVersion_t MLUCoreVersion() { - return DeviceInfo::Global().MLUCoreVersion(); + return paddle::lite::TargetWrapperMlu::MLUCoreVersion(); } - int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); } + int MLUCoreNumber() { + return paddle::lite::TargetWrapperMlu::MLUCoreNumber(); + } u32_t affinity() { return affinity_; } @@ -304,10 +272,12 @@ class Context { std::string name() const { return "MLUContext"; } private: + static int next_queue_id_; + static std::map queue_id_map_; + static std::mutex map_mutex_; int device_id_; // overall information int exec_queue_id_; - int io_queue_id_; cnrtQueue_t io_queue_; cnrtQueue_t exec_queue_; @@ -455,7 +425,7 @@ class ContextScheduler { case TARGET(kMLU): { int dev_id = TargetWrapper::GetCurDevice(); auto& context = ctx->As(); - context.Init(dev_id); + context.Init(dev_id, exec_stream_id); kernel_contexts_[TargetType::kMLU].As().CopySharedTo( &context); LOG(INFO) << "New Context for MLU"; diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc index ac79ede37406188f495690179b4a4886bc009d80..6d404cee9718a94d2646728c8f2d79576ceb7860 100644 --- a/lite/core/device_info.cc +++ b/lite/core/device_info.cc @@ -66,15 +66,6 @@ thread_local std::vector DeviceInfo::active_ids_; thread_local TensorLite DeviceInfo::workspace_; thread_local int64_t DeviceInfo::count_ = 0; -#ifdef LITE_WITH_MLU -thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270}; -thread_local int DeviceInfo::mlu_core_number_{1}; -thread_local bool DeviceInfo::use_first_conv_{false}; -thread_local std::vector DeviceInfo::mean_vec_; -thread_local std::vector DeviceInfo::std_vec_; -thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)}; -#endif - #ifdef TARGET_IOS const int DEFAULT_L1_CACHE_SIZE = 64 * 1024; const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024; @@ -1089,45 +1080,6 @@ int DeviceInfo::Setup() { return 0; } -#ifdef LITE_WITH_MLU -void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version, - int core_number, - bool use_first_conv, - const std::vector& mean_vec, - const std::vector& std_vec, - DataLayoutType input_layout) { - switch (core_version) { - case (lite_api::MLUCoreVersion::MLU_220): - mlu_core_version_ = CNML_MLU220; - break; - case (lite_api::MLUCoreVersion::MLU_270): - mlu_core_version_ = CNML_MLU270; - break; - default: - mlu_core_version_ = CNML_MLU270; - break; - } - mlu_core_number_ = core_number; - use_first_conv_ = use_first_conv; - mean_vec_ = mean_vec; - std_vec_ = std_vec; - input_layout_ = input_layout; -} - -cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; } - -int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; } - -bool DeviceInfo::UseFirstConv() { return use_first_conv_; } - -const std::vector& DeviceInfo::MeanVec() const { return mean_vec_; } - -const std::vector& DeviceInfo::StdVec() const { return std_vec_; } - -DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; } - -#endif // LITE_WITH_MLU - void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) { #ifdef ARM_WITH_OMP thread_num = std::min(thread_num, core_num_); diff --git a/lite/core/device_info.h b/lite/core/device_info.h index 603e3e6b91dc6035ffce2265a27bed4d59db5a9c..7aa3131d8fb1a5f8d573c483bafcb7f4d5c62ec7 100644 --- a/lite/core/device_info.h +++ b/lite/core/device_info.h @@ -59,20 +59,6 @@ class DeviceInfo { int Setup(); void SetRunMode(lite_api::PowerMode mode, int thread_num); -#ifdef LITE_WITH_MLU - void SetMLURunMode(lite_api::MLUCoreVersion core_version, - int core_number, - bool use_first_conv, - const std::vector& mean_vec, - const std::vector& std_vec, - DataLayoutType input_layout); - cnmlCoreVersion_t MLUCoreVersion(); - int MLUCoreNumber(); - bool UseFirstConv(); - const std::vector& MeanVec() const; - const std::vector& StdVec() const; - DataLayoutType InputLayout() const; -#endif void SetCache(int l1size, int l2size, int l3size); void SetArch(ARMArch arch) { arch_ = arch; } @@ -124,15 +110,6 @@ class DeviceInfo { static thread_local TensorLite workspace_; static thread_local int64_t count_; -#ifdef LITE_WITH_MLU - static thread_local cnmlCoreVersion_t mlu_core_version_; - static thread_local int mlu_core_number_; - static thread_local bool use_first_conv_; - static thread_local std::vector mean_vec_; - static thread_local std::vector std_vec_; - static thread_local DataLayoutType input_layout_; -#endif - void SetDotInfo(int argc, ...); void SetFP16Info(int argc, ...); void SetFP32Info(int argc, ...); diff --git a/lite/core/kernel.h b/lite/core/kernel.h index 9fffcc60012060327612345528c705bcf7722f17..361d014acc512dc2a46061f86efa83e1e1845807 100644 --- a/lite/core/kernel.h +++ b/lite/core/kernel.h @@ -66,7 +66,7 @@ class KernelBase { virtual void SetProfileRuntimeKernelInfo( paddle::lite::profile::OpCharacter* ch) { ch->kernel_func_name = std::string("NotImpl"); -#ifdef LITE_WITH_ARM +#ifdef LITE_WITH_OPENCL ch->cl_event = event_; #endif } diff --git a/lite/core/memory.cc b/lite/core/memory.cc index 1f2f7fed7d61b67a76f54a092b6d48951bc9fcbd..83e41d2c0960d87a0201b55b943529a9df4f6ab2 100644 --- a/lite/core/memory.cc +++ b/lite/core/memory.cc @@ -140,6 +140,11 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) { dst, src, size, IoDirection::HtoD); break; #endif +#ifdef LITE_WITH_XPU + case TargetType::kXPU: + TargetWrapperXPU::MemcpySync(dst, src, size, IoDirection::HtoD); + break; +#endif #ifdef LITE_WITH_OPENCL case TargetType::kOpenCL: TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD); diff --git a/lite/core/memory.h b/lite/core/memory.h index a1013910019251271ddfccfbc700297c45226fe6..c80c8fb6b6e1356ebfa52920a8ee39f61ed20692 100644 --- a/lite/core/memory.h +++ b/lite/core/memory.h @@ -97,6 +97,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) { case TARGET(kBM): TargetWrapper::MemcpySync(dst, src, size, dir); break; +#endif +#ifdef LITE_WITH_XPU + case TARGET(kXPU): + TargetWrapperXPU::MemcpySync(dst, src, size, dir); + break; #endif default: LOG(FATAL) diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt index 2540bb56d4082570c984e8eea009b5575825fec9..be09ed4b1a63154b8561f4d39cff7d987a9fcba7 100644 --- a/lite/core/mir/CMakeLists.txt +++ b/lite/core/mir/CMakeLists.txt @@ -23,9 +23,11 @@ lite_cc_library(mir_passes fusion/sequence_pool_concat_fuse_pass.cc fusion/scale_activation_fuse_pass.cc fusion/__xpu__resnet_fuse_pass.cc + fusion/__xpu__resnet_cbam_fuse_pass.cc fusion/__xpu__multi_encoder_fuse_pass.cc fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc fusion/__xpu__fc_fuse_pass.cc + fusion/__xpu__mmdnn_fuse_pass.cc elimination/identity_scale_eliminate_pass.cc elimination/identity_dropout_eliminate_pass.cc elimination/elementwise_mul_constant_eliminate_pass.cc diff --git a/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc index f4226820d0437db8cad0cfdac92be15359bb90bd..673854b118a8adaca73cb905eda4892b6903665c 100644 --- a/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc +++ b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc @@ -18,7 +18,7 @@ #include "lite/core/mir/pass.h" #include "lite/core/mir/pass_registry.h" #include "lite/core/mir/pattern_matcher.h" -#include "lite/model_parser/cpp/var_desc.h" +#include "lite/model_parser/cpp_desc.h" namespace paddle { namespace lite { diff --git a/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc b/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..61aeb2ab1f51ddcd6b153971253f8239472a1031 --- /dev/null +++ b/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc @@ -0,0 +1,1183 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/xpu_pattern_matcher_high_api.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +class XPUMmdnnFloat2Fix { + public: + void operator()(SSAGraph* graph) { + for (auto* node : graph->StmtTopologicalOrder()) { + CHECK(node->IsStmt()); + auto* op_info = node->stmt()->op_info(); + std::string op_type = op_info->Type(); + + static const std::vector target_ops{"var_conv_2d", + "search_fc"}; + if (std::find(target_ops.begin(), target_ops.end(), op_type) != + target_ops.end()) { + std::string weight_name = op_info->Input("W").front(); + auto* scope = node->stmt()->op()->scope(); + auto* weight_t = scope->FindMutableTensor(weight_name); + auto weight_dims = weight_t->dims(); + auto weight_len = weight_t->numel(); + float* weight_on_host = weight_t->mutable_data(); + float max_f = + paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len); + std::unique_ptr weight_int16(new int16_t[weight_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + weight_on_host, weight_int16.get(), max_f, weight_len); + memcpy( + weight_on_host, weight_int16.get(), weight_len * sizeof(int16_t)); + + auto update_op_info = *op_info; + update_op_info.SetAttr("__xpu__float_to_fix", true); + update_op_info.SetAttr("__xpu__w_max", max_f); + node->stmt()->ResetOp(update_op_info, graph->valid_places()); + VLOG(3) << "Float2Fix, op_type=" << op_type + << ", weight_name=" << weight_name; + } else if (op_type == "match_matrix_tensor") { + std::string weight_name = op_info->Input("W").front(); + auto* scope = node->stmt()->op()->scope(); + auto* weight_t = scope->FindMutableTensor(weight_name); + auto weight_dims = weight_t->dims(); + auto weight_len = weight_t->numel(); + float* weight_on_host = weight_t->mutable_data(); + float max_f = + paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len); + std::unique_ptr weight_int16(new int16_t[weight_len]); + std::unique_ptr weight_trans_int16(new int16_t[weight_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + weight_on_host, weight_int16.get(), max_f, weight_len); + paddle::lite::xpu::math::Transpose(weight_int16.get(), + weight_trans_int16.get(), + weight_dims[0], + weight_dims[1] * weight_dims[2]); + memcpy(weight_on_host, + weight_trans_int16.get(), + weight_len * sizeof(int16_t)); + + auto update_op_info = *op_info; + update_op_info.SetAttr("__xpu__float_to_fix", true); + update_op_info.SetAttr("__xpu__w_max", max_f); + node->stmt()->ResetOp(update_op_info, graph->valid_places()); + VLOG(3) << "Float2Fix && Transposed, op_type=" << op_type + << ", weight_name=" << weight_name; + } else if (op_type == "search_grnn") { + auto* scope = node->stmt()->op()->scope(); + + std::string wi_name = op_info->Input("Wi").front(); + auto* wi_t = scope->FindMutableTensor(wi_name); + auto wi_dims = wi_t->dims(); + auto wi_len = wi_t->numel(); + auto wi_stride_len = wi_len / 3; + float* wi_on_host = wi_t->mutable_data(); + std::unique_ptr wi_int16(new int16_t[wi_len]); + std::vector wi_max(3); + for (int i = 0; i < 3; ++i) { + float max_f = paddle::lite::xpu::math::FindMaxAbs( + wi_on_host + i * wi_stride_len, wi_stride_len); + paddle::lite::xpu::math::ConvertFP32ToInt16( + wi_on_host + i * wi_stride_len, + wi_int16.get() + i * wi_stride_len, + max_f, + wi_stride_len); + wi_max[i] = max_f; + } + memcpy(wi_on_host, wi_int16.get(), wi_len * sizeof(int16_t)); + + std::string wh_name = op_info->Input("Wh").front(); + auto* wh_t = scope->FindMutableTensor(wh_name); + auto wh_dims = wh_t->dims(); + auto wh_len = wh_t->numel(); + auto wh_stride_len = wh_len / 3; + float* wh_on_host = wh_t->mutable_data(); + std::unique_ptr wh_int16(new int16_t[wh_len]); + std::vector wh_max(3); + for (int i = 0; i < 3; ++i) { + float max_f = paddle::lite::xpu::math::FindMaxAbs( + wh_on_host + i * wh_stride_len, wh_stride_len); + paddle::lite::xpu::math::ConvertFP32ToInt16( + wh_on_host + i * wh_stride_len, + wh_int16.get() + i * wh_stride_len, + max_f, + wh_stride_len); + wh_max[i] = max_f; + } + memcpy(wh_on_host, wh_int16.get(), wh_len * sizeof(int16_t)); + + auto update_op_info = *op_info; + update_op_info.SetAttr("__xpu__float_to_fix", true); + update_op_info.SetAttr>("__xpu__wi_max", wi_max); + update_op_info.SetAttr>("__xpu__wh_max", wh_max); + node->stmt()->ResetOp(update_op_info, graph->valid_places()); + VLOG(3) << "Float2Fix, op_type=" << op_type << ", wi_name=" << wi_name + << ", wh_name=" << wh_name; + } + } + } +}; + +class XPUMmdnnSearchAttentionFuser : public FuseBase { + public: + void BuildPattern() override { + auto* input = VarNode("input")->AsInput(); + + auto* search_group_padding = + OpNode("search_group_padding", "search_group_padding"); + auto* out_emb_padding = + VarNode("out_emb_padding") + ->assert_is_op_output("search_group_padding", "Out_emb_padding") + ->AsIntermediate(); + auto* out_new = VarNode("out_new") + ->assert_is_op_output("search_group_padding", "Out_new") + ->AsIntermediate(); + auto* out_padding = + VarNode("out_padding") + ->assert_is_op_output("search_group_padding", "Out_padding") + ->AsIntermediate(); + + auto* search_seq_fc_w = VarNode("search_seq_fc_w") + ->assert_is_op_input("search_seq_fc", "W") + ->AsInput(); + auto* search_seq_fc_b = VarNode("search_seq_fc_b") + ->assert_is_op_input("search_seq_fc", "b") + ->AsInput(); + auto* search_seq_fc = + OpNode("search_seq_fc", "search_seq_fc")->AsIntermediate(); + auto* search_seq_fc_out = VarNode("search_seq_fc_out") + ->assert_is_op_output("search_seq_fc", "Out") + ->AsIntermediate(); + + auto* search_aligned_mat_mul = + OpNode("search_aligned_mat_mul", "search_aligned_mat_mul") + ->AsIntermediate(); + auto* search_aligned_mat_mul_out = + VarNode("search_aligned_mat_mul_out") + ->assert_is_op_output("search_aligned_mat_mul", "Out") + ->AsIntermediate(); + auto* search_aligned_mat_mul_a = + VarNode("search_aligned_mat_mul_a") + ->assert_is_op_output("search_aligned_mat_mul", "_a_addr") + ->AsIntermediate(); + auto* search_aligned_mat_mul_b = + VarNode("search_aligned_mat_mul_b") + ->assert_is_op_output("search_aligned_mat_mul", "_b_addr") + ->AsIntermediate(); + auto* search_aligned_mat_mul_c = + VarNode("search_aligned_mat_mul_c") + ->assert_is_op_output("search_aligned_mat_mul", "_c_addr") + ->AsIntermediate(); + + auto* search_attention_padding_mask = + OpNode("search_attention_padding_mask", "search_attention_padding_mask") + ->AsIntermediate(); + auto* search_attention_padding_mask_out = + VarNode("search_attention_padding_mask_out") + ->assert_is_op_output("search_attention_padding_mask", "Out") + ->AsIntermediate(); + auto* search_attention_padding_mask_pad_begin = + VarNode("search_attention_padding_mask_pad_begin") + ->assert_is_op_output("search_attention_padding_mask", "pad_begin") + ->AsIntermediate(); + + auto* search_seq_softmax = + OpNode("search_seq_softmax", "search_seq_softmax")->AsIntermediate(); + auto* search_seq_softmax_out = + VarNode("search_seq_softmax_out") + ->assert_is_op_output("search_seq_softmax", "Out") + ->AsIntermediate(); + auto* search_seq_softmax_out_log = + VarNode("search_seq_softmax_out_log") + ->assert_is_op_output("search_seq_softmax", "Out_log") + ->AsIntermediate(); + + auto* search_aligned_mat_mul_2 = + OpNode("search_aligned_mat_mul_2", "search_aligned_mat_mul") + ->AsIntermediate(); + auto* search_aligned_mat_mul_2_out = + VarNode("search_aligned_mat_mul_2_out") + ->assert_is_op_output("search_aligned_mat_mul", "Out") + ->AsIntermediate(); + auto* search_aligned_mat_mul_2_a = + VarNode("search_aligned_mat_mul_2_a") + ->assert_is_op_output("search_aligned_mat_mul", "_a_addr") + ->AsIntermediate(); + auto* search_aligned_mat_mul_2_b = + VarNode("search_aligned_mat_mul_2_b") + ->assert_is_op_output("search_aligned_mat_mul", "_b_addr") + ->AsIntermediate(); + auto* search_aligned_mat_mul_2_c = + VarNode("search_aligned_mat_mul_2_c") + ->assert_is_op_output("search_aligned_mat_mul", "_c_addr") + ->AsIntermediate(); + + auto* search_seq_depadding = + OpNode("search_seq_depadding")->AsIntermediate(); + auto* search_seq_depadding_out = + VarNode("search_seq_depadding_out")->AsOutput(); + + *input >> *search_group_padding >> *out_emb_padding; + *search_group_padding >> *out_new; + *search_group_padding >> *out_padding; + + *search_seq_fc_w >> *search_seq_fc; + *search_seq_fc_b >> *search_seq_fc; + *out_emb_padding >> *search_seq_fc; + *search_seq_fc >> *search_seq_fc_out; + + *search_seq_fc_out >> *search_aligned_mat_mul; + *out_emb_padding >> *search_aligned_mat_mul; + *search_aligned_mat_mul >> *search_aligned_mat_mul_out; + *search_aligned_mat_mul >> *search_aligned_mat_mul_a; + *search_aligned_mat_mul >> *search_aligned_mat_mul_b; + *search_aligned_mat_mul >> *search_aligned_mat_mul_c; + + *search_aligned_mat_mul_out >> *search_attention_padding_mask; + *out_padding >> *search_attention_padding_mask; + *search_attention_padding_mask >> *search_attention_padding_mask_out; + *search_attention_padding_mask >> *search_attention_padding_mask_pad_begin; + + *search_attention_padding_mask_out >> *search_seq_softmax; + *search_seq_softmax >> *search_seq_softmax_out; + *search_seq_softmax >> *search_seq_softmax_out_log; + + *search_seq_softmax_out >> *search_aligned_mat_mul_2; + *out_emb_padding >> *search_aligned_mat_mul_2; + *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_out; + *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_a; + *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_b; + *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_c; + + *search_aligned_mat_mul_2_out >> *search_seq_depadding; + *out_new >> *search_seq_depadding; + *search_seq_depadding >> *search_seq_depadding_out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__mmdnn_search_attention"); + op_desc.SetInput("X", {matched.at("input")->arg()->name}); + op_desc.SetInput("W", {matched.at("search_seq_fc_w")->arg()->name}); + op_desc.SetInput("b", {matched.at("search_seq_fc_b")->arg()->name}); + op_desc.SetOutput("Out", + {matched.at("search_seq_depadding_out")->arg()->name}); + + auto* padding_op_info = + matched.at("search_group_padding")->stmt()->op_info(); + op_desc.SetAttr("pad_id", padding_op_info->GetAttr("pad_id")); + auto* matmul_0_op_info = + matched.at("search_aligned_mat_mul")->stmt()->op_info(); + op_desc.SetAttr("alpha0", matmul_0_op_info->GetAttr("alpha")); + auto* matmul_1_op_info = + matched.at("search_aligned_mat_mul_2")->stmt()->op_info(); + op_desc.SetAttr("alpha1", matmul_1_op_info->GetAttr("alpha")); + auto* mask_op_info = + matched.at("search_attention_padding_mask")->stmt()->op_info(); + op_desc.SetAttr("mask", mask_op_info->GetAttr("mask")); + + auto* new_stmt = matched.at("search_group_padding")->stmt(); + auto* scope = new_stmt->op()->scope(); + auto w_name = matched.at("search_seq_fc_w")->arg()->name; + auto* w_t = scope->FindMutableTensor(w_name); + auto w_dims = w_t->dims(); + int w_len = w_t->numel(); + float* w_on_host = w_t->mutable_data(); + + float max_f = paddle::lite::xpu::math::FindMaxAbs(w_on_host, w_len); + std::unique_ptr w_int16(new int16_t[w_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + w_on_host, w_int16.get(), max_f, w_len); + memcpy(w_on_host, w_int16.get(), w_len * sizeof(int16_t)); + op_desc.SetAttr("W_max", max_f); + + auto new_op = LiteOpRegistry::Global().Create(op_desc.Type()); + new_op->Attach(op_desc, scope); + new_op->SetValidPlaces(new_stmt->op()->valid_places()); + auto kernels = new_op->CreateKernels(new_op->valid_places()); + new_stmt->SetOp(new_op); + new_stmt->SetKernels(std::move(kernels)); + + DirectedLink(matched.at("search_seq_fc_w"), + matched.at("search_group_padding")); + DirectedLink(matched.at("search_seq_fc_b"), + matched.at("search_group_padding")); + IR_OP_VAR_LINK(matched.at("search_group_padding"), + matched.at("search_seq_depadding_out")); + } +}; + +class XPUMmdnnMatchConvTopkFuser : public FuseBase { + public: + void BuildPattern() override { + auto* input_x = VarNode("input_x") + ->assert_is_op_input("match_matrix_tensor", "X") + ->AsInput(); + auto* input_y = VarNode("input_y") + ->assert_is_op_input("match_matrix_tensor", "Y") + ->AsInput(); + auto* input_w = VarNode("input_w") + ->assert_is_op_input("match_matrix_tensor", "W") + ->AsInput(); + + auto* match_matrix_tensor = + OpNode("match_matrix_tensor", "match_matrix_tensor"); + auto* match_out = VarNode("match_out") + ->assert_is_op_output("match_matrix_tensor", "Out") + ->AsIntermediate(); + auto* match_tmp = VarNode("match_tmp") + ->assert_is_op_output("match_matrix_tensor", "Tmp") + ->AsIntermediate(); + auto* relu0 = OpNode("relu0", "relu")->AsIntermediate(); + auto* relu0_out = VarNode("relu0_out") + ->assert_is_op_output("relu", "Out") + ->AsIntermediate(); + auto* conv_w = + VarNode("conv_w")->assert_is_op_input("var_conv_2d", "W")->AsInput(); + auto* conv = OpNode("conv", "var_conv_2d")->AsIntermediate(); + auto* conv_out = VarNode("conv_out") + ->assert_is_op_output("var_conv_2d", "Out") + ->AsIntermediate(); + auto* conv_col = VarNode("conv_col") + ->assert_is_op_output("var_conv_2d", "Col") + ->AsIntermediate(); + auto* relu1 = OpNode("relu1", "relu")->AsIntermediate(); + auto* relu1_out = VarNode("relu1_out") + ->assert_is_op_output("relu", "Out") + ->AsIntermediate(); + auto* seq_concat = + OpNode("seq_concat", "sequence_concat")->AsIntermediate(); + auto* seq_concat_out = + VarNode("seq_concat_out") + ->assert_is_op_output("sequence_concat", "Out") + ->assert_is_op_input("sequence_topk_avg_pooling", "X") + ->AsIntermediate(); + auto* topk_col = + VarNode("topk_col") + ->assert_is_op_input("sequence_topk_avg_pooling", "COLUMN") + ->AsInput(); + auto* topk_row = + VarNode("topk_row") + ->assert_is_op_input("sequence_topk_avg_pooling", "ROW") + ->AsInput(); + auto* topk = OpNode("topk", "sequence_topk_avg_pooling")->AsIntermediate(); + auto* topk_out = + VarNode("topk_out") + ->assert_is_op_output("sequence_topk_avg_pooling", "Out") + ->AsOutput(); + auto* topk_pos = + VarNode("topk_pos") + ->assert_is_op_output("sequence_topk_avg_pooling", "pos") + ->AsIntermediate(); + + *input_x >> *match_matrix_tensor; + *input_y >> *match_matrix_tensor; + *input_w >> *match_matrix_tensor; + *match_matrix_tensor >> *match_out >> *relu0 >> *relu0_out; + *match_matrix_tensor >> *match_tmp; + + *relu0_out >> *conv >> *conv_out >> *relu1 >> *relu1_out; + *conv_w >> *conv; + *conv >> *conv_col; + + *relu0_out >> *seq_concat; + *relu1_out >> *seq_concat; + *seq_concat >> *seq_concat_out >> *topk >> *topk_out; + *topk_col >> *topk; + *topk_row >> *topk; + *topk >> *topk_pos; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__mmdnn_match_conv_topk"); + op_desc.SetInput("input_x", {matched.at("input_x")->arg()->name}); + op_desc.SetInput("input_y", {matched.at("input_y")->arg()->name}); + op_desc.SetInput("input_w", {matched.at("input_w")->arg()->name}); + op_desc.SetInput("conv_w", {matched.at("conv_w")->arg()->name}); + op_desc.SetOutput("topk_out", {matched.at("topk_out")->arg()->name}); + + auto* match_op_info = matched.at("match_matrix_tensor")->stmt()->op_info(); + op_desc.SetAttr("input_w_max", + match_op_info->GetAttr("w_max")); + op_desc.SetAttr("dim_t", match_op_info->GetAttr("dim_t")); + auto* conv_op_info = matched.at("conv")->stmt()->op_info(); + op_desc.SetAttr("conv_w_max", conv_op_info->GetAttr("w_max")); + auto* topk_op_info = matched.at("topk")->stmt()->op_info(); + op_desc.SetAttr>( + "topks", topk_op_info->GetAttr>("topks")); + op_desc.SetAttr("channel_num", + topk_op_info->GetAttr("channel_num")); + + auto* new_stmt = matched.at("match_matrix_tensor")->stmt(); + auto new_op = LiteOpRegistry::Global().Create(op_desc.Type()); + new_op->Attach(op_desc, new_stmt->op()->scope()); + new_op->SetValidPlaces(new_stmt->op()->valid_places()); + auto kernels = new_op->CreateKernels(new_op->valid_places()); + new_stmt->SetOp(new_op); + new_stmt->SetKernels(std::move(kernels)); + + // XXX(miaotianxiang): redundant links around |topk| are automatically + // removed as |topk| is + // marked intermediate. + // RemoveDirectedLink(matched.at("topk_col"), matched.at("topk")); + // RemoveDirectedLink(matched.at("topk_row"), matched.at("topk")); + std::vector arg_names{"conv_w"}; + for (auto name : arg_names) { + DirectedLink(matched.at(name), matched.at("match_matrix_tensor")); + } + std::vector out_names{"topk_out"}; + for (auto name : out_names) { + IR_OP_VAR_LINK(matched.at("match_matrix_tensor"), matched.at(name)); + } + } +}; + +class XPUMmdnnBidSeqRevEmbEltwiseFuser : public FuseBase { + public: + void BuildPattern() override { + auto* input0 = VarNode("input0")->AsInput(); + auto* input1 = VarNode("input1")->AsInput(); + auto* emb_tbl = VarNode("emb_tbl")->AsInput(); + + // fwd emb + auto* emb0 = OpNode("emb0", "lookup_table"); + auto* emb0_out = + VarNode("emb0_out")->assert_is_op_output("lookup_table", "Out"); + auto* emb1 = OpNode("emb1", "lookup_table"); + auto* emb1_out = + VarNode("emb1_out")->assert_is_op_output("lookup_table", "Out"); + + auto* eltwise01 = OpNode("eltwise01", "search_seq_arithmetic"); + auto* eltwise01_out = + VarNode("eltwise01_out") + ->assert_is_op_output("search_seq_arithmetic", "Out") + ->AsOutput(); + + // rev emb + auto* seq_rev2 = OpNode("seq_rev2", "sequence_reverse")->AsIntermediate(); + auto* seq_rev2_out = VarNode("seq_rev2_out") + ->assert_is_op_output("sequence_reverse", "Y") + ->AsIntermediate(); + auto* seq_rev3 = OpNode("seq_rev3", "sequence_reverse")->AsIntermediate(); + auto* seq_rev3_out = VarNode("seq_rev3_out") + ->assert_is_op_output("sequence_reverse", "Y") + ->AsIntermediate(); + auto* emb2 = OpNode("emb2", "lookup_table")->AsIntermediate(); + auto* emb2_out = VarNode("emb2_out") + ->assert_is_op_output("lookup_table", "Out") + ->AsIntermediate(); + auto* emb3 = OpNode("emb3", "lookup_table")->AsIntermediate(); + auto* emb3_out = VarNode("emb3_out") + ->assert_is_op_output("lookup_table", "Out") + ->AsIntermediate(); + + auto* eltwise23 = + OpNode("eltwise23", "search_seq_arithmetic")->AsIntermediate(); + auto* eltwise23_out = + VarNode("eltwise23_out") + ->assert_is_op_output("search_seq_arithmetic", "Out") + ->AsOutput(); + + *input0 >> *emb0 >> *emb0_out >> *eltwise01 >> *eltwise01_out; + *emb_tbl >> *emb0; + *input1 >> *emb1 >> *emb1_out >> *eltwise01; + *emb_tbl >> *emb1; + + *input0 >> *seq_rev2 >> *seq_rev2_out >> *emb2 >> *emb2_out >> *eltwise23 >> + *eltwise23_out; + *emb_tbl >> *emb2; + *input1 >> *seq_rev3 >> *seq_rev3_out >> *emb3 >> *emb3_out >> *eltwise23; + *emb_tbl >> *emb3; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("sequence_reverse"); + op_desc.SetInput("X", {matched.at("eltwise01_out")->arg()->name}); + op_desc.SetOutput("Y", {matched.at("eltwise23_out")->arg()->name}); + + auto emb0_op = matched.at("emb0")->stmt()->op(); + auto new_seq_rev_op = LiteOpRegistry::Global().Create("sequence_reverse"); + new_seq_rev_op->Attach(op_desc, emb0_op->scope()); + auto* new_seq_rev_node = + graph->GraphCreateInstructNode(new_seq_rev_op, emb0_op->valid_places()); + + DirectedLink(matched.at("eltwise01_out"), new_seq_rev_node); + DirectedLink(new_seq_rev_node, matched.at("eltwise23_out")); + } +}; + +class XPUMmdnnBidEmbAttFuser : public FuseBase { + public: + void BuildPattern() override { + auto* input0 = VarNode("input0")->AsInput(); + auto* input1 = VarNode("input1")->AsInput(); + auto* emb_tbl = VarNode("emb_tbl")->AsInput(); + + auto* emb0 = OpNode("emb0", "lookup_table"); + auto* emb0_out = VarNode("emb0_out") + ->assert_is_op_output("lookup_table", "Out") + ->AsIntermediate(); + auto* emb1 = OpNode("emb1", "lookup_table")->AsIntermediate(); + auto* emb1_out = VarNode("emb1_out") + ->assert_is_op_output("lookup_table", "Out") + ->AsIntermediate(); + auto* eltwise01 = + OpNode("eltwise01", "search_seq_arithmetic")->AsIntermediate(); + auto* eltwise01_out = + VarNode("eltwise01_out") + ->assert_is_op_output("search_seq_arithmetic", "Out") + ->AsOutput(); + + auto* att_2in1_w = + VarNode("att_2in1_w") + ->assert_is_op_input("__xpu__mmdnn_search_attention", "W") + ->AsInput(); + auto* att_2in1_b = + VarNode("att_2in1_b") + ->assert_is_op_input("__xpu__mmdnn_search_attention", "b") + ->AsInput(); + auto* att_2in1 = + OpNode("att_2in1", "__xpu__mmdnn_search_attention")->AsIntermediate(); + auto* att_2in1_out = + VarNode("att_2in1_out") + ->assert_is_op_output("__xpu__mmdnn_search_attention", "Out") + ->AsIntermediate(); + auto* seq_pool_2in1 = + OpNode("seq_pool_2in1", "sequence_pool")->AsIntermediate(); + auto* seq_pool_2in1_out = VarNode("seq_pool_2in1_out") + ->assert_is_op_output("sequence_pool", "Out") + ->AsOutput(); + auto* seq_pool_2in1_max_idx = + VarNode("seq_pool_2in1_max_idx") + ->assert_is_op_output("sequence_pool", "MaxIndex") + ->AsIntermediate(); + + *input0 >> *emb0 >> *emb0_out >> *eltwise01 >> *eltwise01_out; + *emb_tbl >> *emb0; + *input1 >> *emb1 >> *emb1_out >> *eltwise01; + *emb_tbl >> *emb1; + + *eltwise01_out >> *att_2in1 >> *att_2in1_out >> *seq_pool_2in1 >> + *seq_pool_2in1_out; + *seq_pool_2in1 >> *seq_pool_2in1_max_idx; + *att_2in1_w >> *att_2in1; + *att_2in1_b >> *att_2in1; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__mmdnn_bid_emb_att"); + op_desc.SetInput("id0", {matched.at("input0")->arg()->name}); + op_desc.SetInput("id1", {matched.at("input1")->arg()->name}); + op_desc.SetInput("emb_tbl", {matched.at("emb_tbl")->arg()->name}); + op_desc.SetInput("att_fc_w", {matched.at("att_2in1_w")->arg()->name}); + op_desc.SetInput("att_fc_b", {matched.at("att_2in1_b")->arg()->name}); + op_desc.SetOutput("att_pool_out", + {matched.at("seq_pool_2in1_out")->arg()->name}); + op_desc.SetOutput("emb_fw_out", {matched.at("eltwise01_out")->arg()->name}); + + auto* att_fc_op_info = matched.at("att_2in1")->stmt()->op_info(); + op_desc.SetAttr("att_fc_w_max", + att_fc_op_info->GetAttr("W_max")); + + auto* new_stmt = matched.at("emb0")->stmt(); + auto new_op = LiteOpRegistry::Global().Create(op_desc.Type()); + new_op->Attach(op_desc, new_stmt->op()->scope()); + new_op->SetValidPlaces(new_stmt->op()->valid_places()); + auto kernels = new_op->CreateKernels(new_op->valid_places()); + new_stmt->SetOp(new_op); + new_stmt->SetKernels(std::move(kernels)); + + std::vector arg_names{ + "input1", "att_2in1_w", "att_2in1_b", + }; + for (auto name : arg_names) { + DirectedLink(matched.at(name), matched.at("emb0")); + } + std::vector out_names{ + "seq_pool_2in1_out", "eltwise01_out", + }; + for (auto name : out_names) { + IR_OP_VAR_LINK(matched.at("emb0"), matched.at(name)); + } + } +}; + +class XPUMmdnnBidEmbGrnnAttFuser : public FuseBase { + public: + void BuildPattern() override { + auto* input0 = VarNode("input0")->AsInput(); + auto* input1 = VarNode("input1")->AsInput(); + auto* emb_tbl = VarNode("emb_tbl")->AsInput(); + + auto* emb0 = OpNode("emb0", "lookup_table"); + auto* emb0_out = VarNode("emb0_out") + ->assert_is_op_output("lookup_table", "Out") + ->AsIntermediate(); + auto* emb1 = OpNode("emb1", "lookup_table")->AsIntermediate(); + auto* emb1_out = VarNode("emb1_out") + ->assert_is_op_output("lookup_table", "Out") + ->AsIntermediate(); + auto* eltwise01 = + OpNode("eltwise01", "search_seq_arithmetic")->AsIntermediate(); + auto* eltwise01_out = + VarNode("eltwise01_out") + ->assert_is_op_output("search_seq_arithmetic", "Out") + ->AsOutput(); + + auto* seq_rev_right0 = + OpNode("seq_rev_right0", "sequence_reverse")->AsIntermediate(); + auto* seq_rev_right0_out = + VarNode("seq_rev_right0_out") + ->assert_is_op_output("sequence_reverse", "Y") + ->AsIntermediate(); + auto* grnn_right_wh = VarNode("grnn_right_wh") + ->assert_is_op_input("search_grnn", "Wh") + ->AsInput(); + auto* grnn_right_wi = VarNode("grnn_right_wi") + ->assert_is_op_input("search_grnn", "Wi") + ->AsInput(); + auto* grnn_right = OpNode("grnn_right", "search_grnn")->AsIntermediate(); + auto* grnn_right_out = VarNode("grnn_right_out") + ->assert_is_op_output("search_grnn", "Out") + ->AsIntermediate(); + auto* grnn_right_idx_sorted_by_width = + VarNode("grnn_right_idx_sorted_by_width") + ->assert_is_op_output("search_grnn", "idx_sorted_by_width") + ->AsIntermediate(); + auto* grnn_right_layout_input = + VarNode("grnn_right_layout_input") + ->assert_is_op_output("search_grnn", "layout_input") + ->AsIntermediate(); + auto* grnn_right_tmp_buffer = + VarNode("grnn_right_tmp_buffer") + ->assert_is_op_output("search_grnn", "tmp_buffer") + ->AsIntermediate(); + auto* seq_rev_right1 = + OpNode("seq_rev_right1", "sequence_reverse")->AsIntermediate(); + auto* seq_rev_right1_out = + VarNode("seq_rev_right1_out") + ->assert_is_op_output("sequence_reverse", "Y") + ->AsIntermediate(); + auto* seq_pool_right = + OpNode("seq_pool_right", "sequence_pool")->AsIntermediate(); + auto* seq_pool_right_out = VarNode("seq_pool_right_out") + ->assert_is_op_output("sequence_pool", "Out") + ->AsOutput(); + auto* seq_pool_right_max_idx = + VarNode("seq_pool_right_max_idx") + ->assert_is_op_output("sequence_pool", "MaxIndex") + ->AsIntermediate(); + + auto* grnn_left_wh = VarNode("grnn_left_wh") + ->assert_is_op_input("search_grnn", "Wh") + ->AsInput(); + auto* grnn_left_wi = VarNode("grnn_left_wi") + ->assert_is_op_input("search_grnn", "Wi") + ->AsInput(); + auto* grnn_left = OpNode("grnn_left", "search_grnn")->AsIntermediate(); + auto* grnn_left_out = VarNode("grnn_left_out") + ->assert_is_op_output("search_grnn", "Out") + ->AsIntermediate(); + auto* grnn_left_idx_sorted_by_width = + VarNode("grnn_left_idx_sorted_by_width") + ->assert_is_op_output("search_grnn", "idx_sorted_by_width") + ->AsIntermediate(); + auto* grnn_left_layout_input = + VarNode("grnn_left_layout_input") + ->assert_is_op_output("search_grnn", "layout_input") + ->AsIntermediate(); + auto* grnn_left_tmp_buffer = + VarNode("grnn_left_tmp_buffer") + ->assert_is_op_output("search_grnn", "tmp_buffer") + ->AsIntermediate(); + auto* seq_pool_left = + OpNode("seq_pool_left", "sequence_pool")->AsIntermediate(); + auto* seq_pool_left_out = VarNode("seq_pool_left_out") + ->assert_is_op_output("sequence_pool", "Out") + ->AsOutput(); + auto* seq_pool_left_max_idx = + VarNode("seq_pool_left_max_idx") + ->assert_is_op_output("sequence_pool", "MaxIndex") + ->AsIntermediate(); + + auto* concat_2in1 = OpNode("concat_2in1", "concat")->AsIntermediate(); + auto* concat_2in1_out = VarNode("concat_2in1_out") + ->assert_is_op_output("concat", "Out") + ->AsIntermediate(); + auto* att_2in1_w = + VarNode("att_2in1_w") + ->assert_is_op_input("__xpu__mmdnn_search_attention", "W") + ->AsInput(); + auto* att_2in1_b = + VarNode("att_2in1_b") + ->assert_is_op_input("__xpu__mmdnn_search_attention", "b") + ->AsInput(); + auto* att_2in1 = + OpNode("att_2in1", "__xpu__mmdnn_search_attention")->AsIntermediate(); + auto* att_2in1_out = + VarNode("att_2in1_out") + ->assert_is_op_output("__xpu__mmdnn_search_attention", "Out") + ->AsIntermediate(); + auto* seq_pool_2in1 = + OpNode("seq_pool_2in1", "sequence_pool")->AsIntermediate(); + auto* seq_pool_2in1_out = VarNode("seq_pool_2in1_out") + ->assert_is_op_output("sequence_pool", "Out") + ->AsOutput(); + auto* seq_pool_2in1_max_idx = + VarNode("seq_pool_2in1_max_idx") + ->assert_is_op_output("sequence_pool", "MaxIndex") + ->AsIntermediate(); + + auto* concat_3in1 = OpNode("concat_3in1", "concat")->AsIntermediate(); + auto* concat_3in1_out = VarNode("concat_3in1_out") + ->assert_is_op_output("concat", "Out") + ->AsOutput(); + + *input0 >> *emb0 >> *emb0_out >> *eltwise01 >> *eltwise01_out; + *emb_tbl >> *emb0; + *input1 >> *emb1 >> *emb1_out >> *eltwise01; + *emb_tbl >> *emb1; + + *eltwise01_out >> *seq_rev_right0 >> *seq_rev_right0_out >> *grnn_right >> + *grnn_right_out >> *seq_rev_right1 >> *seq_rev_right1_out; + *grnn_right_out >> *seq_pool_right >> *seq_pool_right_out; + *seq_pool_right >> *seq_pool_right_max_idx; + *grnn_right_wh >> *grnn_right; + *grnn_right_wi >> *grnn_right; + *grnn_right >> *grnn_right_idx_sorted_by_width; + *grnn_right >> *grnn_right_layout_input; + *grnn_right >> *grnn_right_tmp_buffer; + + *eltwise01_out >> *grnn_left >> *grnn_left_out >> *seq_pool_left >> + *seq_pool_left_out; + *seq_pool_left >> *seq_pool_left_max_idx; + *grnn_left_wh >> *grnn_left; + *grnn_left_wi >> *grnn_left; + *grnn_left >> *grnn_left_idx_sorted_by_width; + *grnn_left >> *grnn_left_layout_input; + *grnn_left >> *grnn_left_tmp_buffer; + + *seq_rev_right1_out >> *concat_2in1; + *grnn_left_out >> *concat_2in1; + *concat_2in1 >> *concat_2in1_out >> *att_2in1 >> *att_2in1_out >> + *seq_pool_2in1 >> *seq_pool_2in1_out; + *seq_pool_2in1 >> *seq_pool_2in1_max_idx; + *att_2in1_w >> *att_2in1; + *att_2in1_b >> *att_2in1; + + *eltwise01_out >> *concat_3in1; + *seq_rev_right1_out >> *concat_3in1; + *grnn_left_out >> *concat_3in1; + *concat_3in1 >> *concat_3in1_out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__mmdnn_bid_emb_grnn_att"); + op_desc.SetInput("id0", {matched.at("input0")->arg()->name}); + op_desc.SetInput("id1", {matched.at("input1")->arg()->name}); + op_desc.SetInput("emb_tbl", {matched.at("emb_tbl")->arg()->name}); + op_desc.SetInput("grnn_fw_wh", {matched.at("grnn_left_wh")->arg()->name}); + op_desc.SetInput("grnn_fw_wi", {matched.at("grnn_left_wi")->arg()->name}); + op_desc.SetInput("grnn_rv_wh", {matched.at("grnn_right_wh")->arg()->name}); + op_desc.SetInput("grnn_rv_wi", {matched.at("grnn_right_wi")->arg()->name}); + op_desc.SetInput("att_fc_w", {matched.at("att_2in1_w")->arg()->name}); + op_desc.SetInput("att_fc_b", {matched.at("att_2in1_b")->arg()->name}); + op_desc.SetOutput("grnn_fw_pool_out", + {matched.at("seq_pool_left_out")->arg()->name}); + op_desc.SetOutput("grnn_rv_pool_out", + {matched.at("seq_pool_right_out")->arg()->name}); + op_desc.SetOutput("att_pool_out", + {matched.at("seq_pool_2in1_out")->arg()->name}); + op_desc.SetOutput("concat_3in1_out", + {matched.at("concat_3in1_out")->arg()->name}); + op_desc.SetOutput("emb_fw_out", {matched.at("eltwise01_out")->arg()->name}); + + auto* grnn_fw_op_info = matched.at("grnn_left")->stmt()->op_info(); + op_desc.SetAttr>( + "grnn_fw_wh_maxs", + grnn_fw_op_info->GetAttr>("wh_max")); + op_desc.SetAttr>( + "grnn_fw_wi_maxs", + grnn_fw_op_info->GetAttr>("wi_max")); + auto* grnn_rv_op_info = matched.at("grnn_right")->stmt()->op_info(); + op_desc.SetAttr>( + "grnn_rv_wh_maxs", + grnn_rv_op_info->GetAttr>("wh_max")); + op_desc.SetAttr>( + "grnn_rv_wi_maxs", + grnn_rv_op_info->GetAttr>("wi_max")); + auto* att_fc_op_info = matched.at("att_2in1")->stmt()->op_info(); + op_desc.SetAttr("att_fc_w_max", + att_fc_op_info->GetAttr("W_max")); + + auto* new_stmt = matched.at("emb0")->stmt(); + auto new_op = LiteOpRegistry::Global().Create(op_desc.Type()); + new_op->Attach(op_desc, new_stmt->op()->scope()); + new_op->SetValidPlaces(new_stmt->op()->valid_places()); + auto kernels = new_op->CreateKernels(new_op->valid_places()); + new_stmt->SetOp(new_op); + new_stmt->SetKernels(std::move(kernels)); + + std::vector arg_names{ + "input1", + "grnn_left_wh", + "grnn_left_wi", + "grnn_right_wh", + "grnn_right_wi", + "att_2in1_w", + "att_2in1_b", + }; + for (auto name : arg_names) { + DirectedLink(matched.at(name), matched.at("emb0")); + } + std::vector out_names{ + "seq_pool_left_out", + "seq_pool_right_out", + "seq_pool_2in1_out", + "concat_3in1_out", + "eltwise01_out", + }; + for (auto name : out_names) { + IR_OP_VAR_LINK(matched.at("emb0"), matched.at(name)); + } + } +}; + +class XPUMmdnnMergeAllFuser : public FuseBase { + public: + void BuildPattern() override { + auto* concat_7in1_input0 = VarNode("concat_7in1_input0") + ->assert_is_op_nth_input("concat", "X", 0) + ->AsInput(); + auto* concat_7in1_input1 = VarNode("concat_7in1_input1") + ->assert_is_op_nth_input("concat", "X", 1) + ->AsInput(); + auto* concat_7in1_input2 = VarNode("concat_7in1_input2") + ->assert_is_op_nth_input("concat", "X", 2) + ->AsInput(); + auto* concat_7in1_input3 = VarNode("concat_7in1_input3") + ->assert_is_op_nth_input("concat", "X", 3) + ->AsInput(); + auto* concat_7in1_input4 = VarNode("concat_7in1_input4") + ->assert_is_op_nth_input("concat", "X", 4) + ->AsInput(); + auto* concat_7in1_input5 = VarNode("concat_7in1_input5") + ->assert_is_op_nth_input("concat", "X", 5) + ->AsInput(); + auto* concat_7in1_input6 = VarNode("concat_7in1_input6") + ->assert_is_op_nth_input("concat", "X", 6) + ->AsInput(); + auto* concat_7in1 = OpNode("concat_7in1", "concat"); + auto* concat_7in1_out = VarNode("concat_7in1_out") + ->assert_is_op_output("concat", "Out") + ->AsIntermediate(); + auto* search_fc0_w = VarNode("search_fc0_w") + ->assert_is_op_input("search_fc", "W") + ->AsInput(); + auto* search_fc0_b = VarNode("search_fc0_b") + ->assert_is_op_input("search_fc", "b") + ->AsInput(); + auto* search_fc0 = OpNode("search_fc0", "search_fc")->AsIntermediate(); + auto* search_fc0_out = VarNode("search_fc0_out") + ->assert_is_op_output("search_fc", "Out") + ->AsIntermediate(); + auto* relu0 = OpNode("relu0", "relu")->AsIntermediate(); + auto* relu0_out = VarNode("relu0_out") + ->assert_is_op_output("relu", "Out") + ->AsIntermediate(); + + auto* concat_2in1_input0 = VarNode("concat_2in1_input0") + ->assert_is_op_nth_input("concat", "X", 0) + ->AsInput(); + auto* concat_2in1_input1 = VarNode("concat_2in1_input1") + ->assert_is_op_nth_input("concat", "X", 1) + ->AsInput(); + auto* concat_2in1 = OpNode("concat_2in1", "concat")->AsIntermediate(); + auto* concat_2in1_out = VarNode("concat_2in1_out") + ->assert_is_op_output("concat", "Out") + ->AsIntermediate(); + auto* seq_rev = OpNode("seq_rev", "sequence_reverse")->AsIntermediate(); + auto* seq_rev_out = VarNode("seq_rev_out") + ->assert_is_op_output("sequence_reverse", "Y") + ->AsIntermediate(); + + auto* grnn_rv_wh = VarNode("grnn_rv_wh") + ->assert_is_op_input("search_grnn", "Wh") + ->AsInput(); + auto* grnn_rv_wi = VarNode("grnn_rv_wi") + ->assert_is_op_input("search_grnn", "Wi") + ->AsInput(); + auto* grnn_rv = OpNode("grnn_rv", "search_grnn")->AsIntermediate(); + auto* grnn_rv_out = VarNode("grnn_rv_out") + ->assert_is_op_output("search_grnn", "Out") + ->AsIntermediate(); + auto* grnn_rv_idx_sorted_by_width = + VarNode("grnn_rv_idx_sorted_by_width") + ->assert_is_op_output("search_grnn", "idx_sorted_by_width") + ->AsIntermediate(); + auto* grnn_rv_layout_input = + VarNode("grnn_rv_layout_input") + ->assert_is_op_output("search_grnn", "layout_input") + ->AsIntermediate(); + auto* grnn_rv_tmp_buffer = + VarNode("grnn_rv_tmp_buffer") + ->assert_is_op_output("search_grnn", "tmp_buffer") + ->AsIntermediate(); + auto* seq_pool_rv = + OpNode("seq_pool_rv", "sequence_pool")->AsIntermediate(); + auto* seq_pool_rv_out = VarNode("seq_pool_rv_out") + ->assert_is_op_output("sequence_pool", "Out") + ->AsIntermediate(); + auto* seq_pool_rv_max_idx = + VarNode("seq_pool_rv_max_idx") + ->assert_is_op_output("sequence_pool", "MaxIndex") + ->AsIntermediate(); + + auto* grnn_fw_wh = VarNode("grnn_fw_wh") + ->assert_is_op_input("search_grnn", "Wh") + ->AsInput(); + auto* grnn_fw_wi = VarNode("grnn_fw_wi") + ->assert_is_op_input("search_grnn", "Wi") + ->AsInput(); + auto* grnn_fw = OpNode("grnn_fw", "search_grnn")->AsIntermediate(); + auto* grnn_fw_out = VarNode("grnn_fw_out") + ->assert_is_op_output("search_grnn", "Out") + ->AsIntermediate(); + auto* grnn_fw_idx_sorted_by_width = + VarNode("grnn_fw_idx_sorted_by_width") + ->assert_is_op_output("search_grnn", "idx_sorted_by_width") + ->AsIntermediate(); + auto* grnn_fw_layout_input = + VarNode("grnn_fw_layout_input") + ->assert_is_op_output("search_grnn", "layout_input") + ->AsIntermediate(); + auto* grnn_fw_tmp_buffer = + VarNode("grnn_fw_tmp_buffer") + ->assert_is_op_output("search_grnn", "tmp_buffer") + ->AsIntermediate(); + auto* seq_pool_fw = + OpNode("seq_pool_fw", "sequence_pool")->AsIntermediate(); + auto* seq_pool_fw_out = VarNode("seq_pool_fw_out") + ->assert_is_op_output("sequence_pool", "Out") + ->AsIntermediate(); + auto* seq_pool_fw_max_idx = + VarNode("seq_pool_fw_max_idx") + ->assert_is_op_output("sequence_pool", "MaxIndex") + ->AsIntermediate(); + + auto* rv_fw_concat = OpNode("rv_fw_concat", "concat")->AsIntermediate(); + auto* rv_fw_concat_out = VarNode("rv_fw_concat_out") + ->assert_is_op_output("concat", "Out") + ->AsIntermediate(); + + auto* last_concat = OpNode("last_concat", "concat")->AsIntermediate(); + auto* last_concat_out = VarNode("last_concat_out") + ->assert_is_op_output("concat", "Out") + ->AsIntermediate(); + auto* search_fc1_w = VarNode("search_fc1_w") + ->assert_is_op_input("search_fc", "W") + ->AsInput(); + auto* search_fc1_b = VarNode("search_fc1_b") + ->assert_is_op_input("search_fc", "b") + ->AsInput(); + auto* search_fc1 = OpNode("search_fc1", "search_fc")->AsIntermediate(); + auto* search_fc1_out = VarNode("search_fc1_out") + ->assert_is_op_output("search_fc", "Out") + ->AsIntermediate(); + auto* relu1 = OpNode("relu1", "relu")->AsIntermediate(); + auto* relu1_out = VarNode("relu1_out") + ->assert_is_op_output("relu", "Out") + ->AsIntermediate(); + auto* search_fc2_w = VarNode("search_fc2_w") + ->assert_is_op_input("search_fc", "W") + ->AsInput(); + auto* search_fc2_b = VarNode("search_fc2_b") + ->assert_is_op_input("search_fc", "b") + ->AsInput(); + auto* search_fc2 = OpNode("search_fc2", "search_fc")->AsIntermediate(); + auto* search_fc2_out = VarNode("search_fc2_out") + ->assert_is_op_output("search_fc", "Out") + ->AsOutput(); + + *concat_7in1_input0 >> *concat_7in1; + *concat_7in1_input1 >> *concat_7in1; + *concat_7in1_input2 >> *concat_7in1; + *concat_7in1_input3 >> *concat_7in1; + *concat_7in1_input4 >> *concat_7in1; + *concat_7in1_input5 >> *concat_7in1; + *concat_7in1_input6 >> *concat_7in1; + *concat_7in1 >> *concat_7in1_out >> *search_fc0 >> *search_fc0_out >> + *relu0 >> *relu0_out; + *search_fc0_w >> *search_fc0; + *search_fc0_b >> *search_fc0; + + *concat_2in1_input0 >> *concat_2in1; + *concat_2in1_input1 >> *concat_2in1; + *concat_2in1 >> *concat_2in1_out >> *seq_rev >> *seq_rev_out; + + *seq_rev_out >> *grnn_rv >> *grnn_rv_out >> *seq_pool_rv >> + *seq_pool_rv_out; + *seq_pool_rv >> *seq_pool_rv_max_idx; + *grnn_rv_wh >> *grnn_rv; + *grnn_rv_wi >> *grnn_rv; + *grnn_rv >> *grnn_rv_idx_sorted_by_width; + *grnn_rv >> *grnn_rv_layout_input; + *grnn_rv >> *grnn_rv_tmp_buffer; + + *concat_2in1_out >> *grnn_fw >> *grnn_fw_out >> *seq_pool_fw >> + *seq_pool_fw_out; + *seq_pool_fw >> *seq_pool_fw_max_idx; + *grnn_fw_wh >> *grnn_fw; + *grnn_fw_wi >> *grnn_fw; + *grnn_fw >> *grnn_fw_idx_sorted_by_width; + *grnn_fw >> *grnn_fw_layout_input; + *grnn_fw >> *grnn_fw_tmp_buffer; + + *seq_pool_rv_out >> *rv_fw_concat; + *seq_pool_fw_out >> *rv_fw_concat; + *rv_fw_concat >> *rv_fw_concat_out; + + *rv_fw_concat_out >> *last_concat; + *relu0_out >> *last_concat; + *last_concat >> *last_concat_out >> *search_fc1 >> *search_fc1_out >> + *relu1 >> *relu1_out >> *search_fc2 >> *search_fc2_out; + *search_fc1_w >> *search_fc1; + *search_fc1_b >> *search_fc1; + *search_fc2_w >> *search_fc2; + *search_fc2_b >> *search_fc2; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__mmdnn_merge_all"); + auto* concat_7in1_op_info = matched.at("concat_7in1")->stmt()->op_info(); + op_desc.SetInput("concat_7in1_x", concat_7in1_op_info->Input("X")); + auto* concat_2in1_op_info = matched.at("concat_2in1")->stmt()->op_info(); + op_desc.SetInput("concat_2in1_x", concat_2in1_op_info->Input("X")); + op_desc.SetInput("grnn_fw_wh", {matched.at("grnn_fw_wh")->arg()->name}); + op_desc.SetInput("grnn_fw_wi", {matched.at("grnn_fw_wi")->arg()->name}); + op_desc.SetInput("grnn_rv_wh", {matched.at("grnn_rv_wh")->arg()->name}); + op_desc.SetInput("grnn_rv_wi", {matched.at("grnn_rv_wi")->arg()->name}); + op_desc.SetInput("fc0_w", {matched.at("search_fc0_w")->arg()->name}); + op_desc.SetInput("fc0_b", {matched.at("search_fc0_b")->arg()->name}); + op_desc.SetInput("fc1_w", {matched.at("search_fc1_w")->arg()->name}); + op_desc.SetInput("fc1_b", {matched.at("search_fc1_b")->arg()->name}); + op_desc.SetInput("fc2_w", {matched.at("search_fc2_w")->arg()->name}); + op_desc.SetInput("fc2_b", {matched.at("search_fc2_b")->arg()->name}); + + op_desc.SetOutput("out", {matched.at("search_fc2_out")->arg()->name}); + + auto* grnn_fw_op_info = matched.at("grnn_fw")->stmt()->op_info(); + op_desc.SetAttr>( + "grnn_fw_wh_maxs", + grnn_fw_op_info->GetAttr>("wh_max")); + op_desc.SetAttr>( + "grnn_fw_wi_maxs", + grnn_fw_op_info->GetAttr>("wi_max")); + auto* grnn_rv_op_info = matched.at("grnn_rv")->stmt()->op_info(); + op_desc.SetAttr>( + "grnn_rv_wh_maxs", + grnn_rv_op_info->GetAttr>("wh_max")); + op_desc.SetAttr>( + "grnn_rv_wi_maxs", + grnn_rv_op_info->GetAttr>("wi_max")); + auto* fc0_op_info = matched.at("search_fc0")->stmt()->op_info(); + op_desc.SetAttr("fc0_w_max", fc0_op_info->GetAttr("w_max")); + auto* fc1_op_info = matched.at("search_fc1")->stmt()->op_info(); + op_desc.SetAttr("fc1_w_max", fc1_op_info->GetAttr("w_max")); + auto* fc2_op_info = matched.at("search_fc2")->stmt()->op_info(); + op_desc.SetAttr("fc2_w_max", fc2_op_info->GetAttr("w_max")); + + auto* new_stmt = matched.at("concat_7in1")->stmt(); + auto new_op = LiteOpRegistry::Global().Create(op_desc.Type()); + new_op->Attach(op_desc, new_stmt->op()->scope()); + new_op->SetValidPlaces(new_stmt->op()->valid_places()); + auto kernels = new_op->CreateKernels(new_op->valid_places()); + new_stmt->SetOp(new_op); + new_stmt->SetKernels(std::move(kernels)); + + std::vector arg_names{ + "concat_2in1_input0", + "concat_2in1_input1", + "grnn_fw_wh", + "grnn_fw_wi", + "grnn_rv_wh", + "grnn_rv_wi", + "search_fc0_w", + "search_fc0_b", + "search_fc1_w", + "search_fc1_b", + "search_fc2_w", + "search_fc2_b", + }; + for (auto name : arg_names) { + DirectedLink(matched.at(name), matched.at("concat_7in1")); + } + std::vector out_names{ + "search_fc2_out", + }; + for (auto name : out_names) { + IR_OP_VAR_LINK(matched.at("concat_7in1"), matched.at(name)); + } + } +}; + +} // namespace fusion + +class XPUMmdnnFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + + fusion::XPUMmdnnFloat2Fix float_2_fix; + float_2_fix(graph.get()); + fusion::XPUMmdnnSearchAttentionFuser search_att_fuser; + search_att_fuser(graph.get()); + fusion::XPUMmdnnMatchConvTopkFuser match_conv_topk_fuser; + match_conv_topk_fuser(graph.get()); + + fusion::XPUMmdnnBidSeqRevEmbEltwiseFuser bi_seq_rev_emb_eltwise_fuser; + bi_seq_rev_emb_eltwise_fuser(graph.get()); + fusion::XPUMmdnnBidEmbGrnnAttFuser bid_emb_grnn_att_fuser; + bid_emb_grnn_att_fuser(graph.get()); + fusion::XPUMmdnnBidEmbAttFuser bid_emb_att_fuser; + bid_emb_att_fuser(graph.get()); + fusion::XPUMmdnnMergeAllFuser merge_all_fuser; + merge_all_fuser(graph.get()); + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__mmdnn_fuse_pass, paddle::lite::mir::XPUMmdnnFusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("__xpu__mmdnn_search_attention") + .BindKernel("__xpu__mmdnn_bid_emb_grnn_att") + .BindKernel("__xpu__mmdnn_bid_emb_att") + .BindKernel("__xpu__mmdnn_match_conv_topk") + .BindKernel("__xpu__mmdnn_merge_all"); diff --git a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc index 525042e44b2997013943f392f592d812bd68fa0b..04988612192b79824b1294428fa9b1c38d784979 100644 --- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc +++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc @@ -639,20 +639,21 @@ class XPUMultiEncoderFusePass : public ProgramPass { std::set fc_int31_ids; #ifdef LITE_WITH_XPU // TODO(miaotianxiang): core/mir/*_pass.cc are compiled anyway and need to - // access Context::_multi_encoder_precision, but this static member - // variable in class specialization defined in lite/core/context.cc - // is only compiled iff LITE_WITH_XPU==ON. To suppress linkage error, we use + // access TargetWrapperXPU::multi_encoder_precision, but this static member + // variable in class specialization defined in + // lite/backends/xpu/target_wrapper.cc is only compiled iff + // LITE_WITH_XPU==ON. To suppress linkage error, we use // #ifdef here. Any better idea? if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int31" || - lite::Context::_multi_encoder_precision == "int31") { + lite::TargetWrapperXPU::multi_encoder_precision == "int31") { fc_int31_ids = {0, 1, 2, 3, 4, 5}; VLOG(3) << "Use int31 in XPUMultiEncoderOp, " - << "lite::Context<>::_multi_encoder_precision=" - << lite::Context::_multi_encoder_precision; + << "lite::TargetWrapperXPU::multi_encoder_precision=" + << lite::TargetWrapperXPU::multi_encoder_precision; } else { VLOG(3) << "Use int16 in XPUMultiEncoderOp, " - << "lite::Context<>::_multi_encoder_precision=" - << lite::Context::_multi_encoder_precision; + << "lite::TargetWrapperXPU::multi_encoder_precision=" + << lite::TargetWrapperXPU::multi_encoder_precision; } #endif diff --git a/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc b/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..b25eb084f286fccfa4afe8832f9dc1ff8384d552 --- /dev/null +++ b/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc @@ -0,0 +1,1389 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/xpu_pattern_matcher_high_api.h" +#include "lite/operators/subgraph_op.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +class XPUResNetCbamBlock0Fuser : public FuseBase { + public: + XPUResNetCbamBlock0Fuser() {} + + void BuildPattern() override { + auto* input = + VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput(); + + auto* left_conv1_weight = VarNode("left_conv1_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv1 = OpNode("left_conv1", "conv2d"); + auto* left_conv1_out = VarNode("left_conv1_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* left_bn1_scale = VarNode("left_bn1_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* left_bn1_bias = VarNode("left_bn1_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* left_bn1_mean = VarNode("left_bn1_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* left_bn1_var = VarNode("left_bn1_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* left_bn1 = OpNode("left_bn1", "batch_norm")->AsIntermediate(); + auto* left_bn1_out = VarNode("left_bn1_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* left_bn1_mean_out = VarNode("left_bn1_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* left_bn1_var_out = + VarNode("left_bn1_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* left_bn1_saved_mean = + VarNode("left_bn1_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* left_bn1_saved_var = + VarNode("left_bn1_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* left_relu1 = OpNode("left_relu1", "relu")->AsIntermediate(); + auto* left_relu1_out = VarNode("left_relu1_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* left_conv2_weight = VarNode("left_conv2_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv2 = OpNode("left_conv2", "conv2d")->AsIntermediate(); + auto* left_conv2_out = VarNode("left_conv2_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* left_bn2_scale = VarNode("left_bn2_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* left_bn2_bias = VarNode("left_bn2_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* left_bn2_mean = VarNode("left_bn2_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* left_bn2_var = VarNode("left_bn2_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* left_bn2 = OpNode("left_bn2", "batch_norm")->AsIntermediate(); + auto* left_bn2_out = VarNode("left_bn2_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* left_bn2_mean_out = VarNode("left_bn2_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* left_bn2_var_out = + VarNode("left_bn2_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* left_bn2_saved_mean = + VarNode("left_bn2_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* left_bn2_saved_var = + VarNode("left_bn2_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* left_relu2 = OpNode("left_relu2", "relu")->AsIntermediate(); + auto* left_relu2_out = VarNode("left_relu2_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* left_conv3_weight = VarNode("left_conv3_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv3 = OpNode("left_conv3", "conv2d")->AsIntermediate(); + auto* left_conv3_out = VarNode("left_conv3_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* left_bn3_scale = VarNode("left_bn3_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* left_bn3_bias = VarNode("left_bn3_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* left_bn3_mean = VarNode("left_bn3_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* left_bn3_var = VarNode("left_bn3_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* left_bn3 = OpNode("left_bn3", "batch_norm")->AsIntermediate(); + auto* left_bn3_out = VarNode("left_bn3_out") + ->assert_is_op_output("batch_norm", "Y") + ->AsIntermediate(); + auto* left_bn3_mean_out = VarNode("left_bn3_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* left_bn3_var_out = + VarNode("left_bn3_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* left_bn3_saved_mean = + VarNode("left_bn3_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* left_bn3_saved_var = + VarNode("left_bn3_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + // cbam specific + auto* reduce_mean = OpNode("reduce_mean", "reduce_mean")->AsIntermediate(); + auto* reduce_mean_out = VarNode("reduce_mean_out") + ->assert_is_op_output("reduce_mean", "Out") + ->assert_is_op_input("concat") + ->AsIntermediate(); + auto* reduce_max = OpNode("reduce_max", "reduce_max")->AsIntermediate(); + auto* reduce_max_out = VarNode("reduce_max_out") + ->assert_is_op_output("reduce_max", "Out") + ->assert_is_op_input("concat") + ->AsIntermediate(); + auto* concat = OpNode("concat", "concat")->AsIntermediate(); + auto* concat_out = VarNode("concat_out") + ->assert_is_op_output("concat", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + auto* left_conv4_weight = VarNode("left_conv4_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv4 = OpNode("left_conv4", "conv2d")->AsIntermediate(); + auto* left_conv4_out = VarNode("left_conv4_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("sigmoid", "X") + ->AsIntermediate(); + auto* sigmoid = OpNode("sigmoid", "sigmoid")->AsIntermediate(); + auto* sigmoid_out = VarNode("sigmoid_out") + ->assert_is_op_output("sigmoid", "Out") + ->assert_is_op_input("elementwise_mul") + ->AsIntermediate(); + auto* reshape = OpNode("reshape", "reshape2")->AsIntermediate(); + auto* reshape_out = VarNode("reshape_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("elementwise_mul") + ->AsIntermediate(); + auto* reshape_xshape = VarNode("reshape_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* eltwise_mul = + OpNode("eltwise_mul", "elementwise_mul")->AsIntermediate(); + auto* eltwise_mul_out = VarNode("eltwise_mul_out") + ->assert_is_op_output("elementwise_mul", "Out") + ->assert_is_op_input("elementwise_add") + ->AsIntermediate(); + + auto* right_conv1_weight = VarNode("right_conv1_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv1 = OpNode("right_conv1", "conv2d")->AsIntermediate(); + auto* right_conv1_out = VarNode("right_conv1_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn1_scale = VarNode("right_bn1_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn1_bias = VarNode("right_bn1_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn1_mean = VarNode("right_bn1_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn1_var = VarNode("right_bn1_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate(); + auto* right_bn1_out = VarNode("right_bn1_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("elementwise_add") + ->AsIntermediate(); + auto* right_bn1_mean_out = + VarNode("right_bn1_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn1_var_out = + VarNode("right_bn1_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn1_saved_mean = + VarNode("right_bn1_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn1_saved_var = + VarNode("right_bn1_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + auto* add = OpNode("add", "elementwise_add")->AsIntermediate(); + auto* add_out = VarNode("add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* relu = OpNode("relu", "relu")->AsIntermediate(); + auto* relu_out = + VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput(); + + *input >> *left_conv1 >> *left_conv1_out >> *left_bn1 >> *left_bn1_out >> + *left_relu1 >> *left_relu1_out >> *left_conv2 >> *left_conv2_out >> + *left_bn2 >> *left_bn2_out >> *left_relu2 >> *left_relu2_out >> + *left_conv3 >> *left_conv3_out >> *left_bn3 >> + *left_bn3_out /* >> *add*/; + + *left_bn3_out >> *reduce_mean >> *reduce_mean_out >> *concat; + *left_bn3_out >> *reduce_max >> *reduce_max_out >> *concat; + *concat >> *concat_out >> *left_conv4 >> *left_conv4_out >> *sigmoid >> + *sigmoid_out >> *eltwise_mul; + *left_conv4_weight >> *left_conv4; + *left_bn3_out >> *reshape >> *reshape_out >> *eltwise_mul; + *reshape >> *reshape_xshape; + *eltwise_mul >> *eltwise_mul_out >> *add; + + *left_conv1_weight >> *left_conv1; + *left_bn1_scale >> *left_bn1; + *left_bn1_bias >> *left_bn1; + *left_bn1_mean >> *left_bn1; + *left_bn1_var >> *left_bn1; + *left_bn1 >> *left_bn1_mean_out; + *left_bn1 >> *left_bn1_var_out; + *left_bn1 >> *left_bn1_saved_mean; + *left_bn1 >> *left_bn1_saved_var; + + *left_conv2_weight >> *left_conv2; + *left_bn2_scale >> *left_bn2; + *left_bn2_bias >> *left_bn2; + *left_bn2_mean >> *left_bn2; + *left_bn2_var >> *left_bn2; + *left_bn2 >> *left_bn2_mean_out; + *left_bn2 >> *left_bn2_var_out; + *left_bn2 >> *left_bn2_saved_mean; + *left_bn2 >> *left_bn2_saved_var; + + *left_conv3_weight >> *left_conv3; + *left_bn3_scale >> *left_bn3; + *left_bn3_bias >> *left_bn3; + *left_bn3_mean >> *left_bn3; + *left_bn3_var >> *left_bn3; + *left_bn3 >> *left_bn3_mean_out; + *left_bn3 >> *left_bn3_var_out; + *left_bn3 >> *left_bn3_saved_mean; + *left_bn3 >> *left_bn3_saved_var; + + *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >> + *right_bn1_out >> *add; + + *right_conv1_weight >> *right_conv1; + *right_bn1_scale >> *right_bn1; + *right_bn1_bias >> *right_bn1; + *right_bn1_mean >> *right_bn1; + *right_bn1_var >> *right_bn1; + *right_bn1 >> *right_bn1_mean_out; + *right_bn1 >> *right_bn1_var_out; + *right_bn1 >> *right_bn1_saved_mean; + *right_bn1 >> *right_bn1_saved_var; + + *add >> *add_out >> *relu >> *relu_out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("resnet_cbam_block0"); + op_desc.SetInput("Inputs", {matched.at("input")->arg()->name}); + op_desc.SetInput("Filter", + { + matched.at("left_conv1_weight")->arg()->name, + matched.at("left_conv2_weight")->arg()->name, + matched.at("left_conv3_weight")->arg()->name, + matched.at("left_conv4_weight")->arg()->name, + matched.at("right_conv1_weight")->arg()->name, + }); + op_desc.SetInput("Scale", + { + matched.at("left_bn1_scale")->arg()->name, + matched.at("left_bn2_scale")->arg()->name, + matched.at("left_bn3_scale")->arg()->name, + "placeholder_sa_conv", + matched.at("right_bn1_scale")->arg()->name, + }); + op_desc.SetInput("Bias", + { + matched.at("left_bn1_bias")->arg()->name, + matched.at("left_bn2_bias")->arg()->name, + matched.at("left_bn3_bias")->arg()->name, + "placeholder_sa_conv", + matched.at("right_bn1_bias")->arg()->name, + }); + op_desc.SetInput("Mean", + { + matched.at("left_bn1_mean")->arg()->name, + matched.at("left_bn2_mean")->arg()->name, + matched.at("left_bn3_mean")->arg()->name, + "placeholder_sa_conv", + matched.at("right_bn1_mean")->arg()->name, + }); + op_desc.SetInput("Var", + { + matched.at("left_bn1_variance")->arg()->name, + matched.at("left_bn2_variance")->arg()->name, + matched.at("left_bn3_variance")->arg()->name, + "placeholder_sa_conv", + matched.at("right_bn1_variance")->arg()->name, + }); + op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name}); + // XXX: keep these to fool SubgraphOp::AttachImpl() + op_desc.SetAttr("sub_block", 0); + op_desc.SetAttr>("input_data_names", {}); + op_desc.SetAttr>("output_data_names", {}); + + auto block0_stmt = matched.at("left_conv1")->stmt(); + // block0_stmt->ResetOp(op_desc, graph->valid_places()); + auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + // XXX: memleak? + auto sub_block_desc = new cpp::BlockDesc(); + static_cast(fake_subgraph_op.get()) + ->SetSubBlock(sub_block_desc); + fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope()); + fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places()); + block0_stmt->SetOp(fake_subgraph_op); + + std::vector froms = { + "left_conv2_weight", + "left_conv3_weight", + "left_conv4_weight", + "right_conv1_weight", + "left_bn1_bias", + "left_bn2_bias", + "left_bn3_bias", + "right_bn1_bias", + }; + for (auto& from : froms) { + IR_NODE_LINK_TO(matched.at(from), matched.at("left_conv1")); + } + IR_OP_VAR_LINK(matched.at("left_conv1"), matched.at("relu_out")); + } +}; + +class XPUResNetCbamBlock1Fuser : public FuseBase { + public: + XPUResNetCbamBlock1Fuser() {} + + void BuildPattern() override { + auto* input = VarNode("input") + ->assert_is_op_input("conv2d", "Input") + ->assert_is_op_input("elementwise_add") + ->AsInput(); + + auto* right_conv1_weight = VarNode("right_conv1_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv1 = OpNode("right_conv1", "conv2d"); + auto* right_conv1_out = VarNode("right_conv1_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn1_scale = VarNode("right_bn1_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn1_bias = VarNode("right_bn1_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn1_mean = VarNode("right_bn1_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn1_var = VarNode("right_bn1_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate(); + auto* right_bn1_out = VarNode("right_bn1_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* right_bn1_mean_out = + VarNode("right_bn1_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn1_var_out = + VarNode("right_bn1_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn1_saved_mean = + VarNode("right_bn1_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn1_saved_var = + VarNode("right_bn1_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* right_relu1 = OpNode("right_relu1", "relu")->AsIntermediate(); + auto* right_relu1_out = VarNode("right_relu1_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* right_conv2_weight = VarNode("right_conv2_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv2 = OpNode("right_conv2", "conv2d")->AsIntermediate(); + auto* right_conv2_out = VarNode("right_conv2_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn2_scale = VarNode("right_bn2_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn2_bias = VarNode("right_bn2_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn2_mean = VarNode("right_bn2_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn2_var = VarNode("right_bn2_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn2 = OpNode("right_bn2", "batch_norm")->AsIntermediate(); + auto* right_bn2_out = VarNode("right_bn2_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* right_bn2_mean_out = + VarNode("right_bn2_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn2_var_out = + VarNode("right_bn2_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn2_saved_mean = + VarNode("right_bn2_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn2_saved_var = + VarNode("right_bn2_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* right_relu2 = OpNode("right_relu2", "relu")->AsIntermediate(); + auto* right_relu2_out = VarNode("right_relu2_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* right_conv3_weight = VarNode("right_conv3_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv3 = OpNode("right_conv3", "conv2d")->AsIntermediate(); + auto* right_conv3_out = VarNode("right_conv3_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn3_scale = VarNode("right_bn3_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn3_bias = VarNode("right_bn3_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn3_mean = VarNode("right_bn3_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn3_var = VarNode("right_bn3_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn3 = OpNode("right_bn3", "batch_norm")->AsIntermediate(); + auto* right_bn3_out = VarNode("right_bn3_out") + ->assert_is_op_output("batch_norm", "Y") + ->AsIntermediate(); + auto* right_bn3_mean_out = + VarNode("right_bn3_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn3_var_out = + VarNode("right_bn3_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn3_saved_mean = + VarNode("right_bn3_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn3_saved_var = + VarNode("right_bn3_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + // cbam specific + auto* reduce_mean = OpNode("reduce_mean", "reduce_mean")->AsIntermediate(); + auto* reduce_mean_out = VarNode("reduce_mean_out") + ->assert_is_op_output("reduce_mean", "Out") + ->assert_is_op_input("concat") + ->AsIntermediate(); + auto* reduce_max = OpNode("reduce_max", "reduce_max")->AsIntermediate(); + auto* reduce_max_out = VarNode("reduce_max_out") + ->assert_is_op_output("reduce_max", "Out") + ->assert_is_op_input("concat") + ->AsIntermediate(); + auto* concat = OpNode("concat", "concat")->AsIntermediate(); + auto* concat_out = VarNode("concat_out") + ->assert_is_op_output("concat", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + auto* right_conv4_weight = VarNode("right_conv4_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv4 = OpNode("right_conv4", "conv2d")->AsIntermediate(); + auto* right_conv4_out = VarNode("right_conv4_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("sigmoid", "X") + ->AsIntermediate(); + auto* sigmoid = OpNode("sigmoid", "sigmoid")->AsIntermediate(); + auto* sigmoid_out = VarNode("sigmoid_out") + ->assert_is_op_output("sigmoid", "Out") + ->assert_is_op_input("elementwise_mul") + ->AsIntermediate(); + auto* reshape = OpNode("reshape", "reshape2")->AsIntermediate(); + auto* reshape_out = VarNode("reshape_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("elementwise_mul") + ->AsIntermediate(); + auto* reshape_xshape = VarNode("reshape_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* eltwise_mul = + OpNode("eltwise_mul", "elementwise_mul")->AsIntermediate(); + auto* eltwise_mul_out = VarNode("eltwise_mul_out") + ->assert_is_op_output("elementwise_mul", "Out") + ->assert_is_op_input("elementwise_add") + ->AsIntermediate(); + + auto* add = OpNode("add", "elementwise_add")->AsIntermediate(); + auto* add_out = VarNode("add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* relu = OpNode("relu", "relu")->AsIntermediate(); + auto* relu_out = + VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput(); + + *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >> + *right_bn1_out >> *right_relu1 >> *right_relu1_out >> *right_conv2 >> + *right_conv2_out >> *right_bn2 >> *right_bn2_out >> *right_relu2 >> + *right_relu2_out >> *right_conv3 >> *right_conv3_out >> *right_bn3 >> + *right_bn3_out /* >> *add*/; + + *right_bn3_out >> *reduce_mean >> *reduce_mean_out >> *concat; + *right_bn3_out >> *reduce_max >> *reduce_max_out >> *concat; + *concat >> *concat_out >> *right_conv4 >> *right_conv4_out >> *sigmoid >> + *sigmoid_out >> *eltwise_mul; + *right_conv4_weight >> *right_conv4; + *right_bn3_out >> *reshape >> *reshape_out >> *eltwise_mul; + *reshape >> *reshape_xshape; + *eltwise_mul >> *eltwise_mul_out >> *add; + + *right_conv1_weight >> *right_conv1; + *right_bn1_scale >> *right_bn1; + *right_bn1_bias >> *right_bn1; + *right_bn1_mean >> *right_bn1; + *right_bn1_var >> *right_bn1; + *right_bn1 >> *right_bn1_mean_out; + *right_bn1 >> *right_bn1_var_out; + *right_bn1 >> *right_bn1_saved_mean; + *right_bn1 >> *right_bn1_saved_var; + + *right_conv2_weight >> *right_conv2; + *right_bn2_scale >> *right_bn2; + *right_bn2_bias >> *right_bn2; + *right_bn2_mean >> *right_bn2; + *right_bn2_var >> *right_bn2; + *right_bn2 >> *right_bn2_mean_out; + *right_bn2 >> *right_bn2_var_out; + *right_bn2 >> *right_bn2_saved_mean; + *right_bn2 >> *right_bn2_saved_var; + + *right_conv3_weight >> *right_conv3; + *right_bn3_scale >> *right_bn3; + *right_bn3_bias >> *right_bn3; + *right_bn3_mean >> *right_bn3; + *right_bn3_var >> *right_bn3; + *right_bn3 >> *right_bn3_mean_out; + *right_bn3 >> *right_bn3_var_out; + *right_bn3 >> *right_bn3_saved_mean; + *right_bn3 >> *right_bn3_saved_var; + + *input >> *add; + + *add >> *add_out >> *relu >> *relu_out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("resnet_cbam_block1"); + op_desc.SetInput("Inputs", {matched.at("input")->arg()->name}); + op_desc.SetInput("Filter", + { + matched.at("right_conv1_weight")->arg()->name, + matched.at("right_conv2_weight")->arg()->name, + matched.at("right_conv3_weight")->arg()->name, + matched.at("right_conv4_weight")->arg()->name, + }); + op_desc.SetInput("Scale", + { + matched.at("right_bn1_scale")->arg()->name, + matched.at("right_bn2_scale")->arg()->name, + matched.at("right_bn3_scale")->arg()->name, + "placeholder_sa_conv", + }); + op_desc.SetInput("Bias", + { + matched.at("right_bn1_bias")->arg()->name, + matched.at("right_bn2_bias")->arg()->name, + matched.at("right_bn3_bias")->arg()->name, + "placeholder_sa_conv", + }); + op_desc.SetInput("Mean", + { + matched.at("right_bn1_mean")->arg()->name, + matched.at("right_bn2_mean")->arg()->name, + matched.at("right_bn3_mean")->arg()->name, + "placeholder_sa_conv", + }); + op_desc.SetInput("Var", + { + matched.at("right_bn1_variance")->arg()->name, + matched.at("right_bn2_variance")->arg()->name, + matched.at("right_bn3_variance")->arg()->name, + "placeholder_sa_conv", + }); + op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name}); + // XXX: keep these to fool SubgraphOp::AttachImpl() + op_desc.SetAttr("sub_block", 0); + op_desc.SetAttr>("input_data_names", {}); + op_desc.SetAttr>("output_data_names", {}); + + auto block1_stmt = matched.at("right_conv1")->stmt(); + auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + // XXX: memleak? + auto sub_block_desc = new cpp::BlockDesc(); + static_cast(fake_subgraph_op.get()) + ->SetSubBlock(sub_block_desc); + fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope()); + fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places()); + block1_stmt->SetOp(fake_subgraph_op); + + std::vector froms = { + "right_conv2_weight", + "right_conv3_weight", + "right_conv4_weight", + "right_bn1_bias", + "right_bn2_bias", + "right_bn3_bias", + }; + for (auto& from : froms) { + IR_NODE_LINK_TO(matched.at(from), matched.at("right_conv1")); + } + IR_OP_VAR_LINK(matched.at("right_conv1"), matched.at("relu_out")); + } +}; + +class XPUResNetCbamBlock2Fuser : public FuseBase { + public: + XPUResNetCbamBlock2Fuser() {} + + void BuildPattern() override { + auto* input = VarNode("input")->assert_is_op_input("clip", "X")->AsInput(); + + auto* clip = OpNode("clip", "clip"); + auto* clip_out = VarNode("clip_out") + ->assert_is_op_output("clip", "Out") + ->assert_is_op_input("elementwise_pow") + ->AsIntermediate(); + auto* eltwise_y = VarNode("eltwise_y") + ->assert_is_op_input("elementwise_pow") + ->assert_is_op_input("elementwise_div") + ->AsIntermediate(); + auto* eltwise_pow = + OpNode("eltwise_pow", "elementwise_pow")->AsIntermediate(); + auto* eltwise_pow_out = VarNode("eltwise_pow_out") + ->assert_is_op_output("elementwise_pow", "Out") + ->assert_is_op_input("pad2d", "X") + ->AsIntermediate(); + auto* pad2d = OpNode("pad2d", "pad2d")->AsIntermediate(); + auto* pad2d_out = VarNode("pad2d_out") + ->assert_is_op_output("pad2d", "Out") + ->assert_is_op_input("pool2d", "X") + ->AsIntermediate(); + auto* pool2d = OpNode("pool2d", "pool2d")->AsIntermediate(); + auto* pool2d_out = VarNode("pool2d_out") + ->assert_is_op_output("pool2d", "Out") + ->assert_is_op_input("elementwise_pow") + ->AsIntermediate(); + + auto* fill_const = OpNode("fill_const", "fill_constant")->AsIntermediate(); + auto* fill_const_out = VarNode("fill_const_out") + ->assert_is_op_output("fill_constant", "Out") + ->assert_is_op_input("elementwise_div") + ->AsIntermediate(); + auto* eltwise_div = + OpNode("eltwise_div", "elementwise_div")->AsIntermediate(); + auto* eltwise_div_out = VarNode("eltwise_div_out") + ->assert_is_op_output("elementwise_div", "Out") + ->assert_is_op_input("elementwise_pow") + ->AsIntermediate(); + + auto* eltwise_pow2 = + OpNode("eltwise_pow2", "elementwise_pow")->AsIntermediate(); + auto* eltwise_pow2_out = VarNode("eltwise_pow2_out") + ->assert_is_op_output("elementwise_pow", "Out") + ->AsIntermediate(); + + auto* shape = OpNode("shape", "shape")->AsIntermediate(); + auto* shape_out = VarNode("shape_out") + ->assert_is_op_output("shape", "Out") + ->assert_is_op_input("gather") + ->AsIntermediate(); + auto* fill_const2 = + OpNode("fill_const2", "fill_constant")->AsIntermediate(); + auto* fill_const2_out = VarNode("fill_const2_out") + ->assert_is_op_output("fill_constant", "Out") + ->assert_is_op_input("gather") + ->AsIntermediate(); + auto* gather = OpNode("gather", "gather")->AsIntermediate(); + auto* gather_out = VarNode("gather_out") + ->assert_is_op_output("gather", "Out") + ->assert_is_op_input("assign", "X") + ->AsIntermediate(); + auto* assign = OpNode("assign", "assign")->AsIntermediate(); + auto* assign_out = VarNode("assign_out") + ->assert_is_op_output("assign", "Out") + ->assert_is_op_input("concat") + ->AsIntermediate(); + + auto* fill_const3 = + OpNode("fill_const3", "fill_constant")->AsIntermediate(); + auto* fill_const3_out = VarNode("fill_const3_out") + ->assert_is_op_output("fill_constant", "Out") + ->assert_is_op_input("assign") + ->AsIntermediate(); + auto* assign2 = OpNode("assign2", "assign")->AsIntermediate(); + auto* assign2_out = VarNode("assign2_out") + ->assert_is_op_output("assign", "Out") + ->assert_is_op_input("concat") + ->AsIntermediate(); + + auto* concat = OpNode("concat", "concat")->AsIntermediate(); + auto* concat_out = VarNode("concat_out") + ->assert_is_op_output("concat", "Out") + ->assert_is_op_input("cast", "X") + ->AsIntermediate(); + auto* cast = OpNode("cast", "cast")->AsIntermediate(); + auto* cast_out = VarNode("cast_out") + ->assert_is_op_output("cast", "Out") + ->assert_is_op_input("reshape2", "Shape") + ->AsIntermediate(); + + auto* reshape2 = OpNode("reshape2", "reshape2")->AsIntermediate(); + auto* reshape2_out = VarNode("reshape2_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("matmul", "X") + ->AsIntermediate(); + auto* reshape2_xshape = VarNode("reshape2_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* matmul_y = + VarNode("matmul_y")->assert_is_op_input("matmul", "Y")->AsInput(); + auto* matmul = OpNode("matmul", "matmul")->AsIntermediate(); + auto* matmul_out = VarNode("matmul_out") + ->assert_is_op_output("matmul", "Out") + ->assert_is_op_input("elementwise_add") + ->AsIntermediate(); + auto* eltwise_add_y = VarNode("eltwise_add_y") + ->assert_is_op_input("elementwise_add") + ->AsInput(); + auto* eltwise_add = + OpNode("eltwise_add", "elementwise_add")->AsIntermediate(); + auto* eltwise_add_out = VarNode("eltwise_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->AsIntermediate(); + + auto* norm = OpNode("norm", "norm")->AsIntermediate(); + auto* norm_out = VarNode("norm_out") + ->assert_is_op_output("norm", "Out") + ->assert_is_op_input("elementwise_add") + ->AsIntermediate(); + auto* norm_norm = VarNode("norm_norm") + ->assert_is_op_output("norm", "Norm") + ->AsIntermediate(); + auto* fill_const4 = + OpNode("fill_const4", "fill_constant")->AsIntermediate(); + auto* fill_const4_out = VarNode("fill_const4_out") + ->assert_is_op_output("fill_constant", "Out") + ->assert_is_op_input("elementwise_add") + ->AsIntermediate(); + auto* eltwise_add2 = + OpNode("eltwise_add2", "elementwise_add")->AsIntermediate(); + auto* eltwise_add2_out = VarNode("eltwise_add2_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("elementwise_mul") + ->AsIntermediate(); + auto* fill_const5 = + OpNode("fill_const5", "fill_constant")->AsIntermediate(); + auto* fill_const5_out = VarNode("fill_const5_out") + ->assert_is_op_output("fill_constant", "Out") + ->assert_is_op_input("elementwise_mul") + ->AsIntermediate(); + auto* eltwise_mul = + OpNode("eltwise_mul", "elementwise_mul")->AsIntermediate(); + auto* eltwise_mul_out = VarNode("eltwise_mul_out") + ->assert_is_op_output("elementwise_mul", "Out") + ->assert_is_op_input("elementwise_div") + ->AsIntermediate(); + + auto* eltwise_div2 = + OpNode("eltwise_div2", "elementwise_div")->AsIntermediate(); + auto* eltwise_div2_out = VarNode("eltwise_div2_out") + ->assert_is_op_output("elementwise_div", "Out") + ->AsOutput(); + + *input >> *clip >> *clip_out >> *eltwise_pow >> *eltwise_pow_out >> + *pad2d >> *pad2d_out >> *pool2d >> *pool2d_out >> *eltwise_pow2; + *eltwise_y >> *eltwise_pow; + + *fill_const >> *fill_const_out >> *eltwise_div >> *eltwise_div_out >> + *eltwise_pow2; + *eltwise_y >> *eltwise_div; + + *eltwise_pow2 >> *eltwise_pow2_out >> *shape >> *shape_out >> *gather >> + *gather_out >> *assign >> *assign_out >> *concat >> *concat_out >> + *cast >> *cast_out >> *reshape2; + *fill_const2 >> *fill_const2_out >> *gather; + *fill_const3 >> *fill_const3_out >> *assign2 >> *assign2_out >> *concat; + *eltwise_pow2_out >> *reshape2; + + *reshape2 >> *reshape2_out >> *matmul >> *matmul_out >> *eltwise_add >> + *eltwise_add_out; + *reshape2 >> *reshape2_xshape; + *matmul_y >> *matmul; + *eltwise_add_y >> *eltwise_add; + + *eltwise_add_out >> *norm >> *norm_out >> *eltwise_add2 >> + *eltwise_add2_out >> *eltwise_mul >> *eltwise_mul_out >> + *eltwise_div2 >> *eltwise_div2_out; + *norm >> *norm_norm; + *fill_const4 >> *fill_const4_out >> *eltwise_add2; + *fill_const5 >> *fill_const5_out >> *eltwise_mul; + *eltwise_add_out >> *eltwise_div2; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("resnet_cbam_block2"); + op_desc.SetInput("Inputs", {matched.at("input")->arg()->name}); + op_desc.SetInput("Filter", {matched.at("matmul_y")->arg()->name}); + op_desc.SetInput("Scale", {"placeholder_last_fc"}); + op_desc.SetInput("Bias", {matched.at("eltwise_add_y")->arg()->name}); + op_desc.SetInput("Mean", {"placeholder_last_fc"}); + op_desc.SetInput("Var", {"placeholder_last_fc"}); + op_desc.SetOutput("Outputs", {matched.at("eltwise_div2_out")->arg()->name}); + // XXX: keep these to fool SubgraphOp::AttachImpl() + op_desc.SetAttr("sub_block", 0); + op_desc.SetAttr>("input_data_names", {}); + op_desc.SetAttr>("output_data_names", {}); + + // extra traits to distill + auto block2_stmt = matched.at("clip")->stmt(); + auto* scope = block2_stmt->op()->scope(); + auto pow_tensor_name = matched.at("eltwise_y")->arg()->name; + auto* pow_tensor = scope->FindTensor(pow_tensor_name); + float pool_p = pow_tensor->data()[0]; + op_desc.SetAttr("pool_p", pool_p); + auto* matmul_op_info = matched.at("matmul")->stmt()->op_info(); + CHECK(matmul_op_info->GetAttr("transpose_Y") == true) + << "Y of last fc must have been transposed"; + + auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + // XXX: memleak? + auto sub_block_desc = new cpp::BlockDesc(); + static_cast(fake_subgraph_op.get()) + ->SetSubBlock(sub_block_desc); + fake_subgraph_op->Attach(op_desc, scope); + fake_subgraph_op->SetValidPlaces(block2_stmt->op()->valid_places()); + block2_stmt->SetOp(fake_subgraph_op); + + std::vector froms = { + "matmul_y", "eltwise_add_y", + }; + for (auto& from : froms) { + IR_NODE_LINK_TO(matched.at(from), matched.at("clip")); + } + IR_OP_VAR_LINK(matched.at("clip"), matched.at("eltwise_div2_out")); + } +}; + +class XPUResNetCbamFuser : public xpu::XPUFuseBase { + public: + XPUResNetCbamFuser() {} + + void BuildPattern() override { + auto* input = + VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput(); + + auto* top_conv_weight = VarNode("top_conv_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* top_conv = OpNode("top_conv", "conv2d"); + auto* top_conv_out = VarNode("top_conv_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* top_bn_scale = VarNode("top_bn_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* top_bn_bias = VarNode("top_bn_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* top_bn_mean = VarNode("top_bn_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* top_bn_var = VarNode("top_bn_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* top_bn = OpNode("top_bn", "batch_norm")->AsIntermediate(); + auto* top_bn_out = VarNode("top_bn_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* top_bn_mean_out = VarNode("top_bn_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* top_bn_var_out = + VarNode("top_bn_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* top_bn_saved_mean = + VarNode("top_bn_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* top_bn_saved_var = + VarNode("top_bn_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* top_relu = OpNode("top_relu", "relu")->AsIntermediate(); + auto* top_relu_out = VarNode("top_relu_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("pool2d", "X") + ->AsIntermediate(); + auto* top_pool = OpNode("top_pool", "pool2d")->AsIntermediate(); + auto* top_pool_out = + VarNode("top_pool_out") + ->assert_is_op_output("pool2d", "Out") + ->assert_is_op_input("resnet_cbam_block0", "Inputs") + ->AsIntermediate(); + + // args are left out + auto* resnet_block0_1 = + OpNode("resnet_block0_1", "resnet_cbam_block0")->AsIntermediate(); + auto* resnet_block0_1_out = + VarNode("resnet_block0_1_out") + ->assert_is_op_output("resnet_cbam_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_1_1 = + OpNode("resnet_block1_1_1", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_1_1_out = + VarNode("resnet_block1_1_1_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_1_2 = + OpNode("resnet_block1_1_2", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_1_2_out = + VarNode("resnet_block1_1_2_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block0_2 = + OpNode("resnet_block0_2", "resnet_cbam_block0")->AsIntermediate(); + auto* resnet_block0_2_out = + VarNode("resnet_block0_2_out") + ->assert_is_op_output("resnet_cbam_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_2_1 = + OpNode("resnet_block1_2_1", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_2_1_out = + VarNode("resnet_block1_2_1_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_2_2 = + OpNode("resnet_block1_2_2", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_2_2_out = + VarNode("resnet_block1_2_2_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_2_3 = + OpNode("resnet_block1_2_3", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_2_3_out = + VarNode("resnet_block1_2_3_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block0_3 = + OpNode("resnet_block0_3", "resnet_cbam_block0")->AsIntermediate(); + auto* resnet_block0_3_out = + VarNode("resnet_block0_3_out") + ->assert_is_op_output("resnet_cbam_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_1 = + OpNode("resnet_block1_3_1", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_3_1_out = + VarNode("resnet_block1_3_1_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_2 = + OpNode("resnet_block1_3_2", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_3_2_out = + VarNode("resnet_block1_3_2_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_3 = + OpNode("resnet_block1_3_3", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_3_3_out = + VarNode("resnet_block1_3_3_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_4 = + OpNode("resnet_block1_3_4", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_3_4_out = + VarNode("resnet_block1_3_4_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_5 = + OpNode("resnet_block1_3_5", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_3_5_out = + VarNode("resnet_block1_3_5_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block0_4 = + OpNode("resnet_block0_4", "resnet_cbam_block0")->AsIntermediate(); + auto* resnet_block0_4_out = + VarNode("resnet_block0_4_out") + ->assert_is_op_output("resnet_cbam_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_4_1 = + OpNode("resnet_block1_4_1", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_4_1_out = + VarNode("resnet_block1_4_1_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_4_2 = + OpNode("resnet_block1_4_2", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_4_2_out = + VarNode("resnet_block1_4_2_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block2 = + OpNode("resnet_block2", "resnet_cbam_block2")->AsIntermediate(); + auto* resnet_block2_out = + VarNode("resnet_block2_out") + ->assert_is_op_output("resnet_cbam_block2", "Outputs") + ->AsOutput(); + + *input >> *top_conv >> *top_conv_out >> *top_bn >> *top_bn_out >> + *top_relu >> *top_relu_out >> *top_pool >> *top_pool_out >> + *resnet_block0_1 >> *resnet_block0_1_out >> *resnet_block1_1_1 >> + *resnet_block1_1_1_out >> *resnet_block1_1_2 >> + *resnet_block1_1_2_out >> *resnet_block0_2 >> *resnet_block0_2_out >> + *resnet_block1_2_1 >> *resnet_block1_2_1_out >> *resnet_block1_2_2 >> + *resnet_block1_2_2_out >> *resnet_block1_2_3 >> + *resnet_block1_2_3_out >> *resnet_block0_3 >> *resnet_block0_3_out >> + *resnet_block1_3_1 >> *resnet_block1_3_1_out >> *resnet_block1_3_2 >> + *resnet_block1_3_2_out >> *resnet_block1_3_3 >> + *resnet_block1_3_3_out >> *resnet_block1_3_4 >> + *resnet_block1_3_4_out >> *resnet_block1_3_5 >> + *resnet_block1_3_5_out >> *resnet_block0_4 >> *resnet_block0_4_out >> + *resnet_block1_4_1 >> *resnet_block1_4_1_out >> *resnet_block1_4_2 >> + *resnet_block1_4_2_out >> *resnet_block2 >> *resnet_block2_out; + + *top_conv_weight >> *top_conv; + *top_bn_scale >> *top_bn; + *top_bn_bias >> *top_bn; + *top_bn_mean >> *top_bn; + *top_bn_var >> *top_bn; + *top_bn >> *top_bn_mean_out; + *top_bn >> *top_bn_var_out; + *top_bn >> *top_bn_saved_mean; + *top_bn >> *top_bn_saved_var; + } + + void handle_placeholder_sa_conv(SSAGraph* graph, + const key2nodes_t& matched, + paddle::lite::Scope* scope, + const std::string& filter_name, + std::vector* max_filter_name) { + auto* filter_t = scope->FindMutableTensor(filter_name); + int filter_len = filter_t->numel(); + float* filter_on_host = filter_t->mutable_data(); + + float max_f = + paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len); + std::unique_ptr filter_int16(new int16_t[filter_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + filter_on_host, filter_int16.get(), max_f, filter_len); + memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t)); + + // create new arg in graph and scope + std::string max_name = filter_name + "_max"; + max_filter_name->push_back(max_name); + auto* max_filter_node = graph->NewArgumentNode(max_name); + max_filter_node->arg()->is_weight = true; + max_filter_node->arg()->type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + DirectedLink(max_filter_node, matched.at("top_conv")); + auto* max_filter_t = scope->NewTensor(max_name); + max_filter_t->Resize({4}); + float* max_ptr = max_filter_t->mutable_data(); + max_ptr[0] = max_f; + max_ptr[1] = max_f; + max_ptr[2] = max_f; + max_ptr[3] = max_f; + } + + void handle_placeholder_last_fc(SSAGraph* graph, + const key2nodes_t& matched, + paddle::lite::Scope* scope, + const std::string& filter_name, + std::vector* max_filter_name) { + auto* filter_t = scope->FindMutableTensor(filter_name); + auto filter_dims = filter_t->dims(); + int filter_len = filter_t->numel(); + float* filter_on_host = filter_t->mutable_data(); + + // XXX(miaotianxiang): Y has already been transposed in model... + float max_f = + paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len); + std::unique_ptr filter_int16(new int16_t[filter_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + filter_on_host, filter_int16.get(), max_f, filter_len); + memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t)); + + // create new arg in graph and scope + std::string max_name = filter_name + "_max"; + max_filter_name->push_back(max_name); + auto* max_filter_node = graph->NewArgumentNode(max_name); + max_filter_node->arg()->is_weight = true; + max_filter_node->arg()->type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + DirectedLink(max_filter_node, matched.at("top_conv")); + auto* max_filter_t = scope->NewTensor(max_name); + max_filter_t->Resize({4}); + float* max_ptr = max_filter_t->mutable_data(); + max_ptr[0] = max_f; + max_ptr[1] = max_f; + max_ptr[2] = max_f; + max_ptr[3] = max_f; + } + + void InsertNewNode(SSAGraph* graph, + const key2nodes_t& matched, + const std::vector& extra_input_vars) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__resnet_cbam"); + op_desc.SetInput("Input", {matched.at("input")->arg()->name}); + std::vector filter_name = { + matched.at("top_conv_weight")->arg()->name}; + std::vector scale_name = { + matched.at("top_bn_scale")->arg()->name}; + std::vector bias_name = { + matched.at("top_bn_bias")->arg()->name}; + std::vector mean_name = { + matched.at("top_bn_mean")->arg()->name}; + std::vector var_name = { + matched.at("top_bn_variance")->arg()->name}; + std::vector max_filter_name; + std::vector resnet_block_vec = { + "resnet_block0_1", + "resnet_block1_1_1", + "resnet_block1_1_2", + "resnet_block0_2", + "resnet_block1_2_1", + "resnet_block1_2_2", + "resnet_block1_2_3", + "resnet_block0_3", + "resnet_block1_3_1", + "resnet_block1_3_2", + "resnet_block1_3_3", + "resnet_block1_3_4", + "resnet_block1_3_5", + "resnet_block0_4", + "resnet_block1_4_1", + "resnet_block1_4_2", + "resnet_block2", + }; + for (auto& block : resnet_block_vec) { + auto* block_op_info = matched.at(block)->stmt()->op_info(); + auto block_filter_name = block_op_info->Input("Filter"); + std::copy(block_filter_name.begin(), + block_filter_name.end(), + std::back_inserter(filter_name)); + auto block_scale_name = block_op_info->Input("Scale"); + std::copy(block_scale_name.begin(), + block_scale_name.end(), + std::back_inserter(scale_name)); + auto block_bias_name = block_op_info->Input("Bias"); + std::copy(block_bias_name.begin(), + block_bias_name.end(), + std::back_inserter(bias_name)); + auto block_mean_name = block_op_info->Input("Mean"); + std::copy(block_mean_name.begin(), + block_mean_name.end(), + std::back_inserter(mean_name)); + auto block_var_name = block_op_info->Input("Var"); + std::copy(block_var_name.begin(), + block_var_name.end(), + std::back_inserter(var_name)); + } + + auto* resnet_cbam_stmt = matched.at("top_conv")->stmt(); + auto* scope = resnet_cbam_stmt->op()->scope(); + for (size_t i = 0; i < filter_name.size(); ++i) { + if (scale_name[i] == "placeholder_sa_conv") { + handle_placeholder_sa_conv( + graph, matched, scope, filter_name[i], &max_filter_name); + continue; + } else if (scale_name[i] == "placeholder_last_fc") { + handle_placeholder_last_fc( + graph, matched, scope, filter_name[i], &max_filter_name); + continue; + } + + auto* filter_t = scope->FindMutableTensor(filter_name[i]); + auto* scale_t = scope->FindMutableTensor(scale_name[i]); + auto* bias_t = scope->FindMutableTensor(bias_name[i]); + auto* mean_t = scope->FindMutableTensor(mean_name[i]); + auto* var_t = scope->FindMutableTensor(var_name[i]); + + int mean_len = mean_t->numel(); + int filter_len = filter_t->numel(); + int filter_stride = filter_len / mean_len; + + float* filter_on_host = filter_t->mutable_data(); + float* scale_on_host = scale_t->mutable_data(); + float* bias_on_host = bias_t->mutable_data(); + float* mean_on_host = mean_t->mutable_data(); + float* var_on_host = var_t->mutable_data(); + + // Perform preprocess + for (int i = 0; i < mean_len; ++i) { + scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f); + } + for (int i = 0; i < mean_len; ++i) { + for (int j = 0; j < filter_stride; ++j) { + filter_on_host[i * filter_stride + j] *= scale_on_host[i]; + } + } + for (int i = 0; i < mean_len; ++i) { + bias_on_host[i] += -mean_on_host[i] * scale_on_host[i]; + } + + float max_f = + paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len); + std::unique_ptr filter_int16(new int16_t[filter_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + filter_on_host, filter_int16.get(), max_f, filter_len); + memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t)); + + // create new arg in graph and scope + std::string max_name = filter_name[i] + "_max"; + max_filter_name.push_back(max_name); + auto* max_filter_node = graph->NewArgumentNode(max_name); + max_filter_node->arg()->is_weight = true; + max_filter_node->arg()->type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + DirectedLink(max_filter_node, matched.at("top_conv")); + auto* max_filter_t = scope->NewTensor(max_name); + max_filter_t->Resize({4}); + float* max_ptr = max_filter_t->mutable_data(); + max_ptr[0] = max_f; + max_ptr[1] = max_f; + max_ptr[2] = max_f; + max_ptr[3] = max_f; + } + op_desc.SetInput("Filter", filter_name); + op_desc.SetInput("Bias", bias_name); + op_desc.SetInput("MaxFilter", max_filter_name); + op_desc.SetOutput("Output", {matched.at("resnet_block2_out")->arg()->name}); + op_desc.SetAttr("xpu", 1); + auto* block2_op_info = matched.at("resnet_block2")->stmt()->op_info(); + op_desc.SetAttr("pool_p", block2_op_info->GetAttr("pool_p")); + + auto resnet_cbam_op = LiteOpRegistry::Global().Create(op_desc.Type()); + resnet_cbam_op->Attach(op_desc, scope); + resnet_cbam_op->SetValidPlaces(resnet_cbam_stmt->op()->valid_places()); + auto kernels = + resnet_cbam_op->CreateKernels(resnet_cbam_op->valid_places()); + resnet_cbam_stmt->SetOp(resnet_cbam_op); + resnet_cbam_stmt->SetKernels(std::move(kernels)); + + IR_NODE_LINK_TO(matched.at("top_bn_bias"), matched.at("top_conv")); + for (auto* node : extra_input_vars) { + IR_NODE_LINK_TO(node, matched.at("top_conv")); + } + IR_OP_VAR_LINK(matched.at("top_conv"), matched.at("resnet_block2_out")); + } +}; + +} // namespace fusion + +class XPUResNetCbamFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + fusion::XPUResNetCbamBlock0Fuser block0_fuser; + block0_fuser(graph.get()); + fusion::XPUResNetCbamBlock1Fuser block1_fuser; + block1_fuser(graph.get()); + fusion::XPUResNetCbamBlock2Fuser block2_fuser; + block2_fuser(graph.get()); + fusion::XPUResNetCbamFuser resnet_fuser; + resnet_fuser(graph.get()); + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__resnet_cbam_fuse_pass, + paddle::lite::mir::XPUResNetCbamFusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("__xpu__resnet_cbam"); diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc index 68c07c0ffd0694aec0ff073082e1192213a0ef4a..20023830123939f1cf83706f69ca8a7a2703b646 100644 --- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc @@ -25,21 +25,21 @@ namespace mir { void ConvActivationFusePass::Apply(const std::unique_ptr& graph) { std::vector act_types{"relu"}; bool has_int8 = false; - bool has_arm_float = false; + bool has_arm = false; bool has_cuda = false; for (auto& place : graph->valid_places()) { if (place.precision == PRECISION(kInt8)) { has_int8 = true; } - if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) { - has_arm_float = true; + if (place.target == TARGET(kARM)) { + has_arm = true; } if (place.target == TARGET(kCUDA)) { has_cuda = true; } } - if (!has_int8 && has_arm_float) { + if (has_arm) { act_types.push_back("relu6"); act_types.push_back("leaky_relu"); } @@ -64,4 +64,5 @@ REGISTER_MIR_PASS(lite_conv_activation_fuse_pass, paddle::lite::mir::ConvActivationFusePass) .BindTargets({TARGET(kAny)}) .ExcludeTargets({TARGET(kXPU)}) + .ExcludeTargets({TARGET(kMLU)}) .BindKernel("conv2d"); diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc index 69be8dab0a06c26d5ca2bcdfe8327634edb9637d..a8a5a5deb2a57982587d9db9f94cadb367af8595 100644 --- a/lite/core/mir/fusion/conv_bn_fuser.cc +++ b/lite/core/mir/fusion/conv_bn_fuser.cc @@ -156,12 +156,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { // little difference for int8 /////////////////////////////////////////////////////////////////////////////// if (enable_int8) { - PADDLE_ENFORCE(conv_op_desc->HasAttr("weight_scale"), - "INT8 mode: Conv should has weight_scale attr"); + std::string weight_name = conv_op_desc->Input("Filter").front(); + CHECK(conv_op_desc->HasInputScale(weight_name)) + << "INT8 mode: Conv should has weight_scale attr"; auto conv_weight_d = conv_weight_t->mutable_data(); // compute new conv_weight for int8 - auto weight_scale = - conv_op_desc->GetAttr>("weight_scale"); + auto weight_scale = conv_op_desc->GetInputScale(weight_name); if (conv_type_ == "conv2d_transpose" && !depthwise) { int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; @@ -188,7 +188,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { } } } - conv_op_desc->SetAttr("weight_scale", weight_scale); + conv_op_desc->SetInputScale(weight_name, weight_scale); } else if (is_weight_quantization) { std::string scale_name = conv_weight_name + "_quant_scale"; if (conv_op_desc->HasAttr(scale_name)) { diff --git a/lite/core/mir/fusion/conv_bn_fuser.h b/lite/core/mir/fusion/conv_bn_fuser.h index 8bd8c0ce0600bb68667d96d07d43fa3028b5a856..841566067ba6675271227adfa82c74defac35f2a 100644 --- a/lite/core/mir/fusion/conv_bn_fuser.h +++ b/lite/core/mir/fusion/conv_bn_fuser.h @@ -18,7 +18,7 @@ #include #include #include "lite/core/mir/pattern_matcher_high_api.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc index e2d8f96c53bd76d9495035c6ec56a5364b9bdcf5..d9bffffebfaabcca9c63700caf6e3ee91fa2eecb 100644 --- a/lite/core/mir/fusion/fc_fuse_pass.cc +++ b/lite/core/mir/fusion/fc_fuse_pass.cc @@ -24,8 +24,13 @@ namespace mir { void FcFusePass::Apply(const std::unique_ptr& graph) { #ifdef LITE_WITH_X86 +#ifdef LITE_WITH_MLU + fusion::FcFuser fuser(false); + fuser(graph.get()); +#else fusion::FcFuser fuser(true); fuser(graph.get()); +#endif #endif fusion::FcFuser fuser2(false); @@ -38,7 +43,9 @@ void FcFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass) .BindTargets({TARGET(kAny)}) - .ExcludeTargets({TARGET(kXPU), TARGET(kX86)}) + .ExcludeTargets({TARGET(kXPU)}) +#ifndef LITE_WITH_MLU + .ExcludeTargets({TARGET(kX86)}) +#endif .ExcludeTargets({TARGET(kBM)}) - .ExcludeTargets({TARGET(kCUDA)}) .BindKernel("fc"); diff --git a/lite/core/mir/fusion/fc_fuser.cc b/lite/core/mir/fusion/fc_fuser.cc index 3c99131083d37ea2c8511ed136bff17c891529af..8fdde50fc3015b411ee13fed15e92a93a1c722e5 100644 --- a/lite/core/mir/fusion/fc_fuser.cc +++ b/lite/core/mir/fusion/fc_fuser.cc @@ -71,7 +71,20 @@ void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { } cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) { - cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info(); + auto op_desc = *matched.at("mul")->stmt()->op_info(); + + // Get the input scale from mul + std::vector x_scale_vct; + std::vector y_scale_vct; + auto input_x_name = op_desc.Input("X").front(); + auto input_y_name = op_desc.Input("Y").front(); + bool is_quantized_op = op_desc.HasInputScale(input_x_name) && + op_desc.HasInputScale(input_y_name); + if (is_quantized_op) { + x_scale_vct = op_desc.GetInputScale(input_x_name); + y_scale_vct = op_desc.GetInputScale(op_desc.Input("Y").front()); + } + op_desc.mutable_inputs()->clear(); op_desc.mutable_outputs()->clear(); op_desc.SetType("fc"); @@ -85,6 +98,13 @@ cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) { if (with_relu_) { op_desc.SetAttr("activation_type", std::string{"relu"}); } + + // Set the input scale into fc + if (is_quantized_op) { + op_desc.SetInputScale(matched.at("x")->arg()->name, x_scale_vct); + op_desc.SetInputScale(matched.at("W")->arg()->name, y_scale_vct); + } + return op_desc; } diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc index 80a033c75f2e23efa091375ee2a9f78e3ff40d71..ea8400b0bb2cd1680e52d9a92ef79aca4e09887b 100644 --- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc +++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc @@ -34,12 +34,13 @@ void QuantDequantFusePass::Apply(const std::unique_ptr& graph) { } // fuse quantized node and dequant node - for (auto& op_type : {"conv2d", "mul", "depthwise_conv2d"}) { + for (auto& op_type : + {"conv2d", "mul", "depthwise_conv2d", "conv2d_transpose"}) { fusion::DequantOpFuser fuser(op_type); fuser(graph.get()); } - for (auto& op_type : {"conv2d", "depthwise_conv2d"}) { + for (auto& op_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) { fusion::ChannelWiseDequantOpFuser fuser(op_type); fuser(graph.get()); } diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc index f6d03cc23d56f8ae25f22b5b2667ed451ef8afaa..1335518b00db5311b4605148817faed52164fd7a 100644 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc +++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc @@ -23,6 +23,20 @@ namespace lite { namespace mir { namespace fusion { +static std::string GetWeightArgname(const std::string& op_type) { + std::string weight_argname{}; + std::vector conv_ops = { + "conv2d", "depthwise_conv2d", "conv2d_transpose"}; + std::vector mul_ops = {"mul", "matmul"}; + if (std::find(conv_ops.begin(), conv_ops.end(), op_type) != conv_ops.end()) { + weight_argname = "Filter"; + } else if (std::find(mul_ops.begin(), mul_ops.end(), op_type) != + mul_ops.end()) { + weight_argname = "Y"; + } + return weight_argname; +} + void DeleteQuantOpFuser::BuildPattern() { auto* input_scale_node = VarNode("input_scale_node") ->assert_is_op_input(quant_op_type_, "InScale"); @@ -64,13 +78,7 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph, for (auto* quantized_node : outlinks) { // save input scale in quantized op by input argname + index auto op_desc = *quantized_node->stmt()->mutable_op_info(); - std::string argname; - int index; - op_desc.GetInputArgname(out_act_name, &argname); - op_desc.GetInputIndex(out_act_name, &index); - op_desc.SetAttr(argname + std::to_string(index) + "_input_scale", - scale_value); - op_desc.SetAttr("input_scale", scale_value); // save it for now + op_desc.SetInputScale(out_act_name, {scale_value}); op_desc.SetAttr("bit_length", bit_length); op_desc.UpdateAllInputs(out_act_name, in_act_name); quantized_node->stmt()->ResetOp(op_desc, graph->valid_places()); @@ -89,20 +97,13 @@ cpp::OpDesc DeleteQuantOpFuser::GenOpDesc(const key2nodes_t& matched) { } void DequantOpFuser::BuildPattern() { - std::string weight_name = ""; - if (quantized_op_type_ == "conv2d" || - quantized_op_type_ == "depthwise_conv2d") { - weight_name = "Filter"; - } else { - weight_name = "Y"; - } - + std::string weight_argname = GetWeightArgname(quantized_op_type_); auto* quantized_op_input = VarNode("quantized_op_input") ->assert_is_op_input(quantized_op_type_) ->AsInput(); auto* quantized_op_weight = VarNode("quantized_op_weight") - ->assert_is_op_input(quantized_op_type_, weight_name) + ->assert_is_op_input(quantized_op_type_, weight_argname) ->AsInput(); auto* quantized_op = OpNode("quantized_op", quantized_op_type_) ->assert_is_op(quantized_op_type_) @@ -135,6 +136,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, auto* quantized_op = matched.at("quantized_op"); auto* dequant_op = matched.at("dequant_op"); auto* dequant_op_out = matched.at("dequant_op_out"); + auto weight_name = quantized_op_weight->arg()->name; // obtain weight_scale from max_range auto* scope = quantized_op->stmt()->op()->scope(); @@ -150,14 +152,15 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, // = max(abs(weight)) / range // set op desc - cpp::OpDesc op_desc = *quantized_op->stmt()->op_info(); + auto op_desc = *quantized_op->stmt()->op_info(); auto quantized_weight_var_name = quantized_op_weight->arg()->name; auto quantized_weight_t = scope->FindVar(quantized_weight_var_name)->GetMutable(); std::vector weight_scale; - int weight_scale_size; + int weight_scale_size = 0; if (quantized_op_type_ == "conv2d" || - quantized_op_type_ == "depthwise_conv2d") { + quantized_op_type_ == "depthwise_conv2d" || + quantized_op_type_ == "conv2d_transpose") { op_desc.SetInput("Input", {quantized_op_input->arg()->name}); op_desc.SetOutput("Output", {dequant_op_out->arg()->name}); // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should @@ -173,7 +176,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, weight_scale.push_back(whole_weight_scale); } op_desc.SetAttr("enable_int8", true); - op_desc.SetAttr("weight_scale", weight_scale); + op_desc.SetInputScale(weight_name, weight_scale); // change the weight from the float type to int8 type. Tensor temp_tensor; @@ -204,12 +207,13 @@ cpp::OpDesc DequantOpFuser::GenOpDesc(const key2nodes_t& matched) { void ChannelWiseDequantOpFuser::BuildPattern() { std::string dequant_op_type = "fake_channel_wise_dequantize_max_abs"; + std::string weight_argname = GetWeightArgname(quantized_op_type_); auto* quantized_op_input = VarNode("quantized_op_input") ->assert_is_op_input(quantized_op_type_) ->AsInput(); auto* quantized_op_weight = VarNode("quantized_op_weight") - ->assert_is_op_input(quantized_op_type_, "Filter") + ->assert_is_op_input(quantized_op_type_, weight_argname) ->AsInput(); auto* quantized_op = OpNode("quantized_op", quantized_op_type_) ->assert_is_op(quantized_op_type_) @@ -246,6 +250,7 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph, auto* dequant_op_channel_scale = matched.at("dequant_op_channel_scale"); auto* dequant_op = matched.at("dequant_op"); auto* dequant_op_out = matched.at("dequant_op_out"); + auto weight_name = quantized_op_weight->arg()->name; // obtain input weight_scale from fake_dequant op auto* scope = quantized_op->stmt()->op()->scope(); @@ -265,17 +270,20 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph, } // set op desc - cpp::OpDesc op_desc = *quantized_op->stmt()->op_info(); + auto op_desc = *quantized_op->stmt()->op_info(); if (quantized_op_type_ == "conv2d" || - quantized_op_type_ == "depthwise_conv2d") { + quantized_op_type_ == "depthwise_conv2d" || + quantized_op_type_ == "conv2d_transpose") { op_desc.SetInput("Input", {quantized_op_input->arg()->name}); op_desc.SetOutput("Output", {dequant_op_out->arg()->name}); } else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") { op_desc.SetInput("X", {quantized_op_input->arg()->name}); op_desc.SetOutput("Out", {dequant_op_out->arg()->name}); } - op_desc.SetAttr("enable_int8", true); - op_desc.SetAttr("weight_scale", weight_scale); + if (quantized_op_type_ != "conv2d_transpose") { + op_desc.SetAttr("enable_int8", true); + } + op_desc.SetInputScale(weight_name, weight_scale); // change the weight from the float type to int8 type. auto quantized_weight_var_name = quantized_op_weight->arg()->name; @@ -352,22 +360,7 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph, // Save quantization info in op_info attr auto op_info = *quantized_node->stmt()->op_info(); op_info.SetAttr("bit_length", bit_length); - - std::string argname; - int index; - op_info.GetInputArgname(output_act_name, &argname); - op_info.GetInputIndex(output_act_name, &index); - op_info.SetAttr(argname + std::to_string(index) + "_input_scale", - scale_value); - std::string op_type = op_info.Type(); - // Analyse the weight scale or input scale. - if (((op_type == "conv2d" || op_type == "depthwise_conv2d") && - argname == "Input") || - ((op_type == "mul" || op_type == "matmul") && argname == "Y")) { - op_info.SetAttr("weight_scale", scale_value); - } else { - op_info.SetAttr("input_scale", scale_value); - } + op_info.SetInputScale(output_act_name, {scale_value}); op_info.UpdateAllInputs(output_act_name, input_act_name); quantized_node->stmt()->ResetOp(op_info, graph->valid_places()); diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc index 5ad094fd4219bcbb3c59ec1c71f42af6cac5a11a..92804d6e72e7a2de6f3a6f3b47f338aecd25aa8c 100644 --- a/lite/core/mir/memory_optimize_pass.cc +++ b/lite/core/mir/memory_optimize_pass.cc @@ -314,4 +314,5 @@ REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass) TARGET(kXPU), TARGET(kBM), TARGET(kRKNPU), - TARGET(kAPU)}); + TARGET(kAPU), + TARGET(kMLU)}); diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc index ba48d5d4ead5ea922ded0bff3a87c2c127595790..46738dd49c16fd9736d61711b4baf56d51247699 100644 --- a/lite/core/mir/mlu_postprocess_pass.cc +++ b/lite/core/mir/mlu_postprocess_pass.cc @@ -14,18 +14,22 @@ #include "lite/core/mir/mlu_postprocess_pass.h" #include +#include #include #include #include #include #include "lite/core/mir/graph_visualize_pass.h" #include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/subgraph/subgraph_detector.h" #include "lite/operators/subgraph_op.h" namespace paddle { namespace lite { namespace mir { +static thread_local int g_stream_id = 0; + Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, const std::string& cast_arg_name, SSAGraph* graph, @@ -37,6 +41,10 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, cast_arg->AsArg().type = cast_type; inst_node->AsStmt().op()->scope()->Var(cast_arg_name); + VLOG(4) << "insert cast before subgraph"; + VLOG(4) << "curent node type: " << cur_node->AsArg().type->name() + << " cast to node type: " << cast_type->name(); + // create the stmt node auto* cast_inst = graph->NewInstructNode(); // create op @@ -60,14 +68,17 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, CHECK(0) << "Unsupport cast type"; } cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); + + auto v_places = graph->valid_places(); // create kernels - auto kernels = cast_op->CreateKernels(graph->valid_places()); + auto kernels = cast_op->CreateKernels(v_places); std::vector> selected_kernels; bool is_found = false; for (auto& kernel : kernels) { if (op_type == "cast") { const Type* in_arg_ty = kernel->GetInputDeclType("X"); - if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) { + if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type) && + DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type)) { is_found = true; } } else if (op_type == "layout") { @@ -83,24 +94,22 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); if (TargetCompatibleTo(*in_arg_ty, *cur_node->AsArg().type) && - TargetCompatibleTo(*out_arg_ty, *cast_type)) { + TargetCompatibleTo(*out_arg_ty, *cast_type) && + PrecisionCompatible(*in_arg_ty, *cur_node->AsArg().type) && + PrecisionCompatible(*out_arg_ty, *cast_type)) { is_found = true; } } else { CHECK(0) << "Unsupport cast type"; } if (is_found) { + VLOG(4) << "insert kernel: " << kernel->name(); selected_kernels.emplace_back(std::move(kernel)); // we pick the kernel cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); auto& stmt = cast_inst->AsStmt(); - if (op_type == "layout") { - stmt.picked_kernel().SetContext( - ContextScheduler::Global().NewContext(TARGET(kX86))); - } else { - stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext( - stmt.picked_kernel().target())); - } + stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + stmt.picked_kernel().target(), g_stream_id)); break; } } @@ -124,6 +133,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name); // for CastAfter manully set the tensor's type var->GetMutable(); + VLOG(4) << "insert cast after subgraph"; + VLOG(4) << "curent node type: " << cur_node->AsArg().type->name() + << " cast to node type: " << cast_type->name(); // create the stmt node auto* cast_inst = graph->NewInstructNode(); @@ -133,8 +145,8 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, cpp::OpDesc op_desc; op_desc.SetType(op_type); if (op_type == "cast") { - op_desc.SetAttr("in_dtype", 4); // FP32 - op_desc.SetAttr("out_dtype", 5); // FP16 + op_desc.SetAttr("in_dtype", 4); // FP16 + op_desc.SetAttr("out_dtype", 5); // FP32 op_desc.SetInput("X", {cast_arg_name}); op_desc.SetOutput("Out", {cur_node->AsArg().name}); } else if (op_type == "layout") { @@ -150,8 +162,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); + auto v_places = graph->valid_places(); // create kernels - auto kernels = cast_op->CreateKernels(graph->valid_places()); + auto kernels = cast_op->CreateKernels(v_places); std::vector> selected_kernels; bool is_found = false; for (auto& kernel : kernels) { @@ -164,14 +177,17 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); if (DataLayoutCompatible(*in_arg_ty, *cast_type) && - DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type)) { + DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type) && + PrecisionCompatibleTo(*in_arg_ty, *cast_type)) { is_found = true; } } else if (op_type == "io_copy") { const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); if (TargetCompatibleTo(*in_arg_ty, *cast_type) && - TargetCompatibleTo(*out_arg_ty, *cur_node->AsArg().type)) { + TargetCompatibleTo(*out_arg_ty, *cur_node->AsArg().type) && + PrecisionCompatible(*in_arg_ty, *cur_node->AsArg().type) && + PrecisionCompatible(*out_arg_ty, *cast_type)) { is_found = true; } } else { @@ -182,13 +198,8 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, // we pick the kernel cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); auto& stmt = cast_inst->AsStmt(); - if (op_type == "layout") { - stmt.picked_kernel().SetContext( - ContextScheduler::Global().NewContext(TARGET(kX86))); - } else { - stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext( - stmt.picked_kernel().target())); - } + stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + stmt.picked_kernel().target(), g_stream_id)); break; } } @@ -203,7 +214,8 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, void MLUPostprocessPass::InsertBefore(SSAGraph* graph, Node* head_node, Node* inst_node, - const Type* inst_type) { + const Type* inst_type, + bool use_mlu_cast) { const auto* head_type = head_node->AsArg().type; // break original link @@ -218,39 +230,52 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph, head_node->AsArg().name) != first_conv_nodes_.end(); // precision cast node - if (head_type->precision() != inst_type->precision() && !is_first_conv_head) { + if (!use_mlu_cast) { + if (head_type->precision() != inst_type->precision() && + !is_first_conv_head) { + cur_node = InsertCastBefore("cast", + name_prefix + "cast", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy(head_type->target(), + inst_type->precision(), + head_type->layout())); + } + + // layout cast node + if (head_type->layout() != inst_type->layout()) { + cur_node = InsertCastBefore("layout", + name_prefix + "layout", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy(head_type->target(), + inst_type->precision(), + inst_type->layout())); + } + + // io copy cur_node = InsertCastBefore( - "cast", - name_prefix + "cast", + "io_copy", + name_prefix + "io_copy", graph, cur_node, inst_node, LiteType::GetTensorTy( - head_type->target(), inst_type->precision(), head_type->layout())); - } - - // layout cast node - if (head_type->layout() != inst_type->layout()) { + inst_type->target(), inst_type->precision(), inst_type->layout())); + } else { + // io copy cur_node = InsertCastBefore( - "layout", - name_prefix + "layout", + "io_copy", + name_prefix + "io_copy", graph, cur_node, inst_node, LiteType::GetTensorTy( - head_type->target(), inst_type->precision(), inst_type->layout())); + inst_type->target(), head_type->precision(), head_type->layout())); } - // io copy - cur_node = InsertCastBefore( - "io_copy", - name_prefix + "io_copy", - graph, - cur_node, - inst_node, - LiteType::GetTensorTy( - inst_type->target(), inst_type->precision(), inst_type->layout())); - // connect cur_node to inst_node DirectedLink(cur_node, inst_node); @@ -311,10 +336,9 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node, CHECK(subgraph_precision == PRECISION(kFloat) || subgraph_precision == PRECISION(kFP16)) << "Mlu node has unsupport precision"; - VLOG(4) << "picked kernel precision: " - << PrecisionToStr(subgraph_precision); *arg_type = LiteType::GetTensorTy( subgraph_target, subgraph_precision, subgraph_layout); + VLOG(4) << "picked subgraph kernel type: " << (*arg_type)->name(); break; } } @@ -356,7 +380,8 @@ bool MLUPostprocessPass::NeedInsert(Node* node, const Type* inst_type) { void MLUPostprocessPass::InsertAfter(SSAGraph* graph, Node* tail_node, Node* inst_node, - const Type* inst_type) { + const Type* inst_type, + bool use_mlu_cast) { const auto* tail_type = tail_node->AsArg().type; // break original link @@ -367,39 +392,50 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph, tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_"; // precision cast node - if (tail_type->precision() != inst_type->precision()) { + if (!use_mlu_cast) { + if (tail_type->precision() != inst_type->precision()) { + cur_node = InsertCastAfter("cast", + name_prefix + "cast", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy(tail_type->target(), + inst_type->precision(), + tail_type->layout())); + } + + // layout cast node + if (tail_type->layout() != inst_type->layout()) { + cur_node = InsertCastAfter("layout", + name_prefix + "layout", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy(tail_type->target(), + inst_type->precision(), + inst_type->layout())); + } + + // io copy cur_node = InsertCastAfter( - "cast", - name_prefix + "cast", + "io_copy", + name_prefix + "io_copy", graph, cur_node, inst_node, LiteType::GetTensorTy( - tail_type->target(), inst_type->precision(), tail_type->layout())); - } - - // layout cast node - if (tail_type->layout() != inst_type->layout()) { + inst_type->target(), inst_type->precision(), inst_type->layout())); + } else { cur_node = InsertCastAfter( - "layout", - name_prefix + "layout", + "io_copy", + name_prefix + "io_copy", graph, cur_node, inst_node, LiteType::GetTensorTy( - tail_type->target(), inst_type->precision(), inst_type->layout())); + inst_type->target(), tail_type->precision(), tail_type->layout())); } - // io copy - cur_node = InsertCastAfter( - "io_copy", - name_prefix + "io_copy", - graph, - cur_node, - inst_node, - LiteType::GetTensorTy( - inst_type->target(), inst_type->precision(), inst_type->layout())); - // connect cur_node to inst_node DirectedLink(inst_node, cur_node); @@ -496,6 +532,74 @@ void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) { } } +void MLUPostprocessPass::ModifyInputOutputDataType(SSAGraph* graph) { + for (auto& node : graph->mutable_nodes()) { + if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") { + const Type* subgraph_arg_type = nullptr; + GetSubgraphOpArgType(&node, &subgraph_arg_type, graph); + for (auto& in_node : node.inlinks) { + const auto* in_node_type = in_node->AsArg().type; + VLOG(4) << "MLU subgraph input type: " << in_node->AsArg().name + << *in_node_type; + if (in_node->AsArg().is_weight || in_node->AsArg().is_persist) { + CHECK(in_node_type->target() == TARGET(kHost) && + in_node_type->precision() == PRECISION(kAny) && + in_node_type->layout() == DATALAYOUT(kNCHW)) + << "MLU subgraph unexpected persistent input type!"; + in_node->AsArg().type = LiteType::GetTensorTy( + TARGET(kMLU), PRECISION(kAny), DATALAYOUT(kNHWC)); + } else { + CHECK((in_node_type->target() == TARGET(kHost) || + in_node_type->target() == TARGET(kX86)) && + in_node_type->precision() == PRECISION(kFloat) && + in_node_type->layout() == DATALAYOUT(kNCHW)) + << "MLU subgraph unexpected common input type!"; + } + } + for (auto& out_node : node.outlinks) { + const auto* out_node_type = out_node->AsArg().type; + auto& out_arg = out_node->AsArg(); + VLOG(4) << "MLU subgraph output type: " << out_node->AsArg().name + << *out_node_type; + if (out_node->AsArg().is_weight || out_node->AsArg().is_persist) { + CHECK(out_node_type->target() == TARGET(kHost) && + out_node_type->precision() == PRECISION(kAny) && + out_node_type->layout() == DATALAYOUT(kNCHW)) + << "MLU subgraph unexpected persistent input type!"; + out_node->AsArg().type = LiteType::GetTensorTy( + TARGET(kMLU), PRECISION(kAny), DATALAYOUT(kNHWC)); + } else if (out_node_type->precision() == PRECISION(kAny) && + out_node->outlinks.empty()) { + out_arg.is_persist = true; + out_arg.type = LiteType::GetTensorTy( + TARGET(kMLU), PRECISION(kAny), DATALAYOUT(kNHWC)); + } else { + CHECK(out_node_type->precision() == PRECISION(kFloat)) + << "MLU subgraph unexpected common output type!"; + if (out_node->outlinks.empty()) { + out_arg.type = LiteType::GetTensorTy(TARGET(kHost), + subgraph_arg_type->precision(), + DATALAYOUT(kNHWC)); + VLOG(4) << "unused output node type: " << out_arg.name + << out_node_type->name(); + } else { + out_arg.type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + VLOG(4) << "output node type: " << out_arg.name + << out_node_type->name(); + } + } + const auto target = out_node->AsArg().type->target(); + const auto precision = out_node->AsArg().type->precision(); + const auto layout = out_node->AsArg().type->layout(); + VLOG(4) << "arg name: " << out_node->AsArg().name + << " type: " << TargetToStr(target) << ", " + << PrecisionToStr(precision) << ", " << DataLayoutToStr(layout); + } + } + } +} + void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { for (auto& node : graph->mutable_nodes()) { if (!node.IsStmt()) continue; @@ -515,6 +619,16 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { old_type->precision(), paddle::lite_api::DataLayoutType::kNHWC, old_type->device()); + // modify inst feed to NHWC, while set_mlu_input_layout(kNHWC) + // invoked, to keep consistent with actual data layout + auto place = node.AsStmt().place(); + place.layout = DATALAYOUT(kNHWC); + std::vector valid_places = {place}; + auto updated_op_info = *node.AsStmt().op_info(); + node.AsStmt().ResetOp(updated_op_info, valid_places, nullptr); + auto kernel = &(node.AsStmt().picked_kernel()); + VLOG(4) << "kernel info: " << kernel->name(); + node.AsStmt().op()->AttachKernel(kernel); } } } @@ -540,6 +654,213 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { } } +std::pair CheckInputAndInsert(Scope* scope, + cpp::BlockDesc* block_desc, + const std::string& input_name, + const Type* tensor_type, + const Type* subgraph_type) { + auto cur_node = input_name; + bool do_insert = false; + if (!DataLayoutCompatible(*tensor_type, *subgraph_type)) { + auto layout_op = block_desc->AddOp(); + auto layout_arg_name = string_format("%s/layout", cur_node.c_str()); + scope->Var(layout_arg_name); + VLOG(4) << "insert layout for subgraph input, arg tensor name: " + << layout_arg_name; + layout_op->SetType("layout"); + layout_op->SetInput("Input", {cur_node}); + layout_op->SetOutput("Out", {layout_arg_name}); + cur_node = layout_arg_name; + do_insert = true; + } + + if (!PrecisionCompatible(*tensor_type, *subgraph_type) && + tensor_type->precision() != PRECISION(kInt8) && + tensor_type->precision() != PRECISION(kInt32)) { + auto cast_op = block_desc->AddOp(); + auto cast_arg_name = string_format("%s/cast", cur_node.c_str()); + scope->Var(cast_arg_name); + VLOG(4) << "insert cast for subgraph input, arg tensor name: " + << cast_arg_name; + cast_op->SetType("cast"); + cast_op->SetAttr("in_dtype", 5); // FP32 + cast_op->SetAttr("out_dtype", 4); // FP16 + cast_op->SetInput("X", {cur_node}); + cast_op->SetOutput("Out", {cast_arg_name}); + cur_node = cast_arg_name; + do_insert = true; + } + + return std::make_pair(do_insert, cur_node); +} + +std::pair CheckOutputAndInsert( + Scope* scope, + cpp::BlockDesc* block_desc, + const std::string& output_name, + const Type* tensor_type, + const Type* subgraph_type) { + auto cur_node = output_name; + bool do_insert = false; + cpp::OpDesc *layout_op = nullptr, *cast_op = nullptr; + size_t cast_idx = 0; + + // subgraph -> cast -> layout -> output + if (!PrecisionCompatible(*tensor_type, *subgraph_type)) { + cast_op = block_desc->AddOp(); + cast_idx = block_desc->OpsSize() - 1; + CHECK_EQ(cast_op, block_desc->GetOp(cast_idx)); + cast_op->SetType("cast"); + cast_op->SetAttr("in_dtype", 4); // FP16 + cast_op->SetAttr("out_dtype", 5); // FP32 + do_insert = true; + } + + if (!DataLayoutCompatible(*tensor_type, *subgraph_type)) { + auto layout_arg_name = string_format("%s/layout", cur_node.c_str()); + scope->Var(layout_arg_name); + VLOG(4) << "insert layout for subgraph output, arg tensor name: " + << layout_arg_name; + layout_op = block_desc->AddOp(); + layout_op->SetType("layout"); + layout_op->SetInput("Input", {layout_arg_name}); + layout_op->SetOutput("Out", {cur_node}); + cur_node = layout_arg_name; + do_insert = true; + } + + if (cast_op) { + cast_op = block_desc->GetOp(cast_idx); + auto cast_arg_name = string_format("%s/cast", cur_node.c_str()); + scope->Var(cast_arg_name); + VLOG(4) << "insert cast for subgraph output, arg tensor name: " + << cast_arg_name; + cast_op->SetInput("X", {cast_arg_name}); + cast_op->SetOutput("Out", {cur_node}); + cur_node = cast_arg_name; + } + + return std::make_pair(do_insert, cur_node); +} + +// insert cast op on mlu, to avoid cast on cpu +void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node, + const Type* subgraph_type) { + auto subgraph_op = subgraph_node->AsStmt().op(); + CHECK_EQ(subgraph_op->Type(), "subgraph"); + auto op = dynamic_cast(subgraph_op.get()); + CHECK(op); + auto block_desc = op->GetSubBlock(); + + // create a new block desc to keep op sequence correct + cpp::BlockDesc* new_block_desc = new cpp::BlockDesc(); + new_block_desc->ClearOps(); + new_block_desc->ClearVars(); + new_block_desc->SetIdx(block_desc->Idx()); + new_block_desc->SetParentIdx(block_desc->ParentIdx()); + new_block_desc->SetForwardBlockIdx(block_desc->ForwardBlockIdx()); + + // find all IO that is not weight or persist + std::list i_names, o_names; + std::map node_replace; + + // Insert cast op for iotensor which is not weight or persist + for (auto& input : subgraph_node->inlinks) { + auto input_name = input->AsArg().name; + if (!(input->AsArg().is_weight || input->AsArg().is_persist)) { + i_names.emplace_back(input_name); + auto ret = CheckInputAndInsert(op->scope(), + new_block_desc, + input_name, + input->AsArg().type, + subgraph_type); + if (ret.first) { + node_replace[input_name] = ret.second; + } + } + } + for (auto& output : subgraph_node->outlinks) { + auto output_name = output->AsArg().name; + if (!(output->AsArg().is_weight || output->AsArg().is_persist)) { + o_names.emplace_back(output_name); + auto ret = CheckOutputAndInsert(op->scope(), + block_desc, + output_name, + output->AsArg().type, + subgraph_type); + if (ret.first) { + node_replace[output_name] = ret.second; + } + } + } + + // update input and output + for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); ++op_idx) { + auto desc = block_desc->GetOp(op_idx); + auto new_desc = new_block_desc->AddOp(); + *new_desc = *desc; + + if (desc->Type() != "layout" && desc->Type() != "cast") { + auto op_input_args = new_desc->InputArgumentNames(); + for (auto& input_arg : op_input_args) { + auto op_input = new_desc->Input(input_arg); + for (auto& it : i_names) { + auto index = std::find(op_input.begin(), op_input.end(), it); + if (index != op_input.end() && + node_replace.find(it) != node_replace.end()) { + index = op_input.erase(index); + op_input.emplace(index, node_replace.at(it)); + VLOG(4) << new_desc->Type() << "] change input from " << it + << " to " << node_replace.at(it); + } + } + new_desc->SetInput(input_arg, op_input); + } + + auto op_output_args = new_desc->OutputArgumentNames(); + for (auto& output_arg : op_output_args) { + auto op_output = new_desc->Output(output_arg); + for (auto& it : o_names) { + auto index = std::find(op_output.begin(), op_output.end(), it); + if (index != op_output.end() && + node_replace.find(it) != node_replace.end()) { + index = op_output.erase(index); + op_output.emplace(index, node_replace.at(it)); + VLOG(4) << new_desc->Type() << "] change output from " << it + << " to " << node_replace.at(it); + } + } + new_desc->SetOutput(output_arg, op_output); + } + } + } + op->SetSubBlock(new_block_desc); +} + +void ModifyValidPlaces(SSAGraph* graph, bool use_mlu_cast) { + // remove invalid places, since only support X86, host, MLU + auto v_places = graph->valid_places(); + for (auto it = v_places.begin(); it != v_places.end();) { + if (it->target != TARGET(kMLU) && it->target != TARGET(kHost) && + it->target != TARGET(kX86)) { + it = v_places.erase(it); + } else { + ++it; + } + } + + if (use_mlu_cast) { + // insert mlu float place for float io copy, no effect to subgraph type + v_places.emplace_back(TARGET(kMLU), PRECISION(kFloat), DATALAYOUT(kNHWC)); + } + + graph->SetValidPlaces(v_places); + VLOG(4) << "valid places after modified:"; + for (auto& p : v_places) { + VLOG(4) << p.DebugString(); + } +} + void MLUPostprocessPass::Apply(const std::unique_ptr& graph) { // currently for non-persistent input and output args, mlu subgraph op // only support float16/float32 data type @@ -549,35 +870,47 @@ void MLUPostprocessPass::Apply(const std::unique_ptr& graph) { // arg_in and arg_out are assumed to be NHWC which user should be aware of. // Thus here we change these args' layout to NHWC #ifdef LITE_WITH_MLU - if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) { + ModifyInputOutputDataType(graph.get()); + + if (lite::TargetWrapperMlu::InputLayout() == DATALAYOUT(kNHWC)) { ModifyLayout(graph.get()); } - if (lite::DeviceInfo::Global().UseFirstConv()) { + if (lite::TargetWrapperMlu::UseFirstConv()) { GatherAndModifyFirstConvNodes(graph.get()); } #endif + g_stream_id = static_cast(reinterpret_cast(graph.get())); + bool disable_mlu_cast = GetBoolFromEnv("LITE_DISABLE_MLU_CAST"); + ModifyValidPlaces(graph.get(), !disable_mlu_cast); // insert io_copy, layout and precision cast of subgraph's inputs and outputs for (auto& node : graph->mutable_nodes()) { if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") { const Type* subgraph_arg_type = nullptr; GetSubgraphOpArgType(&node, &subgraph_arg_type, graph.get()); + if (!disable_mlu_cast) { + AdjustSubgraph(&node, subgraph_arg_type); + } auto links_tmp = node.inlinks; for (auto p_in : links_tmp) { if (NeedInsert(p_in, subgraph_arg_type)) { - InsertBefore(graph.get(), p_in, &node, subgraph_arg_type); + InsertBefore( + graph.get(), p_in, &node, subgraph_arg_type, !disable_mlu_cast); } } links_tmp.assign(node.outlinks.begin(), node.outlinks.end()); for (auto p_out : links_tmp) { if (NeedInsert(p_out, subgraph_arg_type)) { - InsertAfter(graph.get(), p_out, &node, subgraph_arg_type); + InsertAfter( + graph.get(), p_out, &node, subgraph_arg_type, !disable_mlu_cast); } } } } + // std::vector> subgraphs({graph->NodeTopologicalOrder()}); + // SubgraphVisualizer(graph.get(), subgraphs)(); } } // namespace mir diff --git a/lite/core/mir/mlu_postprocess_pass.h b/lite/core/mir/mlu_postprocess_pass.h index 688dd06fb5fbec0c8e1c53acfe4215456ddb4192..5a31c1d8322db7bbc57de8dd18fdaf8ff4b0c885 100644 --- a/lite/core/mir/mlu_postprocess_pass.h +++ b/lite/core/mir/mlu_postprocess_pass.h @@ -79,6 +79,8 @@ class MLUPostprocessPass : public ProgramPass { const Type** arg_type, SSAGraph* graph); + void ModifyInputOutputDataType(SSAGraph* graph); + void ModifyLayout(SSAGraph* graph); bool NeedInsert(Node* node, const Type* inst_type); @@ -86,12 +88,14 @@ class MLUPostprocessPass : public ProgramPass { void InsertBefore(SSAGraph* graph, Node* head_node, Node* inst_node, - const Type* type); + const Type* type, + bool use_mlu_cast); void InsertAfter(SSAGraph* graph, Node* tail_node, Node* inst_node, - const Type* type); + const Type* type, + bool use_mlu_cast); Node* InsertCastBefore(const std::string& op_type, const std::string& cast_arg_name, @@ -115,6 +119,8 @@ class MLUPostprocessPass : public ProgramPass { bool IsFirstConvInSubgraph(Node* arg_node, Node* inst); + void AdjustSubgraph(Node* subgraph_node, const Type* op_type); + private: std::set first_conv_nodes_; }; diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.cc b/lite/core/mir/quantized_op_attributes_inference_pass.cc index 66b37446a4cc6a33c09757266c9dd2cbc818325e..259447aa21b76261a266a243dcc9c2a7530c9dc5 100644 --- a/lite/core/mir/quantized_op_attributes_inference_pass.cc +++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc @@ -37,34 +37,53 @@ void QuantizedOpAttributesInferencePass::Apply( auto& inst = op_node->AsStmt(); auto op_info = inst.op_info(); auto op_type = op_info->Type(); - if (!op_info->HasAttr("input_scale")) continue; - bool found = false; - float output_scale; + + // Check if any of the inputs of the op have scale value + bool has_input_scale = false; + for (auto in_var_node : op_node->inlinks) { + CHECK(in_var_node->IsArg()); + auto in_var_node_name = in_var_node->arg()->name; + has_input_scale |= op_info->HasInputScale(in_var_node_name); + } + if (!has_input_scale) continue; + + // Infer the output scale according to its out_threshold or the input scale + // of its adjacent ops + bool is_quantized = true; for (auto out_var_node : op_node->outlinks) { CHECK(out_var_node->IsArg()); + std::vector output_scale; + bool has_output_scale = false; + auto out_var_node_name = out_var_node->arg()->name; for (auto out_op_node : out_var_node->outlinks) { CHECK(out_op_node->IsStmt()); auto& out_inst = out_op_node->AsStmt(); auto out_op_info = out_inst.op_info(); - if (!out_op_info->HasAttr("input_scale")) continue; - auto input_scale = out_op_info->GetAttr("input_scale"); - if (!found) { - found = true; + if (!out_op_info->HasInputScale(out_var_node_name)) continue; + auto input_scale = out_op_info->GetInputScale(out_var_node_name); + if (!has_output_scale) { output_scale = input_scale; + has_output_scale = true; } else { - CHECK_EQ(output_scale, input_scale); + CHECK_EQ(output_scale.size(), input_scale.size()); } } + if (has_output_scale) { + inst.mutable_op_info()->SetOutputScale(out_var_node_name, output_scale); + } else if (op_info->HasAttr("out_threshold")) { + // Only consider one output, there are only one out_threshold + int bit_length = op_info->GetAttr("bit_length"); + int range = (1 << (bit_length - 1)) - 1; + output_scale = std::vector{ + op_info->GetAttr("out_threshold") / range}; + inst.mutable_op_info()->SetOutputScale(out_var_node_name, output_scale); + } else { + is_quantized = false; + } } - if (found) { - inst.mutable_op_info()->SetAttr("output_scale", output_scale); - } else if (op_info->HasAttr("output_scale")) { - int bit_length = op_info->GetAttr("bit_length"); - int range = (1 << (bit_length - 1)) - 1; - output_scale = op_info->GetAttr("output_scale"); - inst.mutable_op_info()->SetAttr("output_scale", output_scale / range); - } - if (op_info->HasAttr("output_scale")) { + + // Fix the missing of the attribute 'enable_int8'. + if (is_quantized) { inst.mutable_op_info()->SetAttr("enable_int8", true); } } diff --git a/lite/core/mir/runtime_context_assign_pass.cc b/lite/core/mir/runtime_context_assign_pass.cc index 5b6f968484b7b49838a004c3edfd00ff9b7e5e5e..7ad833b22885204130b50a931dc2da7d040c654c 100644 --- a/lite/core/mir/runtime_context_assign_pass.cc +++ b/lite/core/mir/runtime_context_assign_pass.cc @@ -44,6 +44,10 @@ class RuntimeContextAssignPass : public StmtPass { inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext( inst.picked_kernel().target())); } +#elif LITE_WITH_MLU + inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + inst.picked_kernel().target(), + static_cast(reinterpret_cast(graph.get())))); #else int stream_id = inst.stream_id_; diff --git a/lite/core/mir/static_kernel_pick_pass.cc b/lite/core/mir/static_kernel_pick_pass.cc index 1de0d1a26577b31e1dfc5187562cc80bce6fe4d1..b5dd1f8b9c119f4647b72a35eb71df37f31fc6f8 100644 --- a/lite/core/mir/static_kernel_pick_pass.cc +++ b/lite/core/mir/static_kernel_pick_pass.cc @@ -110,15 +110,16 @@ void StaticKernelPickPass::Apply(const std::unique_ptr& graph) { if (out_type_int8) { auto out_node = node.outlinks.front(); CHECK(out_node->IsArg()); + auto out_node_name = out_node->arg()->name; auto one_adj_op_node = out_node->outlinks.front(); CHECK(one_adj_op_node->IsStmt()); auto& one_adj_instruct = one_adj_op_node->AsStmt(); CHECK(one_adj_instruct.op_info()->HasAttr("enable_int8")); - CHECK(one_adj_instruct.op_info()->HasAttr("input_scale")); + CHECK(one_adj_instruct.op_info()->HasInputScale(out_node_name)); - instruct.mutable_op_info()->SetAttr( - "output_scale", - one_adj_instruct.op_info()->GetAttr("input_scale")); + instruct.mutable_op_info()->SetOutputScale( + out_node_name, + one_adj_instruct.op_info()->GetInputScale(out_node_name)); auto update_desc = *instruct.mutable_op_info(); instruct.ResetOp(update_desc, graph->valid_places()); diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc index 31a38280ff537d486f5fb3ba46dee5b025d3f1f1..4b9f34225f70e9050b2605b49e888ed323536b2f 100644 --- a/lite/core/mir/subgraph/subgraph_detector.cc +++ b/lite/core/mir/subgraph/subgraph_detector.cc @@ -425,20 +425,45 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, subgraph_op_desc.SetAttr("sub_block", sub_block_idx); // Extract input and output nodes from the target subgraph - std::set input_var_nodes; + std::set idata_var_nodes; std::set weight_var_nodes; - std::set output_var_nodes; + std::set odata_var_nodes; std::set local_var_nodes; std::set unused_var_nodes; ExtractInputsOutputs(subgraph_nodes, - &input_var_nodes, + &idata_var_nodes, &weight_var_nodes, - &output_var_nodes, + &odata_var_nodes, &local_var_nodes, &unused_var_nodes); - + // A simplified model without the original weight/local/unused nodes on the + // subgraph ops will be saved only if 'SUBGRAPH_DISABLE_ONLINE_MODE' is set to + // true and Predictor->Run(...), Predictor->Save(...) is called. + std::set input_var_nodes(idata_var_nodes.begin(), + idata_var_nodes.end()); + std::set output_var_nodes(odata_var_nodes.begin(), + odata_var_nodes.end()); + if (!GetBoolFromEnv(SUBGRAPH_DISABLE_ONLINE_MODE)) { + input_var_nodes.insert(weight_var_nodes.begin(), weight_var_nodes.end()); + output_var_nodes.insert(local_var_nodes.begin(), local_var_nodes.end()); + output_var_nodes.insert(unused_var_nodes.begin(), unused_var_nodes.end()); + } // Set input and output name mapping which stores the real inputs and // outputs + std::vector idata_var_names; + std::vector odata_var_names; + for (auto &var_node : idata_var_nodes) { + idata_var_names.push_back(var_node->AsArg().name); + } + for (auto &var_node : odata_var_nodes) { + odata_var_names.push_back(var_node->AsArg().name); + } + subgraph_op_desc.SetAttr>("input_data_names", + idata_var_names); + subgraph_op_desc.SetAttr>("output_data_names", + odata_var_names); + // Set all of the inputs and outputs to the target subgraph op + // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram() std::vector input_var_names; std::vector output_var_names; for (auto &var_node : input_var_nodes) { @@ -447,60 +472,36 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, for (auto &var_node : output_var_nodes) { output_var_names.push_back(var_node->AsArg().name); } - subgraph_op_desc.SetAttr>("input_data_names", - input_var_names); - subgraph_op_desc.SetAttr>("output_data_names", - output_var_names); + subgraph_op_desc.SetInput("Inputs", input_var_names); + subgraph_op_desc.SetOutput("Outputs", output_var_names); + auto subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + static_cast(subgraph_op.get()) + ->SetSubBlock(sub_block_desc); + auto any_op = (*subgraph_nodes.begin())->AsStmt().op(); + subgraph_op->Attach(subgraph_op_desc, any_op->scope()); - // Set input/output scale values of input/output var nodes for - // type_precision_cast_pass. - std::vector input_data_scales; - std::vector output_data_scales; + // Export the scale values of the input/output var nodes of the inner op nodes + // only for type_precision_cast_pass. for (auto &var_node : input_var_nodes) { + auto var_node_name = var_node->arg()->name; auto any_op_node = var_node->outlinks.front(); CHECK(any_op_node->IsStmt()); auto &any_inst = any_op_node->AsStmt(); - if (any_inst.op_info()->HasAttr("input_scale")) { - input_data_scales.push_back( - any_inst.op_info()->GetAttr("input_scale")); + if (any_inst.op_info()->HasInputScale(var_node_name)) { + subgraph_op->mutable_op_info()->SetInputScale( + var_node_name, any_inst.op_info()->GetInputScale(var_node_name)); } } for (auto &var_node : output_var_nodes) { + auto var_node_name = var_node->arg()->name; auto any_op_node = var_node->inlinks.front(); CHECK(any_op_node->IsStmt()); auto &any_inst = any_op_node->AsStmt(); - if (any_inst.op_info()->HasAttr("output_scale")) { - output_data_scales.push_back( - any_inst.op_info()->GetAttr("output_scale")); + if (any_inst.op_info()->HasOutputScale(var_node_name)) { + subgraph_op->mutable_op_info()->SetOutputScale( + var_node_name, any_inst.op_info()->GetOutputScale(var_node_name)); } } - if (input_data_scales.size() > 0) { - subgraph_op_desc.SetAttr>("input_data_scales", - input_data_scales); - } - if (output_data_scales.size() > 0) { - subgraph_op_desc.SetAttr>("output_data_scales", - output_data_scales); - } - - // Set all of the inputs and outputs to the target subgraph op - // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram() - for (auto &var_node : weight_var_nodes) { - input_var_names.push_back(var_node->AsArg().name); - } - for (auto &var_node : local_var_nodes) { - output_var_names.push_back(var_node->AsArg().name); - } - for (auto &var_node : unused_var_nodes) { - output_var_names.push_back(var_node->AsArg().name); - } - subgraph_op_desc.SetInput("Inputs", input_var_names); - subgraph_op_desc.SetOutput("Outputs", output_var_names); - auto subgraph_op = LiteOpRegistry::Global().Create("subgraph"); - static_cast(subgraph_op.get()) - ->SetSubBlock(sub_block_desc); - auto any_op = (*subgraph_nodes.begin())->AsStmt().op(); - subgraph_op->Attach(subgraph_op_desc, any_op->scope()); // Create and add a new subgraph node into the graph auto subgraph_op_node = @@ -508,26 +509,13 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, for (auto &var_node : input_var_nodes) { IR_NODE_LINK_TO(var_node, subgraph_op_node); } - for (auto &var_node : weight_var_nodes) { - IR_NODE_LINK_TO(var_node, subgraph_op_node); - } for (auto &var_node : output_var_nodes) { IR_OP_VAR_LINK(subgraph_op_node, var_node); } - for (auto &var_node : local_var_nodes) { - IR_OP_VAR_LINK(subgraph_op_node, var_node); - } - for (auto &var_node : unused_var_nodes) { - IR_OP_VAR_LINK(subgraph_op_node, var_node); - } // Remove subgraph nodes and unused var nodes - auto nodes2rm = GetNodes2RM(subgraph_nodes, - {input_var_nodes, - weight_var_nodes, - output_var_nodes, - local_var_nodes, - unused_var_nodes}); + auto nodes2rm = + GetNodes2RM(subgraph_nodes, {input_var_nodes, output_var_nodes}); GraphSafeRemoveNodes(graph, nodes2rm); } @@ -602,7 +590,17 @@ std::set GetNodes2RM( std::set nodes2rm(op_nodes.begin(), op_nodes.end()); for (auto &op_node : op_nodes) { for (auto &var_node : op_node->inlinks) { - if (!nodes2rm.count(var_node)) { + bool skip = false; + // skip the var node which is used by any other ops that doesn't belong to + // the subgraph ops. + for (auto &out_op_node : var_node->outlinks) { + if (std::find(op_nodes.begin(), op_nodes.end(), out_op_node) != + op_nodes.end()) { + skip = true; + break; + } + } + if (!skip && !nodes2rm.count(var_node)) { nodes2rm.insert(var_node); } } diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc index f52c0332fa3cfce904d2b7c8bf010bc3d3ac6ac9..06c9c4c78fedba7cfabcd4ff2dd3804b404f966d 100644 --- a/lite/core/mir/subgraph/subgraph_detector_test.cc +++ b/lite/core/mir/subgraph/subgraph_detector_test.cc @@ -20,7 +20,7 @@ #include "lite/api/paddle_use_passes.h" #include "lite/core/mir/ssa_graph.h" #include "lite/core/program.h" -#include "lite/model_parser/cpp/program_desc.h" +#include "lite/model_parser/cpp_desc.h" #include "lite/model_parser/model_parser.h" DEFINE_string(model_dir, "", "model_dir"); diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc index 8fd3751f9ca1585af6b8b00f23acd6bacf5b7a51..104ad5b4fa819de5ff3501c08c60e9918c93cddf 100644 --- a/lite/core/mir/subgraph/subgraph_pass_test.cc +++ b/lite/core/mir/subgraph/subgraph_pass_test.cc @@ -13,8 +13,12 @@ // limitations under the License. #include + #include + #include "lite/api/paddle_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" #include "lite/api/test_helper.h" #include "lite/utils/cp_logging.h" #include "lite/utils/string.h" diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc index 1133e5ba8203ec9fea177844a6311c993f6b8ff7..44b6eaf1eb0c5c96630dd66d129919b40f3ea8c6 100644 --- a/lite/core/mir/type_layout_cast_pass.cc +++ b/lite/core/mir/type_layout_cast_pass.cc @@ -249,11 +249,13 @@ void OpenCLTypeLayoutTransformPass::Apply( REGISTER_MIR_PASS(type_layout_cast_pass, paddle::lite::mir::TypeLayoutTransformPass) .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kMLU)}) .BindKernel("layout_once") .BindKernel("layout"); REGISTER_MIR_PASS(type_layout_cast_preprocess_pass, paddle::lite::mir::OpenCLTypeLayoutTransformPass) .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kMLU)}) .BindKernel("layout_once") .BindKernel("layout"); diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc index 25648877568f6427843f8ded6890450c265b4f06..39a94cbca6bd6222da5da1d314ea07475592bf0e 100644 --- a/lite/core/mir/type_precision_cast_pass.cc +++ b/lite/core/mir/type_precision_cast_pass.cc @@ -66,65 +66,30 @@ void UpdateInputs(OpLite* op, const std::string& from, const std::string& to) { } } -// Infer the scale value for the new calib op from the subgraph op -static bool InferScaleFromSubgraph(std::string var_name, - const OpInfo* op_info, - float* scale, - bool reverse = false) { - std::string attr_name = reverse ? "output_data_names" : "input_data_names"; - if (!op_info->HasAttr(attr_name)) return false; - auto input_or_output_names = - op_info->GetAttr>(attr_name); - attr_name = reverse ? "output_data_scales" : "input_data_scales"; - if (!op_info->HasAttr(attr_name)) return false; - auto input_or_output_scales = op_info->GetAttr>(attr_name); - auto size = input_or_output_names.size(); - CHECK(size == input_or_output_scales.size()); - for (size_t i = 0; i < size; i++) { - if (input_or_output_names[i] == var_name) { - *scale = input_or_output_scales[i]; - return true; - } - } - return false; -} - // Infer the scale value for the new calib op from the input_scale of the // current op and output_scale of the previous op. // case 1: prev_op->var_node->op_node(int8->any op, with input_scale). -// case 2: prev_op->var_node->op_node(subgraph op, int8->any, with -// input_data_scales). -// case 3: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any, +// case 2: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any, // without input_scale). -// case 4: prev_op(any->int8, subgraph_op, with -// output_data_scales)->var_node->op_node(fp32->any, without input_scale). static bool InferScale(Node* var_node, Node* op_node, float* scale) { bool found = false; auto& inst = op_node->AsStmt(); auto op_info = inst.op_info(); auto op_type = op_info->Type(); auto var_name = var_node->AsArg().name; - if (op_type == "subgraph") { - found = InferScaleFromSubgraph(var_name, op_info, scale, false); + if (op_info->HasInputScale(var_name)) { + *scale = op_info->GetInputScale(var_name)[0]; + found = true; } else { - if (op_info->HasAttr("input_scale")) { - *scale = op_info->GetAttr("input_scale"); + // Obtain the output_scale from one of its previous Ops + auto prev_op_node = var_node->inlinks.front(); + CHECK(prev_op_node->IsStmt()); + auto& prev_inst = prev_op_node->AsStmt(); + auto prev_op_info = prev_inst.op_info(); + auto prev_op_type = prev_op_info->Type(); + if (prev_op_info->HasOutputScale(var_name)) { + *scale = prev_op_info->GetOutputScale(var_name)[0]; found = true; - } else { - // Obtain the output_scale from one of its previous Ops - auto prev_op_node = var_node->inlinks.front(); - CHECK(prev_op_node->IsStmt()); - auto& prev_inst = prev_op_node->AsStmt(); - auto prev_op_info = prev_inst.op_info(); - auto prev_op_type = prev_op_info->Type(); - if (prev_op_type == "subgraph") { - found = InferScaleFromSubgraph(var_name, prev_op_info, scale, true); - } else { - if (prev_op_info->HasAttr("output_scale")) { - *scale = prev_op_info->GetAttr("output_scale"); - found = true; - } - } } } return found; diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc index 537636065d6aeea67fd7c8c71fb00b183720fecc..585aaf3b703bca0a0a34030106dbf793e2a31d52 100644 --- a/lite/core/op_lite.cc +++ b/lite/core/op_lite.cc @@ -18,6 +18,7 @@ #include #include #include "lite/core/op_registry.h" +#include "lite/utils/string.h" namespace paddle { namespace lite { @@ -186,5 +187,114 @@ void OpLite::AttachOutput(const cpp::OpDesc &op_desc, } } +bool OpInfo::GetInputArgname(const std::string &value_name, + std::string *out) const { + for (auto &item : inputs()) { + auto it = std::find(item.second.begin(), item.second.end(), value_name); + if (it != item.second.end()) { + *out = item.first; + return true; + } + } + return false; +} + +bool OpInfo::GetOutputArgname(const std::string &value_name, + std::string *out) const { + for (auto &item : outputs()) { + auto it = std::find(item.second.begin(), item.second.end(), value_name); + if (it != item.second.end()) { + *out = item.first; + return true; + } + } + return false; +} + +bool OpInfo::GetInputIndex(const std::string &input_name, int *out) const { + for (auto &item : inputs()) { + auto it = std::find(item.second.begin(), item.second.end(), input_name); + if (it != item.second.end()) { + *out = it - item.second.begin(); + return true; + } + } + return false; +} + +bool OpInfo::GetOutputIndex(const std::string &output_name, int *out) const { + for (auto &item : outputs()) { + auto it = std::find(item.second.begin(), item.second.end(), output_name); + if (it != item.second.end()) { + *out = it - item.second.begin(); + return true; + } + } + return false; +} + +bool OpInfo::HasInputScale(const std::string &input_name) const { + std::string argname; + int index; + if (GetInputArgname(input_name, &argname) && + GetInputIndex(input_name, &index)) { + return HasAttr(argname + to_string(index) + "_scale"); + } else { + return false; + } +} + +bool OpInfo::HasOutputScale(const std::string &output_name) const { + std::string argname; + int index; + if (GetOutputArgname(output_name, &argname) && + GetOutputIndex(output_name, &index)) { + return HasAttr(argname + to_string(index) + "_scale"); + } else { + return false; + } +} + +void OpInfo::SetInputScale(const std::string &input_name, + const std::vector &scale_value) { + std::string argname; + int index; + CHECK(GetInputArgname(input_name, &argname)); + CHECK(GetInputIndex(input_name, &index)); + CHECK(scale_value.size() > 0) + << "Error in SetInputScale: the scales should not be empty"; + SetAttr>(argname + to_string(index) + "_scale", + scale_value); +} + +void OpInfo::SetOutputScale(const std::string &output_name, + const std::vector &scale_value) { + std::string argname; + int index; + CHECK(GetOutputArgname(output_name, &argname)); + CHECK(GetOutputIndex(output_name, &index)); + CHECK(scale_value.size() > 0) + << "Error in SetOutputScale: the scales should not be empty"; + SetAttr>(argname + to_string(index) + "_scale", + scale_value); +} + +std::vector OpInfo::GetInputScale(const std::string &input_name) const { + std::string argname; + int index; + CHECK(GetInputArgname(input_name, &argname)); + CHECK(GetInputIndex(input_name, &index)); + return GetAttr>(argname + to_string(index) + "_scale"); +} + +std::vector OpInfo::GetOutputScale( + const std::string &output_name) const { + std::string argname; + int index; + CHECK(GetOutputArgname(output_name, &argname)); + CHECK(GetOutputIndex(output_name, &index)); + return GetAttr>(argname + to_string(index) + "_scale"); +} + } // namespace lite } // namespace paddle diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h index 301065d5b6bb5c4f41b19d9a9034985ca2f74d89..079586d5e0c00f261bfbf4c7658ccca97402f8ac 100644 --- a/lite/core/op_lite.h +++ b/lite/core/op_lite.h @@ -24,7 +24,7 @@ #include "lite/core/context.h" #include "lite/core/kernel.h" #include "lite/core/scope.h" -#include "lite/model_parser/cpp/op_desc.h" +#include "lite/model_parser/cpp_desc.h" #include "lite/operators/op_params.h" namespace paddle { @@ -229,55 +229,8 @@ class OpInfo : public cpp::OpDesc { return OutputArgumentNames(); } - bool GetInputArgname(const std::string &value_name, std::string *out) const { - for (auto &item : inputs_) { - auto it = std::find(item.second.begin(), item.second.end(), value_name); - if (it != item.second.end()) { - *out = item.first; - return true; - } - } - return false; - } - bool GetOutputArgname(const std::string &value_name, std::string *out) const { - for (auto &item : outputs_) { - auto it = std::find(item.second.begin(), item.second.end(), value_name); - if (it != item.second.end()) { - *out = item.first; - return true; - } - } - return false; - } - - // For the input variable name, find the index of the corresponding - // input argname - bool GetInputIndex(const std::string &value_name, int *out) const { - for (auto &item : inputs_) { - auto it = std::find(item.second.begin(), item.second.end(), value_name); - if (it != item.second.end()) { - *out = it - item.second.begin(); - return true; - } - } - return false; - } - - // For the output variable name, find the index of the corresponding - // output argname - bool GetOutputIndex(const std::string &value_name, int *out) const { - for (auto &item : outputs_) { - auto it = std::find(item.second.begin(), item.second.end(), value_name); - if (it != item.second.end()) { - *out = it - item.second.begin(); - return true; - } - } - return false; - } - void UpdateAllInputs(const std::string &from, const std::string &to) { - for (auto &item : inputs_) { + for (auto &item : *mutable_inputs()) { for (auto &var : item.second) { if (var == from) var = to; } @@ -285,12 +238,32 @@ class OpInfo : public cpp::OpDesc { } void UpdateAllOutputs(const std::string &from, const std::string &to) { - for (auto &item : outputs_) { + for (auto &item : *mutable_outputs()) { for (auto &var : item.second) { if (var == from) var = to; } } } + + bool GetInputArgname(const std::string &value_name, std::string *out) const; + bool GetOutputArgname(const std::string &value_name, std::string *out) const; + + bool GetInputIndex(const std::string &input_name, int *out) const; + bool GetOutputIndex(const std::string &output_name, int *out) const; + + bool HasInputScale(const std::string &input_name) const; + bool HasOutputScale(const std::string &output_name) const; + + void SetInputScale(const std::string &input_name, + const std::vector &scale_value); + void SetOutputScale(const std::string &output_name, + const std::vector &scale_value); + + // For conv2d, depthwise_conv2d and mul, the scale of weight are a vector. + // Otherwise, all input and output scales are scalar, but we save these + // as vecotr. + std::vector GetInputScale(const std::string &input_name) const; + std::vector GetOutputScale(const std::string &output_name) const; }; } // namespace lite diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc index ef6d3cfaf001ea55cef23faee11d508920c49715..cb773edd18ee236a30cbfcf5d6b1ce5773f0269d 100644 --- a/lite/core/op_registry.cc +++ b/lite/core/op_registry.cc @@ -17,277 +17,5 @@ #include namespace paddle { -namespace lite { - -const std::map &GetOp2PathDict() { - return OpKernelInfoCollector::Global().GetOp2PathDict(); -} - -std::list> KernelRegistry::Create( - const std::string &op_type, - TargetType target, - PrecisionType precision, - DataLayoutType layout) { - Place place{target, precision, layout}; - VLOG(5) << "creating " << op_type << " kernel for " << place.DebugString(); -#define CREATE_KERNEL1(target__, precision__) \ - switch (layout) { \ - case DATALAYOUT(kNCHW): \ - return Create(op_type); \ - case DATALAYOUT(kAny): \ - return Create(op_type); \ - case DATALAYOUT(kNHWC): \ - return Create(op_type); \ - case DATALAYOUT(kImageDefault): \ - return Create(op_type); \ - case DATALAYOUT(kImageFolder): \ - return Create(op_type); \ - case DATALAYOUT(kImageNW): \ - return Create(op_type); \ - default: \ - LOG(FATAL) << "unsupported kernel layout " << DataLayoutToStr(layout); \ - } - -#define CREATE_KERNEL(target__) \ - switch (precision) { \ - case PRECISION(kFloat): \ - CREATE_KERNEL1(target__, kFloat); \ - case PRECISION(kInt8): \ - CREATE_KERNEL1(target__, kInt8); \ - case PRECISION(kFP16): \ - CREATE_KERNEL1(target__, kFP16); \ - case PRECISION(kAny): \ - CREATE_KERNEL1(target__, kAny); \ - case PRECISION(kInt32): \ - CREATE_KERNEL1(target__, kInt32); \ - case PRECISION(kInt64): \ - CREATE_KERNEL1(target__, kInt64); \ - default: \ - CHECK(false) << "not supported kernel precision " \ - << PrecisionToStr(precision); \ - } - - switch (target) { - case TARGET(kHost): { - CREATE_KERNEL(kHost); - } break; -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_X86) - case TARGET(kX86): { - CREATE_KERNEL(kX86); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_CUDA) - case TARGET(kCUDA): { - CREATE_KERNEL(kCUDA); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_ARM) - case TARGET(kARM): { - CREATE_KERNEL(kARM); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_OPENCL) - case TARGET(kOpenCL): { - CREATE_KERNEL(kOpenCL); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_NPU) - case TARGET(kNPU): { - CREATE_KERNEL(kNPU); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_APU) - case TARGET(kAPU): { - CREATE_KERNEL(kAPU); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_XPU) - case TARGET(kXPU): { - CREATE_KERNEL(kXPU); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_FPGA) - case TARGET(kFPGA): { - CREATE_KERNEL(kFPGA); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_BM) - case TARGET(kBM): { - CREATE_KERNEL(kBM); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_MLU) - case TARGET(kMLU): { - CREATE_KERNEL(kMLU); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_RKNPU) - case TARGET(kRKNPU): { - CREATE_KERNEL(kRKNPU); - } break; -#endif - default: - CHECK(false) << "not supported kernel target " << TargetToStr(target); - } - -#undef CREATE_KERNEL - return std::list>(); -} - -KernelRegistry::KernelRegistry() : registries_() { -#define INIT_FOR(target__, precision__, layout__) \ - registries_[std::make_tuple(TARGET(target__), \ - PRECISION(precision__), \ - DATALAYOUT(layout__))] \ - .set *>( \ - &KernelRegistryForTarget::Global()); -// Currently, just register 2 kernel targets. -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_CUDA) - INIT_FOR(kCUDA, kFloat, kNCHW); - INIT_FOR(kCUDA, kFloat, kNHWC); - INIT_FOR(kCUDA, kInt8, kNCHW); - INIT_FOR(kCUDA, kFP16, kNCHW); - INIT_FOR(kCUDA, kFP16, kNHWC); - INIT_FOR(kCUDA, kAny, kNCHW); - INIT_FOR(kCUDA, kAny, kAny); - INIT_FOR(kCUDA, kInt8, kNHWC); - INIT_FOR(kCUDA, kInt64, kNCHW); - INIT_FOR(kCUDA, kInt64, kNHWC); - INIT_FOR(kCUDA, kInt32, kNCHW); - INIT_FOR(kCUDA, kInt32, kNHWC); -#endif - -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_MLU) - INIT_FOR(kMLU, kFloat, kNHWC); - INIT_FOR(kMLU, kFloat, kNCHW); - INIT_FOR(kMLU, kFP16, kNHWC); - INIT_FOR(kMLU, kFP16, kNCHW); - INIT_FOR(kMLU, kInt8, kNHWC); - INIT_FOR(kMLU, kInt8, kNCHW); - INIT_FOR(kMLU, kInt16, kNHWC); - INIT_FOR(kMLU, kInt16, kNCHW); -#endif - - INIT_FOR(kHost, kAny, kNCHW); - INIT_FOR(kHost, kAny, kNHWC); - INIT_FOR(kHost, kAny, kAny); - INIT_FOR(kHost, kBool, kNCHW); - INIT_FOR(kHost, kBool, kNHWC); - INIT_FOR(kHost, kBool, kAny); - INIT_FOR(kHost, kFloat, kNCHW); - INIT_FOR(kHost, kFloat, kNHWC); - INIT_FOR(kHost, kFloat, kAny); - INIT_FOR(kHost, kFP16, kNCHW); - INIT_FOR(kHost, kFP16, kNHWC); - INIT_FOR(kHost, kFP16, kAny); - INIT_FOR(kHost, kInt8, kNCHW); - INIT_FOR(kHost, kInt8, kNHWC); - INIT_FOR(kHost, kInt8, kAny); - INIT_FOR(kHost, kInt16, kNCHW); - INIT_FOR(kHost, kInt16, kNHWC); - INIT_FOR(kHost, kInt16, kAny); - INIT_FOR(kHost, kInt32, kNCHW); - INIT_FOR(kHost, kInt32, kNHWC); - INIT_FOR(kHost, kInt32, kAny); - INIT_FOR(kHost, kInt64, kNCHW); - INIT_FOR(kHost, kInt64, kNHWC); - INIT_FOR(kHost, kInt64, kAny); - -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_X86) - INIT_FOR(kX86, kFloat, kNCHW); - INIT_FOR(kX86, kAny, kNCHW); - INIT_FOR(kX86, kAny, kAny); - INIT_FOR(kX86, kInt64, kNCHW); -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_ARM) - INIT_FOR(kARM, kFloat, kNCHW); - INIT_FOR(kARM, kFloat, kNHWC); - INIT_FOR(kARM, kInt8, kNCHW); - INIT_FOR(kARM, kInt8, kNHWC); - INIT_FOR(kARM, kAny, kNCHW); - INIT_FOR(kARM, kAny, kAny); - INIT_FOR(kARM, kInt32, kNCHW); - INIT_FOR(kARM, kInt64, kNCHW); -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_OPENCL) - INIT_FOR(kOpenCL, kFloat, kNCHW); - INIT_FOR(kOpenCL, kFloat, kNHWC); - INIT_FOR(kOpenCL, kAny, kNCHW); - INIT_FOR(kOpenCL, kAny, kNHWC); - INIT_FOR(kOpenCL, kFloat, kAny); - INIT_FOR(kOpenCL, kInt8, kNCHW); - INIT_FOR(kOpenCL, kAny, kAny); - INIT_FOR(kOpenCL, kFP16, kNCHW); - INIT_FOR(kOpenCL, kFP16, kNHWC); - INIT_FOR(kOpenCL, kFP16, kImageDefault); - INIT_FOR(kOpenCL, kFP16, kImageFolder); - INIT_FOR(kOpenCL, kFP16, kImageNW); - INIT_FOR(kOpenCL, kFloat, kImageDefault); - INIT_FOR(kOpenCL, kFloat, kImageFolder); - INIT_FOR(kOpenCL, kFloat, kImageNW); - INIT_FOR(kOpenCL, kAny, kImageDefault); - INIT_FOR(kOpenCL, kAny, kImageFolder); - INIT_FOR(kOpenCL, kAny, kImageNW); -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_NPU) - INIT_FOR(kNPU, kFloat, kNCHW); - INIT_FOR(kNPU, kFloat, kNHWC); - INIT_FOR(kNPU, kInt8, kNCHW); - INIT_FOR(kNPU, kInt8, kNHWC); - INIT_FOR(kNPU, kAny, kNCHW); - INIT_FOR(kNPU, kAny, kNHWC); - INIT_FOR(kNPU, kAny, kAny); -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_APU) - INIT_FOR(kAPU, kInt8, kNCHW); - INIT_FOR(kXPU, kFloat, kNCHW); - INIT_FOR(kXPU, kInt8, kNCHW); - INIT_FOR(kXPU, kAny, kNCHW); - INIT_FOR(kXPU, kAny, kAny); -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_FPGA) - INIT_FOR(kFPGA, kFP16, kNHWC); - INIT_FOR(kFPGA, kFP16, kAny); - INIT_FOR(kFPGA, kFloat, kNHWC); - INIT_FOR(kFPGA, kAny, kNHWC); - INIT_FOR(kFPGA, kAny, kAny); -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_BM) - INIT_FOR(kBM, kFloat, kNCHW); - INIT_FOR(kBM, kInt8, kNCHW); - INIT_FOR(kBM, kAny, kNCHW); - INIT_FOR(kBM, kAny, kAny); -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_RKNPU) - INIT_FOR(kRKNPU, kFloat, kNCHW); - INIT_FOR(kRKNPU, kInt8, kNCHW); - INIT_FOR(kRKNPU, kAny, kNCHW); - INIT_FOR(kRKNPU, kAny, kAny); -#endif - -#undef INIT_FOR -} - -KernelRegistry &KernelRegistry::Global() { - static auto *x = new KernelRegistry; - return *x; -} - -} // namespace lite +namespace lite {} // namespace lite } // namespace paddle diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h index 2128e218554fb304474c14cfacd7867e491a4fe6..90a2b563af7e17a4806bd47cb883d9590cdab40f 100644 --- a/lite/core/op_registry.h +++ b/lite/core/op_registry.h @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -33,19 +32,19 @@ using LiteType = paddle::lite::Type; class OpKernelInfoCollector { public: - static OpKernelInfoCollector &Global() { - static auto *x = new OpKernelInfoCollector; + static OpKernelInfoCollector& Global() { + static auto* x = new OpKernelInfoCollector; return *x; } - void AddOp2path(const std::string &op_name, const std::string &op_path) { + void AddOp2path(const std::string& op_name, const std::string& op_path) { size_t index = op_path.find_last_of('/'); if (index != std::string::npos) { op2path_.insert(std::pair( op_name, op_path.substr(index + 1))); } } - void AddKernel2path(const std::string &kernel_name, - const std::string &kernel_path) { + void AddKernel2path(const std::string& kernel_name, + const std::string& kernel_path) { size_t index = kernel_path.find_last_of('/'); if (index != std::string::npos) { kernel2path_.insert(std::pair( @@ -53,13 +52,13 @@ class OpKernelInfoCollector { } } void SetKernel2path( - const std::map &kernel2path_map) { + const std::map& kernel2path_map) { kernel2path_ = kernel2path_map; } - const std::map &GetOp2PathDict() { + const std::map& GetOp2PathDict() { return op2path_; } - const std::map &GetKernel2PathDict() { + const std::map& GetKernel2PathDict() { return kernel2path_; } @@ -71,409 +70,185 @@ class OpKernelInfoCollector { namespace paddle { namespace lite { -const std::map &GetOp2PathDict(); - -using KernelFunc = std::function; -using KernelFuncCreator = std::function()>; -class LiteOpRegistry final : public Factory> { +class OpLiteFactory { public: - static LiteOpRegistry &Global() { - static auto *x = new LiteOpRegistry; - return *x; + // Register a function to create an op + void RegisterCreator(const std::string& op_type, + std::function()> fun) { + op_registry_[op_type] = fun; } - private: - LiteOpRegistry() = default; -}; - -template -class OpLiteRegistor : public Registor { - public: - explicit OpLiteRegistor(const std::string &op_type) - : Registor([&] { - LiteOpRegistry::Global().Register( - op_type, [op_type]() -> std::unique_ptr { - return std::unique_ptr(new OpClass(op_type)); - }); - }) {} -}; -template -using KernelRegistryForTarget = - Factory, std::unique_ptr>; - -class KernelRegistry final { - public: - using any_kernel_registor_t = - variant *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // + static OpLiteFactory& Global() { + static OpLiteFactory* x = new OpLiteFactory; + return *x; + } - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // + std::shared_ptr Create(const std::string& op_type) const { + auto it = op_registry_.find(op_type); + if (it == op_registry_.end()) return nullptr; + return it->second(); + } - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // + std::string DebugString() const { + STL::stringstream ss; + for (const auto& item : op_registry_) { + ss << " - " << item.first << "\n"; + } + return ss.str(); + } - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // + std::vector GetAllOps() const { + std::vector res; + for (const auto& op : op_registry_) { + res.push_back(op.first); + } + return res; + } - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // + protected: + std::map()>> op_registry_; +}; - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget * // - >; +using LiteOpRegistry = OpLiteFactory; - KernelRegistry(); +// Register OpLite by initializing a static OpLiteRegistrar instance +class OpLiteRegistrar { + public: + OpLiteRegistrar(const std::string& op_type, + std::function()> fun) { + OpLiteFactory::Global().RegisterCreator(op_type, fun); + } + // Touch function is used to guarantee registrar was initialized. + void touch() {} +}; - static KernelRegistry &Global(); +class KernelFactory { + public: + // Register a function to create kernels + void RegisterCreator(const std::string& op_type, + TargetType target, + PrecisionType precision, + DataLayoutType layout, + std::function()> fun) { + op_registry_[op_type][std::make_tuple(target, precision, layout)].push_back( + fun); + } - template - void Register( - const std::string &name, - typename KernelRegistryForTarget::creator_t - &&creator) { - using kernel_registor_t = - KernelRegistryForTarget; - auto &varient = registries_[std::make_tuple(Target, Precision, Layout)]; - auto *reg = varient.template get(); - CHECK(reg) << "Can not be empty of " << name; - reg->Register(name, std::move(creator)); -#ifdef LITE_ON_MODEL_OPTIMIZE_TOOL - kernel_info_map_[name].push_back( - std::make_tuple(Target, Precision, Layout)); -#endif // LITE_ON_MODEL_OPTIMIZE_TOOL + static KernelFactory& Global() { + static KernelFactory* x = new KernelFactory; + return *x; } - template - std::list> Create(const std::string &op_type) { - using kernel_registor_t = - KernelRegistryForTarget; - std::list> kernel_list; - std::tuple temp_tuple( - Target, Precision, Layout); - if (registries_[temp_tuple].valid()) { - kernel_list = - registries_[temp_tuple].template get()->Creates( - op_type); + /** + * Create all kernels belongs to an op. + */ + std::list> Create(const std::string& op_type) { + std::list> res; + if (op_registry_.find(op_type) == op_registry_.end()) return res; + auto& kernel_registry = op_registry_[op_type]; + for (auto it = kernel_registry.begin(); it != kernel_registry.end(); ++it) { + for (auto& fun : it->second) { + res.emplace_back(fun()); + } } - return kernel_list; + return res; } - std::list> Create(const std::string &op_type, + /** + * Create a specific kernel. Return a list for API compatible. + */ + std::list> Create(const std::string& op_type, TargetType target, PrecisionType precision, - DataLayoutType layout); + DataLayoutType layout) { + std::list> res; + if (op_registry_.find(op_type) == op_registry_.end()) return res; + auto& kernel_registry = op_registry_[op_type]; + auto it = kernel_registry.find(std::make_tuple(target, precision, layout)); + if (it == kernel_registry.end()) return res; + for (auto& fun : it->second) { + res.emplace_back(fun()); + } + return res; + } std::string DebugString() const { -#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL - return "No more debug info"; -#else // LITE_ON_MODEL_OPTIMIZE_TOOL STL::stringstream ss; - ss << "\n"; - ss << "Count of kernel kinds: "; - int count = 0; - for (auto &item : kernel_info_map_) { - count += item.second.size(); - } - ss << count << "\n"; - - ss << "Count of registered kernels: " << kernel_info_map_.size() << "\n"; - for (auto &item : kernel_info_map_) { - ss << "op: " << item.first << "\n"; - for (auto &kernel : item.second) { - ss << " - (" << TargetToStr(std::get<0>(kernel)) << ","; - ss << PrecisionToStr(std::get<1>(kernel)) << ","; - ss << DataLayoutToStr(std::get<2>(kernel)); - ss << ")"; - ss << "\n"; - } + for (const auto& item : op_registry_) { + ss << " - " << item.first << "\n"; } - return ss.str(); -#endif // LITE_ON_MODEL_OPTIMIZE_TOOL } - private: - mutable std::map, - any_kernel_registor_t> - registries_; -#ifndef LITE_ON_TINY_PUBLISH - mutable std::map< - std::string, - std::vector>> - kernel_info_map_; -#endif + protected: + // Outer map: op -> a map of kernel. + // Inner map: kernel -> creator function. + // Each kernel was represented by a combination of + std::map, + std::list()>>>> + op_registry_; }; -template -class KernelRegistor : public lite::Registor { +using KernelRegistry = KernelFactory; + +// Register Kernel by initializing a static KernelRegistrar instance +class KernelRegistrar { public: - KernelRegistor(const std::string &op_type, const std::string &alias) - : Registor([=] { - KernelRegistry::Global().Register( - op_type, [=]() -> std::unique_ptr { - std::unique_ptr x(new KernelType); - x->set_op_type(op_type); - x->set_alias(alias); - return x; - }); - }) {} + KernelRegistrar(const std::string& op_type, + TargetType target, + PrecisionType precision, + DataLayoutType layout, + std::function()> fun) { + KernelFactory::Global().RegisterCreator( + op_type, target, precision, layout, fun); + } + // Touch function is used to guarantee registrar was initialized. + void touch() {} }; } // namespace lite } // namespace paddle -// Operator registry -#define LITE_OP_REGISTER_INSTANCE(op_type__) op_type__##__registry__instance__ -#define REGISTER_LITE_OP(op_type__, OpClass) \ - static paddle::lite::OpLiteRegistor LITE_OP_REGISTER_INSTANCE( \ - op_type__)(#op_type__); \ - int touch_op_##op_type__() { \ - OpKernelInfoCollector::Global().AddOp2path(#op_type__, __FILE__); \ - return LITE_OP_REGISTER_INSTANCE(op_type__).Touch(); \ +// Register an op. +#define REGISTER_LITE_OP(op_type__, OpClass) \ + static paddle::lite::OpLiteRegistrar op_type__##__registry( \ + #op_type__, []() { \ + return std::unique_ptr(new OpClass(#op_type__)); \ + }); \ + int touch_op_##op_type__() { \ + op_type__##__registry.touch(); \ + OpKernelInfoCollector::Global().AddOp2path(#op_type__, __FILE__); \ + return 0; \ } -// Kernel registry -#define LITE_KERNEL_REGISTER(op_type__, target__, precision__) \ - op_type__##__##target__##__##precision__##__registor__ -#define LITE_KERNEL_REGISTER_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__) \ - op_type__##__##target__##__##precision__##__##layout__##registor__instance__##alias__ // NOLINT - -#define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \ - LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__) - +// Register a kernel. #define REGISTER_LITE_KERNEL( \ op_type__, target__, precision__, layout__, KernelClass, alias__) \ - static paddle::lite::KernelRegistor \ - LITE_KERNEL_REGISTER_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__)(#op_type__, \ - #alias__); \ - static KernelClass LITE_KERNEL_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__); \ + static paddle::lite::KernelRegistrar \ + op_type__##target__##precision__##layout__##alias__##_kernel_registry( \ + #op_type__, \ + TARGET(target__), \ + PRECISION(precision__), \ + DATALAYOUT(layout__), \ + []() { \ + std::unique_ptr x(new KernelClass); \ + x->set_op_type(#op_type__); \ + x->set_alias(#alias__); \ + return x; \ + }); \ int touch_##op_type__##target__##precision__##layout__##alias__() { \ + op_type__##target__##precision__##layout__##alias__##_kernel_registry \ + .touch(); \ OpKernelInfoCollector::Global().AddKernel2path( \ #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \ __FILE__); \ - LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \ - .Touch(); \ return 0; \ } \ - static bool LITE_KERNEL_PARAM_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__) UNUSED = \ - paddle::lite::ParamTypeRegistry::NewInstance( \ - #op_type__ "/" #alias__) - -#define LITE_KERNEL_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__) \ - op_type__##target__##precision__##layout__##alias__ -#define LITE_KERNEL_PARAM_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__) \ - op_type__##target__##precision__##layout__##alias__##param_register + static auto \ + op_type__##target__##precision__##layout__##alias__##param_register \ + UNUSED = paddle::lite::ParamTypeRegistry::NewInstance< \ + TARGET(target__), \ + PRECISION(precision__), \ + DATALAYOUT(layout__)>(#op_type__ "/" #alias__) diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 579f7690d7b73bb400d68cbcaf138b32bb23a6ce..70905c96f08d74fc5e27c85c7ccf3d395420a5e9 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -94,6 +94,8 @@ class Optimizer { #endif "identity_dropout_eliminate_pass", "__xpu__resnet_fuse_pass", + "__xpu__resnet_cbam_fuse_pass", + "__xpu__mmdnn_fuse_pass", "__xpu__multi_encoder_fuse_pass", "__xpu__embedding_with_eltwise_add_fuse_pass", "__xpu__fc_fuse_pass", @@ -108,9 +110,13 @@ class Optimizer { "bm_subgraph_pass", "apu_subgraph_pass", "rknpu_subgraph_pass", + "mlu_subgraph_pass", "static_kernel_pick_pass", // pick original kernel from graph + "remove_tf_redundant_ops_pass", "variable_place_inference_pass", // inference arg/var's + + "mlu_postprocess_pass", // info(target/precision/layout/device) // using kernel info "argument_type_display_pass", // debug pass: show arg-type-node's @@ -140,13 +146,9 @@ class Optimizer { "variable_place_inference_pass", // "argument_type_display_pass", - "mlu_subgraph_pass", - "runtime_context_assign_pass", "argument_type_display_pass", - "mlu_postprocess_pass", - "memory_optimize_pass"}}; if (passes.size() == 1) { diff --git a/lite/core/program.cc b/lite/core/program.cc index c911d4bba888901aec8a535b1a78528876ca03d3..f9ce00446e936871241405d39c51a2fcab91db32 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -15,9 +15,7 @@ #include "lite/core/program.h" #include #include -#include "lite/model_parser/cpp/block_desc.h" -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/cpp/var_desc.h" +#include "lite/model_parser/cpp_desc.h" #include "lite/operators/conditional_block_op.h" #include "lite/operators/subgraph_op.h" #include "lite/operators/while_op.h" diff --git a/lite/core/program.h b/lite/core/program.h index 46d66759a5ae516725fcab90e9c36c39d1683b17..5dff631c70d4f4353b2487df8e37e62143306e85 100644 --- a/lite/core/program.h +++ b/lite/core/program.h @@ -22,7 +22,7 @@ #include "lite/core/kernel.h" #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" -#include "lite/model_parser/cpp/program_desc.h" +#include "lite/model_parser/cpp_desc.h" #ifdef LITE_WITH_PROFILE #include "lite/core/profile/profiler.h" #endif diff --git a/lite/core/scope.h b/lite/core/scope.h index 57e4e3a5e058000f963ff369cbd25e69b9c981c6..41d6ee8f4f55268e3389cd4cada7e48fb8f922d7 100644 --- a/lite/core/scope.h +++ b/lite/core/scope.h @@ -62,19 +62,36 @@ class Scope final { // Create a Tensor variable. This will create a new Variable called `name`. Tensor* NewTensor(const std::string& name) { auto* var = Var(name); - return var->GetMutable(); + return var->GetMutable(); } const Tensor* FindTensor(const std::string& name) { auto* var = FindVar(name); if (!var) return nullptr; - return &var->Get(); + return &var->Get(); } Tensor* FindMutableTensor(const std::string& name) { auto* var = FindVar(name); if (!var) return nullptr; - return var->GetMutable(); + return var->GetMutable(); + } + + std::vector* NewTensorList(const std::string& name) { + auto* var = Var(name); + return var->GetMutable>(); + } + + const std::vector* FindTensorList(const std::string& name) { + auto* var = FindVar(name); + if (!var) return nullptr; + return &var->Get>(); + } + + std::vector* FindMutableTensorList(const std::string& name) { + auto* var = FindVar(name); + if (!var) return nullptr; + return var->GetMutable>(); } private: diff --git a/lite/fluid/data_type.cc b/lite/fluid/data_type.cc index 0dab71ed26c1b4ee438f52e088614bb577a9eade..3ad02a9c53c311a9253bbdf481c9aa6288685654 100644 --- a/lite/fluid/data_type.cc +++ b/lite/fluid/data_type.cc @@ -67,7 +67,7 @@ framework::proto::VarType::Type ToDataType(std::type_index type) { if (it != gDataTypeMap().cpp_to_proto_.end()) { return it->second; } - PADDLE_THROW("Not support %s as tensor type", type.name()); + LOG(FATAL) << "Not support " << type.name() << " as tensor type"; return static_cast(-1); } @@ -76,8 +76,8 @@ std::type_index ToTypeIndex(framework::proto::VarType::Type type) { if (it != gDataTypeMap().proto_to_cpp_.end()) { return it->second; } - PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type", - static_cast(type)); + LOG(FATAL) << "Not support framework::proto::VarType::Type(" + << static_cast(type) << ") as tensor type"; return std::type_index(typeid(void)); } @@ -86,8 +86,8 @@ std::string DataTypeToString(const framework::proto::VarType::Type type) { if (it != gDataTypeMap().proto_to_str_.end()) { return it->second; } - PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type", - static_cast(type)); + LOG(FATAL) << "Not support framework::proto::VarType::Type(" + << static_cast(type) << ") as tensor type"; return std::string(); } @@ -96,7 +96,8 @@ size_t SizeOfType(framework::proto::VarType::Type type) { if (it != gDataTypeMap().proto_to_size_.end()) { return it->second; } - PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type).c_str()); + LOG(FATAL) << "Not support " << DataTypeToString(type).c_str() + << " as tensor type"; return 0; } diff --git a/lite/fluid/data_type.h b/lite/fluid/data_type.h index a8b11ec465e00356561c95b56f63e3c56cbe8a5b..9896c0d54844b99748e1a7c8bddc5e178f84fb51 100644 --- a/lite/fluid/data_type.h +++ b/lite/fluid/data_type.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include "lite/core/framework.pb.h" #include "lite/fluid/float16.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -72,7 +72,7 @@ inline void VisitDataType(framework::proto::VarType::Type type, _ForEachDataType_(VisitDataTypeCallback); #undef VisitDataTypeCallback - PADDLE_THROW("Not supported %d", type); + LOG(FATAL) << "Not supported " << type; } extern std::string DataTypeToString(const framework::proto::VarType::Type type); diff --git a/lite/fluid/eigen.h b/lite/fluid/eigen.h index c3af7e9f6c3588f404c614430bf01f7ab5e099e5..3312c9c39eaad4fc0a4225d9734b3f80790b2979 100644 --- a/lite/fluid/eigen.h +++ b/lite/fluid/eigen.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include "lite/core/tensor.h" #include "lite/fluid/float16.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" #include "unsupported/Eigen/CXX11/Tensor" namespace paddle { @@ -30,7 +30,7 @@ struct EigenDim { using Type = Eigen::DSizes; static Type From(const lite::DDim& dims) { - PADDLE_ENFORCE_EQ(dims.size(), D, "D must match DDim::size"); + CHECK_EQ(dims.size(), D) << "D must match DDim::size"; Type ret; for (size_t d = 0; d < dims.size(); d++) { ret[d] = dims[d]; @@ -39,7 +39,7 @@ struct EigenDim { } static Type From(const DDim::value_type length) { - PADDLE_ENFORCE_EQ(D, 1, "D must be 1."); + CHECK_EQ(D, 1) << "D must be 1."; Type ret; ret[0] = length; return ret; @@ -84,16 +84,16 @@ struct EigenMatrix : public EigenTensor { static typename EigenMatrix::Type Reshape(Tensor& tensor, // NOLINT int num_col_dims) { int rank = tensor.dims().size(); - PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, - "`num_col_dims` must be between (0, rank_of_tensor)."); + CHECK(num_col_dims > 0 && num_col_dims < rank) + << "`num_col_dims` must be between (0, rank_of_tensor)."; return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims)); } static typename EigenMatrix::ConstType Reshape(const Tensor& tensor, int num_col_dims) { int rank = tensor.dims().size(); - PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, - "`num_col_dims` must be between (0, rank_of_tensor)."); + CHECK(num_col_dims > 0 && num_col_dims < rank) + << "`num_col_dims` must be between (0, rank_of_tensor)."; return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims)); } }; diff --git a/lite/fluid/rw_lock.h b/lite/fluid/rw_lock.h index eb9829425eca9d8bd363a45961302a7f3818e513..f68a21502073ccde6d27c46793d3f8cfa0751af3 100644 --- a/lite/fluid/rw_lock.h +++ b/lite/fluid/rw_lock.h @@ -20,7 +20,7 @@ limitations under the License. */ #include // NOLINT #endif // !_WIN32 -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -33,17 +33,15 @@ struct RWLock { ~RWLock() { pthread_rwlock_destroy(&lock_); } inline void RDLock() { - PADDLE_ENFORCE_EQ( - pthread_rwlock_rdlock(&lock_), 0, "acquire read lock failed"); + CHECK_EQ(pthread_rwlock_rdlock(&lock_), 0) << "acquire read lock failed"; } inline void WRLock() { - PADDLE_ENFORCE_EQ( - pthread_rwlock_wrlock(&lock_), 0, "acquire write lock failed"); + CHECK_EQ(pthread_rwlock_wrlock(&lock_), 0) << "acquire write lock failed"; } inline void UNLock() { - PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed"); + CHECK_EQ(pthread_rwlock_unlock(&lock_), 0) << "unlock failed"; } private: diff --git a/lite/fluid/selected_rows.cc b/lite/fluid/selected_rows.cc index 98e9325ca2f8fab3f8aa77a0bb074ae5d1be7670..361d63cf5dfd9cd21db47917047a7e2f3758ec96 100644 --- a/lite/fluid/selected_rows.cc +++ b/lite/fluid/selected_rows.cc @@ -119,7 +119,7 @@ void DeserializeFromStream( // the 1st field, unit32_t version for SelectedRows uint32_t version; is.read(reinterpret_cast(&version), sizeof(version)); - PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + CHECK_EQ(version, 0U) << "Only version 0 is supported"; } { // the 2st field, rows information @@ -163,24 +163,22 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key, if (iter == id_to_index_.end()) { rwlock_->UNLock(); if (!auto_grown) { - PADDLE_THROW("key %ld not found", key); + LOG(FATAL) << "key " << key << " not found"; } rwlock_->WRLock(); auto map_size = id_to_index_.size(); auto vector_size = rows_.size(); if (map_size != vector_size) { rwlock_->UNLock(); - PADDLE_THROW( - "id_to_index_ size %lu should have the same size with rows_ %lu", - map_size, - vector_size); + LOG(FATAL) << "id_to_index_ size " << map_size + << " should have the same size with rows_ " << vector_size; } auto write_iter = id_to_index_.find(key); if (write_iter == id_to_index_.end()) { int row_num = rows_.size(); if (row_num == value_->dims()[0]) { rwlock_->UNLock(); - PADDLE_THROW("selected rows is full, then length exceed %d", row_num); + LOG(FATAL) << "selected rows is full, then length exceed " << row_num; } // key logic to put a key into id_to_index_ rows_.push_back(key); @@ -213,16 +211,14 @@ void SelectedRows::Get(const lite::Tensor& ids, lite::Tensor* value, bool auto_grown, bool is_test) { - PADDLE_ENFORCE(value->IsInitialized(), - "The value tensor should be initialized."); + CHECK(value->IsInitialized()) << "The value tensor should be initialized."; if (ids.numel() == 0) { VLOG(3) << "keys is empty, please check data!"; } else { int64_t value_width = value_->numel() / value_->dims()[0]; - PADDLE_ENFORCE_EQ(value_width, - value->numel() / value->dims()[0], - "output tensor should have the same shape with table " - "except the dims[0]."); + CHECK_EQ(value_width, value->numel() / value->dims()[0]) + << "output tensor should have the same shape with table " + "except the dims[0]."; for (int i = 0; i < ids.numel(); ++i) { auto id = ids.data()[i]; int64_t index = AutoGrownIndex(id, auto_grown, is_test); diff --git a/lite/fluid/selected_rows.h b/lite/fluid/selected_rows.h index 5db322f8592f4518d9e1ccc996ffb1e847e7b964..aad93552ebef5d67c77e554b29bf593f5cd176f7 100644 --- a/lite/fluid/selected_rows.h +++ b/lite/fluid/selected_rows.h @@ -82,7 +82,7 @@ class SelectedRows { int64_t Index(int64_t key) const { auto it = std::find(rows_.begin(), rows_.end(), key); if (it == rows_.end()) { - PADDLE_THROW("id %ld not in table", key); + LOG(FATAL) << "id " << key << " not in table"; } return static_cast(std::distance(rows_.begin(), it)); } diff --git a/lite/gen_code/gen_code.cc b/lite/gen_code/gen_code.cc index 6c43f6e0116d9adfc4fc6f315d5653b2634dfe7b..a1e69b624a600719121926fc3a4f58391fa63ce6 100644 --- a/lite/gen_code/gen_code.cc +++ b/lite/gen_code/gen_code.cc @@ -59,7 +59,7 @@ void Module::AddHeaderIncludeGenCode() { Line("#include \"lite/gen_code/paddle_infer.h\""); Line("#include \"lite/core/op_registry.h\""); Line("#include \"lite/core/scope.h\""); - Line("#include \"lite/model_parser/cpp/op_desc.h\""); + Line("#include \"lite/model_parser/cpp_desc.h\""); Line(""); Line(""); } diff --git a/lite/gen_code/gen_code.h b/lite/gen_code/gen_code.h index d316eac43f99664fa71cba54b3ab5360852300a0..e100904a7fe4f9c3e489c056ceeeba21657b4944 100644 --- a/lite/gen_code/gen_code.h +++ b/lite/gen_code/gen_code.h @@ -20,9 +20,9 @@ #include "lite/core/program.h" #include "lite/core/target_wrapper.h" #include "lite/core/tensor.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/compatible_pb.h" -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/cpp_desc.h" #include "lite/model_parser/pb/op_desc.h" #include "lite/utils/all.h" diff --git a/lite/gen_code/gen_code_test.cc b/lite/gen_code/gen_code_test.cc index d0b1c1f8b23f90976f4b315a1a4e13069b2136f1..5b3db0de8342f312dcb4443ebcd1fd72e857eea0 100644 --- a/lite/gen_code/gen_code_test.cc +++ b/lite/gen_code/gen_code_test.cc @@ -25,7 +25,7 @@ #include "lite/core/scope.h" #include "lite/core/tensor.h" #include "lite/model_parser/compatible_pb.h" -#include "lite/model_parser/cpp/op_desc.h" +#include "lite/model_parser/cpp_desc.h" #include "lite/model_parser/model_parser.h" #include "lite/model_parser/pb/program_desc.h" diff --git a/lite/kernels/apu/bridges/conv_op.cc b/lite/kernels/apu/bridges/conv_op.cc index ca6e0ff2ac3930fe5cab9230dbbefa0af0a864ab..bf5e313180d9d8089b29f993384bd243b2a5ed05 100644 --- a/lite/kernels/apu/bridges/conv_op.cc +++ b/lite/kernels/apu/bridges/conv_op.cc @@ -35,6 +35,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { int neuron_errCode; VLOG(3) << "[APU] Converting [" << op_type << "]"; + CHECK(op_info->HasAttr("enable_int8") && + op_info->GetAttr("enable_int8")); + // Get input and output vars and op attributes auto input_name = op_info->Input("Input").front(); auto input = scope->FindMutableTensor(input_name); @@ -94,30 +97,18 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { input_dims, filter_dims); - float input_scale; - float output_scale; - std::vector weight_scale; - if (op_info->HasAttr("enable_int8")) { - if (op_info->GetAttr("enable_int8")) { - if (op_info->HasAttr("input_scale")) - input_scale = op_info->GetAttr("input_scale"); - if (op_info->HasAttr("weight_scale")) - weight_scale = op_info->GetAttr>("weight_scale"); - if (op_info->HasAttr("output_scale")) - output_scale = op_info->GetAttr("output_scale"); - VLOG(3) << "has output scale:" << output_scale; - } else { - return FAILED; - } - } else { - return FAILED; - } + CHECK(op_info->HasInputScale(input_name)); + auto input_scale = op_info->GetInputScale(input_name)[0]; + CHECK(op_info->HasInputScale(filter_name)); + auto filter_scale = op_info->GetInputScale(filter_name); + CHECK(op_info->HasOutputScale(output_name)); + auto output_scale = op_info->GetOutputScale(output_name)[0]; VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups << " ,dilations: " << dilations[0] << ":" << dilations[1]; VLOG(3) << "with_act: " << with_act << " ,act_type:" << act_type; VLOG(3) << "input_dims: " << input_dims << " ,output_dims: " << output_dims - << " ,weight_scale size: " << weight_scale.size(); + << " ,filter_scale size: " << filter_scale.size(); VLOG(3) << "filter_dims: " << filter_dims << " ,memory_size: " << filter->memory_size() << " ,data_size: " << filter->data_size(); @@ -216,10 +207,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { NeuronOperandType filterType; NeuronOperandType channelFilterType; NeuronSymmPerChannelQuantParams symmPerChannelQuantParams; - if (1 == weight_scale.size()) { + if (1 == filter_scale.size()) { // Per layer type filterType.type = NEURON_TENSOR_QUANT8_ASYMM; - filterType.scale = weight_scale[0]; + filterType.scale = filter_scale[0]; filterType.zeroPoint = 128; filterType.dimensionCount = filter_dims.size(); filterType.dimensions = &dims_filter[0]; @@ -237,17 +228,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { symmPerChannelQuantParams.channelDim = 3; else symmPerChannelQuantParams.channelDim = 0; - symmPerChannelQuantParams.scaleCount = weight_scale.size(); - symmPerChannelQuantParams.scales = weight_scale.data(); + symmPerChannelQuantParams.scaleCount = filter_scale.size(); + symmPerChannelQuantParams.scales = filter_scale.data(); biasType.scale = 0; } std::shared_ptr filter_node = nullptr; - if (1 == weight_scale.size()) { + if (1 == filter_scale.size()) { NeuronModel_addOperand(model, &filterType); // 1: filter filter_node = graph->Add(filter_name, dims_filter); - VLOG(3) << "filter node idx: " << filter_node->index() << "w_scale[0]" - << weight_scale[0] << ": filterType: " << filterType.dimensions[0] + VLOG(3) << "filter node idx: " << filter_node->index() << "filter_scale[0]" + << filter_scale[0] << ": filterType: " << filterType.dimensions[0] << ":" << filterType.dimensions[1] << ":" << filterType.dimensions[2] << ":" << filterType.dimensions[3]; memcpy(filter->mutable_data(), @@ -263,8 +254,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { NeuronModel_addOperand(model, &channelFilterType); // 1: filter filter_node = graph->Add(filter_name, dims_filter); VLOG(3) << "chennel filter node idx: " << filter_node->index() - << " ,scale_count:" << weight_scale.size() - << " weight_scale[0]:" << weight_scale.data()[0] + << " ,scale_count:" << filter_scale.size() + << " filter_scale[0]:" << filter_scale.data()[0] << " ,channelFilterType: " << channelFilterType.dimensions[0] << ":" << channelFilterType.dimensions[1] << ":" << channelFilterType.dimensions[2] << ":" @@ -298,7 +289,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { std::shared_ptr bias_node = nullptr; if (HasInputArg(op_info, scope, "Bias")) { auto bias_name = op_info->Input("Bias").front(); - auto bias_type = kernel->GetInputDeclType("Bias"); auto bias = scope->FindMutableTensor(bias_name); auto bias_dims = bias->dims(); @@ -364,10 +354,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Add output tensor type NeuronOperandType outType; outType.type = NEURON_TENSOR_QUANT8_ASYMM; - if (graph->IsOutput(output_name)) - outType.scale = output_scale / 127; - else - outType.scale = output_scale; + outType.scale = output_scale; outType.zeroPoint = 128; outType.dimensionCount = output_dims.size(); std::vector dims_out = {(uint32_t)output_dims[0], @@ -401,7 +388,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { int32_t* int32_bias_data = reinterpret_cast(bias->mutable_data()); float2int32( - bias->data(), input_scale, weight_scale, int32_bias_data); + bias->data(), input_scale, filter_scale, int32_bias_data); VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << " : " << int32_bias_data[1] << " : " << int32_bias_data[2] << " : " diff --git a/lite/kernels/apu/bridges/fc_op.cc b/lite/kernels/apu/bridges/fc_op.cc index a00a35f9a0766b4fb4f02d05419a0ae42354ca37..106ce2c16f3fd287a27c92179fa3a429c7be57c8 100644 --- a/lite/kernels/apu/bridges/fc_op.cc +++ b/lite/kernels/apu/bridges/fc_op.cc @@ -31,6 +31,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto scope = op->scope(); VLOG(3) << "[APU] Converting [" + op_type + "]"; + CHECK(op_info->HasAttr("enable_int8") && + op_info->GetAttr("enable_int8")); + + // Get input and output vars and op attributes auto input_name = op_info->Input("Input").front(); auto input = scope->FindMutableTensor(input_name); auto input_dims = input->dims(); @@ -52,23 +56,12 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { << " out_dims: " << out_dims << " m: " << m << " k: " << k << " n: " << n; - float input_scale = 1.0f; - float out_scale = 1.0f; - std::vector w_scale; - if (op_info->HasAttr("enable_int8")) { - if (op_info->GetAttr("enable_int8")) { - if (op_info->HasAttr("input_scale")) - input_scale = op_info->GetAttr("input_scale"); - if (op_info->HasAttr("weight_scale")) - w_scale = op_info->GetAttr>("weight_scale"); - if (op_info->HasAttr("output_scale")) - out_scale = op_info->GetAttr("output_scale"); - } else { - return FAILED; - } - } else { - return FAILED; - } + CHECK(op_info->HasInputScale(input_name)); + auto input_scale = op_info->GetInputScale(input_name)[0]; + CHECK(op_info->HasInputScale(w_name)); + auto w_scale = op_info->GetInputScale(w_name); + CHECK(op_info->HasOutputScale(out_name)); + auto out_scale = op_info->GetOutputScale(out_name)[0]; // Add input tensor type NeuronOperandType inType; diff --git a/lite/kernels/apu/bridges/pool_op.cc b/lite/kernels/apu/bridges/pool_op.cc index 2bda76ab99af727276102e884f84534b77a59586..b82f23beaf715e8c720ffc22792b804ff6c2c225 100644 --- a/lite/kernels/apu/bridges/pool_op.cc +++ b/lite/kernels/apu/bridges/pool_op.cc @@ -32,6 +32,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto scope = op->scope(); VLOG(3) << "[APU] Converting [" + op_type + "] "; + CHECK(op_info->HasAttr("enable_int8") && + op_info->GetAttr("enable_int8")); + // Get input and output vars and op attributes auto x_name = op_info->Input("X").front(); auto x = scope->FindMutableTensor(x_name); @@ -87,22 +90,10 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { ksize); // Add x tensor type - float x_scale = 1.0f; - float out_scale = 1.0f; - if (op_info->HasAttr("enable_int8")) { - if (op_info->GetAttr("enable_int8")) { - if (op_info->HasAttr("input_scale")) - x_scale = op_info->GetAttr("input_scale"); - if (op_info->HasAttr("output_scale")) - out_scale = op_info->GetAttr("output_scale"); - } else { - LOG(WARNING) << "Do not enable_int8"; - return FAILED; - } - } else { - LOG(WARNING) << "Do not enable_int8"; - return FAILED; - } + CHECK(op_info->HasInputScale(x_name)); + auto x_scale = op_info->GetInputScale(x_name)[0]; + CHECK(op_info->HasOutputScale(out_name)); + auto out_scale = op_info->GetOutputScale(out_name)[0]; NeuronOperandType xType; xType.type = NEURON_TENSOR_QUANT8_ASYMM; diff --git a/lite/kernels/apu/bridges/softmax_op.cc b/lite/kernels/apu/bridges/softmax_op.cc index 6a289ac987b9fa300cb548d190b6e46b67f24c44..dec6d12307b50798d04f743064360aa6870acfa3 100644 --- a/lite/kernels/apu/bridges/softmax_op.cc +++ b/lite/kernels/apu/bridges/softmax_op.cc @@ -31,6 +31,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto scope = op->scope(); VLOG(3) << "[APU] Converting [" + op_type + "]"; + CHECK(op_info->HasAttr("enable_int8") && + op_info->GetAttr("enable_int8")); + // Get input and output vars and op attributes auto x_name = op_info->Input("X").front(); auto x = scope->FindMutableTensor(x_name); @@ -45,22 +48,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { axis += x_rank; } - float input_scale = 1.0f; - float out_scale = 1.0f; - if (op_info->HasAttr("enable_int8")) { - if (op_info->GetAttr("enable_int8")) { - if (op_info->HasAttr("input_scale")) - input_scale = op_info->GetAttr("input_scale"); - if (op_info->HasAttr("output_scale")) - out_scale = op_info->GetAttr("output_scale"); - } else { - LOG(WARNING) << "Do not enable_int8"; - return FAILED; - } - } else { - LOG(WARNING) << "Do not enable_int8"; - return FAILED; - } + CHECK(op_info->HasInputScale(x_name)); + auto input_scale = op_info->GetInputScale(x_name)[0]; + CHECK(op_info->HasOutputScale(out_name)); + auto out_scale = op_info->GetOutputScale(out_name)[0]; // Check output scale NeuronOperandType xType; @@ -104,14 +95,14 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Add out operand NeuronOperandType outType; outType.type = NEURON_TENSOR_QUANT8_ASYMM; - outType.scale = out_scale / 127; + outType.scale = out_scale; outType.zeroPoint = 128; outType.dimensionCount = x_dims.size(); outType.dimensions = &dims_x[0]; NeuronModel_addOperand(model, &outType); // 3: output std::shared_ptr out_node = nullptr; out_node = graph->Add(out_name, dims_x); - VLOG(3) << "output_scale: " << out_scale; + VLOG(3) << "out_scale: " << out_scale; float beta_val[] = {1.0f}; NeuronModel_setOperandValue( diff --git a/lite/kernels/apu/subgraph_compute.cc b/lite/kernels/apu/subgraph_compute.cc index 6009e71e05c33f6dedfd995020612e112c888d36..21373811dd91d009d834a16d2c437bc722cd676a 100644 --- a/lite/kernels/apu/subgraph_compute.cc +++ b/lite/kernels/apu/subgraph_compute.cc @@ -28,7 +28,7 @@ namespace lite { namespace kernels { namespace apu { -int SubgraphEngine::BuildDeviceProgram() { +bool SubgraphEngine::BuildDeviceProgram() { unsigned int version; Neuron_getVersion(&version); VLOG(3) << "Neuron Adapter version: " << version; @@ -38,7 +38,7 @@ int SubgraphEngine::BuildDeviceProgram() { int neuron_errCode = NeuronModel_create(&model_); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "Fail to create model"; - return subgraph::FAILED; + return false; } graph.set_model(model_); graph.set_input_names(input_names_); @@ -46,6 +46,9 @@ int SubgraphEngine::BuildDeviceProgram() { // Convert all of ops and their input vars and weights and added into the APU // NIR graph + if (origin_program_.empty()) { + BuildOriginProgram(); + } const auto& bridges = subgraph::Registry::Instance(); for (auto& inst : origin_program_) { auto op = const_cast(inst.op()); @@ -54,7 +57,7 @@ int SubgraphEngine::BuildDeviceProgram() { op->InferShape(); std::string op_type = op->op_info()->Type(); if (!bridges.Exists(op_type, TARGET(kAPU))) { - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); @@ -63,7 +66,7 @@ int SubgraphEngine::BuildDeviceProgram() { const_cast(op), const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } @@ -84,7 +87,7 @@ int SubgraphEngine::BuildDeviceProgram() { VLOG(3) << "input idx: " << graph.Get(input_names_[i])->index(); } else { LOG(WARNING) << "Fail to find input: " << input_names_[i]; - return subgraph::FAILED; + return false; } } @@ -105,7 +108,7 @@ int SubgraphEngine::BuildDeviceProgram() { VLOG(3) << "output idx: " << graph.Get(output_names_[i])->index(); } else { LOG(WARNING) << "Fail to find output: " << output_names_[i]; - return subgraph::FAILED; + return false; } } @@ -116,7 +119,7 @@ int SubgraphEngine::BuildDeviceProgram() { neuron_errCode = NeuronModel_finish(model_); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "Fail to create NIR model:" << neuron_errCode; - return subgraph::FAILED; + return false; } VLOG(3) << "[APU] APU NIR model created!"; @@ -129,15 +132,14 @@ int SubgraphEngine::BuildDeviceProgram() { compilation_ = lite::apu::Device::Global().Build(model_); if (compilation_ == nullptr) { LOG(WARNING) << "[APU] Build APU DLA model failed!"; - return subgraph::FAILED; + return false; } VLOG(3) << "[APU] APU DLA model created, Build cost " << GetCurrentUS() - start_time << " us"; - - return status; + return true; } -int SubgraphEngine::LaunchDeviceProgram() { +bool SubgraphEngine::LaunchDeviceProgram() { auto GetCurrentUS = []() -> double { struct timeval time; gettimeofday(&time, NULL); @@ -149,22 +151,19 @@ int SubgraphEngine::LaunchDeviceProgram() { int neuron_errCode = NeuronExecution_create(compilation_, &run); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "[APU] Build APU runtime failed!"; - return subgraph::FAILED; + return false; } // Set input buffer - Tensor input_temp; for (size_t i = 0; i < origin_itensors_.size(); i++) { - input_temp.Resize({origin_idims_[i]}); - uint8_t* input_data = input_temp.mutable_data(); - memcpy(input_data, - origin_itensors_[i]->raw_data(), - origin_itensors_[i]->memory_size()); + auto origin_data = origin_itensors_[i]->mutable_data(); + auto converted_data = reinterpret_cast(origin_data); for (int j = 0; j < origin_itensors_[i]->data_size(); j++) { - input_data[j] += (uint8_t)128; + converted_data[j] = + static_cast(static_cast(origin_data[j]) + 128); } NeuronExecution_setInput( - run, i, NULL, input_data, origin_itensors_[i]->memory_size()); + run, i, NULL, converted_data, origin_itensors_[i]->memory_size()); } // Set output buffer @@ -180,19 +179,20 @@ int SubgraphEngine::LaunchDeviceProgram() { neuron_errCode = NeuronExecution_compute(run); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "Fail to run execution!" << neuron_errCode; - return subgraph::FAILED; + return false; } for (size_t i = 0; i < origin_otensors_.size(); i++) { - int8_t* output_data = origin_otensors_[i]->mutable_data(); - VLOG(3) << "output size:" << origin_otensors_[i]->memory_size(); + auto converted_data = origin_otensors_[i]->mutable_data(); + auto origin_data = reinterpret_cast(converted_data); for (int j = 0; j < origin_otensors_[i]->data_size(); j++) { - output_data[j] -= (int8_t)128; + converted_data[j] = + static_cast(static_cast(origin_data[j]) - 128); } } NeuronExecution_free(run); VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us"; - return 0; + return true; } SubgraphEngine::~SubgraphEngine() { @@ -213,12 +213,11 @@ void SubgraphCompute::PrepareForRun() { param.output_data_names, param.scope)); CHECK(engine_); - engine_->Build(); } void SubgraphCompute::Run() { CHECK(engine_); - engine_->Launch(); + engine_->Run(); } } // namespace apu diff --git a/lite/kernels/apu/subgraph_compute.h b/lite/kernels/apu/subgraph_compute.h index ecd8a38343cd1f62bb5a3bf8e948384b90cfe826..beb582b8cc16e456491c28ace5e2d1695143216a 100644 --- a/lite/kernels/apu/subgraph_compute.h +++ b/lite/kernels/apu/subgraph_compute.h @@ -41,8 +41,8 @@ class SubgraphEngine : public subgraph::Engine { ~SubgraphEngine(); protected: - int BuildDeviceProgram() override; - int LaunchDeviceProgram() override; + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; NeuronModel *model_; NeuronCompilation *compilation_; diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt index 218ee3f053fcf49f6a08ffbe0d780509f9b2cc03..6d1d24adcb4cf74b3c6bb991a33316e974dc0110 100644 --- a/lite/kernels/arm/CMakeLists.txt +++ b/lite/kernels/arm/CMakeLists.txt @@ -42,6 +42,7 @@ add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_de add_kernel(reduce_mean_compute_arm ARM basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(stack_compute_arm ARM basic SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(affine_grid_compute_arm ARM basic SRCS affine_grid_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(range_compute_arm ARM basic SRCS range_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(layout_compute_arm ARM basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -77,6 +78,7 @@ add_kernel(assign_value_compute_arm ARM basic SRCS assign_value_compute.cc DEPS add_kernel(conditional_block_compute_arm ARM extra SRCS conditional_block_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(clip_compute_arm ARM extra SRCS clip_compute.cc DEPS ${lite_kernel_deps} math_arm) # for OCR specific add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -101,7 +103,6 @@ add_kernel(deformable_conv_compute_arm ARM extra SRCS deformable_conv_compute.cc add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(mean_grad_compute_arm ARM train SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(activation_grad_compute_arm ARM train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(elementwise_grad_compute_arm ARM train SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(mul_grad_compute_arm ARM train SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sgd_compute_arm ARM train SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm) diff --git a/lite/kernels/arm/activation_grad_compute.cc b/lite/kernels/arm/activation_grad_compute.cc deleted file mode 100644 index 137668fa5e0d1bd07e838b3040a31e084a7475c8..0000000000000000000000000000000000000000 --- a/lite/kernels/arm/activation_grad_compute.cc +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/activation_grad_compute.h" -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -void SquareGradCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto out_grad_dims = param.Out_grad->dims(); - auto out_grad_data = param.Out_grad->data(); - - auto x_data = param.X->data(); - auto x_grad_data = param.X_grad->mutable_data(); - lite::arm::math::act_square_grad(x_data, - out_grad_data, - x_grad_data, - out_grad_dims.production(), - ctx.threads()); -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(square_grad, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::SquareGradCompute, - def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/arm/affine_grid_compute.cc b/lite/kernels/arm/affine_grid_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..1ea5512bf3a3e9944855b36277784b6a06e050bb --- /dev/null +++ b/lite/kernels/arm/affine_grid_compute.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/arm/affine_grid_compute.h" +#include +#include +#include "lite/backends/arm/math/funcs.h" +#include "lite/backends/arm/math/sgemm.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +#include "lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void AffineGridCompute::PrepareForRun() { + auto& param = Param(); + auto& ctx = this->ctx_->template As(); + + const lite::Tensor* x = param.X; + const float* din = x->data(); + lite::Tensor* out = param.Out; + float* dout = param.Out->mutable_data(); + int N = x->dims()[0]; + int H = param.output_shape[2]; + int W = param.output_shape[3]; + + vh = reinterpret_cast(malloc(sizeof(float) * H)); + vw = reinterpret_cast(malloc(sizeof(float) * W)); + int out_size = H * W * 3; + float scale = 2 / (static_cast(H) - 1); + for (int i = 0; i < H; i++) { + vh[i] = -1 + scale * i; + } + scale = 2 / (static_cast(W) - 1); + for (int i = 0; i < W; i++) { + vw[i] = -1 + i * scale; + } + return; +} +void AffineGridCompute::Run() { + auto& param = Param(); + auto& ctx = this->ctx_->template As(); + + const lite::Tensor* x = param.X; + int N = x->dims()[0]; + + int H = param.output_shape[2]; + int W = param.output_shape[3]; + int out_size = H * W * 3; + float* hw3 = ctx.workspace_data() + ctx.llc_size() / sizeof(float); + + for (int i = 0; i < out_size; i += 3) { + hw3[i] = 1; + hw3[i + 1] = 1; + hw3[i + 2] = 1; + } + + for (int i = 0; i < H * W; i++) { + hw3[i * 3 + 1] = vh[i / W]; + } + for (int i = 0; i < H * W; i++) { + hw3[i * 3] = vw[i % W]; + } + + const float* din = x->data(); + float* dout = param.Out->mutable_data(); + float* tmp = dout; + operators::ActivationParam act_param; + act_param.has_active = false; + for (int i = 0; i < N; i++) { + lite::arm::math::sgemm(false, + true, + H * W, + 2, + 3, + 1.f, + hw3, + 3, + din, + 3, + 0.f, + dout, + 2, + nullptr, + false, + act_param, + &ctx); + + din += 6; + dout += H * W * 2; + } + + return; +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(affine_grid, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::AffineGridCompute, + def) + .BindInput("Theta", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/affine_grid_compute.h b/lite/kernels/arm/affine_grid_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..09f0e1f85c88acc2f70f0ca12f942c560b61a722 --- /dev/null +++ b/lite/kernels/arm/affine_grid_compute.h @@ -0,0 +1,39 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +class AffineGridCompute : public KernelLite { + public: + using param_t = operators::AffineGridParam; + void PrepareForRun() override; + + void Run() override; + + virtual ~AffineGridCompute() = default; + float* vh; + float* vw; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/arm/argmax_compute_test.cc b/lite/kernels/arm/argmax_compute_test.cc index 034d57cdaba77130b319d203c3ae0616720c9d31..5e511264a855ac86a9fb12ede56d51fb1ea83010 100644 --- a/lite/kernels/arm/argmax_compute_test.cc +++ b/lite/kernels/arm/argmax_compute_test.cc @@ -12,14 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/argmax_compute.h" #include + #include #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/argmax_compute.h" namespace paddle { namespace lite { @@ -66,9 +68,7 @@ void argmax_compute_ref(const operators::ArgmaxParam& param) { } TEST(argmax_arm, retrive_op) { - auto argmax = - KernelRegistry::Global().Create( - "arg_max"); + auto argmax = KernelRegistry::Global().Create("arg_max"); ASSERT_FALSE(argmax.empty()); ASSERT_TRUE(argmax.front()); } diff --git a/lite/kernels/arm/axpy_compute_test.cc b/lite/kernels/arm/axpy_compute_test.cc index af145435ebe2c5bd0c1d1b78b112e8a8572d36ec..7348630e776155cd421bc78a9da7494d42e84c3f 100644 --- a/lite/kernels/arm/axpy_compute_test.cc +++ b/lite/kernels/arm/axpy_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/axpy_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/axpy_compute.h" namespace paddle { namespace lite { @@ -61,8 +63,7 @@ void axpy_compute_ref(const operators::AxpyParam& param) { } TEST(axpy_arm, retrive_op) { - auto axpy = - KernelRegistry::Global().Create("axpy"); + auto axpy = KernelRegistry::Global().Create("axpy"); ASSERT_FALSE(axpy.empty()); ASSERT_TRUE(axpy.front()); } diff --git a/lite/kernels/arm/batch_norm_compute_test.cc b/lite/kernels/arm/batch_norm_compute_test.cc index bf690f88a5e776709a3988cc843762db3bf684e6..a3ef9bda4a17ebfdb5468c911cc6c9aa6a5d4fd7 100644 --- a/lite/kernels/arm/batch_norm_compute_test.cc +++ b/lite/kernels/arm/batch_norm_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/batch_norm_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/batch_norm_compute.h" namespace paddle { namespace lite { @@ -78,9 +80,7 @@ void batch_norm_compute_ref(const operators::BatchNormParam& param) { } TEST(batch_norm_arm, retrive_op) { - auto batch_norm = - KernelRegistry::Global().Create( - "batch_norm"); + auto batch_norm = KernelRegistry::Global().Create("batch_norm"); ASSERT_FALSE(batch_norm.empty()); ASSERT_TRUE(batch_norm.front()); } diff --git a/lite/kernels/arm/calib_compute.cc b/lite/kernels/arm/calib_compute.cc index 6dac97dcbc59991d4680ab1a98a54a900573f631..383e868843b43f4081e1eac330b1422b79307d9c 100644 --- a/lite/kernels/arm/calib_compute.cc +++ b/lite/kernels/arm/calib_compute.cc @@ -33,6 +33,17 @@ void CalibComputeFp32ToInt8::Run() { din, dout, scale.data(), 1, 1, param.input->numel()); } +template +void CalibComputeInt64ToInt32::Run() { + auto& param = this->template Param(); + const auto* din = param.input->template data(); + std::vector scale = {param.scale}; + auto* dout = param.output->template mutable_data(); + for (auto i = 0; i < param.input->numel(); ++i) { + dout[i] = din[i]; + } +} + template void CalibComputeInt8ToFp32::Run() { auto& param = this->template Param(); @@ -105,6 +116,23 @@ REGISTER_LITE_KERNEL( DATALAYOUT(kNHWC))}) .Finalize(); +REGISTER_LITE_KERNEL( + calib, + kARM, + kInt64, + kNCHW, + paddle::lite::kernels::arm::CalibComputeInt64ToInt32, + int64_to_int32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt64), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt32), + DATALAYOUT(kNCHW))}) + .Finalize(); + REGISTER_LITE_KERNEL( calib_once, kARM, @@ -161,3 +189,20 @@ REGISTER_LITE_KERNEL( PRECISION(kFloat), DATALAYOUT(kNHWC))}) .Finalize(); + +REGISTER_LITE_KERNEL( + calib_once, + kARM, + kInt64, + kNCHW, + paddle::lite::kernels::arm::CalibComputeInt64ToInt32, + int64_to_int32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt64), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt32), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/arm/calib_compute.h b/lite/kernels/arm/calib_compute.h index a4c8b4c1232101416e95171d70ab629f6a37177b..f10bb931df9b276bc3bb01da16906f3e5b5a7dce 100644 --- a/lite/kernels/arm/calib_compute.h +++ b/lite/kernels/arm/calib_compute.h @@ -34,6 +34,19 @@ class CalibComputeFp32ToInt8 private: }; +template +class CalibComputeInt64ToInt32 + : public KernelLite { + public: + using param_t = operators::CalibParam; + + void Run() override; + + ~CalibComputeInt64ToInt32() override{}; + + private: +}; + template class CalibComputeInt8ToFp32 : public KernelLite { diff --git a/lite/kernels/arm/cast_compute.cc b/lite/kernels/arm/cast_compute.cc index 3b3ef07e105c583b7e3eb8b64b14610ca0f9e41a..919e9c603edff4383f086ac795c3dff4ed856c4f 100644 --- a/lite/kernels/arm/cast_compute.cc +++ b/lite/kernels/arm/cast_compute.cc @@ -62,8 +62,19 @@ void CastCompute::Run() { int32_t* out_data = param.Out->mutable_data(); std::transform( x_data_begin, x_data_end, out_data, TransOp); + } else if (param.in_dtype == 0 && param.out_dtype == 5) { // bool->fp32 + const bool* x_data_begin = param.X->data(); + const bool* x_data_end = x_data_begin + param.X->numel(); + float* out_data = param.Out->mutable_data(); + std::transform(x_data_begin, x_data_end, out_data, TransOp); + } else if (param.in_dtype == 3 && param.out_dtype == 5) { // int64->fp32 + const int64_t* x_data_begin = param.X->data(); + const int64_t* x_data_end = x_data_begin + param.X->numel(); + float* out_data = param.Out->mutable_data(); + std::transform(x_data_begin, x_data_end, out_data, TransOp); } else { - LOG(FATAL) << "other has not been implemented"; + LOG(FATAL) << "other has not been implemented transform with dtype" + << param.in_dtype << " X, dtype" << param.out_dtype << " Out"; } } diff --git a/lite/kernels/arm/clip_compute.cc b/lite/kernels/arm/clip_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..2d71eaef9e5b3e68d571a48e1a9772b8870c29b7 --- /dev/null +++ b/lite/kernels/arm/clip_compute.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/arm/clip_compute.h" +#include +#include +#include "lite/backends/arm/math/funcs.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +#include "lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void ClipCompute::Run() { + auto& param = Param(); + lite::Tensor* x = param.x; + lite::Tensor* min_tensor = param.min_tensor; + lite::Tensor* max_tensor = param.max_tensor; + lite::Tensor* out = param.out; + float min = param.min; + float max = param.max; + + if (min_tensor != nullptr) { + min = min_tensor->data()[0]; + } + if (max_tensor != nullptr) { + max = max_tensor->data()[0]; + } + + const float* x_ptr = x->data(); + float* out_ptr = out->mutable_data(); + int64_t num = x->numel(); + lite::arm::math::clip_kernel_fp32(x_ptr, num, min, max, out_ptr); + return; +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + clip, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ClipCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Min", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Max", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/activation_grad_compute.h b/lite/kernels/arm/clip_compute.h similarity index 81% rename from lite/kernels/arm/activation_grad_compute.h rename to lite/kernels/arm/clip_compute.h index ef03f58fa8cd499192aa6edfe3a7c51b49b14f65..94c2b3a32ea2fc0847d8e223ecd61856fa8e3ed2 100644 --- a/lite/kernels/arm/activation_grad_compute.h +++ b/lite/kernels/arm/clip_compute.h @@ -15,20 +15,20 @@ #pragma once #include #include "lite/core/kernel.h" -#include "lite/core/op_registry.h" +#include "lite/operators/clip_op.h" namespace paddle { namespace lite { namespace kernels { namespace arm { -class SquareGradCompute : public KernelLite { +class ClipCompute : public KernelLite { public: - using param_t = operators::ActivationGradParam; + using param_t = operators::ClipParam; void Run() override; - virtual ~SquareGradCompute() = default; + virtual ~ClipCompute() = default; }; } // namespace arm diff --git a/lite/kernels/arm/concat_compute.cc b/lite/kernels/arm/concat_compute.cc index dc78e1b955c29b261b2103479ea00bb836c0a31f..9ab4ca54bb909876bc823ac25cb67764eab12e47 100644 --- a/lite/kernels/arm/concat_compute.cc +++ b/lite/kernels/arm/concat_compute.cc @@ -52,11 +52,7 @@ void ConcatFunc(const std::vector inputs, output_offset += in_stride[0]; } } else { - std::vector inputs_concat(inputs.size()); - for (int j = 0; j < inputs.size(); ++j) { - inputs_concat[j] = inputs[j]; - } - lite::arm::math::concat_func(inputs_concat, axis, out); + lite::arm::math::concat_func(inputs, axis, out); } } @@ -71,6 +67,9 @@ void ConcatCompute::Run() { auto* axis_tensor_data = axis_tensor->data(); axis = axis_tensor_data[0]; } + if (axis < 0) { + axis += inputs[0]->dims().size(); + } switch (inputs.front()->precision()) { case PRECISION(kFloat): diff --git a/lite/kernels/arm/concat_compute_test.cc b/lite/kernels/arm/concat_compute_test.cc index 44c6dedd44ad4509a3f5a9c13fc04d6f1ffbdc64..862094fd23aa339bba0b06c4200e71f06402c645 100644 --- a/lite/kernels/arm/concat_compute_test.cc +++ b/lite/kernels/arm/concat_compute_test.cc @@ -12,14 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/concat_compute.h" #include + #include #include #include + #include "lite/backends/arm/math/funcs.h" #include "lite/core/op_registry.h" #include "lite/core/tensor.h" +#include "lite/kernels/arm/concat_compute.h" namespace paddle { namespace lite { @@ -221,8 +223,7 @@ TEST(concat_arm, compute_input_multi) { } TEST(concat, retrive_op) { - auto concat = - KernelRegistry::Global().Create("concat"); + auto concat = KernelRegistry::Global().Create("concat"); ASSERT_FALSE(concat.empty()); ASSERT_TRUE(concat.front()); } diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc index ef174814ced73d4b2ec20580e06c63d39693ce57..54e67de5abbfc88f64a50b07335d2527d9738206 100644 --- a/lite/kernels/arm/conv_compute.cc +++ b/lite/kernels/arm/conv_compute.cc @@ -121,10 +121,14 @@ void ConvCompute::PrepareForRun() { no_dilation && flag_dw) { impl_ = new DepthwiseConv; // VLOG(3) << "Run DepthwiseConv Int8"; + } else if (param.groups == 1 && kw == 3 && sw == 2 && no_dilation && + pads_equal) { + impl_ = new DirectConv; + // VLOG(3) << "Run DirectConv Int8"; } else if (param.groups == 1 && kw == 3 && sw == 1 && no_dilation && pads_equal) { impl_ = new WinogradConv; - // VLOG(3) << "Run DirectConv Int8"; + // VLOG(3) << "Run WinogradConv Int8"; } else { impl_ = new GemmLikeConv; // VLOG(3) << "Run GemmLikeConvInt8"; @@ -168,10 +172,14 @@ void ConvCompute::PrepareForRun() { no_dilation && flag_dw) { impl_ = new DepthwiseConv; // VLOG(3) << "Run DepthwiseConv Int8"; + } else if (param.groups == 1 && kw == 3 && sw == 2 && no_dilation && + pads_equal) { + impl_ = new DirectConv; + // VLOG(3) << "Run DirectConv Int8"; } else if (param.groups == 1 && kw == 3 && sw == 1 && no_dilation && pads_equal) { impl_ = new WinogradConv; - // VLOG(3) << "Run DirectConv Int8"; + // VLOG(3) << "Run WinogradConv Int8"; } else { impl_ = new GemmLikeConv; // VLOG(3) << "Run GemmLikeConvInt8"; diff --git a/lite/kernels/arm/conv_winograd.cc b/lite/kernels/arm/conv_winograd.cc index c6e06a243cc1d1f1c8dc35338d8183352c4f679a..f61c6109cdfd57b30c2b57390d21dec7c3bb3aa2 100644 --- a/lite/kernels/arm/conv_winograd.cc +++ b/lite/kernels/arm/conv_winograd.cc @@ -358,6 +358,9 @@ void WinogradConv::Run() { param, &ctx); } +#ifdef LITE_WITH_PROFILE + kernel_func_name_ = "conv_compute_2x2_3x3_int8"; +#endif } template class WinogradConv; template class WinogradConv; diff --git a/lite/kernels/arm/conv_winograd.h b/lite/kernels/arm/conv_winograd.h index 69835a74b40b4f08d78cb11f3b9415eef7bc89d6..b93a719f7dbb13aa9888ea943fa81b6ea2b38c00 100644 --- a/lite/kernels/arm/conv_winograd.h +++ b/lite/kernels/arm/conv_winograd.h @@ -61,6 +61,13 @@ class WinogradConv virtual void PrepareForRun(); virtual void ReInitWhenNeeded(); virtual void Run(); +#ifdef LITE_WITH_PROFILE + virtual void SetProfileRuntimeKernelInfo( + paddle::lite::profile::OpCharacter* ch) { + ch->kernel_func_name = kernel_func_name_; + } + std::string kernel_func_name_{"NotImplForConvWino"}; +#endif protected: using param_t = operators::ConvParam; diff --git a/lite/kernels/arm/decode_bboxes_compute_test.cc b/lite/kernels/arm/decode_bboxes_compute_test.cc index 271a99c29b61063877b7d1c0d2e50bc65d135d72..ef9da0f1e2c53a021c82f19d3151a2fe8fba8af4 100644 --- a/lite/kernels/arm/decode_bboxes_compute_test.cc +++ b/lite/kernels/arm/decode_bboxes_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/decode_bboxes_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/decode_bboxes_compute.h" namespace paddle { namespace lite { @@ -115,9 +117,7 @@ void decode_bboxes_compute_ref(const operators::DecodeBboxesParam& param) { } TEST(decode_bboxes_arm, retrive_op) { - auto decode_bboxes = - KernelRegistry::Global().Create( - "decode_bboxes"); + auto decode_bboxes = KernelRegistry::Global().Create("decode_bboxes"); ASSERT_FALSE(decode_bboxes.empty()); ASSERT_TRUE(decode_bboxes.front()); } diff --git a/lite/kernels/arm/deformable_conv_compute.h b/lite/kernels/arm/deformable_conv_compute.h index 6c8995ddd447a4382ee40e00f3b31832566ad9e9..17fae957619b7754637023a21169da9641686e59 100644 --- a/lite/kernels/arm/deformable_conv_compute.h +++ b/lite/kernels/arm/deformable_conv_compute.h @@ -17,6 +17,7 @@ #include "lite/backends/arm/math/funcs.h" #include "lite/core/kernel.h" #ifdef LITE_WITH_PROFILE +#include #include "lite/core/profile/profiler.h" #endif @@ -56,8 +57,9 @@ class DeformableConvCompute : public KernelLite { #ifdef LITE_WITH_PROFILE virtual void SetProfileRuntimeKernelInfo( paddle::lite::profile::OpCharacter* ch) { - impl_->SetProfileRuntimeKernelInfo(ch); + ch->kernel_func_name = kernel_func_name_; } + std::string kernel_func_name_{"NotImplForDeformableConv"}; #endif ~DeformableConvCompute() = default; diff --git a/lite/kernels/arm/dropout_compute_test.cc b/lite/kernels/arm/dropout_compute_test.cc index 1c0f8db347304076caee23ee3d295bcfacbe2a1f..0aa16b8d348d7b8415120051df0e9732fada4495 100644 --- a/lite/kernels/arm/dropout_compute_test.cc +++ b/lite/kernels/arm/dropout_compute_test.cc @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/dropout_compute.h" #include + #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/dropout_compute.h" namespace paddle { namespace lite { @@ -30,9 +32,7 @@ TEST(dropout_arm, init) { } TEST(dropout, retrive_op) { - auto dropout = - KernelRegistry::Global().Create( - "dropout"); + auto dropout = KernelRegistry::Global().Create("dropout"); ASSERT_FALSE(dropout.empty()); ASSERT_TRUE(dropout.front()); } diff --git a/lite/kernels/arm/elementwise_compute.cc b/lite/kernels/arm/elementwise_compute.cc index 8115700f5950ddfcb71df49e6a21528563f23d95..28082785e1c726097a8bfd2165f0d09b9962a5e7 100644 --- a/lite/kernels/arm/elementwise_compute.cc +++ b/lite/kernels/arm/elementwise_compute.cc @@ -300,11 +300,12 @@ void ElementwiseMaxActivationCompute::Run() { } } -void ElementwiseDivCompute::Run() { - auto& param = Param(); - const float* x_data = param.X->data(); - const float* y_data = param.Y->data(); - float* out_data = param.Out->mutable_data(); +template +void ElementwiseDivCompute::Run() { + auto& param = this->template Param(); + auto* x_data = param.X->template data(); + auto* y_data = param.Y->template data(); + auto* out_data = param.Out->template mutable_data(); int axis = param.axis; auto x_dims = param.X->dims(); auto y_dims = param.Y->dims(); @@ -313,10 +314,10 @@ void ElementwiseDivCompute::Run() { LOG(FATAL) << "elewise div don't support x_dims size < y_dims size"; } if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) { - lite::arm::math::elementwise_div_broadcast( + lite::arm::math::elementwise_div_broadcast( x_data, y_data, out_data, pre, n, post); } else { - lite::arm::math::elementwise_div( + lite::arm::math::elementwise_div( x_data, y_data, out_data, x_dims.production()); } } @@ -351,6 +352,29 @@ void ElementwiseDivActivationCompute::Run() { } } +template +void ElementwiseModCompute::Run() { + auto& param = this->template Param(); + auto* x_data = param.X->template data(); + auto* y_data = param.Y->template data(); + auto* out_data = param.Out->template mutable_data(); + int axis = param.axis; + auto x_dims = param.X->dims(); + auto y_dims = param.Y->dims(); + int pre, n, post; + if (x_dims.size() < y_dims.size() && + is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) { + lite::arm::math::elementwise_mod_broadcast( + y_data, x_data, out_data, pre, n, post); + } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) { + lite::arm::math::elementwise_mod_broadcast( + x_data, y_data, out_data, pre, n, post); + } else { + lite::arm::math::elementwise_mod( + x_data, y_data, out_data, x_dims.production()); + } +} + } // namespace arm } // namespace kernels } // namespace lite @@ -465,17 +489,27 @@ REGISTER_LITE_KERNEL( .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); -REGISTER_LITE_KERNEL(elementwise_div, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::ElementwiseDivCompute, - def) +using elementwise_div_fp32 = + paddle::lite::kernels::arm::ElementwiseDivCompute; + +REGISTER_LITE_KERNEL( + elementwise_div, kARM, kFloat, kNCHW, elementwise_div_fp32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); +using elementwise_div_int64 = + paddle::lite::kernels::arm::ElementwiseDivCompute; + +REGISTER_LITE_KERNEL( + elementwise_div, kARM, kInt64, kNCHW, elementwise_div_int64, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .Finalize(); + REGISTER_LITE_KERNEL( fusion_elementwise_div_activation, kARM, @@ -487,3 +521,13 @@ REGISTER_LITE_KERNEL( .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); + +using elementwise_mod_int64 = + paddle::lite::kernels::arm::ElementwiseModCompute; +REGISTER_LITE_KERNEL( + elementwise_mod, kARM, kInt64, kNCHW, elementwise_mod_int64, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/arm/elementwise_compute.h b/lite/kernels/arm/elementwise_compute.h index 731010a0d189c08f031363e6df95652c000a237b..7d7a93bf6954de9bbcd1b44061e614cd041fafe8 100644 --- a/lite/kernels/arm/elementwise_compute.h +++ b/lite/kernels/arm/elementwise_compute.h @@ -86,8 +86,8 @@ class ElementwiseMaxActivationCompute virtual ~ElementwiseMaxActivationCompute() = default; }; -class ElementwiseDivCompute - : public KernelLite { +template +class ElementwiseDivCompute : public KernelLite { public: void Run() override; @@ -102,6 +102,22 @@ class ElementwiseDivActivationCompute virtual ~ElementwiseDivActivationCompute() = default; }; +template +class ElementwiseModCompute : public KernelLite { + public: + void Run() override; + + virtual ~ElementwiseModCompute() = default; +}; + +// class ElementwiseModActivationCompute +// : public KernelLite { +// public: +// void Run() override; + +// virtual ~ElementwiseModActivationCompute() = default; +// }; + } // namespace arm } // namespace kernels } // namespace lite diff --git a/lite/kernels/arm/elementwise_compute_test.cc b/lite/kernels/arm/elementwise_compute_test.cc index b0ac3a7d33d92239c83147a3fe7615cd2fbf0249..62a5bc423ca6e72098332963713e8baffb366325 100644 --- a/lite/kernels/arm/elementwise_compute_test.cc +++ b/lite/kernels/arm/elementwise_compute_test.cc @@ -12,11 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/elementwise_compute.h" #include + +#include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/elementwise_compute.h" namespace paddle { namespace lite { @@ -24,9 +27,7 @@ namespace kernels { namespace arm { TEST(elementwise_add_arm, retrive_op) { - auto elementwise_add = - KernelRegistry::Global().Create( - "elementwise_add"); + auto elementwise_add = KernelRegistry::Global().Create("elementwise_add"); ASSERT_FALSE(elementwise_add.empty()); ASSERT_TRUE(elementwise_add.front()); } @@ -140,6 +141,119 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param, } } +template +void elementwise_fmod_compute_ref(const operators::ElementwiseParam& param, + const std::string act_type) { + const dtype* x_data = param.X->data(); + const dtype* y_data = param.Y->data(); + dtype* out_data = param.Out->mutable_data(); + auto x_dims = param.X->dims(); + auto y_dims = param.Y->dims(); + int axis = param.axis; + if (axis < 0) { + axis = x_dims.size() - y_dims.size(); + } + int batch = 1; + int channels = 1; + int num = 1; + for (int i = 0; i < axis; ++i) { + batch *= x_dims[i]; + } + for (int i = 0; i < y_dims.size(); ++i) { + channels *= y_dims[i]; + } + for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { + num *= x_dims[i]; + } + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = fmod(diny_data + fmod(*din_ptr, diny_data), diny_data); + dout_ptr++; + din_ptr++; + } + } + } + // do activation relu + if (act_type.size() > 0) { + if (act_type == "relu") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + dtype* dout_ptr = out_data + (i * channels + j) * num; + for (int k = 0; k < num; ++k) { + *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f; + dout_ptr++; + } + } + } + } + } +} + +template +void elementwise_imod_compute_ref(const operators::ElementwiseParam& param, + const std::string act_type) { + const dtype* x_data = param.X->data(); + const dtype* y_data = param.Y->data(); + dtype* out_data = param.Out->mutable_data(); + auto x_dims = param.X->dims(); + auto y_dims = param.Y->dims(); + int axis = param.axis; + if (axis < 0) { + axis = x_dims.size() - y_dims.size(); + } + int batch = 1; + int channels = 1; + int num = 1; + for (int i = 0; i < axis; ++i) { + batch *= x_dims[i]; + } + for (int i = 0; i < y_dims.size(); ++i) { + channels *= y_dims[i]; + } + for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { + num *= x_dims[i]; + } + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = (*din_ptr) % diny_data; + dout_ptr++; + din_ptr++; + } + } + } + // do activation relu + if (act_type.size() > 0) { + if (act_type == "relu") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + dtype* dout_ptr = out_data + (i * channels + j) * num; + for (int k = 0; k < num; ++k) { + *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f; + dout_ptr++; + } + } + } + } + } +} + +template void elementwise_fmod_compute_ref( + const operators::ElementwiseParam& param, const std::string act_type); +template void elementwise_imod_compute_ref( + const operators::ElementwiseParam& param, const std::string act_type); +template void elementwise_imod_compute_ref( + const operators::ElementwiseParam& param, const std::string act_type); + TEST(elementwise_add, compute) { ElementwiseAddCompute elementwise_add; operators::ElementwiseParam param; @@ -222,8 +336,7 @@ TEST(elementwise_add, compute) { TEST(fusion_elementwise_add_activation_arm, retrive_op) { auto fusion_elementwise_add_activation = - KernelRegistry::Global().Create( - "fusion_elementwise_add_activation"); + KernelRegistry::Global().Create("fusion_elementwise_add_activation"); ASSERT_FALSE(fusion_elementwise_add_activation.empty()); ASSERT_TRUE(fusion_elementwise_add_activation.front()); } @@ -321,9 +434,7 @@ TEST(fusion_elementwise_add_activation_arm, compute) { } TEST(elementwise_mul_arm, retrive_op) { - auto elementwise_mul = - KernelRegistry::Global().Create( - "elementwise_mul"); + auto elementwise_mul = KernelRegistry::Global().Create("elementwise_mul"); ASSERT_FALSE(elementwise_mul.empty()); ASSERT_TRUE(elementwise_mul.front()); } @@ -416,8 +527,7 @@ TEST(elementwise_mul, compute) { TEST(fusion_elementwise_mul_activation_arm, retrive_op) { auto fusion_elementwise_mul_activation = - KernelRegistry::Global().Create( - "fusion_elementwise_mul_activation"); + KernelRegistry::Global().Create("fusion_elementwise_mul_activation"); ASSERT_FALSE(fusion_elementwise_mul_activation.empty()); ASSERT_TRUE(fusion_elementwise_mul_activation.front()); } @@ -515,9 +625,7 @@ TEST(fusion_elementwise_mul_activation_arm, compute) { } TEST(elementwise_max_arm, retrive_op) { - auto elementwise_max = - KernelRegistry::Global().Create( - "elementwise_max"); + auto elementwise_max = KernelRegistry::Global().Create("elementwise_max"); ASSERT_FALSE(elementwise_max.empty()); ASSERT_TRUE(elementwise_max.front()); } @@ -610,8 +718,7 @@ TEST(elementwise_max, compute) { TEST(fusion_elementwise_max_activation_arm, retrive_op) { auto fusion_elementwise_max_activation = - KernelRegistry::Global().Create( - "fusion_elementwise_max_activation"); + KernelRegistry::Global().Create("fusion_elementwise_max_activation"); ASSERT_FALSE(fusion_elementwise_max_activation.empty()); ASSERT_TRUE(fusion_elementwise_max_activation.front()); } @@ -685,7 +792,7 @@ TEST(fusion_elementwise_max_activation_arm, compute) { } for (int i = 0; i < y_dim.production(); i++) { float sign = i % 2 == 0 ? 0.5f : -0.5f; - y_data[i] = i * sign; + y_data[i] = (i + 1) * sign; } param.X = &x; param.Y = &y; @@ -708,6 +815,106 @@ TEST(fusion_elementwise_max_activation_arm, compute) { } } +TEST(elementwise_mod_int64_arm, retrive_op) { + auto elementwise_mod = KernelRegistry::Global().Create("elementwise_mod"); + ASSERT_FALSE(elementwise_mod.empty()); + ASSERT_TRUE(elementwise_mod.front()); +} + +TEST(elementwise_mod_int64_arm, init) { + ElementwiseModCompute elementwise_mod; + ASSERT_EQ(elementwise_mod.precision(), PRECISION(kInt64)); + ASSERT_EQ(elementwise_mod.target(), TARGET(kARM)); +} + +TEST(elementwise_mod_int64_arm, compute) { + ElementwiseModCompute elementwise_mod; + operators::ElementwiseParam param; + lite::Tensor x, y, output, output_ref; + +#if 1 + for (auto n : {1, 3, 4}) { + for (auto c : {1, 3, 4}) { + for (auto h : {1, 3, 4}) { + for (auto w : {1, 3, 4}) { + for (auto axis : {-1, 0, 1, 3}) { + for (auto yd : {std::vector({n}), + std::vector({c}), + std::vector({h}), + std::vector({w}), + std::vector({n, c}), + std::vector({c, h}), + std::vector({c, h, w}), + std::vector({n, c, h, w})}) { +#else + for (auto n : {1, 3, 4, 11}) { + for (auto c : {1, 3, 4, 11}) { + for (auto h : {1, 3, 4, 11}) { + for (auto w : {1, 3, 4, 11}) { + for (auto axis : {-1, 0, 1, 2, 3}) { + for (auto yd : {std::vector({n}), + std::vector({c}), + std::vector({h}), + std::vector({w}), + std::vector({n, c}), + std::vector({c, h}), + std::vector({h, w}), + std::vector({n, c, h}), + std::vector({c, h, w}), + std::vector({n, c, h, w})}) { +#endif + auto x_dim = DDim(std::vector({n, c, h, w})); + auto y_dim = DDim(yd); + int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis; + + if (axis_t + y_dim.size() > 4) continue; + bool flag = false; + for (int i = 0; i < y_dim.size(); i++) { + if (x_dim[i + axis_t] != y_dim[i]) flag = true; + } + if (flag) continue; + + x.Resize(x_dim); + y.Resize(y_dim); + output.Resize(x_dim); + output_ref.Resize(x_dim); + auto* x_data = x.mutable_data(); + auto* y_data = y.mutable_data(); + auto* output_data = output.mutable_data(); + auto* output_ref_data = output_ref.mutable_data(); + for (int i = 0; i < x_dim.production(); i++) { + x_data[i] = i + 1; + } + for (int i = 0; i < y_dim.production(); i++) { + y_data[i] = y_dim.production() - i; + } + param.X = &x; + param.Y = &y; + param.axis = axis; + param.Out = &output; + elementwise_mod.SetParam(param); + elementwise_mod.Run(); + param.Out = &output_ref; + elementwise_imod_compute_ref(param, ""); + for (int i = 0; i < output.dims().production(); i++) { + if (std::abs(output_data[i] - output_ref_data[i]) > 1e-5 || + std::isnan(output_data[i]) || + std::isnan(output_ref_data[i])) { + LOG(FATAL) << "elementwise mod cmp error, i: " << i + << ", x_data: " << x_data[i] + << ", y_data: " << y_data[i] + << ", output_data: " << output_data[i] + << ", output_ref_data: " << output_ref_data[i]; + } + } + } + } + } + } + } + } +} + } // namespace arm } // namespace kernels } // namespace lite @@ -719,3 +926,4 @@ USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(elementwise_mod, kARM, kInt64, kNCHW, def); diff --git a/lite/kernels/arm/gather_compute.cc b/lite/kernels/arm/gather_compute.cc index 3efacc4aacefcb150d53738c950ec9e797ed78c7..2a9c70aede7475b36f70c628ff6ccaa823f030b2 100644 --- a/lite/kernels/arm/gather_compute.cc +++ b/lite/kernels/arm/gather_compute.cc @@ -73,7 +73,6 @@ void GatherCompute::Run() { REGISTER_LITE_KERNEL( gather, kARM, kAny, kNCHW, paddle::lite::kernels::arm::GatherCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) - .BindInput("Index", - {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("Index", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .Finalize(); diff --git a/lite/kernels/arm/layer_norm_compute_test.cc b/lite/kernels/arm/layer_norm_compute_test.cc index 22fe3d06569fac424ab797712142b4d088dc7d3a..e84f9f133ce0cdecb714dc535c0f5833597105c6 100644 --- a/lite/kernels/arm/layer_norm_compute_test.cc +++ b/lite/kernels/arm/layer_norm_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/layer_norm_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/layer_norm_compute.h" namespace paddle { namespace lite { @@ -181,9 +183,7 @@ TEST(layer_norm_arm, compute) { } TEST(layer_norm, retrive_op) { - auto layer_norm = - KernelRegistry::Global().Create( - "layer_norm"); + auto layer_norm = KernelRegistry::Global().Create("layer_norm"); ASSERT_FALSE(layer_norm.empty()); ASSERT_TRUE(layer_norm.front()); } diff --git a/lite/kernels/arm/lrn_compute_test.cc b/lite/kernels/arm/lrn_compute_test.cc index e7030d00427e55c7faf333997cd90cba46260cd4..9afd05b80aaffdc4be2ae1deaa5993b8fd21dce4 100644 --- a/lite/kernels/arm/lrn_compute_test.cc +++ b/lite/kernels/arm/lrn_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/lrn_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/lrn_compute.h" namespace paddle { namespace lite { @@ -133,8 +135,7 @@ void lrn_compute_ref(const operators::LrnParam& param) { } TEST(lrn_arm, retrive_op) { - auto lrn = - KernelRegistry::Global().Create("lrn"); + auto lrn = KernelRegistry::Global().Create("lrn"); ASSERT_FALSE(lrn.empty()); ASSERT_TRUE(lrn.front()); } diff --git a/lite/kernels/arm/merge_lod_tensor_compute_test.cc b/lite/kernels/arm/merge_lod_tensor_compute_test.cc index 914a58308bdf0d5c6d374d5f81ca38224941c85d..f8d92dfdc740988733ad26d5385b17050b490635 100644 --- a/lite/kernels/arm/merge_lod_tensor_compute_test.cc +++ b/lite/kernels/arm/merge_lod_tensor_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/merge_lod_tensor_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/merge_lod_tensor_compute.h" namespace paddle { namespace lite { @@ -26,9 +28,7 @@ namespace kernels { namespace arm { TEST(merge_lod_tensor_arm, retrive_op) { - auto kernel = - KernelRegistry::Global().Create( - "merge_lod_tensor"); + auto kernel = KernelRegistry::Global().Create("merge_lod_tensor"); ASSERT_FALSE(kernel.empty()); ASSERT_TRUE(kernel.front()); } diff --git a/lite/kernels/arm/mul_compute_test.cc b/lite/kernels/arm/mul_compute_test.cc index cddee81fe22897dbe91721ed172b144539e0852c..76ab95b93485b3e6701dca6224ce2a5f7a8b3df7 100644 --- a/lite/kernels/arm/mul_compute_test.cc +++ b/lite/kernels/arm/mul_compute_test.cc @@ -12,16 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/mul_compute.h" #include + #include #include #include #include #include #include + #include "lite/backends/arm/math/funcs.h" #include "lite/core/op_registry.h" +#include "lite/kernels/arm/mul_compute.h" namespace paddle { namespace lite { @@ -69,8 +71,7 @@ void FillData(T* a, } TEST(mul_arm, retrive_op) { - auto mul = - KernelRegistry::Global().Create("mul"); + auto mul = KernelRegistry::Global().Create("mul"); ASSERT_FALSE(mul.empty()); ASSERT_TRUE(mul.front()); } diff --git a/lite/kernels/arm/pool_compute_test.cc b/lite/kernels/arm/pool_compute_test.cc index acdaf0d0131621c1c2403b8a071d6cb1134f4565..c4aeb20a5bf53d80be4b407698a51ead46f6b8f5 100644 --- a/lite/kernels/arm/pool_compute_test.cc +++ b/lite/kernels/arm/pool_compute_test.cc @@ -12,14 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/pool_compute.h" #include + #include #include #include #include + #include "lite/backends/arm/math/funcs.h" #include "lite/core/op_registry.h" +#include "lite/kernels/arm/pool_compute.h" namespace paddle { namespace lite { @@ -341,8 +343,7 @@ TEST(pool_arm, compute) { } TEST(pool_arm, retrive_op) { - auto pool = KernelRegistry::Global().Create( - "pool2d"); + auto pool = KernelRegistry::Global().Create("pool2d"); ASSERT_FALSE(pool.empty()); ASSERT_TRUE(pool.front()); } diff --git a/lite/kernels/arm/scale_compute_test.cc b/lite/kernels/arm/scale_compute_test.cc index 0d327b9807d306770850b09ed1ed2a0337104c92..fe5e1911d0cc2c012876731f50bd04b3125b8fa2 100644 --- a/lite/kernels/arm/scale_compute_test.cc +++ b/lite/kernels/arm/scale_compute_test.cc @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/scale_compute.h" #include + #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/scale_compute.h" namespace paddle { namespace lite { @@ -103,8 +105,7 @@ TEST(scale_arm, compute) { } TEST(scale, retrive_op) { - auto scale = - KernelRegistry::Global().Create("scale"); + auto scale = KernelRegistry::Global().Create("scale"); ASSERT_FALSE(scale.empty()); ASSERT_TRUE(scale.front()); } diff --git a/lite/kernels/arm/sequence_conv_compute.cc b/lite/kernels/arm/sequence_conv_compute.cc index a70b6717097ec0ffdaa24ba257bfdf8dbd536f3f..69740a258be165f9ceec6829a81497e842b5a697 100644 --- a/lite/kernels/arm/sequence_conv_compute.cc +++ b/lite/kernels/arm/sequence_conv_compute.cc @@ -88,7 +88,7 @@ void SequenceConvCompute::Run() { paddle::lite::arm::math::im2col( sub_in_data, 1, - sequence_len, + input_row_end - input_row_begin, hidden_dim, // C H W -> 1, seq_len, hidden_dim kernel_size, hidden_dim, // kernel_h, kernel_w diff --git a/lite/kernels/arm/softmax_compute.cc b/lite/kernels/arm/softmax_compute.cc index 3409d0f5c5bd6e7ce1ea77809f7715b62bb10ca2..79ea23ab3fad3340c63846ea11cc89b371f5c6c9 100644 --- a/lite/kernels/arm/softmax_compute.cc +++ b/lite/kernels/arm/softmax_compute.cc @@ -34,7 +34,7 @@ void SoftmaxCompute::Run() { int inner_num = x_dims.Slice(axis + 1, x_rank).production(); int axis_size = x_dims[axis]; if (inner_num == 1) { - if (axis_size >= 4) { + if (axis_size > 4) { lite::arm::math::softmax_inner1_large_axis( din, dout, outer_num, axis_size); } else { diff --git a/lite/kernels/arm/softmax_compute_test.cc b/lite/kernels/arm/softmax_compute_test.cc index 459112d8c0169375584baf0cb983037682e47a3d..486ccf2cedd1af3ce0d7cc2f7d0aeecaadf15ca9 100644 --- a/lite/kernels/arm/softmax_compute_test.cc +++ b/lite/kernels/arm/softmax_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/softmax_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/softmax_compute.h" namespace paddle { namespace lite { @@ -121,9 +123,7 @@ TEST(softmax_arm, compute) { } TEST(softmax, retrive_op) { - auto softmax = - KernelRegistry::Global().Create( - "softmax"); + auto softmax = KernelRegistry::Global().Create("softmax"); ASSERT_FALSE(softmax.empty()); ASSERT_TRUE(softmax.front()); } diff --git a/lite/kernels/arm/split_compute_test.cc b/lite/kernels/arm/split_compute_test.cc index 034fbb85c487df6159a6a22b9958cc9e64d9e1c6..c51ea186b52a77abec5c7560b0a028079bea4aba 100644 --- a/lite/kernels/arm/split_compute_test.cc +++ b/lite/kernels/arm/split_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/split_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/split_compute.h" namespace paddle { namespace lite { @@ -165,8 +167,7 @@ TEST(split_arm, compute) { } TEST(split, retrive_op) { - auto split = - KernelRegistry::Global().Create("split"); + auto split = KernelRegistry::Global().Create("split"); ASSERT_FALSE(split.empty()); ASSERT_TRUE(split.front()); } diff --git a/lite/kernels/arm/split_lod_tensor_compute_test.cc b/lite/kernels/arm/split_lod_tensor_compute_test.cc index 3b2004c786698b70b4c54b68d696a9cf5f5221fd..03f5a21890ffd515e83de7895c2be886b15b8967 100644 --- a/lite/kernels/arm/split_lod_tensor_compute_test.cc +++ b/lite/kernels/arm/split_lod_tensor_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/split_lod_tensor_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/split_lod_tensor_compute.h" namespace paddle { namespace lite { @@ -26,9 +28,7 @@ namespace kernels { namespace arm { TEST(split_lod_tensor_arm, retrive_op) { - auto kernel = - KernelRegistry::Global().Create( - "split_lod_tensor"); + auto kernel = KernelRegistry::Global().Create("split_lod_tensor"); ASSERT_FALSE(kernel.empty()); ASSERT_TRUE(kernel.front()); } diff --git a/lite/kernels/arm/transpose_compute_test.cc b/lite/kernels/arm/transpose_compute_test.cc index aaf3f138a54db2c7ff766325cfd61bc51ec8b1d2..74fd14754637427277a6b19b820bb5d3de66c418 100644 --- a/lite/kernels/arm/transpose_compute_test.cc +++ b/lite/kernels/arm/transpose_compute_test.cc @@ -12,14 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/transpose_compute.h" #include + #include #include #include + #include "lite/backends/arm/math/funcs.h" #include "lite/core/op_registry.h" #include "lite/core/tensor.h" +#include "lite/kernels/arm/transpose_compute.h" namespace paddle { namespace lite { @@ -121,9 +123,7 @@ TEST(transpose_arm, compute_shape_nchw) { } TEST(transpose, retrive_op) { - auto transpose = - KernelRegistry::Global().Create( - "transpose"); + auto transpose = KernelRegistry::Global().Create("transpose"); ASSERT_FALSE(transpose.empty()); ASSERT_TRUE(transpose.front()); } @@ -189,9 +189,7 @@ TEST(transpose2_arm, compute_shape_nchw) { } TEST(transpose2, retrive_op) { - auto transpose2 = - KernelRegistry::Global().Create( - "transpose2"); + auto transpose2 = KernelRegistry::Global().Create("transpose2"); ASSERT_FALSE(transpose2.empty()); ASSERT_TRUE(transpose2.front()); } diff --git a/lite/kernels/bm/bridges/batch_norm_op.cc b/lite/kernels/bm/bridges/batch_norm_op.cc index fbf70178fdd971edce34b3253b02febfa3e3b85c..f5ecc0825a17f26b1cf65605ea2e8c0c93338f39 100644 --- a/lite/kernels/bm/bridges/batch_norm_op.cc +++ b/lite/kernels/bm/bridges/batch_norm_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include "lite/kernels/bm/bridges/graph.h" #include "lite/kernels/bm/bridges/utility.h" #include "lite/kernels/npu/bridges/registry.h" @@ -64,10 +65,16 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto* bias_data = bias->mutable_data(); auto* mean_data = mean->mutable_data(); auto* variance_data = variance->mutable_data(); + + float* new_bias = static_cast(malloc(bias->memory_size())); + float* new_scale = static_cast(malloc(scale->memory_size())); + CHECK(new_bias != nullptr); + CHECK(new_scale != nullptr); + for (int c = 0; c < channel_size; c++) { float inv_scale = 1.f / (std::sqrt(variance_data[c] + epsilon)); - bias_data[c] = bias_data[c] - inv_scale * scale_data[c] * mean_data[c]; - scale_data[c] = inv_scale * scale_data[c]; + new_bias[c] = bias_data[c] - inv_scale * scale_data[c] * mean_data[c]; + new_scale[c] = inv_scale * scale_data[c]; } const int input_num = 1; @@ -86,11 +93,13 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { output_dims.size(), static_cast(output_var_name.c_str()), static_cast(unique_op_name.c_str()), - static_cast(scale->mutable_data()), - static_cast(bias->mutable_data()), + static_cast(new_scale), + static_cast(new_bias), 1, 1, 1); + free(new_scale); + free(new_bias); delete[] shape; delete[] name; delete[] dim; diff --git a/lite/kernels/bm/bridges/density_prior_box_op.cc b/lite/kernels/bm/bridges/density_prior_box_op.cc index 137c5142d5ae544226dbe5d6cd7c872fc272b71a..895901d94e2b2077f530e196ef8f30d4f57df793 100644 --- a/lite/kernels/bm/bridges/density_prior_box_op.cc +++ b/lite/kernels/bm/bridges/density_prior_box_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include "lite/kernels/bm/bridges/graph.h" #include "lite/kernels/bm/bridges/utility.h" #include "lite/kernels/npu/bridges/registry.h" diff --git a/lite/kernels/bm/bridges/interpolate_op.cc b/lite/kernels/bm/bridges/interpolate_op.cc index 8c2d39b16ac0206d83199fdeac6c30a0a352856e..a77ec4e8f788e581d9d226369210a449ec50840c 100644 --- a/lite/kernels/bm/bridges/interpolate_op.cc +++ b/lite/kernels/bm/bridges/interpolate_op.cc @@ -76,6 +76,8 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) { static_cast(output_var_name.c_str()), 0, 0, + 0, + 0, type); } graph->AddNode(output_var_name); diff --git a/lite/kernels/bm/subgraph_compute.cc b/lite/kernels/bm/subgraph_compute.cc index d7640e1ac7326d9764380469dc97a7806b044437..664198cf9fb45664fdc088df382b9b94a1924e9b 100644 --- a/lite/kernels/bm/subgraph_compute.cc +++ b/lite/kernels/bm/subgraph_compute.cc @@ -28,12 +28,35 @@ namespace lite { namespace kernels { namespace bm { -int SubgraphEngine::BuildDeviceProgram() { +bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() { + // Obtain the origin input tensors, and create the origin output + // tensors(Don't try to access them before launch the device program or the + // origin program) + PrepareWorkspaceForOriginProgram(); + // Create the device input and output tensors, but don't initialize them + // with the dimensions + device_inputs_.resize(input_names_.size()); + for (int i = 0; i < input_names_.size(); i++) { + device_inputs_[i].reset(new hiai::AiTensor); + CHECK(device_inputs_[i]); + } + device_outputs_.resize(output_names_.size()); + for (int i = 0; i < output_names_.size(); i++) { + device_outputs_[i].reset(new hiai::AiTensor); + CHECK(device_outputs_[i]); + } + return true; +} + +bool SubgraphEngine::BuildDeviceProgram() { int status = 0; subgraph::bm::Graph graph; const auto& bridges = subgraph::Registry::Instance(); graph.CreateCompilerHandle(); auto& ctx = this->ctx_->template As(); + if (origin_program_.empty()) { + BuildOriginProgram(); + } for (auto& inst : origin_program_) { auto op = const_cast(inst.op()); CHECK(op); @@ -42,7 +65,7 @@ int SubgraphEngine::BuildDeviceProgram() { std::string op_type = op->op_info()->Type(); LOG(INFO) << op_type; if (!bridges.Exists(op_type, TARGET(kBM))) { - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); status |= @@ -50,12 +73,13 @@ int SubgraphEngine::BuildDeviceProgram() { const_cast(op), const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } - std::string net_name = "bmnetc_f32umodel"; + std::string net_name = "bmnet_f32bmodel"; + auto unique_net_name = lite::subgraph::bm::UniqueName(net_name); __bmcompile_opt( - graph.GetCompilerHandle(), const_cast(net_name.c_str()), 1); + graph.GetCompilerHandle(), const_cast(unique_net_name.c_str()), 2); void* bmodel_data = nullptr; unsigned int data_size = 0; bm_hd_ = static_cast(ctx.GetHandle()); @@ -63,7 +87,7 @@ int SubgraphEngine::BuildDeviceProgram() { graph.UnlockCompilerMutex(); bmrt_hd_ = bmrt_create(bm_hd_); if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) { - return subgraph::FAILED; + return false; } bmrt_get_network_names(bmrt_hd_, &net_names_); net_info_ = bmrt_get_network_info(bmrt_hd_, net_names_[0]); @@ -116,10 +140,10 @@ int SubgraphEngine::BuildDeviceProgram() { net_info_->output_dtypes[i], stage.output_shapes[i]); } - return status; + return true; } -int SubgraphEngine::LaunchDeviceProgram() { +bool SubgraphEngine::LaunchDeviceProgram() { for (size_t i = 0; i < device_inputs_.size(); i++) { bm_memcpy_s2d(bm_hd_, device_inputs_[i].device_mem, @@ -143,7 +167,7 @@ int SubgraphEngine::LaunchDeviceProgram() { out_index++; } } - return 0; + return true; } void SubgraphCompute::PrepareForRun() { @@ -155,12 +179,11 @@ void SubgraphCompute::PrepareForRun() { param.output_data_names, param.scope)); CHECK(engine_); - engine_->Build(); } void SubgraphCompute::Run() { CHECK(engine_); - engine_->Launch(); + engine_->Run(); } } // namespace bm diff --git a/lite/kernels/bm/subgraph_compute.h b/lite/kernels/bm/subgraph_compute.h index 60f7661c7990d90020dbfc7ec3a6e0d178dceb70..7a5b2552ff95681da09346ba11f40f1a6acb7f01 100644 --- a/lite/kernels/bm/subgraph_compute.h +++ b/lite/kernels/bm/subgraph_compute.h @@ -44,8 +44,9 @@ class SubgraphEngine : public subgraph::Engine { ctx, block_idx, block_desc, input_names, output_names, scope) {} protected: - int BuildDeviceProgram() override; - int LaunchDeviceProgram() override; + bool PrepareWorkspaceForDeviceProgram() override; + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; private: void *bmrt_hd_; diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt index 1a58a51c36a1ccbb21bb2830a197c096e7ddac51..22bb4345fe744df9a06997d366310e2cc24a7a12 100644 --- a/lite/kernels/cuda/CMakeLists.txt +++ b/lite/kernels/cuda/CMakeLists.txt @@ -6,6 +6,8 @@ message(STATUS "compile with lite CUDA kernels") # basic kernels add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} ${math_cuda}) +add_kernel(fc_compute_cuda CUDA basic SRCS fc_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) +add_kernel(matmul_compute_cuda CUDA basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps} ${math_cuda}) add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps}) add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps}) add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps}) @@ -34,7 +36,10 @@ add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute. add_kernel(search_seq_depadding_compute_cuda CUDA extra SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps}) add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm ${math_cuda}) add_kernel(sequence_reverse_compute_cuda CUDA extra SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(sequence_pad_compute_cuda CUDA extra SRCS sequence_pad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) +add_kernel(sequence_unpad_compute_cuda CUDA extra SRCS sequence_unpad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) add_kernel(sequence_concat_compute_cuda CUDA extra SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(sequence_mask_compute_cuda CUDA extra SRCS sequence_mask_compute.cu DEPS ${lite_kernel_deps}) add_kernel(sequence_arithmetic_compute_cuda CUDA extra SRCS sequence_arithmetic_compute.cu DEPS ${lite_kernel_deps}) add_kernel(lookup_table_compute_cuda CUDA extra SRCS lookup_table_compute.cu DEPS ${lite_kernel_deps}) add_kernel(attention_padding_mask_compute_cuda CUDA extra SRCS attention_padding_mask_compute.cu DEPS ${lite_kernel_deps}) @@ -44,6 +49,8 @@ add_kernel(match_matrix_tensor_compute_cuda CUDA extra SRCS match_matrix_tensor_ add_kernel(search_aligned_mat_mul_compute_cuda CUDA extra SRCS search_aligned_mat_mul_compute.cc DEPS ${lite_kernel_deps} cuda_batched_gemm) add_kernel(search_seq_fc_compute_cuda CUDA extra SRCS search_seq_fc_compute.cu DEPS ${lite_kernel_deps} cuda_gemm) add_kernel(var_conv_2d_compute_cuda CUDA extra SRCS var_conv_2d_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) +add_kernel(topk_pooling_compute_cuda CUDA extra SRCS topk_pooling_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(assign_value_compute_cuda CUDA extra SRCS assign_value_compute.cu DEPS ${lite_kernel_deps}) # unit test lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda) @@ -60,7 +67,9 @@ nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute nv_test(elementwise_compute_cuda_test SRCS elementwise_compute_test.cc DEPS elementwise_compute_cuda) nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda) #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda) -nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) +nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) +nv_test(fc_compute_cuda_test SRCS fc_compute_test.cc DEPS fc_compute_cuda) +nv_test(matmul_compute_cuda_test SRCS matmul_compute_test.cc DEPS matmul_compute_cuda) nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda ) nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda) #nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda) @@ -74,9 +83,14 @@ if(LITE_BUILD_EXTRA) nv_test(search_aligned_mat_mul_compute_cuda_test SRCS search_aligned_mat_mul_compute_test.cc DEPS search_aligned_mat_mul_compute_cuda) nv_test(search_seq_fc_compute_cuda_test SRCS search_seq_fc_compute_test.cc DEPS search_seq_fc_compute_cuda) nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_cuda) + nv_test(sequence_pad_compute_cuda_test SRCS sequence_pad_compute_test.cc DEPS sequence_pad_compute_cuda) + nv_test(sequence_unpad_compute_cuda_test SRCS sequence_unpad_compute_test.cc DEPS sequence_unpad_compute_cuda) + nv_test(sequence_mask_compute_cuda_test SRCS sequence_mask_compute_test.cc DEPS sequence_mask_compute_cuda) nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda) #nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda) #nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda) nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda) #nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda) + nv_test(topk_pooling_compute_cuda_test SRCS topk_pooling_compute_test.cc DEPS topk_pooling_compute_cuda) + nv_test(assign_value_compute_cuda_test SRCS assign_value_compute_test.cc DEPS assign_value_compute_cuda) endif() diff --git a/lite/kernels/cuda/assign_value_compute.cu b/lite/kernels/cuda/assign_value_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..89f2937f10399361951c3c8deb47e3700f93e288 --- /dev/null +++ b/lite/kernels/cuda/assign_value_compute.cu @@ -0,0 +1,76 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "lite/backends/cuda/target_wrapper.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" +#include "lite/kernels/cuda/assign_value_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +void TensorFromVector(const std::vector& src, + lite::Tensor* dst, + cudaStream_t* stream) { + auto* src_ptr = static_cast(src.data()); + auto* dst_ptr = static_cast(dst->mutable_data(TARGET(kCUDA))); + auto size = src.size() * sizeof(T); + TargetWrapperCuda::MemcpyAsync( + dst_ptr, src_ptr, size, IoDirection::HtoD, *stream); +} + +void AssignValueCompute::Run() { + auto& param = Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + int dtype = param.dtype; + std::vector fp32_values = param.fp32_values; + std::vector int32_values = param.int32_values; + std::vector int64_values = param.int64_values; + std::vector bool_values = param.bool_values; + auto* out = param.Out; + + if (dtype == static_cast(lite::core::FluidType::INT32)) { + TensorFromVector(int32_values, out, &stream); + } else if (dtype == static_cast(lite::core::FluidType::FP32)) { + TensorFromVector(fp32_values, out, &stream); + } else if (dtype == static_cast(lite::core::FluidType::INT64)) { + TensorFromVector(int64_values, out, &stream); + } else if (dtype == static_cast(lite::core::FluidType::BOOL)) { + TensorFromVector(bool_values, out, &stream); + } else { + LOG(FATAL) << "Unsupported dtype for assign_value_op:" << dtype; + } + return; +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(assign_value, + kCUDA, + kAny, + kNCHW, + paddle::lite::kernels::cuda::AssignValueCompute, + def) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kAny))}) + .Finalize(); diff --git a/lite/kernels/cuda/assign_value_compute.h b/lite/kernels/cuda/assign_value_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..c334e36d8061437881a4ea67d960f87b7ffb3ceb --- /dev/null +++ b/lite/kernels/cuda/assign_value_compute.h @@ -0,0 +1,34 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class AssignValueCompute : public KernelLite { + public: + using param_t = operators::AssignValueParam; + + void Run() override; + virtual ~AssignValueCompute() = default; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/assign_value_compute_test.cc b/lite/kernels/cuda/assign_value_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c29426b745e92f71bcfeca6a8fc2890cd1908b4 --- /dev/null +++ b/lite/kernels/cuda/assign_value_compute_test.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/assign_value_compute.h" + +#include + +#include +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class AssignValueTest : public ::testing::Test { + protected: + AssignValueTest() : dtype_(5), shape_({1}) { + int num = std::accumulate( + shape_.begin(), shape_.end(), 1, std::multiplies()); + fp32_values_.resize(num); + int32_values_.resize(num); + int64_values_.resize(num); + bool_values_.resize(num); + for (int i = 0; i < num; ++i) { + fp32_values_[i] = i + 5; + int32_values_[i] = i; + int64_values_[i] = i; + bool_values_[i] = i; + } + std::vector out_shape(shape_.size(), 0); + for (size_t i = 0; i < shape_.size(); ++i) out_shape[i] = shape_[i]; + out_ref_.Resize(lite::DDim(out_shape)); + out_gpu_.Resize(out_ref_.dims()); + out_cpu_.Resize(out_ref_.dims()); + + RunBaseLine(&out_ref_); + + InitParamAndContext(); + } + + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.shape = shape_; + param_.dtype = dtype_; + param_.fp32_values = fp32_values_; + param_.int32_values = int32_values_; + param_.int64_values = int64_values_; + param_.bool_values = bool_values_; + param_.Out = &out_gpu_; + } + + void InitFloatInput() {} + + void InitHalfInput() {} + + void RunBaseLine(lite::Tensor* out) { + if (dtype_ == static_cast(lite::core::FluidType::INT32)) { + for (size_t i = 0; i < int32_values_.size(); ++i) { + out->mutable_data()[i] = int32_values_[i]; + } + } else if (dtype_ == static_cast(lite::core::FluidType::FP32)) { + for (size_t i = 0; i < fp32_values_.size(); ++i) { + out->mutable_data()[i] = fp32_values_[i]; + } + } else if (dtype_ == static_cast(lite::core::FluidType::INT64)) { + for (size_t i = 0; i < int64_values_.size(); ++i) { + out->mutable_data()[i] = int64_values_[i]; + } + } else if (dtype_ == static_cast(lite::core::FluidType::BOOL)) { + for (size_t i = 0; i < bool_values_.size(); ++i) { + out->mutable_data()[i] = bool_values_[i]; + } + } else { + LOG(FATAL) << "Unsupported dtype_ for assign_value_op:" << dtype_; + } + } + + int dtype_; + std::vector shape_; + std::vector fp32_values_; + std::vector int32_values_; + std::vector int64_values_; + std::vector bool_values_; + + lite::Tensor out_ref_; + lite::Tensor out_gpu_; + lite::Tensor out_cpu_; + + operators::AssignValueParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(AssignValueTest, fp32) { + InitFloatInput(); + AssignValueCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(out_cpu_.mutable_data(), + out_gpu_.data(), + sizeof(float) * out_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + EXPECT_NEAR(out_cpu_.data()[i], out_ref_.data()[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/fc_compute.cu b/lite/kernels/cuda/fc_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..0ad376577b133540b782e2726564302a95ddf216 --- /dev/null +++ b/lite/kernels/cuda/fc_compute.cu @@ -0,0 +1,353 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "lite/kernels/cuda/fc_compute.h" + +#include + +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +struct FcTypeTraits; + +template <> +struct FcTypeTraits { + typedef float4 Type; +}; + +template +__global__ void AddBiasV2(const int num, const T* bias, T* data, int K) { + CUDA_KERNEL_LOOP(index, num) { + int bias_idx = index % K; + const T bias_ptr = bias[bias_idx]; + const T in_ptr = data[index]; + T packed_val; + packed_val.x = in_ptr.x + bias_ptr.x; + packed_val.y = in_ptr.y + bias_ptr.y; + data[index] = packed_val; + } +} + +template <> +__global__ void AddBiasV2(const int num, + const half2* bias, + half2* data, + int K) { + CUDA_KERNEL_LOOP(index, num) { + int bias_idx = index % K; + const half2 bias_ptr = bias[bias_idx]; + const half2 in_ptr = data[index]; +#if __CUDA_ARCH__ >= 530 + data[index] = __hadd2(in_ptr, bias_ptr); +#else + half2 packed_val; + packed_val.x = __hadd(in_ptr.x, bias_ptr.x); + packed_val.y = __hadd(in_ptr.y, bias_ptr.y); + data[index] = packed_val; +#endif + } +} + +template +__global__ void AddBiasReluV2(const int num, const T* bias, T* data, int K) { + CUDA_KERNEL_LOOP(index, num) { + int bias_idx = index % K; + const T bias_ptr = bias[bias_idx]; + const T in_ptr = data[index]; + T packed_val; + packed_val.x = fmaxf(0.f, in_ptr.x + bias_ptr.x); + packed_val.y = fmaxf(0.f, in_ptr.y + bias_ptr.y); + data[index] = packed_val; + } +} + +template <> +__global__ void AddBiasReluV2(const int num, + const half2* bias, + half2* data, + int K) { + CUDA_KERNEL_LOOP(index, num) { + int bias_idx = index % K; + const half2 bias_ptr = bias[bias_idx]; + const half2 in_ptr = data[index]; +#if __CUDA_ARCH__ >= 530 + data[index] = __hmul2(__hgt2(in_ptr + bias_ptr, __float2half2_rn(0.f)), + in_ptr + bias_ptr); +#else + const float2 bias = __half22float2(bias_ptr); + const float2 in = __half22float2(in_ptr); + data[index] = __floats2half2_rn( + bias.x + in.x > 0.0f ? static_cast(bias.x + in.x) : 0.0f, + bias.y + in.y > 0.0f ? static_cast(bias.y + in.y) : 0.0f); +#endif + } +} + +template +__global__ void AddBiasV4(const int num, const T* bias, T* data, int K) { + CUDA_KERNEL_LOOP(index, num) { + int bias_idx = index % K; + const T bias_ptr = bias[bias_idx]; + const T in_ptr = data[index]; + T packed_val; + packed_val.x = in_ptr.x + bias_ptr.x; + packed_val.y = in_ptr.y + bias_ptr.y; + packed_val.z = in_ptr.z + bias_ptr.z; + packed_val.w = in_ptr.w + bias_ptr.w; + data[index] = packed_val; + } +} + +template +__global__ void AddBiasReluV4(const int num, const T* bias, T* data, int K) { + CUDA_KERNEL_LOOP(index, num) { + int bias_idx = index % K; + const T bias_ptr = bias[bias_idx]; + const T in_ptr = data[index]; + T packed_val; + packed_val.x = fmaxf(0.f, in_ptr.x + bias_ptr.x); + packed_val.y = fmaxf(0.f, in_ptr.y + bias_ptr.y); + packed_val.z = fmaxf(0.f, in_ptr.z + bias_ptr.z); + packed_val.w = fmaxf(0.f, in_ptr.w + bias_ptr.w); + data[index] = packed_val; + } +} + +template +__global__ void AddBias(const int num, const T* bias, T* data) { + int offset = blockIdx.x * num; + + for (int i = threadIdx.x; i < num; i += blockDim.x) { + T temp; +#if __CUDA_ARCH__ >= 350 + temp = __ldg(data + offset + i) + __ldg(bias + i); +#else + temp = data[offset + i] + bias[i]; +#endif + data[offset + i] = temp; + } +} + +template <> +__global__ void AddBias(const int num, const half* bias, half* data) { + int offset = blockIdx.x * num; + + for (int i = threadIdx.x; i < num; i += blockDim.x) { + half temp; +#if __CUDA_ARCH__ >= 350 + temp = __hadd(__ldg(data + offset + i), __ldg(bias + i)); +#else + temp = __hadd(data[offset + i], bias[i]); +#endif + data[offset + i] = temp; + } +} + +template +__global__ void AddBiasRelu(const int num, const T* bias, T* data) { + int offset = blockIdx.x * num; + + for (int i = threadIdx.x; i < num; i += blockDim.x) { + T temp; +#if __CUDA_ARCH__ >= 350 + temp = __ldg(data + offset + i) + __ldg(bias + i); +#else + temp = data[offset + i] + bias[i]; +#endif + data[offset + i] = static_cast(temp > 0) * temp; + } +} + +template <> +__global__ void AddBiasRelu(const int num, const half* bias, half* data) { + int offset = blockIdx.x * num; + + for (int i = threadIdx.x; i < num; i += blockDim.x) { + half temp; +#if __CUDA_ARCH__ >= 350 + temp = __hadd(__ldg(data + offset + i), __ldg(bias + i)); +#else + temp = __hadd(data[offset + i], bias[i]); +#endif + +#if __CUDA_ARCH__ >= 530 + data[offset + i] = + __hgt(temp, __float2half(0.0f)) ? temp : __float2half(0.0f); +#else + data[offset + i] = + __float2half(__half2float(temp) > 0.f ? __half2float(temp) : 0.f); +#endif + } +} + +template +void FcCompute::PrepareForRun() { + gemm_impl_.reset(new lite::cuda::math::Gemm); +} + +template +void FcCompute::Run() { + auto& context = this->ctx_->template As(); + auto stream = context.exec_stream(); + auto& param = this->template Param(); + + const auto* x_data = param.input->template data(); + const auto* w_data = param.w->template data(); + const auto* b_data = param.bias ? param.bias->template data() : nullptr; + + auto out_vec = param.output->dims().Vectorize(); + out_vec.back() = param.w->dims()[1]; + param.output->Resize(out_vec); + auto* out_data = param.output->template mutable_data(TARGET(kCUDA)); + + int in_num_col_dims = param.in_num_col_dims; + + int M = static_cast( + param.input->dims().Slice(0, param.in_num_col_dims).production()); + int K = static_cast( + param.input->dims() + .Slice(param.in_num_col_dims, param.input->dims().size()) + .production()); + int K2 = static_cast(param.w->dims()[0]); + int N = static_cast(param.w->dims()[1]); + CHECK_EQ(K, K2) << "x_w must be equal with y_h"; + + CHECK(gemm_impl_->init(false, false, M, N, K, &context)); + gemm_impl_->run(1.0f, 0.0f, x_data, w_data, out_data, &context); + + if (b_data == nullptr) { + return; + } + + std::string activation_type = param.activation_type; + if (N % 4 == 0) { + const int threads = 256; + const int num = M * N / 4; + const int blocks = (num + threads - 1) / threads; + typedef typename FcTypeTraits::Type trans_type; + const auto* bias_ptr_v4 = reinterpret_cast(b_data); + auto* data_ptr_v4 = reinterpret_cast(out_data); + if (activation_type == "relu") { + AddBiasReluV4<<>>( + num, bias_ptr_v4, data_ptr_v4, N / 4); + } else if (activation_type == "") { + AddBiasV4<<>>( + num, bias_ptr_v4, data_ptr_v4, N / 4); + } else { + LOG(FATAL) << "not supported activation type: " << activation_type; + } + } else { + const int threads = 256; + const int blocks = M; + if (activation_type == "relu") { + AddBiasRelu<<>>(N, b_data, out_data); + } else if (activation_type == "") { + AddBias<<>>(N, b_data, out_data); + } else { + LOG(FATAL) << "not supported activation type: " << activation_type; + } + } +} + +template <> +void FcCompute::Run() { + auto& context = this->ctx_->template As(); + auto stream = context.exec_stream(); + auto& param = this->template Param(); + + const auto* x_data = param.input->template data(); + const auto* w_data = param.w->template data(); + const auto* b_data = param.bias ? param.bias->template data() : nullptr; + + auto out_vec = param.output->dims().Vectorize(); + out_vec.back() = param.w->dims()[1]; + param.output->Resize(out_vec); + auto* out_data = param.output->template mutable_data(TARGET(kCUDA)); + + int in_num_col_dims = param.in_num_col_dims; + + int M = static_cast( + param.input->dims().Slice(0, param.in_num_col_dims).production()); + int K = static_cast( + param.input->dims() + .Slice(param.in_num_col_dims, param.input->dims().size()) + .production()); + int K2 = static_cast(param.w->dims()[0]); + int N = static_cast(param.w->dims()[1]); + CHECK_EQ(K, K2) << "x_w must be equal with y_h"; + + CHECK(gemm_impl_->init(false, false, M, N, K, &context)); + gemm_impl_->run(1.0f, 0.0f, x_data, w_data, out_data, &context); + + if (b_data == nullptr) { + return; + } + + std::string activation_type = param.activation_type; + if (N % 2 == 0) { + const int threads = 256; + const int num = M * N / 2; + const int blocks = (num + threads - 1) / threads; + const auto* bias_ptr_v2 = reinterpret_cast(b_data); + auto* data_ptr_v2 = reinterpret_cast(out_data); + if (activation_type == "relu") { + AddBiasReluV2<<>>( + num, bias_ptr_v2, data_ptr_v2, N / 2); + } else if (activation_type == "") { + AddBiasV2<<>>( + num, bias_ptr_v2, data_ptr_v2, N / 2); + } else { + LOG(FATAL) << "not supported activation type: " << activation_type; + } + } else { + const int threads = 256; + const int blocks = M; + if (activation_type == "relu") { + AddBiasRelu<<>>(N, b_data, out_data); + } else if (activation_type == "") { + AddBias<<>>(N, b_data, out_data); + } else { + LOG(FATAL) << "not supported activation type: " << activation_type; + } + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +using FcFp32 = paddle::lite::kernels::cuda::FcCompute; + +using FcFp16 = paddle::lite::kernels::cuda::FcCompute; + +REGISTER_LITE_KERNEL(fc, kCUDA, kFloat, kNCHW, FcFp32, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(fc, kCUDA, kFP16, kNCHW, FcFp16, def) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .Finalize(); diff --git a/lite/kernels/cuda/fc_compute.h b/lite/kernels/cuda/fc_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..700194c115824762411e952c77d06cb01a754bc0 --- /dev/null +++ b/lite/kernels/cuda/fc_compute.h @@ -0,0 +1,45 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "lite/backends/cuda/math/gemm.h" +#include "lite/core/kernel.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class FcCompute : public KernelLite { + public: + using param_t = operators::FcParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~FcCompute() = default; + + private: + std::unique_ptr> gemm_impl_{nullptr}; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/fc_compute_test.cc b/lite/kernels/cuda/fc_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..fa0dada729ca01cb1a4176ca585ce8f921f3aa42 --- /dev/null +++ b/lite/kernels/cuda/fc_compute_test.cc @@ -0,0 +1,231 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/fc_compute.h" + +#include +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/utils/float16.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class FcTest : public ::testing::Test { + protected: + FcTest() + : m_(8), + k_(16), + n_(64), + in_num_col_dims_(1), + act_type_("relu"), + x_shape_({m_, k_}), + w_shape_({k_, n_}), + b_shape_({n_}), + out_shape_({m_, n_}) { + x_ref_.Resize(lite::DDim(x_shape_)); + x_gpu_.Resize(lite::DDim(x_shape_)); + + w_ref_.Resize(lite::DDim(w_shape_)); + w_gpu_.Resize(lite::DDim(w_shape_)); + + b_ref_.Resize(lite::DDim(b_shape_)); + b_gpu_.Resize(lite::DDim(b_shape_)); + + auto x_ref_data = x_ref_.mutable_data(); + auto w_ref_data = w_ref_.mutable_data(); + auto b_ref_data = b_ref_.mutable_data(); + + // prepare input + for (int64_t i = 0; i < x_ref_.numel(); i++) { + x_ref_data[i] = static_cast(i % 10 * 0.2); + } + for (int64_t i = 0; i < w_ref_.numel(); i++) { + w_ref_data[i] = static_cast(i % 10 * 0.2); + } + for (int64_t i = 0; i < b_ref_.numel(); i++) { + b_ref_data[i] = static_cast(i % 10 * 0.2); + } + + out_ref_.Resize(lite::DDim(out_shape_)); + out_cpu_.Resize(out_ref_.dims()); + out_gpu_.Resize(out_ref_.dims()); + RunBaseLine(&x_ref_, &w_ref_, &b_ref_, &out_ref_); + + InitParamAndContext(); + } + + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.input = &x_gpu_; + param_.w = &w_gpu_; + param_.bias = &b_gpu_; + param_.in_num_col_dims = in_num_col_dims_; + param_.activation_type = act_type_; + param_.output = &out_gpu_; + } + + void InitFloatInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + w_gpu_.Assign(w_ref_.data(), + w_gpu_.dims()); + b_gpu_.Assign(b_ref_.data(), + b_gpu_.dims()); + } + + void InitHalfInput() { + x_half_.Resize(lite::DDim(x_shape_)); + auto x_half_data = x_half_.mutable_data(); + for (int64_t i = 0; i < x_half_.numel(); i++) { + x_half_data[i] = half(lite::float16(x_ref_.data()[i])); + } + x_gpu_.Assign(x_half_data, x_gpu_.dims()); + w_half_.Resize(w_ref_.dims()); + auto w_half_data = w_half_.mutable_data(); + for (int64_t i = 0; i < w_half_.numel(); i++) { + w_half_data[i] = half(lite::float16(w_ref_.data()[i])); + } + w_gpu_.Assign(w_half_data, w_gpu_.dims()); + b_half_.Resize(b_ref_.dims()); + auto b_half_data = b_half_.mutable_data(); + for (int64_t i = 0; i < b_half_.numel(); i++) { + b_half_data[i] = half(lite::float16(b_ref_.data()[i])); + } + b_gpu_.Assign(b_half_data, b_gpu_.dims()); + } + + void RunBaseLine(const lite::Tensor* x, + const lite::Tensor* w, + const lite::Tensor* b, + lite::Tensor* out) { + const float* data_in = x->data(); + const float* bias = b->data(); + const float* weights = w->data(); + float* data_out = out->mutable_data(); + int out_rows = x->dims()[0]; + int in_cols = x->numel() / out_rows; + int out_cols = w->numel() / in_cols; + int index_out; + for (int i = 0; i < out_rows; i++) { + for (int j = 0; j < out_cols; j++) { + index_out = i * out_cols + j; + data_out[index_out] = bias ? bias[j] : 0; + for (int k = 0; k < in_cols; k++) { + data_out[index_out] += + data_in[i * in_cols + k] * weights[k * out_cols + j]; + } + if (act_type_ == "relu") { + data_out[index_out] *= static_cast(data_out[index_out] > 0); + } + } + } + } + + int m_, k_, n_, in_num_col_dims_; + std::string act_type_; + std::vector x_shape_, w_shape_, b_shape_, out_shape_; + lite::Tensor x_ref_, w_ref_, b_ref_, out_ref_; + lite::Tensor x_gpu_, w_gpu_, b_gpu_; + lite::Tensor x_half_, w_half_, b_half_; + lite::Tensor out_cpu_, out_gpu_; + + operators::FcParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(FcTest, TestFP32) { + InitFloatInput(); + FcCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(out_cpu_.mutable_data(), + out_gpu_.data(), + sizeof(float) * out_gpu_.numel(), + IoDirection::DtoH); + + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = out_cpu_.data()[i]; + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / ref, 0.f, 1e-5); + } +} + +TEST_F(FcTest, TestFP16) { + InitHalfInput(); + FcCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp16, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + const half* out_gpu_data = out_gpu_.data(); + half* out_cpu_data = out_cpu_.mutable_data(); + CopySync(out_cpu_data, + out_gpu_data, + sizeof(half) * out_gpu_.numel(), + IoDirection::DtoH); + + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = static_cast(lite::float16(out_cpu_data[i])); + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 2e-2); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/lookup_table_compute_test.cc b/lite/kernels/cuda/lookup_table_compute_test.cc index 9323de14eb168fb55a68640350b87bf7040f5729..89050ea97f160b2fddb479966f59c05aafd8c268 100644 --- a/lite/kernels/cuda/lookup_table_compute_test.cc +++ b/lite/kernels/cuda/lookup_table_compute_test.cc @@ -12,14 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/cuda/lookup_table_compute.h" #include + #include #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/cuda/lookup_table_compute.h" namespace paddle { namespace lite { @@ -56,9 +58,7 @@ void LookupTableComputeRef(const operators::LookupTableParam& param) { } TEST(lookup_table_cuda, retrieve_op) { - auto lookup_table = - KernelRegistry::Global().Create( - "lookup_table"); + auto lookup_table = KernelRegistry::Global().Create("lookup_table"); ASSERT_FALSE(lookup_table.empty()); ASSERT_TRUE(lookup_table.front()); } diff --git a/lite/kernels/cuda/matmul_compute.cc b/lite/kernels/cuda/matmul_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..3b80b673dfabdccc7c728fa3081a81a870531acf --- /dev/null +++ b/lite/kernels/cuda/matmul_compute.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/matmul_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +void MatMulCompute::Run() { + auto& context = this->ctx_->template As(); + auto& param = this->template Param(); + + const auto* x_data = param.X->template data(); + const auto* y_data = param.Y->template data(); + auto* out_data = param.Out->template mutable_data(TARGET(kCUDA)); + bool transpose_x = param.transpose_X; + bool transpose_y = param.transpose_Y; + float alpha = param.alpha; + + auto x_dims = param.X->dims(); + auto y_dims = param.Y->dims(); + + int m = 0; + int k = 0; + int n = 0; + int batch = 0; + int64_t stride_x = 0; + int64_t stride_y = 0; + + if (x_dims.size() >= 2 && y_dims.size() >= 2 && + (x_dims.size() != 2 || y_dims.size() != 2)) { + // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N] + // x: [B, M, K], y: [K, N], out: [B, M, N] + // or + // x: [M, K], y: [B, ..., K, N], out: [B, ..., M, N] + // x: [M, K], y: [B, K, N], out: [B, M, N] + strided_gemm_impl_->init(transpose_x, transpose_y, &context); + m = transpose_x ? x_dims[x_dims.size() - 1] : x_dims[x_dims.size() - 2]; + k = transpose_x ? x_dims[x_dims.size() - 2] : x_dims[x_dims.size() - 1]; + n = transpose_y ? y_dims[y_dims.size() - 2] : y_dims[y_dims.size() - 1]; + int batch_x = x_dims.size() == 2 ? 0 : x_dims.count(0, x_dims.size() - 2); + int batch_y = y_dims.size() == 2 ? 0 : y_dims.count(0, y_dims.size() - 2); + CHECK(batch_x == batch_y || batch_x == 0 || batch_y == 0) + << "batch_size x should be equal to batch_size y, or " + "one of batch_size x and batch_size y should be 0. " + "But got batch_size x = " + << batch_x << ", batch_size y = " << batch_y; + batch = batch_x == 0 ? batch_y : batch_x; + stride_x = x_dims.size() == 2 ? 0 : m * k; + stride_y = y_dims.size() == 2 ? 0 : k * n; + strided_gemm_impl_->run(alpha, + 0.f, + m, + n, + k, + x_data, + y_data, + out_data, + batch, + stride_x, + stride_y); + } else if (x_dims.size() == 2 && y_dims.size() == 2) { + // x: [M, K], y: [K, N], out: [M, N] + m = transpose_x ? x_dims[1] : x_dims[0]; + k = transpose_x ? x_dims[0] : x_dims[1]; + n = transpose_y ? y_dims[0] : y_dims[1]; + gemm_impl_->init(transpose_x, transpose_y, m, n, k, &context); + gemm_impl_->run(alpha, 0.0f, x_data, y_data, out_data, &context); + } else if (x_dims.size() > 2 && y_dims.size() == 1) { + // x: [B, M, K], y: [K], out: [B, M] + strided_gemm_impl_->init(transpose_x, transpose_y, &context); + m = transpose_x ? x_dims[x_dims.size() - 1] : x_dims[x_dims.size() - 2]; + k = transpose_x ? x_dims[x_dims.size() - 2] : x_dims[x_dims.size() - 1]; + n = 1; + batch = x_dims.count(0, x_dims.size() - 2); + stride_x = m * k; + stride_y = 0; + strided_gemm_impl_->run(alpha, + 0.f, + m, + n, + k, + x_data, + y_data, + out_data, + batch, + stride_x, + stride_y); + } else if (x_dims.size() == 1 && y_dims.size() == 1) { + if (!transpose_x && !transpose_y) { + // x: [K], y: [K], out: [1] + m = 1; + k = x_dims[0]; + n = 1; + CHECK_EQ(x_dims[0], y_dims[0]) + << "x_dims[0] should be equal to y_dims[0]"; + gemm_impl_->init(false, false, m, n, k, &context); + gemm_impl_->run(alpha, 0.0f, x_data, y_data, out_data, &context); + } else if (transpose_x && transpose_y) { + // x: [M], y: [N], x_transpose: true, y_transpose: true, out: [M, N] + m = x_dims[0]; + k = 1; + n = y_dims[0]; + gemm_impl_->init(false, false, m, n, k, &context); + gemm_impl_->run(alpha, 0.0f, x_data, y_data, out_data, &context); + } else { + LOG(FATAL) << "not supported x_dims(" << x_dims << ") and y_dims(" + << y_dims << "), transpose_x(" << transpose_x + << "), transpose_y(" << transpose_y << ")"; + } + } else { + LOG(FATAL) << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims + << ")"; + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +using MatMulFp32 = + paddle::lite::kernels::cuda::MatMulCompute; + +using MatMulFp16 = + paddle::lite::kernels::cuda::MatMulCompute; + +REGISTER_LITE_KERNEL(matmul, kCUDA, kFloat, kNCHW, MatMulFp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(matmul, kCUDA, kFP16, kNCHW, MatMulFp16, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .Finalize(); diff --git a/lite/kernels/cuda/matmul_compute.h b/lite/kernels/cuda/matmul_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..69ad178d9184b7c3893f49a23024a14d7466115b --- /dev/null +++ b/lite/kernels/cuda/matmul_compute.h @@ -0,0 +1,50 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/math/gemm.h" +#include "lite/backends/cuda/math/strided_gemm.h" +#include "lite/core/kernel.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class MatMulCompute : public KernelLite { + public: + using param_t = operators::MatMulParam; + + void PrepareForRun() override { + strided_gemm_impl_.reset(new lite::cuda::math::StridedGemm); + gemm_impl_.reset(new lite::cuda::math::Gemm); + } + + void Run() override; + + virtual ~MatMulCompute() = default; + + private: + std::unique_ptr> strided_gemm_impl_{ + nullptr}; + std::unique_ptr> gemm_impl_{nullptr}; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/matmul_compute_test.cc b/lite/kernels/cuda/matmul_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..89f40af3920ba0d3e36781955ffbf5eaba093257 --- /dev/null +++ b/lite/kernels/cuda/matmul_compute_test.cc @@ -0,0 +1,193 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/matmul_compute.h" + +#include +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/utils/float16.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class MatMulTest : public ::testing::Test { + protected: + MatMulTest() + : x_trans_(false), + y_trans_(true), + alpha_(1.0f), + x_shape_({4, 1, 2}), + y_shape_({4, 1, 2}), + out_shape_({4, 1, 1}) { + x_ref_.Resize(lite::DDim(x_shape_)); + x_gpu_.Resize(x_ref_.dims()); + + y_ref_.Resize(lite::DDim(y_shape_)); + y_gpu_.Resize(y_ref_.dims()); + + auto x_ref_data = x_ref_.mutable_data(); + auto y_ref_data = y_ref_.mutable_data(); + + // prepare input + for (int64_t i = 0; i < x_ref_.numel(); i++) { + x_ref_data[i] = static_cast(1); + } + for (int64_t i = 0; i < y_ref_.numel(); i++) { + y_ref_data[i] = static_cast(1); + } + + out_ref_.Resize(lite::DDim(out_shape_)); + out_cpu_.Resize(out_ref_.dims()); + out_gpu_.Resize(out_ref_.dims()); + RunBaseLine(); + + InitParamAndContext(); + } + + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.X = &x_gpu_; + param_.Y = &y_gpu_; + param_.transpose_X = x_trans_; + param_.transpose_Y = y_trans_; + param_.alpha = alpha_; + param_.Out = &out_gpu_; + } + + void InitFloatInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + y_gpu_.Assign(y_ref_.data(), + y_gpu_.dims()); + } + + void InitHalfInput() { + x_half_.Resize(x_ref_.dims()); + auto x_half_data = x_half_.mutable_data(); + for (int64_t i = 0; i < x_half_.numel(); ++i) { + x_half_data[i] = half(lite::float16(x_ref_.data()[i])); + } + x_gpu_.Assign(x_half_data, x_gpu_.dims()); + y_half_.Resize(y_ref_.dims()); + auto y_half_data = y_half_.mutable_data(); + for (int64_t i = 0; i < y_half_.numel(); i++) { + y_half_data[i] = half(lite::float16(y_ref_.data()[i])); + } + y_gpu_.Assign(y_half_data, y_gpu_.dims()); + } + + void RunBaseLine() { + auto* out_data = out_ref_.mutable_data(); + for (int64_t i = 0; i < out_ref_.numel(); ++i) { + out_data[i] = 2; + } + } + + bool x_trans_, y_trans_; + float alpha_; + std::vector x_shape_, y_shape_, out_shape_; + lite::Tensor x_ref_, y_ref_, out_ref_; + lite::Tensor x_gpu_, y_gpu_; + lite::Tensor x_half_, y_half_; + lite::Tensor out_cpu_, out_gpu_; + + operators::MatMulParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(MatMulTest, TestFP32) { + InitFloatInput(); + MatMulCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(out_cpu_.mutable_data(), + out_gpu_.data(), + sizeof(float) * out_gpu_.numel(), + IoDirection::DtoH); + + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = out_cpu_.data()[i]; + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / ref, 0.f, 1e-5); + } +} + +TEST_F(MatMulTest, TestFP16) { + InitHalfInput(); + MatMulCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp16, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + const half* out_gpu_data = out_gpu_.data(); + half* out_cpu_data = out_cpu_.mutable_data(); + CopySync(out_cpu_data, + out_gpu_data, + sizeof(half) * out_gpu_.numel(), + IoDirection::DtoH); + + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = static_cast(lite::float16(out_cpu_data[i])); + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_mask_compute.cu b/lite/kernels/cuda/sequence_mask_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..8a8f292c103b8fb7b55940cf075d4b80b3fb328d --- /dev/null +++ b/lite/kernels/cuda/sequence_mask_compute.cu @@ -0,0 +1,102 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sequence_mask_compute.h" + +#include +#include + +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +__global__ void SequenceMaskKernel(T* dst, + const int64_t* src, + int count, + int maxlen) { + CUDA_KERNEL_LOOP(index, count) { + int src_idx = index / maxlen; + int inner_idx = index % maxlen; + dst[index] = static_cast(inner_idx < src[src_idx] ? 1 : 0); + } +} + +template +void SequenceMaskCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + + const auto* x = param.X; + auto* x_data = x->template data(); + auto* y = param.Y; + int maxlen = param.maxlen; + + if (param.MaxLenTensor) { + auto* len_tensor_data = param.MaxLenTensor->template data(); + int32_t len_data{0}; + TargetWrapperCuda::MemcpySync( + &len_data, len_tensor_data, sizeof(int32_t), IoDirection::DtoH); + maxlen = len_data; + } + + if (maxlen < 0) { + maxlen = thrust::reduce( + x_data, x_data + x->numel(), 0, thrust::maximum()); + } + + auto y_dim = x->dims().Vectorize(); + y_dim.push_back(maxlen); + y->Resize(y_dim); + const int count = y->numel(); + auto* dst_data = y->template mutable_data(TARGET(kCUDA)); + if (param.out_dtype == 5) { + SequenceMaskKernel< + T><<>>( + dst_data, x_data, count, maxlen); + } else { + LOG(FATAL) << "not supported out_dtype: " << param.out_dtype; + } + CUDA_POST_KERNEL_CHECK; +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +using SeqMaskFp32 = + paddle::lite::kernels::cuda::SequenceMaskCompute; + +using SeqMaskFp16 = + paddle::lite::kernels::cuda::SequenceMaskCompute; + +REGISTER_LITE_KERNEL(sequence_mask, kCUDA, kFloat, kNCHW, SeqMaskFp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .BindInput("MaxLenTensor", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt32))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(sequence_mask, kCUDA, kFP16, kNCHW, SeqMaskFp16, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .BindInput("MaxLenTensor", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt32))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_mask_compute.h b/lite/kernels/cuda/sequence_mask_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..3611587f0ce7daef1a88f5b6a916e2d30d33bcc1 --- /dev/null +++ b/lite/kernels/cuda/sequence_mask_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class SequenceMaskCompute : public KernelLite { + public: + using param_t = operators::SequenceMaskParam; + + void Run() override; + virtual ~SequenceMaskCompute() = default; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_mask_compute_test.cc b/lite/kernels/cuda/sequence_mask_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..efbdf2ae00b6d1d9353831e94a202e5e42228b62 --- /dev/null +++ b/lite/kernels/cuda/sequence_mask_compute_test.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sequence_mask_compute.h" + +#include + +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/utils/float16.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SequenceMaskTest : public ::testing::Test { + protected: + SequenceMaskTest() + : maxlen_(4), + out_dtype_(5), + x_data_({3, 2, 1, 0}), + out_shape_({static_cast(x_data_.size()), maxlen_}) { + x_ref_.Resize(lite::DDim({static_cast(x_data_.size())})); + x_gpu_.Resize(x_ref_.dims()); + + auto* x_ref_data = x_ref_.mutable_data(); + + // prepare input + for (size_t i = 0; i < x_data_.size(); i++) { + x_ref_data[i] = x_data_[i]; + } + + out_ref_.Resize(lite::DDim(out_shape_)); + out_gpu_.Resize(out_ref_.dims()); + out_cpu_.Resize(out_ref_.dims()); + RunBaseLine(&x_ref_, &out_ref_); + + InitParamAndContext(); + } + + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.X = &x_gpu_; + param_.Y = &out_gpu_; + param_.maxlen = maxlen_; + param_.out_dtype = out_dtype_; + } + + void InitFloatInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + } + + void InitHalfInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + } + + void RunBaseLine(const lite::Tensor* x, lite::Tensor* out) { + auto* out_data = out->mutable_data(); + + for (size_t i = 0; i < x_data_.size(); ++i) { + for (int j = 0; j < maxlen_; ++j) { + out_data[i * maxlen_ + j] = j < x_data_[i] ? 1 : 0; + } + } + } + + int maxlen_, out_dtype_; + std::vector x_data_, out_shape_; + + lite::Tensor x_ref_, out_ref_; + lite::Tensor x_gpu_, out_gpu_; + lite::Tensor out_cpu_; + + operators::SequenceMaskParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(SequenceMaskTest, fp32) { + InitFloatInput(); + SequenceMaskCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(out_cpu_.mutable_data(), + out_gpu_.data(), + sizeof(float) * out_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + EXPECT_NEAR(out_cpu_.data()[i], out_ref_.data()[i], 1e-5); + } +} + +TEST_F(SequenceMaskTest, TestFP16) { + InitHalfInput(); + SequenceMaskCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp16, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + const half* out_gpu_data = out_gpu_.data(); + half* out_cpu_data = out_cpu_.mutable_data(); + CopySync(out_cpu_data, + out_gpu_data, + sizeof(half) * out_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = static_cast(lite::float16(out_cpu_data[i])); + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_pad_compute.cu b/lite/kernels/cuda/sequence_pad_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..1e304f00633794dcac5d8ebfcd9d79defb4980f7 --- /dev/null +++ b/lite/kernels/cuda/sequence_pad_compute.cu @@ -0,0 +1,106 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/cuda/math/sequence_padding.h" +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/kernels/cuda/sequence_pad_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +void SequencePadCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + + const auto* x = param.X; + const auto* pad_value = param.PadValue; + auto* out = param.Out; + auto* len_t = param.Length; + int padded_length = param.padded_length; + + int seq_num = x->lod()[0].size() - 1; + int max_seq_len = 0; + int step_width = x->numel() / x->dims()[0]; + + // calc for param.Lenght + seq_len_.resize(seq_num); + seq_offsets_vec_.resize(x->lod()[0].size()); + for (size_t i = 0; i < seq_num; ++i) { + max_seq_len = std::max( + max_seq_len, static_cast(x->lod()[0][i + 1] - x->lod()[0][i])); + seq_len_[i] = x->lod()[0][i + 1] - x->lod()[0][i]; + seq_offsets_vec_[i] = x->lod()[0][i]; + } + seq_offsets_vec_[seq_num] = x->lod()[0][seq_num]; + TargetWrapperCuda::MemcpyAsync( + len_t->template mutable_data(TARGET(kCUDA)), + seq_len_.data(), + sizeof(int64_t) * seq_len_.size(), + IoDirection::HtoD, + stream); + seq_offsets_.Resize({static_cast(x->lod()[0].size())}); + TargetWrapperCuda::MemcpyAsync( + seq_offsets_.mutable_data(TARGET(kCUDA)), + seq_offsets_vec_.data(), + sizeof(size_t) * seq_offsets_vec_.size(), + IoDirection::HtoD, + stream); + + const T* seq_data = x->template data(); + T* pad_data = out->template mutable_data(TARGET(kCUDA)); + const T* pad_value_data = pad_value->template data(); + + lite::cuda::math::SequencePadding(pad_data, + seq_data, + pad_value_data, + pad_value->numel() == 1, + seq_offsets_.data(), + seq_num, + padded_length, + step_width, + &stream); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +using SeqPadFp32 = + paddle::lite::kernels::cuda::SequencePadCompute; + +using SeqPadFp16 = + paddle::lite::kernels::cuda::SequencePadCompute; + +REGISTER_LITE_KERNEL(sequence_pad, kCUDA, kFloat, kNCHW, SeqPadFp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("PadValue", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Length", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .Finalize(); + +REGISTER_LITE_KERNEL(sequence_pad, kCUDA, kFP16, kNCHW, SeqPadFp16, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindInput("PadValue", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("Length", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_pad_compute.h b/lite/kernels/cuda/sequence_pad_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..c494fe127d4eb5a7c0ba77a5c76ab1d1d0c1f2f2 --- /dev/null +++ b/lite/kernels/cuda/sequence_pad_compute.h @@ -0,0 +1,41 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class SequencePadCompute : public KernelLite { + public: + using param_t = operators::SequencePadParam; + + void Run() override; + virtual ~SequencePadCompute() = default; + + private: + lite::Tensor seq_offsets_; + std::vector seq_len_; + std::vector seq_offsets_vec_; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_pad_compute_test.cc b/lite/kernels/cuda/sequence_pad_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..91141984c98d5d105f51d0acc247aa878ff219a7 --- /dev/null +++ b/lite/kernels/cuda/sequence_pad_compute_test.cc @@ -0,0 +1,233 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sequence_pad_compute.h" + +#include + +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/utils/float16.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SequencePadTest : public ::testing::Test { + protected: + SequencePadTest() + : batch_(5), + features_(2), + padded_length_(3), + x_lod_({{0, 2, 5}}), + x_shape_({batch_, features_}), + pad_value_shape_({features_}), + out_shape_({static_cast(x_lod_[0].size() - 1), + padded_length_, + features_}) { + x_ref_.Resize(lite::DDim(x_shape_)); + x_ref_.set_lod(x_lod_); + x_gpu_.Resize(x_ref_.dims()); + + pad_value_ref_.Resize(lite::DDim(pad_value_shape_)); + pad_value_gpu_.Resize(pad_value_ref_.dims()); + + length_ref_.Resize( + lite::DDim({static_cast(x_lod_[0].size() - 1)})); + length_gpu_.Resize(length_ref_.dims()); + length_cpu_.Resize(length_ref_.dims()); + + auto x_ref_data = x_ref_.mutable_data(); + auto pad_value_ref_data = pad_value_ref_.mutable_data(); + + for (int64_t i = 0; i < x_ref_.numel(); i++) { + x_ref_data[i] = static_cast(i); + } + for (int64_t i = 0; i < pad_value_ref_.numel(); i++) { + pad_value_ref_data[i] = static_cast(i); + } + + out_ref_.Resize(lite::DDim(out_shape_)); + out_gpu_.Resize(out_ref_.dims()); + out_cpu_.Resize(out_ref_.dims()); + RunBaseLine(&x_ref_, &pad_value_ref_, &out_ref_, &length_ref_); + + InitParamAndContext(); + } + + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.X = &x_gpu_; + param_.PadValue = &pad_value_gpu_; + param_.Length = &length_gpu_; + param_.Out = &out_gpu_; + param_.padded_length = padded_length_; + } + + void InitFloatInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + x_gpu_.set_lod(x_ref_.lod()); + pad_value_gpu_.Assign( + pad_value_ref_.data(), pad_value_gpu_.dims()); + } + + void InitHalfInput() { + x_half_.Resize(lite::DDim(x_shape_)); + auto x_half_data = x_half_.mutable_data(); + for (int64_t i = 0; i < x_half_.numel(); i++) { + x_half_data[i] = half(lite::float16(x_ref_.data()[i])); + } + x_gpu_.Assign(x_half_data, x_gpu_.dims()); + x_gpu_.set_lod(x_ref_.lod()); + pad_value_half_.Resize(pad_value_ref_.dims()); + auto pad_value_half_data = pad_value_half_.mutable_data(); + for (int64_t i = 0; i < pad_value_half_.numel(); i++) { + pad_value_half_data[i] = + half(lite::float16(pad_value_ref_.data()[i])); + } + pad_value_gpu_.Assign( + pad_value_half_data, pad_value_gpu_.dims()); + } + + void RunBaseLine(const lite::Tensor* x, + const lite::Tensor* pad_value, + lite::Tensor* out, + lite::Tensor* length) { + auto* length_data = length->mutable_data(); + auto* out_data = out->mutable_data(); + length_data[0] = 2; + length_data[1] = 3; + + for (size_t i = 0; i < 4; ++i) { + out_data[i] = i; + } + out_data[4] = 0; + out_data[5] = 1; + for (size_t i = 4; i < 10; ++i) { + out_data[2 + i] = i; + } + } + + int batch_, features_, padded_length_; + LoD x_lod_; + std::vector x_shape_, pad_value_shape_, out_shape_; + + lite::Tensor x_ref_, pad_value_ref_, out_ref_, length_ref_; + lite::Tensor x_gpu_, pad_value_gpu_, out_gpu_, length_gpu_; + lite::Tensor x_half_, pad_value_half_; + lite::Tensor out_cpu_, length_cpu_; + + operators::SequencePadParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(SequencePadTest, fp32) { + InitFloatInput(); + SequencePadCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(out_cpu_.mutable_data(), + out_gpu_.data(), + sizeof(float) * out_gpu_.numel(), + IoDirection::DtoH); + CopySync(length_cpu_.mutable_data(), + length_gpu_.data(), + sizeof(int64_t) * length_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + EXPECT_NEAR(out_cpu_.data()[i], out_ref_.data()[i], 1e-5); + } + for (int i = 0; i < length_gpu_.numel(); ++i) { + EXPECT_NEAR( + length_cpu_.data()[i], length_ref_.data()[i], 1e-5); + } +} + +TEST_F(SequencePadTest, TestFP16) { + InitHalfInput(); + SequencePadCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp16, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + const half* out_gpu_data = out_gpu_.data(); + half* out_cpu_data = out_cpu_.mutable_data(); + const int64_t* length_gpu_data = length_gpu_.data(); + int64_t* length_cpu_data = length_cpu_.mutable_data(); + CopySync(out_cpu_data, + out_gpu_data, + sizeof(half) * out_gpu_.numel(), + IoDirection::DtoH); + CopySync(length_cpu_data, + length_gpu_data, + sizeof(int64_t) * length_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = static_cast(lite::float16(out_cpu_data[i])); + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2); + } + for (int i = 0; i < length_gpu_.numel(); ++i) { + EXPECT_NEAR( + length_cpu_.data()[i], length_ref_.data()[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_unpad_compute.cu b/lite/kernels/cuda/sequence_unpad_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..bdedd74588884aa1e4b7f7c7ae3f414810b0826a --- /dev/null +++ b/lite/kernels/cuda/sequence_unpad_compute.cu @@ -0,0 +1,92 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/cuda/math/sequence_padding.h" +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/kernels/cuda/sequence_unpad_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +void SequenceUnpadCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + + const auto* pad_tensor = param.X; + const auto* len_t = param.Length; + auto* seq_tensor = param.Out; + + int padded_length = pad_tensor->dims()[1]; + int seq_num = seq_tensor->lod()[0].size() - 1; + int max_seq_len = 0; + int step_width = seq_tensor->numel() / seq_tensor->dims()[0]; + + seq_offsets_vec_.resize(seq_tensor->lod()[0].size()); + for (size_t i = 0; i < seq_num; ++i) { + max_seq_len = std::max(max_seq_len, + static_cast(seq_tensor->lod()[0][i + 1] - + seq_tensor->lod()[0][i])); + seq_offsets_vec_[i] = seq_tensor->lod()[0][i]; + } + seq_offsets_vec_[seq_num] = seq_tensor->lod()[0][seq_num]; + seq_offsets_.Resize({static_cast(seq_tensor->lod()[0].size())}); + TargetWrapperCuda::MemcpyAsync( + seq_offsets_.mutable_data(TARGET(kCUDA)), + seq_offsets_vec_.data(), + sizeof(size_t) * seq_offsets_vec_.size(), + IoDirection::HtoD, + stream); + + const T* pad_data = pad_tensor->template data(); + T* seq_data = seq_tensor->template mutable_data(TARGET(kCUDA)); + + lite::cuda::math::SequenceUnpadding(seq_data, + pad_data, + seq_offsets_.data(), + seq_num, + padded_length, + step_width, + &stream); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +using SeqUnadFp32 = + paddle::lite::kernels::cuda::SequenceUnpadCompute; + +using SeqUnadFp16 = + paddle::lite::kernels::cuda::SequenceUnpadCompute; + +REGISTER_LITE_KERNEL(sequence_unpad, kCUDA, kFloat, kNCHW, SeqUnadFp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Length", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(sequence_unpad, kCUDA, kFP16, kNCHW, SeqUnadFp16, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindInput("Length", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_unpad_compute.h b/lite/kernels/cuda/sequence_unpad_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..f36520ea15c4ad504b2fd357d8729d6d0dbc2615 --- /dev/null +++ b/lite/kernels/cuda/sequence_unpad_compute.h @@ -0,0 +1,40 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class SequenceUnpadCompute : public KernelLite { + public: + using param_t = operators::SequenceUnpadParam; + + void Run() override; + virtual ~SequenceUnpadCompute() = default; + + private: + lite::Tensor seq_offsets_; + std::vector seq_offsets_vec_; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_unpad_compute_test.cc b/lite/kernels/cuda/sequence_unpad_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..417115a50b6d086bd628a0b93a7d45c688ea18af --- /dev/null +++ b/lite/kernels/cuda/sequence_unpad_compute_test.cc @@ -0,0 +1,198 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sequence_unpad_compute.h" + +#include + +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/utils/float16.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SequenceUnpadTest : public ::testing::Test { + protected: + SequenceUnpadTest() + : batch_(5), + features_(2), + padded_length_(3), + out_lod_({{0, 2, 5}}), + x_shape_({static_cast(out_lod_[0].size() - 1), + padded_length_, + features_}), + out_shape_({batch_, features_}) { + x_ref_.Resize(lite::DDim(x_shape_)); + x_gpu_.Resize(x_ref_.dims()); + + length_ref_.Resize( + lite::DDim({static_cast(out_lod_[0].size() - 1)})); + length_gpu_.Resize(length_ref_.dims()); + + auto* x_ref_data = x_ref_.mutable_data(); + auto* length_ref_data = length_ref_.mutable_data(); + + // prepare input + for (int64_t i = 0; i < x_ref_.numel(); i++) { + x_ref_data[i] = static_cast(i); + } + for (size_t i = 0; i < out_lod_[0].size() - 1; ++i) { + length_ref_data[i] = out_lod_[0][i + 1] - out_lod_[0][i]; + } + + out_ref_.Resize(lite::DDim(out_shape_)); + out_ref_.set_lod(out_lod_); + out_gpu_.Resize(out_ref_.dims()); + out_gpu_.set_lod(out_ref_.lod()); + out_cpu_.Resize(out_ref_.dims()); + out_cpu_.set_lod(out_ref_.lod()); + + RunBaseLine(&x_ref_, &length_ref_, &out_ref_); + + InitParamAndContext(); + } + + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.X = &x_gpu_; + param_.Length = &length_gpu_; + param_.Out = &out_gpu_; + } + + void InitFloatInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + length_gpu_.Assign( + length_ref_.data(), length_gpu_.dims()); + } + + void InitHalfInput() { + x_half_.Resize(lite::DDim(x_shape_)); + auto x_half_data = x_half_.mutable_data(); + for (int64_t i = 0; i < x_half_.numel(); i++) { + x_half_data[i] = half(lite::float16(x_ref_.data()[i])); + } + x_gpu_.Assign(x_half_data, x_gpu_.dims()); + length_gpu_.Assign( + length_ref_.data(), length_gpu_.dims()); + } + + void RunBaseLine(const lite::Tensor* X, + const lite::Tensor* Length, + lite::Tensor* Out) { + auto* out_data = Out->mutable_data(); + + for (size_t i = 0; i < 4; ++i) { + out_data[i] = i; + } + for (size_t i = 6; i < 12; ++i) { + out_data[i - 2] = i; + } + } + + int batch_, features_, padded_length_; + LoD out_lod_; + std::vector x_shape_, out_shape_; + + lite::Tensor x_ref_, out_ref_, length_ref_; + lite::Tensor x_gpu_, out_gpu_, length_gpu_; + lite::Tensor x_half_; + lite::Tensor out_cpu_, length_cpu_; + + operators::SequencePadParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(SequenceUnpadTest, fp32) { + InitFloatInput(); + SequenceUnpadCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(out_cpu_.mutable_data(), + out_gpu_.data(), + sizeof(float) * out_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + EXPECT_NEAR(out_cpu_.data()[i], out_ref_.data()[i], 1e-5); + } +} + +TEST_F(SequenceUnpadTest, TestFP16) { + InitHalfInput(); + SequenceUnpadCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp16, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + const half* out_gpu_data = out_gpu_.data(); + half* out_cpu_data = out_cpu_.mutable_data(); + CopySync(out_cpu_data, + out_gpu_data, + sizeof(half) * out_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = static_cast(lite::float16(out_cpu_data[i])); + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/topk_pooling_compute.cu b/lite/kernels/cuda/topk_pooling_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..bb4499b637a1435dec2dc913bf8141edd60130fc --- /dev/null +++ b/lite/kernels/cuda/topk_pooling_compute.cu @@ -0,0 +1,200 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/topk_pooling_compute.h" + +#include +#include + +#include "lite/backends/cuda/target_wrapper.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +__global__ void top_k_pooling_batch_kernel_reduction(Dtype *output_data, + const Dtype *input, + const int *height_offset, + const int *width_offset, + const int batch_size, + const int channel_num, + const int height_stride, + const int width_stride, + const int k) { + const Dtype *input_start = + input + + (blockIdx.x * channel_num + blockIdx.y) * height_stride * width_stride; + Dtype *output_start = + output_data + (blockIdx.x * channel_num + blockIdx.y) * k; + + int width = width_offset[blockIdx.x + 1] - width_offset[blockIdx.x]; + int height = height_offset[blockIdx.x + 1] - height_offset[blockIdx.x]; + int real_k = k < height * width ? k : height * width; + + extern __shared__ Dtype smem[]; + + Dtype min_val = -100000.0f; + for (int j = threadIdx.x; j < height * width; j += blockDim.x) { + int index_tmp = (j / width) * width_stride + j % width; + smem[j] = input_start[index_tmp]; + } + __syncthreads(); + + // get max val + int t = 0; + for (; t < real_k; ++t) { + // reduction + for (int gap = height * width; gap > 1;) { + if (threadIdx.x == 0) { // edge cond + if (gap % 2 != 0) { + Dtype value_first = smem[0]; + Dtype value_gap = smem[gap - 1]; + if (value_first < value_gap) { + smem[0] = value_gap; + smem[gap - 1] = value_first; + } + } + } + gap >>= 1; + for (int j = threadIdx.x; j < gap; j += blockDim.x) { + Dtype value_first = smem[j]; + Dtype value_gap = smem[j + gap]; + if (value_first < value_gap) { + smem[j] = value_gap; + smem[j + gap] = value_first; + } + } + __syncthreads(); + } + if (threadIdx.x == 0) { + output_start[t] = smem[0]; + smem[0] = min_val; + } + __syncthreads(); + } + for (int i = threadIdx.x; i < (k - t); i += blockDim.x) { + // output_start[t + i] = 0.0f; + } +} + +template +void TopkPoolingCompute::PrepareForRun() { + int device_id = lite::TargetWrapperCuda::GetCurDevice(); + cudaDeviceProp deviceProp; + CUDA_CALL(cudaGetDeviceProperties(&deviceProp, device_id)); + _shared_mem_size = deviceProp.sharedMemPerBlock; +} + +template +void TopkPoolingCompute::Run() { + auto ¶m = this->Param(); + auto &ctx = this->ctx_->template As(); + auto cuda_stream = ctx.exec_stream(); + + CHECK(param.X->lod().size() > 0 && param.X->lod()[0].size() > 0) + << "X sequence offset is not valid"; + CHECK(param.Y->lod().size() > 0 && param.Y->lod()[0].size() > 0) + << "Y sequence offset is not valid"; + + int width_offset_len = param.X->lod()[0].size(); + lite::DDim width_offset_shape(std::vector{width_offset_len}); + _width_offset.Resize(width_offset_shape); + std::vector width_lod_0(width_offset_len, 0); + for (size_t i = 0; i < param.X->lod()[0].size(); ++i) { + width_lod_0[i] = static_cast(param.X->lod()[0][i]); + } + lite::TargetWrapperCuda::MemcpyAsync( + _width_offset.mutable_data(TARGET(kCUDA)), + width_lod_0.data(), + sizeof(int) * width_offset_len, + lite::IoDirection::HtoD, + cuda_stream); + + int height_offset_len = param.Y->lod()[0].size(); + lite::DDim height_offset_shape(std::vector{height_offset_len}); + _height_offset.Resize(height_offset_shape); + std::vector height_lod_0(height_offset_len, 0); + for (size_t i = 0; i < param.Y->lod()[0].size(); ++i) { + height_lod_0[i] = static_cast(param.Y->lod()[0][i]); + } + lite::TargetWrapperCuda::MemcpyAsync( + _height_offset.mutable_data(TARGET(kCUDA)), + height_lod_0.data(), + sizeof(int) * height_offset_len, + lite::IoDirection::HtoD, + cuda_stream); + + const Tensor *x_tensor = param.X; + Tensor *out_tensor = param.Out; + const T *in_data = x_tensor->data(); + T *out_data = out_tensor->mutable_data(TARGET(kCUDA)); + + int num = x_tensor->dims()[0]; + int channel = x_tensor->dims()[1]; + int height = x_tensor->dims()[2]; + int width = x_tensor->dims()[3]; + + const int *height_offset = _height_offset.data(); + const int *width_offset = _width_offset.data(); + + int feat_map_size = height * width; + + if (feat_map_size * sizeof(T) <= _shared_mem_size) { + dim3 blocks(num, channel); + dim3 threads(32, 1); + + top_k_pooling_batch_kernel_reduction< + T><<>>( + out_data, + in_data, + height_offset, + width_offset, + num, + channel, + height, + width, + param.top_k); + } else { + LOG(FATAL) << "Not implemented. Exceeded the shared memory limit."; + } + CUDA_POST_KERNEL_CHECK; +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(topk_pooling, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::TopkPoolingCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/cuda/topk_pooling_compute.h b/lite/kernels/cuda/topk_pooling_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..abf16163812a74de8ebb8cce0dd7d80469e0a7d8 --- /dev/null +++ b/lite/kernels/cuda/topk_pooling_compute.h @@ -0,0 +1,45 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/kernel.h" +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class TopkPoolingCompute + : public KernelLite { + public: + using param_t = operators::TopkPoolingParam; + + void Run() override; + + void PrepareForRun() override; + + virtual ~TopkPoolingCompute() = default; + + protected: + lite::Tensor _height_offset; + lite::Tensor _width_offset; + int _shared_mem_size; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/topk_pooling_compute_test.cc b/lite/kernels/cuda/topk_pooling_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0fb5c29f25bba0b4cc00f3eb58fc1c0726e6b23b --- /dev/null +++ b/lite/kernels/cuda/topk_pooling_compute_test.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/topk_pooling_compute.h" + +#include + +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/utils/float16.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class TopkPooingTest : public ::testing::Test { + protected: + TopkPooingTest() + : num(2), + channels(4), + height(4), + width(4), + top_k(2), + feat_map_num(height * width), + x_lod({{0, 4, 7}}), + y_lod({{0, 4, 7}}), + x_shape({num, channels, height, width}), + out_shape({num, channels * top_k}) { + CHECK_EQ(x_lod[0].size(), num + 1) << "invalid input."; + for (size_t i = 1; i < x_lod[0].size(); ++i) { + CHECK_LE(x_lod[0][i] - x_lod[0][i - 1], height) << "invalid input."; + } + + X_gpu.Resize(lite::DDim(x_shape)); + X_ref.Resize(lite::DDim(x_shape)); + X_ref.set_lod(x_lod); + Y_gpu.Resize(lite::DDim(x_shape)); + Y_ref.Resize(lite::DDim(x_shape)); + Y_ref.set_lod(y_lod); + auto x_ref_data = X_ref.mutable_data(); + auto y_ref_data = Y_ref.mutable_data(); + + // prepare input + for (int64_t i = 0; i < X_ref.numel(); i++) { + x_ref_data[i] = static_cast(i % 16); + } + for (int64_t i = 0; i < Y_ref.numel(); i++) { + y_ref_data[i] = static_cast(i % 16); + } + + Out_ref.Resize(lite::DDim(out_shape)); + Out_gpu.Resize(lite::DDim(out_shape)); + Out_cpu.Resize(lite::DDim(out_shape)); + + device_init(); + } + + void device_init() { + ctx.reset(new KernelContext); + cudaStreamCreate(&stream); + param.X = &X_gpu; + param.Y = &Y_gpu; + param.Out = &Out_gpu; + param.top_k = top_k; + param.feat_map_num = feat_map_num; + } + + void float_data_init() { + X_gpu.Assign(X_ref.data(), + X_gpu.dims()); + X_gpu.set_lod(X_ref.lod()); + Y_gpu.Assign(Y_ref.data(), + Y_gpu.dims()); + Y_gpu.set_lod(Y_ref.lod()); + } + + void half_data_init() {} + + void cpu_base(const lite::Tensor* X, + const lite::Tensor* Y, + lite::Tensor* Out) {} + + int num, channels, height, width; + int top_k, feat_map_num; + std::vector> x_lod, y_lod; + std::vector x_shape, out_shape; + lite::Tensor X_ref, Y_ref, Out_ref; + lite::Tensor X_gpu, Y_gpu; + lite::Tensor Out_cpu, Out_gpu; + + operators::TopkPoolingParam param; + std::unique_ptr ctx; + cudaStream_t stream; +}; + +TEST_F(TopkPooingTest, fp32) { + float_data_init(); + auto& context = ctx->As(); + context.SetExecStream(stream); + TopkPoolingCompute kernel; + kernel.SetParam(param); + kernel.SetContext(std::move(ctx)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(Out_cpu.mutable_data(), + Out_gpu.data(), + sizeof(float) * Out_gpu.numel(), + IoDirection::DtoH); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/transpose_compute.cu b/lite/kernels/cuda/transpose_compute.cu index c5693c674c573d7c9f59034dd3c0985c9d94a22f..ec7ecd16e0daa9f9cb696224ae498825fe75c5b4 100644 --- a/lite/kernels/cuda/transpose_compute.cu +++ b/lite/kernels/cuda/transpose_compute.cu @@ -13,17 +13,20 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "lite/kernels/cuda/transpose_compute.h" + #include + #include "lite/core/op_registry.h" -#include "lite/kernels/cuda/transpose_compute.h" namespace paddle { namespace lite { namespace kernels { namespace cuda { -void TransposeCompute::Run() { - auto& param = this->Param(); +template +void TransposeCompute::Run() { + auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); auto stream = ctx.exec_stream(); @@ -31,8 +34,8 @@ void TransposeCompute::Run() { lite::Tensor* Out = param.output; std::vector axes = param.axis; - const float* in = X->data(); - float* out = Out->mutable_data(TARGET(kCUDA)); + const T* in = X->template data(); + T* out = Out->mutable_data(TARGET(kCUDA)); int ndim = X->dims().size(); std::vector dims = X->dims().data(); @@ -40,7 +43,7 @@ void TransposeCompute::Run() { // NCHW -> NHWC if (axes.size() == 4 && axes[0] == 0 && axes[1] == 2 && axes[2] == 3 && axes[3] == 1) { - trans.NCHW2NHWC(dims[0], dims[1], dims[2] * dims[3], in, out, &stream); + trans_.NCHW2NHWC(dims[0], dims[1], dims[2] * dims[3], in, out, &stream); cudaError_t error = cudaGetLastError(); if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); return; @@ -49,13 +52,13 @@ void TransposeCompute::Run() { // NHWC -> NCHW if (axes.size() == 4 && axes[0] == 0 && axes[1] == 3 && axes[2] == 1 && axes[3] == 2) { - trans.NHWC2NCHW(dims[0], dims[3], dims[1] * dims[2], in, out, &stream); + trans_.NHWC2NCHW(dims[0], dims[3], dims[1] * dims[2], in, out, &stream); cudaError_t error = cudaGetLastError(); if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); return; } - trans.transpose(out, in, dims, axes, &stream); + trans_.transpose(out, in, dims, axes, &stream); cudaError_t error = cudaGetLastError(); if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); } @@ -65,34 +68,31 @@ void TransposeCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL(transpose, - kCUDA, - kFloat, - kNCHW, - paddle::lite::kernels::cuda::TransposeCompute, - def) +using TransFp32 = + paddle::lite::kernels::cuda::TransposeCompute; + +using TransFp16 = + paddle::lite::kernels::cuda::TransposeCompute; + +REGISTER_LITE_KERNEL(transpose, kCUDA, kFloat, kNCHW, TransFp32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) .Finalize(); -REGISTER_LITE_KERNEL(transpose2, - kCUDA, - kFloat, - kNCHW, - paddle::lite::kernels::cuda::TransposeCompute, - def) +REGISTER_LITE_KERNEL(transpose2, kCUDA, kFloat, kNCHW, TransFp32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kCUDA))}) .Finalize(); -// REGISTER_LITE_KERNEL(transpose2, -// kCUDA, -// kFloat, -// kNCHW, -// paddle::lite::kernels::cuda::TransposeCompute, -// def) -// .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) -// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) -// .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kCUDA))}) -// .Finalize(); +REGISTER_LITE_KERNEL(transpose, kCUDA, kFP16, kNCHW, TransFp16, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .Finalize(); + +REGISTER_LITE_KERNEL(transpose2, kCUDA, kFP16, kNCHW, TransFp16, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("XShape", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .Finalize(); diff --git a/lite/kernels/cuda/transpose_compute.h b/lite/kernels/cuda/transpose_compute.h index 273d072231fb0608deb9ed729bdf153395ee983f..7e373c3b26c1701cd467148a06466a86f04e0c95 100644 --- a/lite/kernels/cuda/transpose_compute.h +++ b/lite/kernels/cuda/transpose_compute.h @@ -21,7 +21,8 @@ namespace lite { namespace kernels { namespace cuda { -class TransposeCompute : public KernelLite { +template +class TransposeCompute : public KernelLite { public: using param_t = operators::TransposeParam; @@ -29,7 +30,7 @@ class TransposeCompute : public KernelLite { virtual ~TransposeCompute() = default; private: - lite::cuda::math::Transpose trans; + lite::cuda::math::Transpose trans_; }; } // namespace cuda diff --git a/lite/kernels/cuda/transpose_compute_test.cc b/lite/kernels/cuda/transpose_compute_test.cc index bf0d803a14a5f0e47c96128b953ae72a18798205..89654dd9c8a200f5672f23bd08c32b40b9b6f99e 100644 --- a/lite/kernels/cuda/transpose_compute_test.cc +++ b/lite/kernels/cuda/transpose_compute_test.cc @@ -13,11 +13,16 @@ // limitations under the License. #include "lite/kernels/cuda/transpose_compute.h" + #include #include #include #include +#include "lite/api/test_helper.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/utils/float16.h" + namespace paddle { namespace lite { namespace kernels { @@ -31,9 +36,9 @@ namespace { #define OUT(n, c, h, w) \ output_data[w + h * output_w + c * output_h * output_w + \ n * output_c * output_h * output_w] -void nchw2nhwc_ref(lite::Tensor* input, - lite::Tensor* output, - const std::vector axies) { +void Nchw2nhwcBaseLine(lite::Tensor* input, + lite::Tensor* output, + const std::vector axies) { auto* input_data = input->data(); auto* output_data = output->mutable_data(); @@ -64,9 +69,9 @@ void nchw2nhwc_ref(lite::Tensor* input, #define OUT(n, h, w, c) \ output_data[c + w * output_c + h * output_w * output_c + \ n * output_h * output_w * output_c] -void nhwc2nchw_ref(lite::Tensor* input, - lite::Tensor* output, - const std::vector axies) { +void Nhwc2nchwBaseLine(lite::Tensor* input, + lite::Tensor* output, + const std::vector& axies) { auto* input_data = input->data(); auto* output_data = output->mutable_data(); @@ -89,7 +94,7 @@ void nhwc2nchw_ref(lite::Tensor* input, } } -void transpose_ref(lite::Tensor* input, +void TransBaseLine(const lite::Tensor* input, lite::Tensor* output, const std::vector axes) { auto* input_data = input->data(); @@ -123,7 +128,7 @@ void transpose_ref(lite::Tensor* input, } // namespace TEST(transpose_nchw, normal) { - TransposeCompute transpose_kernel; + TransposeCompute transpose_kernel; std::unique_ptr ctx(new KernelContext); auto& context = ctx->As(); @@ -168,16 +173,15 @@ TEST(transpose_nchw, normal) { auto* out_data = out.mutable_data(TARGET(kCUDA)); CopySync( out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); - nchw2nhwc_ref(&x_ref, &out_ref, axes); + Nchw2nhwcBaseLine(&x_ref, &out_ref, axes); auto* out_ref_data = out_ref.mutable_data(); - // transpose_ref(&x_ref, &out_ref, axes); for (int i = 0; i < out.numel(); i++) { EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); } } TEST(transpose_nhwc, normal) { - TransposeCompute transpose_kernel; + TransposeCompute transpose_kernel; std::unique_ptr ctx(new KernelContext); auto& context = ctx->As(); @@ -220,62 +224,146 @@ TEST(transpose_nhwc, normal) { auto* out_data = out.mutable_data(TARGET(kCUDA)); CopySync( out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); - nhwc2nchw_ref(&x_ref, &out_ref, axes); - // transpose_ref(&x_ref, &out_ref, axes); + Nhwc2nchwBaseLine(&x_ref, &out_ref, axes); auto* out_ref_data = out_ref.mutable_data(); for (int i = 0; i < out.numel(); i++) { EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); } } -TEST(transpose, normal) { - TransposeCompute transpose_kernel; - std::unique_ptr ctx(new KernelContext); - auto& context = ctx->As(); +class TransposeTest : public ::testing::Test { + protected: + TransposeTest() + : C_(3), + H_(128), + W_(64), + axes_({1, 2, 0}), + x_shape_({C_, H_, W_}), + out_shape_({H_, W_, C_}) { + x_ref_.Resize(lite::DDim(x_shape_)); + x_gpu_.Resize(x_ref_.dims()); + + auto X_ref__data = x_ref_.mutable_data(); + + // prepare input + for (int64_t i = 0; i < x_ref_.numel(); i++) { + X_ref__data[i] = static_cast(i); + } - operators::TransposeParam param; + out_ref_.Resize(lite::DDim(out_shape_)); + out_gpu_.Resize(out_ref_.dims()); + out_cpu_.Resize(out_ref_.dims()); + RunBaseLine(&x_ref_, &out_ref_); - lite::Tensor x, x_cpu, x_ref; - lite::Tensor out, out_cpu, out_ref; + InitParamAndContext(); + } - int C = 3, H = 128, W = 128; - std::vector axes({2, 0, 1}); - x.Resize({C, H, W}); - out.Resize({W, C, H}); + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.x = &x_gpu_; + param_.output = &out_gpu_; + param_.axis = axes_; + } - x_cpu.Resize({C, H, W}); - out_cpu.Resize({W, C, H}); + void InitFloatInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + } - x_ref.Resize({C, H, W}); - out_ref.Resize({W, C, H}); + void InitHalfInput() { + x_half_.Resize(lite::DDim(x_ref_.dims())); + auto x_half_data = x_half_.mutable_data(); + for (int64_t i = 0; i < x_half_.numel(); i++) { + x_half_data[i] = half(lite::float16(x_ref_.data()[i])); + } + x_gpu_.Assign(x_half_data, x_gpu_.dims()); + } - auto* x_cpu_data = x_cpu.mutable_data(); - auto* out_cpu_data = out_cpu.mutable_data(); - auto* x_ref_data = x_ref.mutable_data(); + void RunBaseLine(const lite::Tensor* x, lite::Tensor* out) { + TransBaseLine(x, out, axes_); + } - for (int i = 0; i < x_cpu.numel(); ++i) { - x_cpu_data[i] = i + 1; - x_ref_data[i] = i + 1; + int C_, H_, W_; + std::vector axes_; + std::vector x_shape_, out_shape_; + + lite::Tensor x_ref_, out_ref_; + lite::Tensor x_gpu_, out_gpu_; + lite::Tensor x_half_; + lite::Tensor out_cpu_; + + operators::TransposeParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(TransposeTest, fp32) { + InitFloatInput(); + TransposeCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); } - x.Assign(x_cpu_data, x_cpu.dims()); - param.x = &x; - param.output = &out; - param.axis = axes; - transpose_kernel.SetParam(param); - cudaStream_t stream; - cudaStreamCreate(&stream); - context.SetExecStream(stream); - transpose_kernel.SetContext(std::move(ctx)); - transpose_kernel.Launch(); + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } cudaDeviceSynchronize(); - auto* out_data = out.mutable_data(TARGET(kCUDA)); - CopySync( - out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); - transpose_ref(&x_ref, &out_ref, axes); - auto* out_ref_data = out_ref.mutable_data(); - for (int i = 0; i < out.numel(); i++) { - EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(out_cpu_.mutable_data(), + out_gpu_.data(), + sizeof(float) * out_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + EXPECT_NEAR(out_cpu_.data()[i], out_ref_.data()[i], 1e-5); + } +} + +TEST_F(TransposeTest, TestFP16) { + InitHalfInput(); + TransposeCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp16, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + const half* out_gpu_data = out_gpu_.data(); + half* out_cpu_data = out_cpu_.mutable_data(); + CopySync(out_cpu_data, + out_gpu_data, + sizeof(half) * out_gpu_.numel(), + IoDirection::DtoH); + + for (int i = 0; i < out_cpu_.numel(); ++i) { + float res = static_cast(lite::float16(out_cpu_data[i])); + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2); } } diff --git a/lite/kernels/cuda/yolo_box_compute.cu b/lite/kernels/cuda/yolo_box_compute.cu index 6b4b2875f39c479f3ddd387230dbdf8e3d24ce3c..23f5639a9ddbafa38cc575ac5ca068916956a075 100644 --- a/lite/kernels/cuda/yolo_box_compute.cu +++ b/lite/kernels/cuda/yolo_box_compute.cu @@ -185,15 +185,11 @@ void YoloBoxCompute::Run() { anchors_.Resize({static_cast(anchors.size())}); int* d_anchors = anchors_.mutable_data(TARGET(kCUDA)); - // TargetWrapperCuda::MemcpyAsync(d_anchors, - // anchors.data(), - // sizeof(int) * anchors.size(), - // IoDirection::HtoD, - // stream); - CopySync(d_anchors, - anchors.data(), - sizeof(int) * anchors.size(), - IoDirection::HtoD); + TargetWrapperCuda::MemcpyAsync(d_anchors, + anchors.data(), + sizeof(int) * anchors.size(), + IoDirection::HtoD, + stream); int threads = 512; int blocks = (n * box_num + threads - 1) / threads; diff --git a/lite/kernels/fpga/activation_compute_test.cc b/lite/kernels/fpga/activation_compute_test.cc index cef87afffca65ee82ca63e58191d3877f62824f2..99f702b84b3439814433e7c416151b43772dfb0e 100644 --- a/lite/kernels/fpga/activation_compute_test.cc +++ b/lite/kernels/fpga/activation_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/fpga/activation_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/fpga/activation_compute.h" namespace paddle { namespace lite { @@ -37,8 +39,7 @@ void activation_compute_ref(const operators::ActivationParam& param) { } TEST(activation_fpga, retrive_op) { - auto activation = - KernelRegistry::Global().Create("relu"); + auto activation = KernelRegistry::Global().Create("relu"); ASSERT_FALSE(activation.empty()); ASSERT_TRUE(activation.front()); } diff --git a/lite/kernels/fpga/fc_compute_test.cc b/lite/kernels/fpga/fc_compute_test.cc index 6ef8c02ed06dd89876dcab8c14fe389039bda614..08daecda314c771d0597951162d043f34d6316c9 100644 --- a/lite/kernels/fpga/fc_compute_test.cc +++ b/lite/kernels/fpga/fc_compute_test.cc @@ -12,15 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/fpga/fc_compute.h" #include + #include #include #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/fpga/fc_compute.h" namespace paddle { namespace lite { @@ -76,8 +78,7 @@ void FillData(T* a, } TEST(fc_fpga, retrive_op) { - auto fc = - KernelRegistry::Global().Create("fc"); + auto fc = KernelRegistry::Global().Create("fc"); ASSERT_FALSE(fc.empty()); ASSERT_TRUE(fc.front()); } diff --git a/lite/kernels/fpga/pooling_compute_test.cc b/lite/kernels/fpga/pooling_compute_test.cc old mode 100755 new mode 100644 index 9248289fe9353705e7a2d84831b9f3de5d8ee7d7..ff93f1a6e1c30d006065deb04576255c24baed25 --- a/lite/kernels/fpga/pooling_compute_test.cc +++ b/lite/kernels/fpga/pooling_compute_test.cc @@ -12,14 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/fpga/pooling_compute.h" #include + #include #include #include -#include "lite/core/op_registry.h" #include "lite/backends/fpga/KD/float16.hpp" +#include "lite/core/op_registry.h" +#include "lite/kernels/fpga/pooling_compute.h" namespace paddle { namespace lite { @@ -277,8 +278,7 @@ TEST(pool_fpga, compute) { } TEST(pool_fpga, retrive_op) { - auto pool = KernelRegistry::Global().Create( - "pool2d"); + auto pool = KernelRegistry::Global().Create("pool2d"); ASSERT_FALSE(pool.empty()); ASSERT_TRUE(pool.front()); } diff --git a/lite/kernels/fpga/softmax_compute_test.cc b/lite/kernels/fpga/softmax_compute_test.cc index f92139d0f49b3d149531f11cb422e44ded6e7e64..a6f456ba1f140d07ccfcea0d7746c1061586611e 100644 --- a/lite/kernels/fpga/softmax_compute_test.cc +++ b/lite/kernels/fpga/softmax_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/fpga/softmax_compute.h" #include + #include #include + #include "lite/backends/fpga/KD/float16.hpp" #include "lite/core/op_registry.h" +#include "lite/kernels/fpga/softmax_compute.h" namespace paddle { namespace lite { @@ -121,9 +123,7 @@ TEST(softmax_arm, compute) { } TEST(softmax, retrive_op) { - auto softmax = - KernelRegistry::Global().Create( - "softmax"); + auto softmax = KernelRegistry::Global().Create("softmax"); ASSERT_FALSE(softmax.empty()); ASSERT_TRUE(softmax.front()); } diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt index a70345708cce678b52e288a1f3eaf4ee1a23f541..cd91d2dc90f9f48668e1d5ab9fbe5d065cb0e191 100644 --- a/lite/kernels/host/CMakeLists.txt +++ b/lite/kernels/host/CMakeLists.txt @@ -16,3 +16,10 @@ add_kernel(ctc_align_compute_host Host extra SRCS ctc_align_compute.cc DEPS ${li add_kernel(write_to_array_compute_host Host extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps}) add_kernel(read_from_array_compute_host Host extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps}) add_kernel(assign_compute_host Host extra SRCS assign_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(retinanet_detection_output_compute_host Host extra SRCS retinanet_detection_output_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(where_index_compute_host Host extra SRCS where_index_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(activation_grad_compute_host Host train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps}) + +if(LITE_BUILD_EXTRA) + lite_cc_test(test_where_index_compute_host SRCS where_index_compute.cc DEPS where_index_compute_host) +endif() diff --git a/lite/kernels/host/activation_grad_compute.cc b/lite/kernels/host/activation_grad_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..4b837cfda4572fa106a1ba1d015ffd5163b08340 --- /dev/null +++ b/lite/kernels/host/activation_grad_compute.cc @@ -0,0 +1,98 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/host/activation_grad_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +void SquareGradCompute::Run() { + auto& param = this->Param(); + CHECK(param.X); + auto out_grad_dims = param.Out_grad->dims(); + auto out_grad_data = param.Out_grad->data(); + + auto x_data = param.X->data(); + auto x_grad_data = param.X_grad->mutable_data(); + for (int i = 0; i < out_grad_dims.production(); i++) { + x_grad_data[i] = out_grad_data[i] * 2.0 * x_data[i]; + } +} + +void ReluGradCompute::Run() { + auto& param = this->Param(); + CHECK(param.X); + auto out_grad_dims = param.Out_grad->dims(); + auto out_grad_data = param.Out_grad->data(); + + auto x_data = param.X->data(); + auto x_grad_data = param.X_grad->mutable_data(); + for (int i = 0; i < out_grad_dims.production(); i++) { + x_grad_data[i] = x_data[i] > 0 ? out_grad_data[i] : 0.0; + } +} + +void TanhGradCompute::Run() { + auto& param = this->Param(); + CHECK(param.Out); + auto out_grad_dims = param.Out_grad->dims(); + auto out_grad_data = param.Out_grad->data(); + + auto out_data = param.Out->data(); + auto x_grad_data = param.X_grad->mutable_data(); + for (int i = 0; i < out_grad_dims.production(); i++) { + x_grad_data[i] = out_grad_data[i] * + (static_cast(1.0) - out_data[i] * out_data[i]); + } +} + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(square_grad, + kHost, + kFloat, + kNCHW, + paddle::lite::kernels::host::SquareGradCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); + +REGISTER_LITE_KERNEL(relu_grad, + kHost, + kFloat, + kNCHW, + paddle::lite::kernels::host::SquareGradCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); + +REGISTER_LITE_KERNEL(tanh_grad, + kHost, + kFloat, + kNCHW, + paddle::lite::kernels::host::SquareGradCompute, + def) + .BindInput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); diff --git a/lite/kernels/host/activation_grad_compute.h b/lite/kernels/host/activation_grad_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..d942b901c448ee87410a2030ea0f9f10aca0e493 --- /dev/null +++ b/lite/kernels/host/activation_grad_compute.h @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +class SquareGradCompute : public KernelLite { + public: + using param_t = operators::ActivationGradParam; + + void Run() override; + + virtual ~SquareGradCompute() = default; +}; + +class ReluGradCompute : public KernelLite { + public: + using param_t = operators::ActivationGradParam; + + void Run() override; + + virtual ~ReluGradCompute() = default; +}; + +class TanhGradCompute : public KernelLite { + public: + using param_t = operators::ActivationGradParam; + + void Run() override; + + virtual ~TanhGradCompute() = default; +}; + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/host/retinanet_detection_output_compute.cc b/lite/kernels/host/retinanet_detection_output_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..95a4bf708e7f03aee9d9ac99323b173287260b13 --- /dev/null +++ b/lite/kernels/host/retinanet_detection_output_compute.cc @@ -0,0 +1,435 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/host/retinanet_detection_output_compute.h" +#include +#include +#include +#include +#include "lite/operators/retinanet_detection_output_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +template +bool SortScoreTwoPairDescend(const std::pair>& pair1, + const std::pair>& pair2) { + return pair1.first > pair2.first; +} + +template +static inline void GetMaxScoreIndex( + const std::vector& scores, + const T threshold, + int top_k, + std::vector>* sorted_indices) { + for (size_t i = 0; i < scores.size(); ++i) { + if (scores[i] > threshold) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), + sorted_indices->end(), + SortScorePairDescend); + // Keep top_k scores if needed. + if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { + sorted_indices->resize(top_k); + } +} + +template +static inline T BBoxArea(const std::vector& box, const bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static inline T JaccardOverlap(const std::vector& box1, + const std::vector& box2, + const bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + T norm = normalized ? static_cast(0.) : static_cast(1.); + T inter_w = inter_xmax - inter_xmin + norm; + T inter_h = inter_ymax - inter_ymin + norm; + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +void NMSFast(const std::vector>& cls_dets, + const T nms_threshold, + const T eta, + std::vector* selected_indices) { + int64_t num_boxes = cls_dets.size(); + std::vector> sorted_indices; + for (int64_t i = 0; i < num_boxes; ++i) { + sorted_indices.push_back(std::make_pair(cls_dets[i][4], i)); + } + // Sort the score pair according to the scores in descending order + std::stable_sort( + sorted_indices.begin(), sorted_indices.end(), SortScorePairDescend); + selected_indices->clear(); + T adaptive_threshold = nms_threshold; + + while (sorted_indices.size() != 0) { + const int idx = sorted_indices.front().second; + bool keep = true; + for (size_t k = 0; k < selected_indices->size(); ++k) { + if (keep) { + const int kept_idx = (*selected_indices)[k]; + T overlap = T(0.); + + overlap = JaccardOverlap(cls_dets[idx], cls_dets[kept_idx], false); + keep = overlap <= adaptive_threshold; + } else { + break; + } + } + if (keep) { + selected_indices->push_back(idx); + } + sorted_indices.erase(sorted_indices.begin()); + if (keep && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } +} + +template +void DeltaScoreToPrediction( + const std::vector& bboxes_data, + const std::vector& anchors_data, + T im_height, + T im_width, + T im_scale, + int class_num, + const std::vector>& sorted_indices, + std::map>>* preds) { + im_height = static_cast(std::round(im_height / im_scale)); + im_width = static_cast(std::round(im_width / im_scale)); + T zero(0); + int i = 0; + for (const auto& it : sorted_indices) { + T score = it.first; + int idx = it.second; + int a = idx / class_num; + int c = idx % class_num; + + int box_offset = a * 4; + T anchor_box_width = + anchors_data[box_offset + 2] - anchors_data[box_offset] + 1; + T anchor_box_height = + anchors_data[box_offset + 3] - anchors_data[box_offset + 1] + 1; + T anchor_box_center_x = anchors_data[box_offset] + anchor_box_width / 2; + T anchor_box_center_y = + anchors_data[box_offset + 1] + anchor_box_height / 2; + T target_box_center_x = 0, target_box_center_y = 0; + T target_box_width = 0, target_box_height = 0; + target_box_center_x = + bboxes_data[box_offset] * anchor_box_width + anchor_box_center_x; + target_box_center_y = + bboxes_data[box_offset + 1] * anchor_box_height + anchor_box_center_y; + target_box_width = std::exp(bboxes_data[box_offset + 2]) * anchor_box_width; + target_box_height = + std::exp(bboxes_data[box_offset + 3]) * anchor_box_height; + T pred_box_xmin = target_box_center_x - target_box_width / 2; + T pred_box_ymin = target_box_center_y - target_box_height / 2; + T pred_box_xmax = target_box_center_x + target_box_width / 2 - 1; + T pred_box_ymax = target_box_center_y + target_box_height / 2 - 1; + pred_box_xmin = pred_box_xmin / im_scale; + pred_box_ymin = pred_box_ymin / im_scale; + pred_box_xmax = pred_box_xmax / im_scale; + pred_box_ymax = pred_box_ymax / im_scale; + + pred_box_xmin = std::max(std::min(pred_box_xmin, im_width - 1), zero); + pred_box_ymin = std::max(std::min(pred_box_ymin, im_height - 1), zero); + pred_box_xmax = std::max(std::min(pred_box_xmax, im_width - 1), zero); + pred_box_ymax = std::max(std::min(pred_box_ymax, im_height - 1), zero); + + std::vector one_pred; + one_pred.push_back(pred_box_xmin); + one_pred.push_back(pred_box_ymin); + one_pred.push_back(pred_box_xmax); + one_pred.push_back(pred_box_ymax); + one_pred.push_back(score); + (*preds)[c].push_back(one_pred); + i++; + } +} + +template +void MultiClassNMS(const std::map>>& preds, + int class_num, + const int keep_top_k, + const T nms_threshold, + const T nms_eta, + std::vector>* nmsed_out, + int* num_nmsed_out) { + std::map> indices; + int num_det = 0; + for (int c = 0; c < class_num; ++c) { + if (static_cast(preds.count(c))) { + const std::vector> cls_dets = preds.at(c); + NMSFast(cls_dets, nms_threshold, nms_eta, &(indices[c])); + num_det += indices[c].size(); + } + } + + std::vector>> score_index_pairs; + for (const auto& it : indices) { + int label = it.first; + const std::vector& label_indices = it.second; + for (size_t j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + score_index_pairs.push_back( + std::make_pair(preds.at(label)[idx][4], std::make_pair(label, idx))); + } + } + // Keep top k results per image. + std::stable_sort(score_index_pairs.begin(), + score_index_pairs.end(), + SortScoreTwoPairDescend); + if (num_det > keep_top_k) { + score_index_pairs.resize(keep_top_k); + } + + // Store the new indices. + std::map> new_indices; + for (const auto& it : score_index_pairs) { + int label = it.second.first; + int idx = it.second.second; + std::vector one_pred; + one_pred.push_back(label); + one_pred.push_back(preds.at(label)[idx][4]); + one_pred.push_back(preds.at(label)[idx][0]); + one_pred.push_back(preds.at(label)[idx][1]); + one_pred.push_back(preds.at(label)[idx][2]); + one_pred.push_back(preds.at(label)[idx][3]); + nmsed_out->push_back(one_pred); + } + + *num_nmsed_out = (num_det > keep_top_k ? keep_top_k : num_det); +} + +template +void RetinanetDetectionOutput( + const operators::RetinanetDetectionOutputParam& param, + const std::vector& scores, + const std::vector& bboxes, + const std::vector& anchors, + const Tensor& im_info, + std::vector>* nmsed_out, + int* num_nmsed_out) { + int64_t nms_top_k = param.nms_top_k; + int64_t keep_top_k = param.keep_top_k; + T nms_threshold = static_cast(param.nms_threshold); + T nms_eta = static_cast(param.nms_eta); + T score_threshold = static_cast(param.score_threshold); + + int64_t class_num = scores[0].dims()[1]; + std::map>> preds; + for (size_t l = 0; l < scores.size(); ++l) { + // Fetch per level score + Tensor scores_per_level = scores[l]; + // Fetch per level bbox + Tensor bboxes_per_level = bboxes[l]; + // Fetch per level anchor + Tensor anchors_per_level = anchors[l]; + + int64_t scores_num = scores_per_level.numel(); + int64_t bboxes_num = bboxes_per_level.numel(); + std::vector scores_data(scores_num); + std::vector bboxes_data(bboxes_num); + std::vector anchors_data(bboxes_num); + std::copy_n(scores_per_level.data(), scores_num, scores_data.begin()); + std::copy_n(bboxes_per_level.data(), bboxes_num, bboxes_data.begin()); + std::copy_n(anchors_per_level.data(), bboxes_num, anchors_data.begin()); + std::vector> sorted_indices; + + // For the highest level, we take the threshold 0.0 + T threshold = (l < (scores.size() - 1) ? score_threshold : 0.0); + GetMaxScoreIndex(scores_data, threshold, nms_top_k, &sorted_indices); + auto* im_info_data = im_info.data(); + auto im_height = im_info_data[0]; + auto im_width = im_info_data[1]; + auto im_scale = im_info_data[2]; + DeltaScoreToPrediction(bboxes_data, + anchors_data, + im_height, + im_width, + im_scale, + class_num, + sorted_indices, + &preds); + } + + MultiClassNMS(preds, + class_num, + keep_top_k, + nms_threshold, + nms_eta, + nmsed_out, + num_nmsed_out); +} + +template +void MultiClassOutput(const std::vector>& nmsed_out, + Tensor* outs) { + auto* odata = outs->mutable_data(); + int count = 0; + int64_t out_dim = 6; + for (size_t i = 0; i < nmsed_out.size(); ++i) { + odata[count * out_dim] = nmsed_out[i][0] + 1; // label + odata[count * out_dim + 1] = nmsed_out[i][1]; // score + odata[count * out_dim + 2] = nmsed_out[i][2]; // xmin + odata[count * out_dim + 3] = nmsed_out[i][3]; // xmin + odata[count * out_dim + 4] = nmsed_out[i][4]; // xmin + odata[count * out_dim + 5] = nmsed_out[i][5]; // xmin + count++; + } +} + +void RetinanetDetectionOutputCompute::Run() { + auto& param = Param(); + auto& boxes = param.bboxes; + auto& scores = param.scores; + auto& anchors = param.anchors; + auto* im_info = param.im_info; + auto* outs = param.out; + + std::vector boxes_list(boxes.size()); + std::vector scores_list(scores.size()); + std::vector anchors_list(anchors.size()); + for (size_t j = 0; j < boxes_list.size(); ++j) { + boxes_list[j] = *boxes[j]; + scores_list[j] = *scores[j]; + anchors_list[j] = *anchors[j]; + } + auto score_dims = scores_list[0].dims(); + int64_t batch_size = score_dims[0]; + auto box_dims = boxes_list[0].dims(); + int64_t box_dim = box_dims[2]; + int64_t out_dim = box_dim + 2; + + std::vector>> all_nmsed_out; + std::vector batch_starts = {0}; + for (int i = 0; i < batch_size; ++i) { + int num_nmsed_out = 0; + std::vector box_per_batch_list(boxes_list.size()); + std::vector score_per_batch_list(scores_list.size()); + for (size_t j = 0; j < boxes_list.size(); ++j) { + auto score_dims = scores_list[j].dims(); + score_per_batch_list[j] = scores_list[j].Slice(i, i + 1); + score_per_batch_list[j].Resize({score_dims[1], score_dims[2]}); + box_per_batch_list[j] = boxes_list[j].Slice(i, i + 1); + box_per_batch_list[j].Resize({score_dims[1], box_dim}); + } + Tensor im_info_slice = im_info->Slice(i, i + 1); + + std::vector> nmsed_out; + RetinanetDetectionOutput(param, + score_per_batch_list, + box_per_batch_list, + anchors_list, + im_info_slice, + &nmsed_out, + &num_nmsed_out); + all_nmsed_out.push_back(nmsed_out); + batch_starts.push_back(batch_starts.back() + num_nmsed_out); + } + + uint64_t num_kept = batch_starts.back(); + if (num_kept == 0) { + outs->Resize({0, out_dim}); + } else { + outs->Resize({static_cast(num_kept), out_dim}); + for (int i = 0; i < batch_size; ++i) { + int64_t s = static_cast(batch_starts[i]); + int64_t e = static_cast(batch_starts[i + 1]); + if (e > s) { + Tensor out = outs->Slice(s, e); + MultiClassOutput(all_nmsed_out[i], &out); + } + } + } + + LoD lod; + lod.emplace_back(batch_starts); + outs->set_lod(lod); +} + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + retinanet_detection_output, + kHost, + kFloat, + kNCHW, + paddle::lite::kernels::host::RetinanetDetectionOutputCompute, + def) + .BindInput("BBoxes", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Scores", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Anchors", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("ImInfo", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/host/retinanet_detection_output_compute.h b/lite/kernels/host/retinanet_detection_output_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..612ea7105e2728b856f02d71e9fcfaea2a1ef680 --- /dev/null +++ b/lite/kernels/host/retinanet_detection_output_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +class RetinanetDetectionOutputCompute + : public KernelLite { + public: + void Run() override; + + virtual ~RetinanetDetectionOutputCompute() = default; +}; + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/host/where_index_compute.cc b/lite/kernels/host/where_index_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..d06be8d332734f3e41b0414e891c8810a117d8a6 --- /dev/null +++ b/lite/kernels/host/where_index_compute.cc @@ -0,0 +1,173 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/host/where_index_compute.h" +#include +#include +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +static void where_index_rank4(const int64_t* true_index, + int true_num, + const int64_t* stride, + int64_t* out) { + int cnt = true_num >> 1; + register int64_t stride0 = stride[0]; + register int64_t stride1 = stride[1]; + register int64_t stride2 = stride[2]; + register int64_t stride3 = stride[3]; + for (int i = 0; i < cnt; ++i) { + int64_t index0 = true_index[i * 2]; + int64_t index1 = true_index[i * 2 + 1]; + int out_index = i * 8; + // rank0 + register int64_t oindex0 = index0 / stride0; + register int64_t oindex1 = index1 / stride0; + out[out_index] = oindex0; + index0 -= oindex0 * stride0; + index1 -= oindex1 * stride0; + out[out_index + 4] = oindex1; + out_index++; + // rank1 + oindex0 = index0 / stride1; + oindex1 = index1 / stride1; + out[out_index] = oindex0; + index0 -= oindex0 * stride1; + index1 -= oindex1 * stride1; + out[out_index + 4] = oindex1; + out_index++; + // rank2 + oindex0 = index0 / stride2; + oindex1 = index1 / stride2; + out[out_index] = oindex0; + index0 -= oindex0 * stride2; + index1 -= oindex1 * stride2; + out[out_index + 4] = oindex1; + out_index++; + // rank3 + oindex0 = index0 / stride3; + oindex1 = index1 / stride3; + out[out_index] = oindex0; + out[out_index + 4] = oindex1; + } + // remain + for (int r = cnt * 2; r < true_num; ++r) { + int out_index = r * 4; + int64_t index = true_index[r]; + for (int i = 0; i < 4; ++i) { + out[out_index + i] = index / stride[i]; + index -= out[out_index + i] * stride[i]; + } + } +} + +inline void where_index_rank1(const int64_t* true_index, + int true_num, + int64_t* out) { + memcpy(out, true_index, true_num * sizeof(int64_t)); +} + +static void where_index_rankn(const int64_t* true_index, + int true_num, + const int64_t* stride, + int rank, + int64_t* out) { + int out_index = 0; + for (int i = 0; i < true_num; ++i) { + int64_t index = true_index[i]; + for (int r = 0; r < rank; ++r) { + out[out_index] = index / stride[r]; + index -= out[out_index++] * stride[r]; + } + } +} + +template +void WhereIndexKernel(const operators::WhereIndexParam& param) { + auto* input = param.input; + auto* output = param.output; + auto dims = input->dims(); + auto numel = dims.production(); + int64_t rank = static_cast(dims.size()); + const T* cond_data = input->template data(); + int64_t true_num = 0; + std::vector true_index(numel); + for (auto i = 0; i < numel; i++) { + if (static_cast(cond_data[i])) { + true_index[true_num] = i; + true_num++; + } + } + output->Resize({true_num, rank}); + if (true_num == 0) { + return; + } + auto* out_ptr = output->template mutable_data(); + std::vector stride(rank); + stride[rank - 1] = 1; + for (int i = rank - 2; i >= 0; i--) { + stride[i] = stride[i + 1] * dims[i + 1]; + } + if (rank == 1) { + where_index_rank1(true_index.data(), true_num, out_ptr); + } else if (rank == 4) { + where_index_rank4(true_index.data(), true_num, stride.data(), out_ptr); + } else { + where_index_rankn( + true_index.data(), true_num, stride.data(), rank, out_ptr); + } +} + +void WhereIndexCompute::Run() { + auto& param = this->Param(); + switch (param.input->precision()) { + case PRECISION(kFloat): + WhereIndexKernel(param); + break; + case PRECISION(kInt32): + WhereIndexKernel(param); + break; + case PRECISION(kInt64): + WhereIndexKernel(param); + break; + case PRECISION(kInt8): + WhereIndexKernel(param); + break; + case PRECISION(kBool): + WhereIndexKernel(param); + break; + default: + LOG(FATAL) << "WhereIndex does not implement for the " + << "input type:" << static_cast(param.input->precision()); + } +} + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle + +using whereindex = paddle::lite::kernels::host::WhereIndexCompute; + +REGISTER_LITE_KERNEL(where_index, kHost, kAny, kAny, whereindex, def) + .BindInput("Condition", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/host/where_index_compute.h b/lite/kernels/host/where_index_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..6936e3ed8f0ee16bf0e41095bbcbd0c18169d62f --- /dev/null +++ b/lite/kernels/host/where_index_compute.h @@ -0,0 +1,37 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/operators/where_index_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +class WhereIndexCompute : public KernelLite { + public: + using param_t = operators::WhereIndexParam; + + void Run() override; + + virtual ~WhereIndexCompute() = default; +}; + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/host/where_index_compute_test.cc b/lite/kernels/host/where_index_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..7097bdcae2bb319331af72c390a9d5de4fc23a9f --- /dev/null +++ b/lite/kernels/host/where_index_compute_test.cc @@ -0,0 +1,174 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/host/where_index_compute.h" +#include +#include +#include +#include +#include +#include +#include "lite/core/context.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +template +void where_index_compute_ref(lite::Tensor* condition, lite::Tensor* out) { + auto dims = condition->dims(); + auto numel = condition->numel(); + const int64_t rank = static_cast(dims.size()); + const T* cond_data = condition->data(); + std::vector true_index; + for (auto i = 0; i < numel; i++) { + if (static_cast(cond_data[i])) { + true_index.push_back(i); + } + } + int64_t true_num = static_cast(true_index.size()); + out->Resize({true_num, rank}); + int64_t* out_ptr = out->mutable_data(); + if (true_num == 0) { + return; + } + + std::vector stride(rank); + stride[rank - 1] = 1; + for (int i = rank - 2; i >= 0; i--) { + stride[i] = stride[i + 1] * dims[i + 1]; + } + for (int i = 0; i < true_num; ++i) { + int64_t index = true_index[i]; + for (int j = 0; j < rank; ++j) { + out_ptr[i * rank + j] = index / stride[j]; + index -= out_ptr[i * rank + j] * stride[j]; + } + } +} + +TEST(where_index, init) { + WhereIndexCompute where_index; + ASSERT_EQ(where_index.precision(), PRECISION(kAny)); + ASSERT_EQ(where_index.target(), TARGET(kHost)); +} + +TEST(where_index, retrive_op) { + auto where_index = + KernelRegistry::Global().Create( + "where_index"); + ASSERT_FALSE(where_index.empty()); + ASSERT_TRUE(where_index.front()); +} + +TEST(where_index, compute) { + paddle::lite::DeviceInfo::Init(); + WhereIndexCompute where_index; + operators::WhereIndexParam param; + + lite::Tensor input; + lite::Tensor output; + lite::Tensor output_ref; + param.input = &input; + param.output = &output; + where_index.SetParam(param); + for (auto& n : {1, 2, 4}) { + for (auto& c : {1, 3, 21, 32}) { + for (auto& h : {1, 5, 63}) { + for (auto& w : {1, 5, 64}) { + for (auto& dim_size : {1, 2, 3, 4}) { + for (int i = 0; i < 5; ++i) { + std::vector in_shape; + in_shape.push_back(n); + in_shape.push_back(c); + in_shape.push_back(h); + in_shape.push_back(w); + int outer = 1; + for (int i = dim_size - 1; i < in_shape.size(); ++i) { + outer *= in_shape[i]; + } + in_shape.resize(dim_size); + in_shape[dim_size - 1] = outer; + + DDim indim(in_shape); + LOG(INFO) << "in dims: "; + for (int i = 0; i < dim_size; ++i) { + LOG(INFO) << in_shape[i]; + } + input.Resize(indim); + std::default_random_engine engine; + std::uniform_real_distribution dist(-1, 1); + if (i == 0) { + int* indata = input.mutable_data(); + for (int i = 0; i < indim.production(); ++i) { + indata[i] = static_cast(dist(engine) > 0); + } + where_index_compute_ref(&input, &output_ref); + } else if (i == 1) { + int64_t* indata = input.mutable_data(); + for (int i = 0; i < indim.production(); ++i) { + indata[i] = static_cast(dist(engine) > 0); + } + where_index_compute_ref(&input, &output_ref); + } else if (i == 2) { + int8_t* indata = input.mutable_data(); + for (int i = 0; i < indim.production(); ++i) { + indata[i] = static_cast(dist(engine) > 0); + } + where_index_compute_ref(&input, &output_ref); + } else if (i == 3) { + bool* indata = input.mutable_data(); + for (int i = 0; i < indim.production(); ++i) { + indata[i] = dist(engine) > 0; + } + where_index_compute_ref(&input, &output_ref); + } else { + float* indata = input.mutable_data(); + for (int i = 0; i < indim.production(); ++i) { + indata[i] = dist(engine) > 0; + } + where_index_compute_ref(&input, &output_ref); + } + where_index.Run(); + const int64_t* outdata = output.data(); + const int64_t* outdata_ref = output_ref.data(); + CHECK_EQ(output.dims(), output_ref.dims()) + << "where_index out shape error! out_dim is not equal " + "to out_ref dim"; + for (int i = 0; i < output.numel(); i++) { + if (std::abs(outdata[i] - outdata_ref[i]) > 0) { + LOG(FATAL) << "where_index cmp error, i: " << i + << ", output_data: " << outdata[i] + << ", output_ref_data: " << outdata_ref[i] + << "input precision: " + << static_cast(input.precision()); + } + } + } + } + } + } + } + } +} + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(where_index, kHost, kAny, kAny, def); diff --git a/lite/kernels/mlu/CMakeLists.txt b/lite/kernels/mlu/CMakeLists.txt index f9395d45ccecccaf3f873797d0c2d71eda266319..634a0afc551d83be58487d7393e092196e0f6cc5 100644 --- a/lite/kernels/mlu/CMakeLists.txt +++ b/lite/kernels/mlu/CMakeLists.txt @@ -4,6 +4,7 @@ endif() add_subdirectory(bridges) add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges}) -add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) -add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) -add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) +add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${target_wrapper_mlu}) +add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps}) +# depend on transpose function in backend/x86/math/math_function +add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_function}) diff --git a/lite/kernels/mlu/bridges/CMakeLists.txt b/lite/kernels/mlu/bridges/CMakeLists.txt index 82510ab9b6a794f5c6b1ffb43d2d3c55db3a5514..91323925e1ef49462c180fd96392d638e273fd69 100644 --- a/lite/kernels/mlu/bridges/CMakeLists.txt +++ b/lite/kernels/mlu/bridges/CMakeLists.txt @@ -3,7 +3,7 @@ if(NOT LITE_WITH_MLU) endif() lite_cc_library(subgraph_bridge_utility_mlu SRCS utility.cc DEPS ${mlu_builder_libs} tensor) -lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs}) +lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs} subgraph_bridge_utility_mlu) lite_cc_library(subgraph_bridge_graph_mlu SRCS graph.cc DEPS subgraph_bridge_utility_mlu subgraph_bridge_tensor_mlu) set(mlu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_mlu subgraph_bridge_graph_mlu) @@ -18,6 +18,16 @@ lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_d lite_cc_library(subgraph_bridge_scale_op_mlu SRCS scale_op.cc DEPS ${subgraph_bridge_deps_mlu}) lite_cc_library(subgraph_bridge_interp_op_mlu SRCS interpolate_op.cc DEPS ${subgraph_bridge_deps_mlu}) lite_cc_library(subgraph_bridge_concat_op_mlu SRCS concat_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_transpose_op_mlu SRCS transpose_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_dropout_op_mlu SRCS dropout_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_slice_op_mlu SRCS slice_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_split_op_mlu SRCS split_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_cast_op_mlu SRCS cast_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_layout_op_mlu SRCS layout_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_argmax_op_mlu SRCS argmax_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_squeeze_op_mlu SRCS squeeze_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_reshape_op_mlu SRCS reshape_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_flatten_op_mlu SRCS flatten_op.cc DEPS ${subgraph_bridge_deps_mlu}) set(mlu_subgraph_bridges subgraph_bridge_registry subgraph_bridge_utility_mlu @@ -28,12 +38,35 @@ set(mlu_subgraph_bridges subgraph_bridge_pool_op_mlu subgraph_bridge_softmax_op_mlu subgraph_bridge_fc_op_mlu + subgraph_bridge_transpose_op_mlu subgraph_bridge_batch_norm_op_mlu subgraph_bridge_scale_op_mlu subgraph_bridge_interp_op_mlu subgraph_bridge_concat_op_mlu + subgraph_bridge_dropout_op_mlu + subgraph_bridge_slice_op_mlu + subgraph_bridge_split_op_mlu + subgraph_bridge_cast_op_mlu + subgraph_bridge_layout_op_mlu + subgraph_bridge_argmax_op_mlu + subgraph_bridge_squeeze_op_mlu + subgraph_bridge_reshape_op_mlu + subgraph_bridge_flatten_op_mlu CACHE INTERNAL "mlu_subgraph_bridges") + +if (LITE_BUILD_EXTRA) + lite_cc_library(subgraph_bridge_lrn_op_mlu SRCS lrn_op.cc DEPS ${subgraph_bridge_deps_mlu}) + lite_cc_library(subgraph_bridge_gather_op_mlu SRCS gather_op.cc DEPS ${subgraph_bridge_deps_mlu}) + lite_cc_library(subgraph_bridge_norm_op_mlu SRCS norm_op.cc DEPS ${subgraph_bridge_deps_mlu}) + set(mlu_subgraph_bridges + "${mlu_subgraph_bridges}" + subgraph_bridge_lrn_op_mlu + subgraph_bridge_gather_op_mlu + subgraph_bridge_norm_op_mlu + CACHE INTERNAL "mlu_subgraph_bridges") +endif() + lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges}) lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) @@ -45,4 +78,21 @@ lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer targe lite_cc_test(test_scale_converter_mlu SRCS scale_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_interp_converter_mlu SRCS interpolate_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_concat_converter_mlu SRCS concat_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_transpose_converter_mlu SRCS transpose_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_dropout_converter_mlu SRCS dropout_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_slice_converter_mlu SRCS slice_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_split_converter_mlu SRCS split_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_layout_converter_mlu SRCS layout_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_cast_converter_mlu SRCS cast_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_argmax_converter_mlu SRCS argmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_squeeze_converter_mlu SRCS squeeze_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_reshape_converter_mlu SRCS reshape_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_flatten_converter_mlu SRCS flatten_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) + +if (LITE_BUILD_EXTRA) + lite_cc_test(test_norm_converter_mlu SRCS norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) + lite_cc_test(test_lrn_converter_mlu SRCS lrn_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) + lite_cc_test(test_gather_converter_mlu SRCS gather_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +endif() + message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}") diff --git a/lite/kernels/mlu/bridges/act_op.cc b/lite/kernels/mlu/bridges/act_op.cc index 286195d9d5f961288dd0156db31ff8aacae58227..d24c7fac216ed0ba213a4fd95365132a693281c3 100644 --- a/lite/kernels/mlu/bridges/act_op.cc +++ b/lite/kernels/mlu/bridges/act_op.cc @@ -60,6 +60,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { output_tensor->mlu_tensor())); } graph->FuseOp(activation_op); + CNML_CALL(cnmlDestroyBaseOp(&activation_op)); return SUCCESS; } @@ -72,6 +73,9 @@ REGISTER_SUBGRAPH_BRIDGE(sigmoid, kMLU, paddle::lite::subgraph::mlu::ActConverter); REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(relu6, + kMLU, + paddle::lite::subgraph::mlu::ActConverter); REGISTER_SUBGRAPH_BRIDGE(tanh, kMLU, paddle::lite::subgraph::mlu::ActConverter); REGISTER_SUBGRAPH_BRIDGE(leaky_relu, kMLU, diff --git a/lite/kernels/mlu/bridges/act_op_test.cc b/lite/kernels/mlu/bridges/act_op_test.cc index 2b7747f4d8b647b8cb621876907f6178ebf9fe88..11c0c3f732c4c29fff3aedc6cfdcf55760128b5d 100644 --- a/lite/kernels/mlu/bridges/act_op_test.cc +++ b/lite/kernels/mlu/bridges/act_op_test.cc @@ -13,7 +13,9 @@ // limitations under the License. #include + #include + #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" #include "lite/kernels/mlu/bridges/test_helper.h" @@ -116,7 +118,7 @@ void test_act(std::vector x_shape, std::string op_type) { opdesc.SetAttr("offset", 0.5f); } - // create and convert op to NPU model, then run it on NPU + // create and convert op to MLU model, then run it on MLU auto op = CreateOp(opdesc, &scope); // execute reference implementation and save to output tensor act_ref(op); @@ -134,7 +136,8 @@ void test_act(std::vector x_shape, std::string op_type) { TEST(MLUBridges, activation) { std::vector> shapes{{1}, {2, 3}, {1, 2, 3, 4}}; - std::vector types{"sigmoid", "relu", "tanh", "leaky_relu"}; + std::vector types{ + "sigmoid", "relu", "relu6", "tanh", "leaky_relu"}; for (auto x_shape : shapes) { for (auto op_type : types) { test_act(x_shape, op_type); @@ -149,5 +152,6 @@ TEST(MLUBridges, activation) { USE_SUBGRAPH_BRIDGE(sigmoid, kMLU) USE_SUBGRAPH_BRIDGE(relu, kMLU) +USE_SUBGRAPH_BRIDGE(relu6, kMLU) USE_SUBGRAPH_BRIDGE(tanh, kMLU) USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU) diff --git a/lite/kernels/mlu/bridges/argmax_op.cc b/lite/kernels/mlu/bridges/argmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b004639f07c79e5cc414e2d60bc1f32ec522f0f5 --- /dev/null +++ b/lite/kernels/mlu/bridges/argmax_op.cc @@ -0,0 +1,107 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int ArgmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Get input vars and op attributes + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims().Vectorize(); + + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + + int axis = op_info->GetAttr("axis"); + if (axis < 0) { + axis = axis + x_dims.size(); + } + cnmlDimension_t argmax_mode = static_cast(axis); + auto mlu_output_dim = x->dims().Vectorize(); + // shape is NCHW, layout is NHWC + mlu_output_dim[axis] = 1; + auto input_tensor = graph->GetNode(x_var_name); + // if use_fp16 and axis is not c, cast input datatype from fp16 to fp32, so + // output datatype is int32 + bool cast_to_fp32 = + graph->FPType() == CNML_DATA_FLOAT16 && argmax_mode != CNML_DIM_C; + cnmlBaseOp_t cast_op{nullptr}; + std::shared_ptr fp32_input_tensor; + if (cast_to_fp32) { + fp32_input_tensor = graph->AddNode(x_var_name + ".fp32", + x_dims, + CNML_TENSOR, + CNML_NCHW, + CNML_DATA_FLOAT32); + cnmlCreateCastOp(&cast_op, + CNML_CAST_FLOAT16_TO_FLOAT32, + input_tensor->mlu_tensor(), + fp32_input_tensor->mlu_tensor()); + } + auto output_tensor = graph->AddNode( + out_var_name, mlu_output_dim, CNML_TENSOR, CNML_NCHW, CNML_DATA_INT32); + + CHECK(graph->HasNode(x_var_name)); + cnmlBaseOp_t argmax_op{nullptr}; + // ======================= DEBUG INFO ===================== + VLOG(6) << "x_var_name: " << x_var_name; + VLOG(6) << "out_var_name: " << out_var_name; + VLOG(6) << "x dims: " << x->dims(); + VLOG(6) << "output dims: " << output->dims(); + VLOG(6) << "axis: " << axis; + VLOG(6) << "cast_to_fp32: " << cast_to_fp32; + cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR); + cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR); + // ======================= DEBUG END ===================== + + CNML_CALL(cnmlCreateArgmaxOp(&argmax_op, + argmax_mode, + cast_to_fp32 ? fp32_input_tensor->mlu_tensor() + : input_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + if (cast_to_fp32) { + graph->FuseOp(cast_op); + } + graph->FuseOp(argmax_op); + CNML_CALL(cnmlDestroyBaseOp(&argmax_op)); + if (cast_op) { + CNML_CALL(cnmlDestroyBaseOp(&cast_op)); + } + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(arg_max, + kMLU, + paddle::lite::subgraph::mlu::ArgmaxConverter); diff --git a/lite/kernels/mlu/bridges/argmax_op_test.cc b/lite/kernels/mlu/bridges/argmax_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9eeb172812b8deecd6a8f1f2eb321ade4289fa9b --- /dev/null +++ b/lite/kernels/mlu/bridges/argmax_op_test.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/argmax_op.h" + +#include + +#include +#include + +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +template +void argmax_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + int axis = op_info->GetAttr("axis"); + auto x_dims = x->dims(); + if (axis < 0) { + axis += x_dims.size(); + } + auto y_shape = x_dims.Vectorize(); + y_shape.erase(y_shape.begin() + axis); + out->Resize(y_shape); + auto out_dims = out->dims(); + + auto* x_data = x->mutable_data(); + auto* out_data = out->mutable_data(); + + const int size = x_dims[axis]; + const int in_channel = x_dims.count(axis, x_dims.size()); + const int out_channel = out_dims.count(axis, out_dims.size()); + const int in_stride = x_dims.count(axis + 1, x_dims.size()); + const int out_stride = x_dims.count(0, axis); + // int index = 0; + for (int n = 0; n < out_stride; n++) { + for (int k = 0; k < in_stride; k++) { + const float* in_ptr = x_data + n * in_channel + k; + std::vector> vec; + vec.resize(size); + for (int i = 0; i < size; i++) { + vec[i] = std::make_pair(in_ptr[i * in_stride], i); + } + // sort + std::partial_sort(vec.begin(), + vec.begin() + 1, + vec.end(), + std::greater>()); + + out_dtype* out_ptr = out_data + n * out_channel + k; + *out_ptr = vec[0].second; + } + } +} + +void test_argmax(const std::vector& input_shape, int axis) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize(input_shape); + // initialize input&output data + FillTensor(x, -9, 9); + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("arg_max"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axis", static_cast(axis)); + + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + argmax_ref(op); + out_ref->CopyDataFrom(*out); + Tensor input_x; + input_x.Resize(DDim(input_shape)); + // change input layout from NCHW to NHWC + transpose(x->mutable_data(), + input_x.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3])}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_x); + + LaunchOp(op, {x_var_name}, {out_var_name}); + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + std::vector out_shape = input_shape; + out_shape[axis] = 1; + Tensor output_trans; + output_trans.Resize(out_shape); + // Change output layout from NHWC to NCHW + transpose(out_data, + output_trans.mutable_data(), + {static_cast(out_shape[0]), + static_cast(out_shape[2]), + static_cast(out_shape[3]), + static_cast(out_shape[1])}, + {0, 3, 1, 2}); + out_data = output_trans.mutable_data(); + + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(MLUBridges, arg_max) { + test_argmax({1, 2, 3, 4}, 1); + test_argmax({1, 2, 3, 4}, 2); + test_argmax({1, 2, 3, 4}, 3); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(arg_max, kMLU); diff --git a/lite/kernels/mlu/bridges/batch_norm_op.cc b/lite/kernels/mlu/bridges/batch_norm_op.cc index 7353a685dd5fd3a5bcc8c88def8ffb8b96fdde55..ceac1ac696d788869e77a1b173cc0bb4d10a4e21 100644 --- a/lite/kernels/mlu/bridges/batch_norm_op.cc +++ b/lite/kernels/mlu/bridges/batch_norm_op.cc @@ -48,25 +48,32 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto mean = scope->FindVar(mean_var_name)->GetMutable(); auto mean_dims = mean->dims().Vectorize(); + if (mean_dims.size() < 4) { + mean_dims.insert(mean_dims.begin(), 4 - mean_dims.size(), 1); + } auto mean_tensor = graph->AddNode( - mean_var_name, mean_dims, CNML_CONST, CNML_CNHW, graph->FPType()); + mean_var_name, mean_dims, CNML_CONST, CNML_NHWC, graph->FPType()); auto variance = scope->FindVar(variance_var_name)->GetMutable(); auto variance_dims = variance->dims().Vectorize(); + if (variance_dims.size() < 4) { + variance_dims.insert(variance_dims.begin(), 4 - variance_dims.size(), 1); + } auto variance_tensor = graph->AddNode( - variance_var_name, variance_dims, CNML_CONST, CNML_CNHW, graph->FPType()); + variance_var_name, variance_dims, CNML_CONST, CNML_NHWC, graph->FPType()); auto scale = scope->FindVar(scale_var_name)->GetMutable(); auto bias = scope->FindVar(bias_var_name)->GetMutable(); - int co = static_cast(mean_dims[0]); + int co = static_cast(mean_dims[3]); + std::vector variance_trans(co); + std::vector mean_trans(co); for (int i = 0; i < co; ++i) { - variance->mutable_data()[i] = + variance_trans[i] = scale->data()[i] / sqrtf(variance->data()[i] + epsilon); - mean->mutable_data()[i] = - mean->data()[i] - - bias->data()[i] / variance->data()[i]; + mean_trans[i] = + mean->data()[i] - bias->data()[i] / variance_trans[i]; } auto input_tensor = graph->GetNode(x_var_name); @@ -77,10 +84,14 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { mean_tensor->mlu_tensor(), variance_tensor->mlu_tensor())); - graph->BindConstData(variance_var_name, variance); - graph->BindConstData(mean_var_name, mean); + graph->BindConstRawData( + variance_var_name, variance_trans.data(), variance_trans.size(), true); + graph->BindConstRawData( + mean_var_name, mean_trans.data(), mean_trans.size(), true); graph->FuseOp(bn_op); + CNML_CALL(cnmlDestroyBaseOp(&bn_op)); + return SUCCESS; } diff --git a/lite/kernels/mlu/bridges/cast_op.cc b/lite/kernels/mlu/bridges/cast_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..25d988ce5aee519dfb00574343956022b30a89e7 --- /dev/null +++ b/lite/kernels/mlu/bridges/cast_op.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int CastConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + auto in_dtype = op_info->GetAttr("in_dtype"); + auto out_dtype = op_info->GetAttr("out_dtype"); + + CHECK(graph->HasNode(x_var_name)); + auto x_tensor = graph->GetNode(x_var_name); + + cnmlDataType_t out_type; + cnmlCastType_t cast_type; + if (in_dtype == 4 && out_dtype == 5) { + cast_type = CNML_CAST_FLOAT16_TO_FLOAT32; + out_type = CNML_DATA_FLOAT32; + } else if (in_dtype == 5 && out_dtype == 4) { + cast_type = CNML_CAST_FLOAT32_TO_FLOAT16; + out_type = CNML_DATA_FLOAT16; + } else { + CHECK(0) << "Unsupported cast type"; + } + + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, out_type); + + cnmlBaseOp_t cast_op; + CNML_CALL(cnmlCreateCastOp(&cast_op, + cast_type, + x_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + graph->FuseOp(cast_op); + CNML_CALL(cnmlDestroyBaseOp(&cast_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(cast, + kMLU, + paddle::lite::subgraph::mlu::CastConverter); diff --git a/lite/kernels/mlu/bridges/cast_op_test.cc b/lite/kernels/mlu/bridges/cast_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..2389ad5560cd2ede710626cfd40f8db8bff56351 --- /dev/null +++ b/lite/kernels/mlu/bridges/cast_op_test.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/cast_op.h" +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void test_cast_FP16_to_FP32(std::vector shape) { + // prepare input&output variables + std::string x_var_name = "x"; + std::string out_var_name = "out"; + + Scope scope; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + x->Resize(DDim(shape)); + auto* x_data = x->mutable_data(); + + // initialize input&output data + for (int i = 0; i < x->dims().production(); i++) { + x_data[i] = static_cast(i); + } + // initialize op desc + int in_dtype = 4, out_dtype = 5; + cpp::OpDesc opdesc; + opdesc.SetType("cast"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("in_dtype", in_dtype); + opdesc.SetAttr("out_dtype", out_dtype); + + auto op = CreateOp(opdesc, &scope); + + Tensor data; + data.Resize(DDim(shape)); + auto* copy_data = data.mutable_data(); + data.CopyDataFrom(*x); + x->set_precision(paddle::lite_api::PrecisionType::kFP16); + LaunchOp(op, {x_var_name}, {out_var_name}); + + // compare results + auto* out_data = out->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data[i], static_cast(copy_data[i]), 5e-4); + } +} + +void test_cast_FP32_to_FP16(std::vector shape) { + // prepare input&output variables + std::string x_var_name = "x"; + std::string out_var_name = "out"; + + Scope scope; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + x->Resize(DDim(shape)); + auto* x_data = x->mutable_data(); + + // initialize input&output data + for (int i = 0; i < x->dims().production(); i++) { + x_data[i] = static_cast(i); + } + // initialize op desc + int in_dtype = 5, out_dtype = 4; + cpp::OpDesc opdesc; + opdesc.SetType("cast"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("in_dtype", in_dtype); + opdesc.SetAttr("out_dtype", out_dtype); + + auto op = CreateOp(opdesc, &scope); + + Tensor data; + data.Resize(DDim(shape)); + auto* copy_data = data.mutable_data(); + data.CopyDataFrom(*x); + x->set_precision(paddle::lite_api::PrecisionType::kFloat); + LaunchOp(op, {x_var_name}, {out_var_name}); + + // compare results + auto* out_data = out->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(static_cast(out_data[i]), copy_data[i], 5e-4); + } +} + +TEST(MLUBridges, cast) { + test_cast_FP16_to_FP32({2, 3, 4, 5}); + test_cast_FP16_to_FP32({6, 3, 2, 5}); + test_cast_FP32_to_FP16({2, 3, 4, 5}); + test_cast_FP32_to_FP16({6, 3, 2, 5}); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(cast, kMLU); diff --git a/lite/kernels/mlu/bridges/concat_op.cc b/lite/kernels/mlu/bridges/concat_op.cc index 14f0da746a00c1ea10ffae824217dbb2df84df55..1d566639937d79cf1c98c70bfc1294d874fb89c4 100644 --- a/lite/kernels/mlu/bridges/concat_op.cc +++ b/lite/kernels/mlu/bridges/concat_op.cc @@ -44,9 +44,10 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto dims = output_dims.size(); int axis = (param_axis < 0) ? (param_axis + dims) : param_axis; - CHECK_LE(axis, 4) << "Unsupport dims in mlu concat"; - int nchw_to_nhwc_axis_map[4] = {0, 3, 1, 2}; - int nhwc_axis = nchw_to_nhwc_axis_map[axis]; + CHECK_LT(axis, dims) << "Unsupport dims in mlu concat"; + // value of nhwc2nchw_axis is index of nhwc + // order of nhwc2nchw_axis is nchw + int nhwc_axis = GetAxisNHWC2NCHW(dims)[axis]; auto output_tensor = graph->AddNode( out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); @@ -60,6 +61,7 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) { &outputs, 1)); graph->FuseOp(concat_op); + CNML_CALL(cnmlDestroyBaseOp(&concat_op)); return SUCCESS; } diff --git a/lite/kernels/mlu/bridges/conv_op.cc b/lite/kernels/mlu/bridges/conv_op.cc index e7e21f7ad2f64275746e015289c9372368e46f5c..84c5bd5638585a5b5e1e22308c9ddf3c06acd9e9 100644 --- a/lite/kernels/mlu/bridges/conv_op.cc +++ b/lite/kernels/mlu/bridges/conv_op.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "lite/operators/conv_op.h" + #include + #include "lite/kernels/mlu/bridges/graph.h" #include "lite/kernels/mlu/bridges/utility.h" #include "lite/kernels/npu/bridges/registry.h" @@ -30,6 +32,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { const auto* op_info = op->op_info(); const auto* scope = op->scope(); VLOG(3) << "[MLU] Converting " << op_info->Type() << "... "; + CHECK(!op_info->HasAttr("act_type")); // get input, filter and op attributes const auto input_var_name = op_info->Input("Input").front(); @@ -43,8 +46,13 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { const auto output_shape = output->dims().Vectorize(); const auto bs = input_dims[0]; const auto oc = filter_dims[0]; + const auto groups = op_info->GetAttr("groups"); + CHECK_EQ(input_dims.size(), 4u); CHECK_EQ(filter_dims.size(), 4u); + CHECK(!(op_info->HasAttr("fuse_relu") && + (op_info->GetAttr("fuse_relu") == true))) + << "UnSupported param fuse_relu is true!"; const auto strides = op_info->GetAttr>("strides"); auto dilations = op_info->GetAttr>("dilations"); auto paddings = op_info->GetAttr>("paddings"); @@ -70,13 +78,32 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { padding_algorithm, input_dims, filter_dims); + bool is_group_mode = groups > 1; + bool is_depthwise_mode = false; + if (filter_dims[0] == groups && filter_dims[1] == 1 && dilations[0] == 1 && + dilations[1] == 1) { // depthwise filter shape = {1, ic ,kh ,kw} + is_depthwise_mode = true; + is_group_mode = false; + } + + auto input_tensor = graph->GetNode(input_var_name); const auto output_tensor = graph->AddNode( output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType()); + std::vector cnml_filter_shape = { + filter_dims[0], filter_dims[1], filter_dims[2], filter_dims[3]}; + if (is_depthwise_mode) { + /*paddle filter shape is {oc , ic / groups == 1, kh, kw} while + cnml depthwise conv filter expect shape {oc / groups == 1 , ic , kh, kw} + so we should shape filter shape + */ + cnml_filter_shape = { + filter_dims[1], filter_dims[0], filter_dims[2], filter_dims[3]}; + } // Create filter node const auto filter_tensor = graph->AddNode(filter_var_name, - filter_dims.Vectorize(), + cnml_filter_shape, CNML_FILTER, CNML_NCHW, graph->FPType()); @@ -89,15 +116,15 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { dequant(filter_dequant.data(), filter->mutable_data(), 1, - filter_dims[0], - filter_dims[1] * filter_dims[2] * filter_dims[3], + cnml_filter_shape[0], + cnml_filter_shape[1] * cnml_filter_shape[2] * cnml_filter_shape[3], weight_scale); transpose(filter_dequant.data(), filter->mutable_data(), - {static_cast(filter_dims[0]), - static_cast(filter_dims[1]), - static_cast(filter_dims[2]), - static_cast(filter_dims[3])}, + {static_cast(cnml_filter_shape[0]), + static_cast(cnml_filter_shape[1]), + static_cast(cnml_filter_shape[2]), + static_cast(cnml_filter_shape[3])}, {0, 2, 3, 1}); filter->set_precision(PrecisionType::kFloat); } else if (filter->precision() != PrecisionType::kFloat) { @@ -116,7 +143,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { std::vector bias_shape; if (bias_data_size == oc) { // 0: {oc} - bias_shape = {oc}; + bias_shape = {1, 1, 1, oc}; } else if (bias_data_size == output_data_size / bs) { LOG(FATAL) << "Unsupported ... ..."; // 1: {1, oc, oh, ow} @@ -130,18 +157,15 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { << " isn't supported in conv2d Op when output dimension is " << output_dims; } - bias_tensor = graph->AddNode(bias_var_name, - bias_dims.Vectorize(), - CNML_CONST, - CNML_CNHW, - graph->FPType()); + bias_tensor = graph->AddNode( + bias_var_name, bias_shape, CNML_CONST, CNML_NHWC, graph->FPType()); graph->BindConstData(bias_var_name, bias); } const auto input_scale = op_info->GetAttr("input_scale"); bool use_first_conv = false; - if (lite::DeviceInfo::Global().UseFirstConv() && input_dims[1] == 3) { + if (lite::TargetWrapperMlu::UseFirstConv() && input_dims[1] == 3) { use_first_conv = true; } @@ -158,38 +182,75 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { paddings[0], paddings[0])); const auto mean_tensor = graph->AddNode("first_conv_mean_tensor", - std::vector{3}, + std::vector{1, 1, 1, 3}, CNML_CONST, - CNML_CNHW, + CNML_NHWC, graph->FPType()); const auto std_tensor = graph->AddNode("first_conv_std_tensor", - std::vector{3}, + std::vector{1, 1, 1, 3}, CNML_CONST, - CNML_CNHW, + CNML_NHWC, graph->FPType()); graph->BindConstRawData("first_conv_mean_tensor", - lite::DeviceInfo::Global().MeanVec().data(), + lite::TargetWrapperMlu::MeanVec().data(), 3, false); graph->BindConstRawData("first_conv_std_tensor", - lite::DeviceInfo::Global().StdVec().data(), + lite::TargetWrapperMlu::StdVec().data(), 3, false); - graph->GetNode(input_var_name)->set_mlu_dtype(CNML_DATA_UINT8); + input_tensor->set_mlu_dtype(CNML_DATA_UINT8); CNML_CALL(cnmlCreateConvFirstOpForward( &conv_op, conv_param, - graph->GetNode(input_var_name)->mlu_tensor(), + input_tensor->mlu_tensor(), mean_tensor->mlu_tensor(), output_tensor->mlu_tensor(), filter_tensor->mlu_tensor(), bias_tensor ? bias_tensor->mlu_tensor() : nullptr, std_tensor->mlu_tensor())); CNML_CALL(cnmlDestroyConvFirstOpParam(&conv_param)); + } else if (is_depthwise_mode) { + cnmlConvDepthwiseOpParam_t conv_depthwise_param; + cnmlCreateConvDepthwiseOpParam_V2(&conv_depthwise_param, + strides[0], + strides[1], + paddings[0] * 2, + paddings[2] * 2); + CNML_CALL(cnmlCreateConvDepthwiseOpForward( + &conv_op, + conv_depthwise_param, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + filter_tensor->mlu_tensor(), + bias_tensor ? bias_tensor->mlu_tensor() : nullptr)); + CNML_CALL(cnmlDestroyConvDepthwiseOpParam(&conv_depthwise_param)); + } else if (is_group_mode) { + cnmlConvOpParam_t conv_param; + CNML_CALL(cnmlCreateConvOpParam(&conv_param, + strides[0], + strides[1], + dilations[0], + dilations[1], + paddings[0] * 2, + paddings[2] * 2)); + CNML_CALL(cnmlCreateConvGroupOpForward( + &conv_op, + conv_param, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + filter_tensor->mlu_tensor(), + bias_tensor ? bias_tensor->mlu_tensor() : nullptr, + groups)); + CNML_CALL(cnmlDestroyConvOpParam(&conv_param)); } else { cnmlConvOpParam_t conv_param; + VLOG(5) << "conv param (" << input_var_name << ")" + << "stride: " << strides[0] << ',' << strides[1] << '\t' + << "dilations: " << dilations[0] << ',' << dilations[1] << '\t' + << "paddings: " << paddings[0] << ',' << paddings[2] << std::endl; CNML_CALL(cnmlCreateConvOpParam(&conv_param, strides[0], strides[1], @@ -200,19 +261,21 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { CNML_CALL(cnmlCreateConvOpForward( &conv_op, conv_param, - graph->GetNode(input_var_name)->mlu_tensor(), + input_tensor->mlu_tensor(), output_tensor->mlu_tensor(), filter_tensor->mlu_tensor(), bias_tensor ? bias_tensor->mlu_tensor() : nullptr)); CNML_CALL(cnmlDestroyConvOpParam(&conv_param)); } - graph->SetComputingDataType( - conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale); - graph->SetComputingDataType( - conv_op, - filter_tensor->mlu_tensor(), - 1 / *min_element(weight_scale.begin(), weight_scale.end())); + if (!is_depthwise_mode) { + graph->SetComputingDataType( + conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale); + graph->SetComputingDataType( + conv_op, + filter_tensor->mlu_tensor(), + 1 / *max_element(weight_scale.begin(), weight_scale.end())); + } CNML_CALL(cnmlSetOperationComputingLayout(conv_op, CNML_NHWC)); if (HasInputArg(op_info, scope, "Bias")) { auto* bias = scope->FindVar(bias_var_name)->GetMutable(); @@ -220,6 +283,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { } graph->BindConstData(filter_var_name, filter); graph->FuseOp(conv_op); + CNML_CALL(cnmlDestroyBaseOp(&conv_op)); return REBUILD_WHEN_SHAPE_CHANGED; } diff --git a/lite/kernels/mlu/bridges/conv_op_test.cc b/lite/kernels/mlu/bridges/conv_op_test.cc index 1b04814d7d88d227d0bb3e0b58aef26d62f06966..ddaf5b321ffd2af1fbd91af6cf15b5c7789cbba3 100644 --- a/lite/kernels/mlu/bridges/conv_op_test.cc +++ b/lite/kernels/mlu/bridges/conv_op_test.cc @@ -13,8 +13,11 @@ // limitations under the License. #include "lite/operators/conv_op.h" + #include + #include + #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" #include "lite/kernels/mlu/bridges/test_helper.h" @@ -331,6 +334,10 @@ TEST(MLUBridges, conv) { #endif } +TEST(MLUBridges, depthwise_conv2d) { + test_conv(1, 8, 8, 14, 14, false, false, false, true, 1, 1, 2, 3); +} + } // namespace mlu } // namespace subgraph } // namespace lite diff --git a/lite/kernels/mlu/bridges/dropout_op.cc b/lite/kernels/mlu/bridges/dropout_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9aa296236e05a0c80ed9b7001f940cce99b019f7 --- /dev/null +++ b/lite/kernels/mlu/bridges/dropout_op.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Create act node and set params from op + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + /* auto mask_var_name = op_info->Output("Mask").front(); */ + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + /* auto mask = scope->FindVar(mask_var_name)->GetMutable(); */ + /* auto mask_dims = mask->dims().Vectorize(); */ + /* auto mask_tensor = graph->AddNode( */ + /* mask_var_name, mask_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); */ + + // is_test is true by default + // if(op_info->HasAttr("is_test")){ + // auto is_test = op_info->GetAttr("is_test"); + // CHECK(is_test != true); + // } + + // Param fix_seed and seed is useless in MLU + + auto dropout_implementation = + op_info->GetAttr("dropout_implementation"); + auto dropout_prob = op_info->GetAttr("dropout_prob"); + float alpha = 1.0f - dropout_prob; + if (dropout_implementation == "upscale_in_train") { + alpha = 1.; + } + float beta = 0.; + + std::vector shape = {1, 1, 1, 1}; + std::string alpha_var_name = string_format("dropout_alpha_%p", op); + std::string beta_var_name = string_format("dropout_beta_%p", op); + auto alpha_tensor = graph->AddNode( + alpha_var_name, shape, CNML_CONST, CNML_NHWC, graph->FPType()); + auto beta_tensor = graph->AddNode( + beta_var_name, shape, CNML_CONST, CNML_NHWC, graph->FPType()); + + graph->BindConstRawData(alpha_var_name, &alpha, 1); + graph->BindConstRawData(beta_var_name, &beta, 1); + + auto input_tensor = graph->GetNode(x_var_name); + cnmlBaseOp_t scale_op; + CNML_CALL(cnmlCreateScaleOp(&scale_op, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + alpha_tensor->mlu_tensor(), + beta_tensor->mlu_tensor())); + graph->FuseOp(scale_op); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(dropout, + kMLU, + paddle::lite::subgraph::mlu::DropoutConverter); diff --git a/lite/kernels/mlu/bridges/dropout_op_test.cc b/lite/kernels/mlu/bridges/dropout_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..44f03e3051a6c568d541b98b64808e27470d8916 --- /dev/null +++ b/lite/kernels/mlu/bridges/dropout_op_test.cc @@ -0,0 +1,158 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/dropout_op.h" +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void dropout_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + auto dropout_implementation = + op_info->GetAttr("dropout_implementation"); + auto dropout_prob = op_info->GetAttr("dropout_prob"); + float alpha = 1.0f - dropout_prob; + if (dropout_implementation == "upscale_in_train") { + alpha = 1.; + } + float beta = 0.; + + auto x_data = x->data(); + auto out_data = out->mutable_data(); + DDim x_dims = x->dims(); + DDim out_dims = out->dims(); + CHECK_EQ(x_dims.production(), out_dims.production()); + for (int i = 0; i < out_dims.production(); i++) { + out_data[i] = x_data[i] * alpha + beta; + } +} + +void test_dropout(int bs, + int ic, + int ih, + int iw, + std::string dropout_implementation, + float dropout_prob, + float bias) { + // prepare input&output variables + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + std::string mask_var_name("mask"); + std::string out_ref_var_name("out_ref"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* mask = scope.Var(mask_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize({bs, ic, ih, iw}); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + bool is_test = true; + bool fix_seed = false; + int seed = 0; + cpp::OpDesc opdesc; + opdesc.SetType("dropout"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetOutput("Mask", {mask_var_name}); + opdesc.SetAttr("is_test", is_test); + opdesc.SetAttr("fix_seed", fix_seed); + opdesc.SetAttr("seed", seed); + opdesc.SetAttr("dropout_implementation", dropout_implementation); + opdesc.SetAttr("dropout_prob", dropout_prob); + VLOG(6) << "mask: " << mask->dims()[0] << std::endl; + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + dropout_ref(op); + out_ref->CopyDataFrom(*out); + + Tensor input_trans; + input_trans.Resize({bs, ic, ih, iw}); + transpose(x->mutable_data(), + input_trans.mutable_data(), + {bs, ic, ih, iw}, + {0, 2, 3, 1}); + auto os = out->dims(); + out->Resize({static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}); + x->CopyDataFrom(input_trans); + x->Resize({bs, ih, iw, ic}); + + LaunchOp(op, {x_var_name}, {out_var_name}); + + // execute reference implementation and save to output tensor('out') + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + Tensor output_trans; + output_trans.Resize(os); + transpose(out_data, + output_trans.mutable_data(), + {static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}, + {0, 3, 1, 2}); + out_data = output_trans.mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +TEST(MLUBridges, dropout) { + for (auto bs : {1, 3}) { + for (auto ic : {1, 3}) { + for (auto ih : {3, 4}) { + for (auto iw : {4, 3}) { + for (auto dropout_implementation : + {"downgrade_in_infer", "upscale_in_train"}) { + for (auto dropout_prob : {0.f, 1.0f}) { + VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih + << " iw: " << iw + << " dropout_implementation: " << dropout_implementation + << " dropout_prob: " << dropout_prob; + test_dropout( + bs, ic, ih, iw, dropout_implementation, dropout_prob, 0.); + } + } + } + } + } + } +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(dropout, kMLU); diff --git a/lite/kernels/mlu/bridges/elementwise_ops.cc b/lite/kernels/mlu/bridges/elementwise_ops.cc index 41526a0100ba71be9eda25983cb96aa888d6cf4d..5f7192a0628a7887dbca15d63f1ba22799d7ee4b 100644 --- a/lite/kernels/mlu/bridges/elementwise_ops.cc +++ b/lite/kernels/mlu/bridges/elementwise_ops.cc @@ -23,7 +23,7 @@ namespace mlu { std::vector CvtYShape(const Tensor& x, Tensor* y, int axis) { auto x_dims = x.dims(); - CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x"; + // CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x"; auto y_dims = y->dims(); CHECK_GE(x_dims.size(), y_dims.size()); @@ -117,6 +117,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { } graph->FuseOp(elementwise_op); + CNML_CALL(cnmlDestroyBaseOp(&elementwise_op)); cnmlBaseOp_t act_op; if (op_type == "fusion_elementwise_add_activation") { auto mid_tensor = graph->GetNode(out_var_name + "_mid"); @@ -127,6 +128,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { mid_tensor->mlu_tensor(), output_tensor->mlu_tensor())); graph->FuseOp(act_op); + CNML_CALL(cnmlDestroyBaseOp(&act_op)); } return REBUILD_WHEN_SHAPE_CHANGED; } diff --git a/lite/kernels/mlu/bridges/elementwise_ops_test.cc b/lite/kernels/mlu/bridges/elementwise_ops_test.cc index e5087dd708eee3ba255fbfa0383d31b12a6b6870..7844e5b1b57567f72750b21ba288547cb165eb54 100644 --- a/lite/kernels/mlu/bridges/elementwise_ops_test.cc +++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc @@ -153,7 +153,7 @@ void test_elementwise_add(const std::vector& x_shape, opdesc.SetOutput("Out", {out_var_name}); opdesc.SetAttr("axis", axis); - // create and convert op to NPU model, then run it on NPU + // create and convert op to MLU model, then run it on MLU auto op = CreateOp(opdesc, &scope); // execute reference implementation and save to output tensor diff --git a/lite/kernels/mlu/bridges/fc_op.cc b/lite/kernels/mlu/bridges/fc_op.cc index 286feec8d4d44eaa025f333d559c32ca72f042ff..ed9ef7edd002ad0476efb84b34239ce07641538a 100644 --- a/lite/kernels/mlu/bridges/fc_op.cc +++ b/lite/kernels/mlu/bridges/fc_op.cc @@ -34,7 +34,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto w_var_name = op_info->Input("W").front(); auto output_var_name = op_info->Output("Out").front(); - // int in_num_col_dims = op_info->GetAttr("in_num_col_dims"); + CHECK(!op_info->HasAttr("activation_type")); auto x = scope->FindVar(x_var_name)->GetMutable(); auto w = scope->FindVar(w_var_name)->GetMutable(); auto output = scope->FindVar(output_var_name)->GetMutable(); @@ -45,9 +45,28 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(w_dims.size(), 2UL); // Create w node - std::vector w_shape{w_dims[1], w_dims[0]}; + std::vector cnml_w_shape; + if (x_dims.size() == 4) { + if (x_dims[1] * x_dims[2] * x_dims[3] == w_dims[0]) { + cnml_w_shape = { + static_cast(w_dims[1]), + static_cast(x_dims[1]), // input_c + static_cast(x_dims[2]), // input_h + static_cast(x_dims[3]), // input_w + }; + } else { + LOG(FATAL) + << "in fc op, we expect input_h * input_w * input_c == filter_c" + << " but we got input_c = " << x_dims[1] << " input_h = " << x_dims[2] + << " input_w = " << x_dims[3] << " filter_c = " << w_dims[0] + << std::endl; + } + } else { + cnml_w_shape = {w_dims[1], w_dims[0]}; + } + auto w_tensor = graph->AddNode( - w_var_name, w_shape, CNML_FILTER, CNML_NCHW, graph->FPType()); + w_var_name, cnml_w_shape, CNML_FILTER, CNML_NCHW, graph->FPType()); auto input_scale = op_info->GetAttr("input_scale"); @@ -63,15 +82,15 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { if (HasInputArg(op_info, scope, "Bias")) { bias_var_name = op_info->Input("Bias").front(); auto bias = scope->FindVar(bias_var_name)->GetMutable(); - auto bias_dims = bias->dims(); + auto bias_dims = bias->dims().Vectorize(); CHECK(!graph->HasNode(bias_var_name)); + if (bias_dims.size() < 4u) { + bias_dims.insert(bias_dims.begin(), 4 - bias_dims.size(), 1); + } // CHECK_EQ(bias_dims.production(), n); - bias_tensor = graph->AddNode(bias_var_name, - bias_dims.Vectorize(), - CNML_CONST, - CNML_CNHW, - graph->FPType()); + bias_tensor = graph->AddNode( + bias_var_name, bias_dims, CNML_CONST, CNML_NHWC, graph->FPType()); graph->BindConstData(bias_var_name, bias); } cnmlBaseOp_t fc_op; @@ -88,18 +107,46 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { if (w->precision() == PrecisionType::kUnk || w->precision() == PrecisionType::kInt8) { std::vector w_dequant(w->data_size()); - dequant(w_dequant.data(), - w->mutable_data(), - 1, - w_dims[1], - w_dims[0], - weight_scale); - for (int i = 0; i < w_dims[1]; i++) { - for (int j = 0; j < w_dims[0]; j++) { - w->mutable_data()[i * w_dims[0] + j] = - w_dequant[i + j * w_dims[1]]; - } + if (cnml_w_shape.size() == 2) { + dequant(w_dequant.data(), + w->mutable_data(), + 1, + cnml_w_shape[0], + cnml_w_shape[1], + weight_scale); + transpose2d(w_dequant.data(), + w->mutable_data(), + {static_cast(cnml_w_shape[0]), + static_cast(cnml_w_shape[1])}); + } else if (cnml_w_shape.size() == 4) { + dequant(w_dequant.data(), + w->mutable_data(), + 1, + cnml_w_shape[0], + cnml_w_shape[1] * cnml_w_shape[2] * cnml_w_shape[3], + weight_scale); + + int c_o_num = cnml_w_shape[0]; + int c_i_num = cnml_w_shape[1]; + int h_i_num = cnml_w_shape[2]; + int w_i_num = cnml_w_shape[3]; + + // chw == ci * hi * wi == w_dim[0] + // first trans [chw, co] -> [co,chw] + std::vector first_trans_output(w_dequant.size()); + int chw = c_i_num * h_i_num * w_i_num; + transpose2d(w_dequant.data(), first_trans_output.data(), {chw, c_o_num}); + + // second trans [co,ci,hi,wi] -> [co,hi,wi,ci] + transpose(first_trans_output.data(), + w->mutable_data(), + {c_o_num, c_i_num, h_i_num, w_i_num}, + {0, 2, 3, 1}); + } else { + LOG(FATAL) << "expect w_shape.size == 2 or 4, but got " + << cnml_w_shape.size() << std::endl; } + w->set_precision(PrecisionType::kFloat); } else if (w->precision() != PrecisionType::kFloat) { LOG(FATAL) << "UnSupported weight precision!"; @@ -110,9 +157,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { graph->SetComputingDataType( fc_op, w_tensor->mlu_tensor(), - 1 / *min_element(weight_scale.begin(), weight_scale.end())); + 1 / *max_element(weight_scale.begin(), weight_scale.end())); graph->FuseOp(fc_op); + CNML_CALL(cnmlDestroyBaseOp(&fc_op)); return REBUILD_WHEN_SHAPE_CHANGED; } diff --git a/lite/kernels/mlu/bridges/fc_op_test.cc b/lite/kernels/mlu/bridges/fc_op_test.cc index fe1c889f431350b4175ac400aefe77e6392405c5..af856a55a2ddc563d210af3b4ef0e669b32f5a57 100644 --- a/lite/kernels/mlu/bridges/fc_op_test.cc +++ b/lite/kernels/mlu/bridges/fc_op_test.cc @@ -175,9 +175,9 @@ void test_fc(const std::vector& input_shape, TEST(MLUBridges, fc) { for (bool use_bias : {true, false}) { - // test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias); - // test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias); - // test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias); + test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias); + test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias); + test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias); test_fc({1, 1024, 1, 1}, {1024, 32}, 1, use_bias); } } diff --git a/lite/kernels/mlu/bridges/flatten_op.cc b/lite/kernels/mlu/bridges/flatten_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..faf7e6fd2801cdcaad4bce0a20921843f1d1b516 --- /dev/null +++ b/lite/kernels/mlu/bridges/flatten_op.cc @@ -0,0 +1,124 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int FlattenConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + + // ================== Trans1: NHWC => NCHW =========================== + auto input_tensor = graph->GetNode(x_var_name); + auto trans_1_axis = std::move(GetAxisNHWC2NCHW(x->dims().size())); + auto trans1_out = graph->AddNode(x_var_name + ".trans.i", + x->dims().Vectorize(), + CNML_TENSOR, + CNML_NCHW, + graph->FPType(), + CNML_NCHW); + cnmlBaseOp_t trans1_op{nullptr}; + cnmlNdTransposeOpParam_t trans1_param{nullptr}; + CNML_CALL(cnmlCreateNdTransposeOpParam( + &trans1_param, trans_1_axis.data(), trans_1_axis.size())); + CNML_CALL(cnmlCreateNdTransposeProOp(&trans1_op, + input_tensor->mlu_tensor(), + trans1_out->mlu_tensor(), + trans1_param)); + // ======================== Trans1 End ================================== + + // ======================= Flatten op =================================== + cnmlBaseOp_t flatten_op; + auto trans2_input = graph->AddNode(out_var_name + ".trans.o", + output_dims, + CNML_TENSOR, + CNML_NCHW, + graph->FPType(), + CNML_NCHW); + int cnml_trans2_input_shape[4]; + CNML_CALL( + cnmlGetTensorShape(trans2_input->mlu_tensor(), cnml_trans2_input_shape)); + cnmlReshapeOpParam_t reshape_param{nullptr}; + CNML_CALL(cnmlCreateNdReshapeOpParam( + &reshape_param, cnml_trans2_input_shape, output->dims().size())); + + // Use cnmlCreatexxxOpForward to create op. + CNML_CALL(cnmlCreateReshapeOp(&flatten_op, + reshape_param, + trans1_out->mlu_tensor(), + trans2_input->mlu_tensor())); + // ======================= Flatten End =================================== + + // ================== Trans2: NCHW => NHWC =============================== + auto trans_2_axis = std::move(GetAxisNCHW2NHWC(output->dims().size())); + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + cnmlBaseOp_t trans2_op{nullptr}; + cnmlNdTransposeOpParam_t trans2_param{nullptr}; + CNML_CALL(cnmlCreateNdTransposeOpParam( + &trans2_param, trans_2_axis.data(), trans_2_axis.size())); + CNML_CALL(cnmlCreateNdTransposeProOp(&trans2_op, + trans2_input->mlu_tensor(), + output_tensor->mlu_tensor(), + trans2_param)); + // ======================== Trans2 End ================================== + + // ============== DEBUG LOG =============== + + VLOG(6) << "x_var_name: " << x_var_name; + VLOG(6) << "out_var_name: " << out_var_name; + VLOG(6) << "input dim: " << x->dims(); + VLOG(6) << "output dim: " << output->dims(); + // cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(trans1_out->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(trans2_input->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR); + // ============== DEBUG END =============== + graph->FuseOp(trans1_op); + graph->FuseOp(flatten_op); + graph->FuseOp(trans2_op); + CNML_CALL(cnmlDestroyBaseOp(&trans1_op)); + CNML_CALL(cnmlDestroyBaseOp(&flatten_op)); + CNML_CALL(cnmlDestroyBaseOp(&trans2_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(flatten, + kMLU, + paddle::lite::subgraph::mlu::FlattenConverter); +REGISTER_SUBGRAPH_BRIDGE(flatten2, + kMLU, + paddle::lite::subgraph::mlu::FlattenConverter); diff --git a/lite/kernels/mlu/bridges/flatten_op_test.cc b/lite/kernels/mlu/bridges/flatten_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..190b837ffeecfd494ffbd748220207cd63da5c06 --- /dev/null +++ b/lite/kernels/mlu/bridges/flatten_op_test.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/flatten_op.h" + +#include + +#include + +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void test_flatten(std::vector input_shape, int axis) { + // prepare input&output variables + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + x->Resize(input_shape); + Tensor x_cpu; + + // initialize input&output data + FillTensor(x); + x_cpu.CopyDataFrom(*x); + + Tensor input_trans; + input_trans.Resize(input_shape); + transpose(x->mutable_data(), + input_trans.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3])}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_trans); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("flatten2"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axis", axis); + auto op = CreateOp(opdesc, &scope); + + LaunchOp(op, {x_var_name}, {out_var_name}); + // compare results + auto* out_data = out->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], x_cpu.mutable_data()[i], 1e-5); + } +} + +TEST(MLUBridges, flatten) { test_flatten({1, 2, 4, 4}, 2); } +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(flatten, kMLU); +USE_SUBGRAPH_BRIDGE(flatten2, kMLU); diff --git a/lite/kernels/mlu/bridges/gather_op.cc b/lite/kernels/mlu/bridges/gather_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b68f1af76456eede14ec550c623d6a8355f5d5e8 --- /dev/null +++ b/lite/kernels/mlu/bridges/gather_op.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("X").front(); + auto index_var_name = op_info->Input("Index").front(); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + + CHECK(graph->HasNode(x_var_name)); + auto x_tensor = graph->GetNode(x_var_name); + auto index_tensor = graph->GetNode(index_var_name); + + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + + cnmlBaseOp_t gather_op; + CNML_CALL(cnmlCreateGatherV2Op(&gather_op, + x_tensor->mlu_tensor(), + index_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + CNML_DIM_N)); + graph->FuseOp(gather_op); + CNML_CALL(cnmlDestroyBaseOp(&gather_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(gather, + kMLU, + paddle::lite::subgraph::mlu::GatherConverter); diff --git a/lite/kernels/mlu/bridges/gather_op_test.cc b/lite/kernels/mlu/bridges/gather_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..413de7c9d7fda750b387c2daa21ef1e40e7982c7 --- /dev/null +++ b/lite/kernels/mlu/bridges/gather_op_test.cc @@ -0,0 +1,133 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/gather_op.h" +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +template +void gather_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto index = + scope->FindVar(op_info->Input("Index").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + + auto x_dims = x->dims(); + auto index_dims = index->dims(); + CHECK(index_dims.size() == 1 || + (index_dims.size() == 2 && index_dims[1] == 1)); + + int batch_size = index_dims[0]; + DDim out_dims = x_dims; + out_dims[0] = batch_size; + out->Resize(out_dims); + + auto x_data = x->data(); + auto index_data = index->data(); + auto out_data = out->mutable_data(); + + auto slice_num = x_dims[0]; + auto slice_size = x_dims.Slice(1, x_dims.size()).production(); + for (int i = 0; i < batch_size; i++) { + auto index = index_data[i]; + CHECK_LT(index, slice_num) << "index <= slice_num"; + CHECK_GE(index, 0) << "index > 0"; + memcpy(out_data + i * slice_size, + x_data + index * slice_size, + slice_size * sizeof(float)); + } +} + +void test_gather() { + // prepare input&output variables + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + std::string index_var_name = "index"; + + Scope scope; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + auto* index = scope.Var(index_var_name)->GetMutable(); + + x->Resize({5, 4, 3, 2}); + index->Resize({2}); + // initialize input&output data + FillTensor(x); + FillTensor(index, 1, 3); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("gather"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetInput("Index", {index_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + + auto op = CreateOp(opdesc, &scope); + gather_ref(op); + out_ref->CopyDataFrom(*out); + + Tensor input; + input.Resize({5, 4, 3, 2}); + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(5), + static_cast(4), + static_cast(3), + static_cast(2)}, + {0, 2, 3, 1}); + x->CopyDataFrom(input); + LaunchOp(op, {x_var_name, index_var_name}, {out_var_name}); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + + Tensor output; + output.Resize(out->dims()); + transpose(out_data, + output.mutable_data(), + {static_cast(out->dims()[0]), + static_cast(out->dims()[2]), + static_cast(out->dims()[3]), + static_cast(out->dims()[1])}, + {0, 3, 1, 2}); + out_data = output.mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data[i], out_ref_data[i], 5e-4); + } +} + +TEST(MLUBridges, gather) { test_gather(); } + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(gather, kMLU); diff --git a/lite/kernels/mlu/bridges/graph.cc b/lite/kernels/mlu/bridges/graph.cc index 65c2f8214c13ee8d004dbe4b2e706523d007469c..bbe88547c8d60e1468653a28dad97af09b24f952 100644 --- a/lite/kernels/mlu/bridges/graph.cc +++ b/lite/kernels/mlu/bridges/graph.cc @@ -27,10 +27,14 @@ std::shared_ptr Graph::AddNode(const std::string& name, cnmlTensorType_t tensor_type, cnmlDataOrder_t shape_order, cnmlDataType_t mlu_dtype, + cnmlDataOrder_t data_order, void* raw_ptr) { CHECK(!HasNode(name)); + VLOG(5) << "add mlu node: " << name << "\t data type " + << static_cast(mlu_dtype) << "\t data order " + << static_cast(data_order); auto node = std::shared_ptr( - new MLUTensor(shape, tensor_type, shape_order, mlu_dtype)); + new MLUTensor(shape, tensor_type, shape_order, mlu_dtype, data_order)); node->set_mlu_ptr(raw_ptr); nodes_.insert(std::make_pair(name, node)); return node; diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h index 2c6bd63a87e53332a329d0c5c66fcf372a2584ca..07c6b20efb9a72106cf6ae288c411e490345b089 100644 --- a/lite/kernels/mlu/bridges/graph.h +++ b/lite/kernels/mlu/bridges/graph.h @@ -15,13 +15,15 @@ #pragma once #include -#include #include #include +#include #include + #include "lite/core/op_lite.h" #include "lite/core/tensor.h" #include "lite/kernels/mlu/bridges/tensor.h" +#include "lite/utils/env.h" #define PRINT_HW_TIME false @@ -45,32 +47,30 @@ class Graph { CNRT_CALL(cnrtCreateNotifier(¬ifier_end_)); #endif } - ~Graph() { FreeConstData(); CNML_CALL(cnmlDestroyFusionOp(&fusion_op_)); - for (auto op : ops_) { - CNML_CALL(cnmlDestroyBaseOp(&op)); - } #if PRINT_HW_TIME CNRT_CALL(cnrtDestroyNotifier(¬ifier_start_)); CNRT_CALL(cnrtDestroyNotifier(¬ifier_end_)); double total_time = 0; - for (auto& f : time_log_) { - total_time += f; + if (!time_log_.empty()) { + for (auto& f : time_log_) { + total_time += f; + } + std::cout << "cnml hardware time for " << time_log_.size() + << " process:" << total_time / time_log_.size() << std::endl; } - std::cout << "cnml hardware time for " << time_log_.size() - << " process:" << total_time / time_log_.size() << std::endl; #endif } - // Data node std::shared_ptr AddNode( const std::string& name, std::vector shape, cnmlTensorType_t tensor_type = CNML_TENSOR, - cnmlDataOrder_t data_order = CNML_NCHW, + cnmlDataOrder_t shape_order = CNML_NCHW, cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32, + cnmlDataOrder_t data_order = CNML_NHWC, void* raw_ptr = nullptr); std::shared_ptr GetNode(const std::string& name) { @@ -82,9 +82,16 @@ class Graph { return nodes_.find(name) != nodes_.end(); } - void AddInput(std::shared_ptr tensor) { + void AddInput(std::shared_ptr tensor, + bool disable_batch_size_changeable = true) { inputs_.push_back(tensor->mlu_tensor()); input_tensors_.push_back(tensor); + if (!disable_batch_size_changeable) { + constexpr int input_dimNb = 4; + bool input_dim_mutable[4] = {true, false, false, false}; + CNML_CALL(cnmlSetTensorDimMutable( + tensor->mlu_tensor(), input_dim_mutable, input_dimNb)); + } } void AddOutput(std::shared_ptr tensor) { @@ -92,6 +99,22 @@ class Graph { output_tensors_.push_back(tensor); } + std::vector>* MutableInputs() { + return &input_tensors_; + } + + std::vector>* MutableOutputs() { + return &output_tensors_; + } + void GenOfflineModel(const std::string& name) { + cnmlModel_t model; + const std::string& symbol = "subnet0"; + const auto& filename = name + ".offline.cambricon"; + CNML_CALL(cnmlCreateModel(&model, filename.c_str())); + CNML_CALL(cnmlAddFusionOpToModel(model, fusion_op_, symbol.c_str())); + CNML_CALL(cnmlSaveModel(model, filename.c_str())); + CNML_CALL(cnmlDestroyModel(model)); + } void FuseOp(cnmlBaseOp_t op) { CNML_CALL(cnmlFuseOp(op, fusion_op_)); } void Compile(cnmlCoreVersion_t core_version, int core_number) { @@ -103,18 +126,37 @@ class Graph { CNML_CALL(cnmlSetFusionOpCorenum(fusion_op_, core_number)); CNML_CALL(cnmlSetFusionOpCoreVersion(fusion_op_, core_version)); CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_)); - for (auto in : input_tensors_) { - input_addrs_.push_back(in->mlu_data()); - } - for (auto out : output_tensors_) { - output_addrs_.push_back(out->mlu_data()); - } } +#define MEASURE_HWTIME_START(que) \ + do { \ + CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que)); \ + } while (0) + +#define MEASURE_HWTIME_END(que) \ + do { \ + thread_local float hw_time; \ + CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que)); \ + CNRT_CALL(cnrtSyncQueue(que)); \ + CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time)); \ + hw_time /= 1000.0f; \ + DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl; \ + std::lock_guard lk(time_mut_); \ + time_log_.push_back(hw_time); \ + } while (0) + void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) { + input_addrs_.resize(input_tensors_.size()); + output_addrs_.resize(output_tensors_.size()); + for (size_t i = 0; i < input_addrs_.size(); ++i) { + input_addrs_[i] = input_tensors_[i]->mlu_data(); + } + for (size_t i = 0; i < output_addrs_.size(); ++i) { + output_addrs_[i] = output_tensors_[i]->mlu_data(); + } + #if PRINT_HW_TIME - thread_local float hw_time; - CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que)); + MEASURE_HWTIME_START(que); #endif CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_, input_addrs_.data(), @@ -124,18 +166,46 @@ class Graph { &forward_param, que)); #if PRINT_HW_TIME - CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que)); + MEASURE_HWTIME_END(que); #endif + } - CNRT_CALL(cnrtSyncQueue(que)); + void Compute(cnrtQueue_t que, + const std::vector>& in, + const std::vector>& out) { + std::vector in_tensor; + std::vector out_tensor; + input_addrs_.resize(in.size()); + output_addrs_.resize(out.size()); + for (size_t i = 0; i < input_addrs_.size(); ++i) { + input_addrs_[i] = in[i]->mlu_data(); + in_tensor.push_back(in[i]->mlu_tensor()); + } + for (size_t i = 0; i < output_addrs_.size(); ++i) { + output_addrs_[i] = out[i]->mlu_data(); + out_tensor.push_back(out[i]->mlu_tensor()); + } + +#if PRINT_HW_TIME + MEASURE_HWTIME_START(que); +#endif + /* Because of using cnmlSetTensorDimMutable, cnmlComputeFusionOpForward_V3 + * -> cnmlComputeFusionOpForward_V4 */ + CNML_CALL(cnmlComputeFusionOpForward_V4(fusion_op_, + &in_tensor[0], + input_addrs_.data(), + input_addrs_.size(), + &out_tensor[0], + output_addrs_.data(), + output_addrs_.size(), + que, + NULL)); #if PRINT_HW_TIME - CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time)); - hw_time /= 1000.0f; - DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl; - std::lock_guard lk(time_mut_); - time_log_.push_back(hw_time); + MEASURE_HWTIME_END(que); #endif } +#undef MEASURE_HWTIME_START +#undef MEASURE_HWTIME_END template void* RegisterConstData(size_t len) { @@ -165,7 +235,7 @@ class Graph { CNML_CALL(cnmlBindConstData_V2( nodes_[tensor_name]->mlu_tensor(), alloc_data, false)); } else if (fp_type_ == CNML_DATA_FLOAT16) { - void* data_fp16 = RegisterConstData<::paddle::lite::fluid::float16>(len); + void* data_fp16 = RegisterConstData(len); CNRT_CALL( cnrtCastDataType(const_cast(static_cast(data)), CNRT_FLOAT32, @@ -180,7 +250,7 @@ class Graph { } } - void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) { + void BindConstData(std::string tensor_name, paddle::lite::Tensor* tensor) { const float* data = tensor->data(); size_t len = tensor->data_size(); if (fp_type_ == CNML_DATA_FLOAT32) { @@ -189,10 +259,14 @@ class Graph { const_cast(static_cast(data)), false)); } else if (fp_type_ == CNML_DATA_FLOAT16) { - auto* data_fp16 = tensor->mutable_data<::paddle::lite::fluid::float16>(); - for (size_t i = 0; i < len; ++i) { - data_fp16[i] = static_cast<::paddle::lite::fluid::float16>(data[i]); - } + void* data_fp16 = RegisterConstData(len); + CNRT_CALL( + cnrtCastDataType(const_cast(static_cast(data)), + CNRT_FLOAT32, + data_fp16, + CNRT_FLOAT16, + len, + nullptr)); CNML_CALL(cnmlBindConstData_V2(nodes_[tensor_name]->mlu_tensor(), static_cast(data_fp16), false)); @@ -206,19 +280,23 @@ class Graph { float scale, cnmlDataType_t data_type = CNML_DATA_INT8) { cnmlQuantizedParam_t quant_param; - CNML_CALL( - cnmlCreateQuantizedParam(&quant_param, scale2position(scale), 1, 0.0)); + int pos = scale2position(scale); + auto cnml_scale = pow(2, pos) * scale; + VLOG(5) << "[cnml quantized param] pos: " << pos + << "\tscale: " << cnml_scale << std::endl; + CNML_CALL(cnmlCreateQuantizedParam(&quant_param, pos, cnml_scale, 0.0)); CNML_CALL( cnmlSetOperationComputingDataType(op, tensor, data_type, quant_param)); CNML_CALL(cnmlDestroyQuantizedParam(&quant_param)); } - void SetFPType(::paddle::lite_api::PrecisionType type) { + void SetFPType(paddle::lite_api::PrecisionType type) { + origin_fp_type_ = type; switch (type) { - case ::paddle::lite_api::PrecisionType::kFP16: + case paddle::lite_api::PrecisionType::kFP16: fp_type_ = CNML_DATA_FLOAT16; break; - case ::paddle::lite_api::PrecisionType::kFloat: + case paddle::lite_api::PrecisionType::kFloat: fp_type_ = CNML_DATA_FLOAT32; break; default: @@ -230,14 +308,14 @@ class Graph { private: cnmlDataType_t fp_type_{CNML_DATA_FLOAT32}; - std::map> nodes_; + paddle::lite_api::PrecisionType origin_fp_type_{PRECISION(kFloat)}; + std::unordered_map> nodes_; std::vector inputs_; std::vector outputs_; std::vector input_addrs_; std::vector output_addrs_; std::vector> input_tensors_; std::vector> output_tensors_; - std::vector ops_; cnmlFusionOp_t fusion_op_; std::vector const_data_storage_; #if PRINT_HW_TIME diff --git a/lite/kernels/mlu/bridges/interpolate_op.cc b/lite/kernels/mlu/bridges/interpolate_op.cc index 2c1a2aeeff799d31d4328169fce058259543fb1f..32840736b8d9a9712d59a8175cd7d70311a34aad 100644 --- a/lite/kernels/mlu/bridges/interpolate_op.cc +++ b/lite/kernels/mlu/bridges/interpolate_op.cc @@ -85,6 +85,7 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) { nn_param)); CNML_CALL(cnmlDestroyNearestNeighborOpParam(&nn_param)); graph->FuseOp(interp_op); + CNML_CALL(cnmlDestroyBaseOp(&interp_op)); return SUCCESS; } diff --git a/lite/kernels/mlu/bridges/layout_op.cc b/lite/kernels/mlu/bridges/layout_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d14695c4357e06832e06a68646628bfa8d211c43 --- /dev/null +++ b/lite/kernels/mlu/bridges/layout_op.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int LayoutConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("Input").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + std::shared_ptr output_tensor; + + CHECK(graph->HasNode(x_var_name)); + std::vector axis; + auto x_tensor = graph->GetNode(x_var_name); + auto x_data_order = x_tensor->dorder(); + auto x_dims = x->dims().Vectorize(); + if (x_data_order == CNML_NCHW) { + switch (x_dims.size()) { + case 2: + axis = {0, 1}; + break; + case 3: + axis = {0, 2, 1}; + break; + case 4: + axis = {0, 2, 3, 1}; + break; + case 5: + axis = {0, 2, 3, 4, 1}; + break; + default: + CHECK(0) << "Unsupport shape"; + } + output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, x_tensor->dtype()); + VLOG(3) << "layout transpose nchw to nhwc" << std::endl; + } else { + switch (x_dims.size()) { + case 2: + axis = {0, 1}; + break; + case 3: + axis = {0, 2, 1}; + break; + case 4: + axis = {0, 3, 1, 2}; + break; + case 5: + axis = {0, 4, 1, 2, 3}; + break; + default: + CHECK(0) << "Unsupport shpae"; + } + VLOG(3) << "layout transpose nhwc to nchw" << std::endl; + output_tensor = graph->AddNode(out_var_name, + output_dims, + CNML_TENSOR, + CNML_NCHW, + x_tensor->dtype(), + CNML_NCHW); + } + cnmlBaseOp_t layout_op; + cnmlNdTransposeOpParam_t transpose_param; + CNML_CALL( + cnmlCreateNdTransposeOpParam(&transpose_param, axis.data(), axis.size())); + CNML_CALL(cnmlCreateNdTransposeProOp(&layout_op, + x_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + transpose_param)); + graph->FuseOp(layout_op); + CNML_CALL(cnmlDestroyBaseOp(&layout_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(layout, + kMLU, + paddle::lite::subgraph::mlu::LayoutConverter); diff --git a/lite/kernels/mlu/bridges/layout_op_test.cc b/lite/kernels/mlu/bridges/layout_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..69b905b0750fe99e29c6aaa9bffdc9f20229a239 --- /dev/null +++ b/lite/kernels/mlu/bridges/layout_op_test.cc @@ -0,0 +1,190 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/layout_op.h" +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void test_layout_NHWC2NCHW(std::vector input_shape) { + // prepare input&output variables + std::string x_var_name = "input"; + std::string out_var_name = "out"; + + Scope scope; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + x->Resize(DDim(input_shape)); + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("layout"); + opdesc.SetInput("Input", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + + auto op = CreateOp(opdesc, &scope); + + // execute reference implementation and save to output tensor + Tensor input; + input.Resize(DDim(input_shape)); + switch (input_shape.size()) { + case 2: + transpose( + x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), static_cast(input_shape[1])}, + {0, 1}); + break; + case 3: + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[2]), + static_cast(input_shape[1])}, + {0, 2, 1}); + break; + case 4: + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[2]), + static_cast(input_shape[3]), + static_cast(input_shape[1])}, + {0, 3, 1, 2}); + break; + case 5: + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[2]), + static_cast(input_shape[3]), + static_cast(input_shape[4]), + static_cast(input_shape[1])}, + {0, 4, 1, 2, 3}); + break; + default: + CHECK(0) << "Unsupport"; + } + auto* x_data = input.mutable_data(); + LaunchOp(op, {x_var_name}, {out_var_name}); + + // compare results + auto* out_data = out->mutable_data(); + + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data[i], x_data[i], 5e-4); + } +} + +void test_layout_NCHW2NHWC(std::vector input_shape) { + // prepare input&output variables + std::string x_var_name = "input"; + std::string out_var_name = "out"; + + Scope scope; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + x->Resize(DDim(input_shape)); + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("layout"); + opdesc.SetInput("Input", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + + auto op = CreateOp(opdesc, &scope); + + // execute reference implementation and save to output tensor + Tensor input; + input.Resize(DDim(input_shape)); + switch (input_shape.size()) { + case 2: + transpose( + x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), static_cast(input_shape[1])}, + {0, 1}); + break; + case 3: + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2])}, + {0, 2, 1}); + break; + case 4: + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3])}, + {0, 2, 3, 1}); + break; + case 5: + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3]), + static_cast(input_shape[4])}, + {0, 2, 3, 4, 1}); + break; + default: + CHECK(0) << "Unsupport"; + } + auto* x_data = input.mutable_data(); + LaunchOp(op, {x_var_name}, {out_var_name}, CNML_NCHW); + + // compare results + auto* out_data = out->mutable_data(); + + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data[i], x_data[i], 5e-4); + } +} + +TEST(MLUBridges, layout) { + test_layout_NHWC2NCHW({12, 32, 4}); + test_layout_NHWC2NCHW({12, 32, 44, 3}); + test_layout_NHWC2NCHW({12, 32, 44, 3, 6}); + test_layout_NCHW2NHWC({12, 32, 55}); + test_layout_NCHW2NHWC({12, 32, 44, 3}); + test_layout_NCHW2NHWC({12, 32, 44, 3, 8}); + test_layout_NHWC2NCHW({12, 32}); + test_layout_NCHW2NHWC({12, 32}); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(layout, kMLU); diff --git a/lite/kernels/mlu/bridges/lrn_op.cc b/lite/kernels/mlu/bridges/lrn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..657f0dd6781590e1a9ca90bf25e4efcf789863dd --- /dev/null +++ b/lite/kernels/mlu/bridges/lrn_op.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int LrnConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Create lrn node and get params from op + auto fp_type = graph->FPType(); + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, fp_type); + CHECK(graph->HasNode(x_var_name)); + auto input_tensor = graph->GetNode(x_var_name); + + auto alpha = op_info->GetAttr("alpha"); + auto beta = op_info->GetAttr("beta"); + auto k = op_info->GetAttr("k"); + if (op_info->HasAttr("norm_region")) { + CHECK(op_info->GetAttr("norm_region") == "AcrossChannels") + << "Unsuport WithinChannel"; + } + auto local_size = op_info->GetAttr("n"); + CHECK(op_info->HasAttr("input_scale")); + auto input_scale = op_info->GetAttr("input_scale"); + VLOG(5) << "lrn input scale: " << input_scale; + + cnmlLrnOpParam_t param; + cnmlBaseOp_t lrn_op; + CNML_CALL( + cnmlCreateLrnOpParam(¶m, CNML_LRN_V3, local_size, alpha, beta, k)); + CNML_CALL(cnmlCreateLrnOp( + &lrn_op, param, input_tensor->mlu_tensor(), output_tensor->mlu_tensor())); + CNML_CALL(cnmlDestroyLrnOpParam(¶m)); + + graph->SetComputingDataType( + lrn_op, input_tensor->mlu_tensor(), 1 / input_scale); + CNML_CALL(cnmlSetOperationComputingDataType( + lrn_op, output_tensor->mlu_tensor(), fp_type, nullptr)); + + graph->FuseOp(lrn_op); + CNML_CALL(cnmlDestroyBaseOp(&lrn_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(lrn, kMLU, paddle::lite::subgraph::mlu::LrnConverter); diff --git a/lite/kernels/mlu/bridges/lrn_op_test.cc b/lite/kernels/mlu/bridges/lrn_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..21f7e816baeac264bf1b43b7520d464afa38c395 --- /dev/null +++ b/lite/kernels/mlu/bridges/lrn_op_test.cc @@ -0,0 +1,242 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/lrn_op.h" +#include +#include +#include +#include +#include + +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +/** + * @brief get sum of x^2 between channels [size elements] + * + * @tparam float + * @param input + * @param channel_id: the c-th channel within n-th graph. + * @param offset_within_channel: the pixel's offset within a channel. + * @param offset_num: the first address of n-th graph. + * @param c + * @param h + * @param w + * @param size + * @return float + */ +float lrn_square(const float* input, + int channel_id, + int offset_within_channel, + int offset_num, + int c, + int h, + int w, + int size) { + int pre_pad = (size - 1) / 2; + float res = 0; + const float* src = input + offset_num; + + // handle left channels with padding situation. + if (channel_id - pre_pad < 0) { + for (int i = 0; i <= channel_id; ++i) { + res += src[i * h * w + offset_within_channel] * + src[i * h * w + offset_within_channel]; + } + } + + // handle left channels. + if (channel_id - pre_pad >= 0) { + for (int i = channel_id - pre_pad; i <= channel_id; ++i) { + res += src[i * h * w + offset_within_channel] * + src[i * h * w + offset_within_channel]; + } + } + + // handle right channels. + if (channel_id + pre_pad < c) { + for (int i = channel_id + 1; i <= channel_id + pre_pad; ++i) { + res += src[i * h * w + offset_within_channel] * + src[i * h * w + offset_within_channel]; + } + } + + // handle right channels with padding situation. + if (channel_id + pre_pad >= c && channel_id + 1 < c) { + for (int i = channel_id + 1; i < c; ++i) { + res += src[i * h * w + offset_within_channel] * + src[i * h * w + offset_within_channel]; + } + } + + return res; +} + +void lrn_compute_ref(std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = + scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto out = scope->FindVar(op_info->Output("Out").front()) + ->GetMutable(); + + const float* x_data = x->data(); + float* out_data = out->mutable_data(); + auto x_dims = x->dims(); + + auto alpha = op_info->GetAttr("alpha"); + auto beta = op_info->GetAttr("beta"); + auto k = op_info->GetAttr("k"); + auto norm_region = op_info->GetAttr("norm_region"); + auto local_size = op_info->GetAttr("n"); + + int N = x_dims[0]; + int C = x_dims[1]; + int H = x_dims[2]; + int W = x_dims[3]; + + int offset_num = 0; + int offset_within_channel = 0; + int dst_id; + + float square; + + for (int n = 0; n < N; ++n) { + offset_num = n * C * H * W; + + for (int c = 0; c < C; ++c) { + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + offset_within_channel = h * W + w; + dst_id = offset_num + c * H * W + offset_within_channel; + square = lrn_square(x_data, + c, + offset_within_channel, + offset_num, + C, + H, + W, + local_size); + out_data[dst_id] = x_data[dst_id] * pow(k + alpha * square, -beta); + } + } + } + } +} + +void test_lrn(float alpha, + float beta, + float k, + int local_size, + int n, + int c, + int h, + int w, + const std::string& norm_region) { + Scope scope; + std::string x_var_name("X_test"); + std::string out_var_name("Out_test"); + std::string out_ref_var_name("Out_ref"); + auto* x = scope.NewTensor(x_var_name); + auto* out = scope.NewTensor(out_var_name); + auto* out_ref = scope.NewTensor(out_ref_var_name); + + std::vector x_dim{n, c, h, w}; + x->Resize(x_dim); + out->Resize(x_dim); + out_ref->Resize(x_dim); + auto* x_data = x->mutable_data(); + FillTensor(x, 0.f, 1.f); + float *dmax, *dmin; + std::tie(dmin, dmax) = + std::minmax_element(x_data, x_data + x->data_size() - 1); + + cpp::OpDesc opdesc; + opdesc.SetType("lrn"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("alpha", alpha); + opdesc.SetAttr("beta", beta); + opdesc.SetAttr("k", k); + opdesc.SetAttr("n", local_size); + opdesc.SetAttr("norm_region", norm_region); + opdesc.SetAttr("input_scale", (*dmax - *dmin) / 255.f); + + auto op = CreateOp(opdesc, &scope); + + // baseline + lrn_compute_ref(op); + out_ref->CopyDataFrom(*out); + + Tensor input_x; + input_x.Resize(x->dims()); + transpose(x->mutable_data(), + input_x.mutable_data(), + {static_cast(x_dim[0]), + static_cast(x_dim[1]), + static_cast(x_dim[2]), + static_cast(x_dim[3])}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_x); + + LaunchOp(op, {x_var_name}, {out_var_name}); + + Tensor output_trans; + auto os = out->dims(); + output_trans.Resize(os); + transpose(out->mutable_data(), + output_trans.mutable_data(), + {static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}, + {0, 3, 1, 2}); + + auto output_data = output_trans.mutable_data(); + auto* output_ref_data = out_ref->mutable_data(); + for (size_t i = 0; i < out->data_size(); i++) { + EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4); + } +} + +TEST(MLUBridges, lrn) { + int local_size = 5; + float alpha = 0.0001f; + float beta = 0.75; + float k = 2.0f; + std::string norm_region = "AcrossChannels"; + for (int w : {2, 4, 8}) { + for (int h : {2, 4, 8}) { + for (int c : {1, 2, 3, 4}) { + for (int n : {1, 2, 3, 4}) { + test_lrn(alpha, beta, k, local_size, n, c, h, w, norm_region); + } + } + } + } +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(lrn, kMLU) diff --git a/lite/kernels/mlu/bridges/norm_op.cc b/lite/kernels/mlu/bridges/norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..492c3932a8c8a68f7eba687dde30d888d6e0f297 --- /dev/null +++ b/lite/kernels/mlu/bridges/norm_op.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int NormConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Get input vars and op attributes + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims().Vectorize(); + + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + int axis = op_info->GetAttr("axis"); + int epsilon = op_info->GetAttr("epsilon"); + if (axis < 0) { + axis = axis + x_dims.size(); + } + std::vector nchw2nhwc = {0, 3, 1, 2}; + int nhwc_axis = nchw2nhwc[axis]; + + CHECK(graph->HasNode(x_var_name)); + auto input_tensor = graph->GetNode(x_var_name); + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + + // ======== DEBUG =============== + VLOG(6) << "x name=" << x_var_name; + VLOG(6) << "out name=" << out_var_name; + VLOG(6) << "x dims=" << x->dims(); + VLOG(6) << "out dims=" << output->dims(); + VLOG(6) << "axis =" << axis; + VLOG(6) << "nwhc axis=" << nhwc_axis; + VLOG(6) << "epsilon =" << epsilon; + // cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR); + // ======== DEBUG END ============ + cnmlBaseOp_t norm_op{nullptr}; + + cnmlNormalizeOpParam_t param; + int mode = -1; + switch (axis) { + case 0: + mode = 3; // N + break; + case 1: + mode = 0; // C + break; + case 2: + mode = 4; // H + break; + case 3: + mode = 5; // W + break; + default: + CHECK(0); + break; + } + cnmlCreateNormalizeOpParamV2(¶m, + 0, // p + 0, // use_scale + mode, + 1, // weight + epsilon); + + CNML_CALL(cnmlCreateNormalizeOp(&norm_op, + param, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + nullptr, + false /*is_fix8_mode*/)); + graph->FuseOp(norm_op); + CNML_CALL(cnmlDestroyBaseOp(&norm_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(norm, + kMLU, + paddle::lite::subgraph::mlu::NormConverter); diff --git a/lite/kernels/mlu/bridges/norm_op_test.cc b/lite/kernels/mlu/bridges/norm_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..35b5eabbb9ffacd96c3ca6500dd9181f4d5bec5b --- /dev/null +++ b/lite/kernels/mlu/bridges/norm_op_test.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/norm_op.h" + +#include + +#include +#include + +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +// void ToFile(std::string file_name, Tensor* tensor) { +// int count = tensor->dims().production(); +// auto data = tensor->mutable_data(); +// std::ostringstream outs; +// for (size_t i = 0; i < count; i++) { +// outs << data[i] << std::endl; +// } +// std::ofstream of; +// of.open(file_name, std::ios::out); +// of << outs.str(); +// of.close(); +// } + +void norm_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + int axis = op_info->GetAttr("axis"); + int epsilon = op_info->GetAttr("epsilon"); + auto x_dims = x->dims(); + if (axis < 0) { + axis += x_dims.size(); + } + out->Resize(x_dims.Vectorize()); + auto* out_data = out->mutable_data(); + + const auto* x_data = x->data(); + int pre_n = x_dims.count(0, axis); + int n = x_dims[axis]; + int post_n = x_dims.count(axis + 1, x_dims.size()); + for (int i = 0; i < pre_n; i++) { + for (int k = 0; k < post_n; k++) { + float sum = epsilon; + const float* in_tmp = x_data + i * n * post_n + k; + for (int j = 0; j < n; j++) { + sum += in_tmp[j * post_n] * in_tmp[j * post_n]; + } + sum = std::sqrt(sum); + float* out_tmp = out_data + i * n * post_n + k; + for (int j = 0; j < n; j++) { + out_tmp[j * post_n] = in_tmp[j * post_n] / sum; + } + } + } +} + +void test_norm(const std::vector& input_shape, int axis) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize(input_shape); + // initialize input&output data + FillTensor(x, -9, 9); + // initialize op desc + cpp::OpDesc opdesc; + float epsilon = 1e-9f; + opdesc.SetType("norm"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axis", static_cast(axis)); + opdesc.SetAttr("epsilon", static_cast(epsilon)); + + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + norm_ref(op); + out_ref->CopyDataFrom(*out); + Tensor input_x; + input_x.Resize(DDim(input_shape)); + // change input layout from NCHW to NHWC + transpose(x->mutable_data(), + input_x.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3])}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_x); + + LaunchOp(op, {x_var_name}, {out_var_name}); + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + std::vector out_shape = input_shape; + Tensor output_trans; + output_trans.Resize(out_shape); + // Change output layout from NHWC to NCHW + transpose(out_data, + output_trans.mutable_data(), + {static_cast(out_shape[0]), + static_cast(out_shape[2]), + static_cast(out_shape[3]), + static_cast(out_shape[1])}, + {0, 3, 1, 2}); + out_data = output_trans.mutable_data(); + + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(MLUBridges, norm) { + test_norm({1, 2, 3, 4}, 1); + test_norm({1, 2, 3, 4}, 2); + test_norm({1, 2, 3, 4}, 3); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(norm, kMLU); diff --git a/lite/kernels/mlu/bridges/paddle_use_bridges.h b/lite/kernels/mlu/bridges/paddle_use_bridges.h index d31ba0dd41111860a3b26d8ac3afb3273bef4557..be5c64b3b7056d0b8de1589d198db541b5a3777b 100644 --- a/lite/kernels/mlu/bridges/paddle_use_bridges.h +++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h @@ -15,6 +15,7 @@ #pragma once USE_SUBGRAPH_BRIDGE(relu, kMLU); +USE_SUBGRAPH_BRIDGE(relu6, kMLU) USE_SUBGRAPH_BRIDGE(conv2d, kMLU); USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU); USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU); @@ -24,5 +25,26 @@ USE_SUBGRAPH_BRIDGE(batch_norm, kMLU); USE_SUBGRAPH_BRIDGE(fc, kMLU); USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU); USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU); +USE_SUBGRAPH_BRIDGE(transpose, kMLU); +USE_SUBGRAPH_BRIDGE(transpose2, kMLU); USE_SUBGRAPH_BRIDGE(concat, kMLU); USE_SUBGRAPH_BRIDGE(scale, kMLU); +USE_SUBGRAPH_BRIDGE(sigmoid, kMLU); +USE_SUBGRAPH_BRIDGE(elementwise_mul, kMLU); +USE_SUBGRAPH_BRIDGE(dropout, kMLU); +USE_SUBGRAPH_BRIDGE(arg_max, kMLU); +USE_SUBGRAPH_BRIDGE(split, kMLU); +USE_SUBGRAPH_BRIDGE(cast, kMLU); +USE_SUBGRAPH_BRIDGE(layout, kMLU); +USE_SUBGRAPH_BRIDGE(slice, kMLU); +USE_SUBGRAPH_BRIDGE(squeeze, kMLU); +USE_SUBGRAPH_BRIDGE(squeeze2, kMLU); +USE_SUBGRAPH_BRIDGE(flatten, kMLU); +USE_SUBGRAPH_BRIDGE(flatten2, kMLU); +USE_SUBGRAPH_BRIDGE(reshape, kMLU); +USE_SUBGRAPH_BRIDGE(reshape2, kMLU); +#ifdef LITE_BUILD_EXTRA +USE_SUBGRAPH_BRIDGE(gather, kMLU); +USE_SUBGRAPH_BRIDGE(lrn, kMLU) +USE_SUBGRAPH_BRIDGE(norm, kMLU) +#endif diff --git a/lite/kernels/mlu/bridges/pool_op.cc b/lite/kernels/mlu/bridges/pool_op.cc index f77c8084c76fc52c39938e723f02bde9b3cac41b..c734de1eec75d253a9b6b8d7a7f21d710df3d949 100644 --- a/lite/kernels/mlu/bridges/pool_op.cc +++ b/lite/kernels/mlu/bridges/pool_op.cc @@ -55,6 +55,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto global_pooling = op_info->GetAttr("global_pooling"); auto ksize = op_info->GetAttr>("ksize"); auto strides = op_info->GetAttr>("strides"); + CHECK(!(op_info->HasAttr("exclusive") && + op_info->GetAttr("exclusive") == false)) + << "Unsupport param exclusive is false!"; if (paddings.size() == 2L) { for (size_t i = 0; i < 2L; ++i) { @@ -62,8 +65,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); } } - int pad_height = paddings[0]; - int pad_width = paddings[2]; std::string padding_algorithm(""); if (op_info->HasAttr("padding_algorithm")) { padding_algorithm = op_info->GetAttr("padding_algorithm"); @@ -72,6 +73,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { if (op_info->HasAttr("adaptive")) { adaptive = op_info->GetAttr("adaptive"); } + auto input_dims = x->dims(); + lite::operators::UpdatePadding(&paddings, global_pooling, adaptive, @@ -80,31 +83,31 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { strides, ksize); - // std::vector output_shape({input_dims[0], input_dims[1]}); - // for (size_t i = 0; i < 2; i++) { - // output_shape.push_back( - // (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - - // ksize[0]) / - // strides[i] + - // 1); - // } + if (global_pooling) { + ksize.resize(static_cast(input_dims.size()) - 2); + for (size_t i = 0; i < ksize.size(); ++i) { + ksize[i] = static_cast(input_dims[i + 2]); + } + } auto output_tensor = graph->AddNode( output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType()); cnmlPoolOpParam_t pool_param; CNML_CALL( - cnmlCreatePoolOpParam_V2(&pool_param, + cnmlCreatePoolOpParam_V3(&pool_param, ksize[0], ksize[1], strides[0], strides[1], - pad_height, - pad_width, - 1, // dilation - 1, + paddings[0], + paddings[1], + paddings[2], + paddings[3], + 1, // dilation h + 1, // dilation w ToCnmlPoolMode(pooling_type), - ceil_mode ? CNML_POOL_KVALID : CNML_POOL_KFULL, + ceil_mode ? CNML_POOL_KFULL : CNML_POOL_KVALID, true, /* real */ 1 /* blend factor */)); cnmlBaseOp_t pool_op; @@ -114,6 +117,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { output_tensor->mlu_tensor())); CNML_CALL(cnmlDestroyPoolOpParam(&pool_param)); graph->FuseOp(pool_op); + CNML_CALL(cnmlDestroyBaseOp(&pool_op)); return SUCCESS; } diff --git a/lite/kernels/mlu/bridges/pool_op_test.cc b/lite/kernels/mlu/bridges/pool_op_test.cc index 8cee8dbe86109b14cff49f329d71074a9b3bfb61..2ae888744fde3e94e857f04d50ceb1eb878f3c1c 100644 --- a/lite/kernels/mlu/bridges/pool_op_test.cc +++ b/lite/kernels/mlu/bridges/pool_op_test.cc @@ -43,6 +43,12 @@ void pool_ref(const std::shared_ptr op) { std::string pooling_type = op_info->GetAttr("pooling_type"); bool global_pooling = op_info->GetAttr("global_pooling"); + if (pooling_type == "max") { + for (int i = 0; i < out_dims.production(); ++i) { + dst_ptr[i] = -65504.f; + } + } + int in_n = in_dims[0]; int in_c = in_dims[1]; int in_h = in_dims[2]; @@ -203,62 +209,46 @@ void test_pool(int bs, } TEST(MLUBridges, pool) { - // for (auto pooling_type : {"max", "avg"}) { - // for (auto ceil_mode : {true, false}) { - // for (auto global_pooling : {/*true, */ false}) { - // for (auto exclusive : {true /*, false*/}) { - // for (auto ksize : {2, 3}) { - // for (auto stride : {1, 2}) { - // for (auto padding : {0, 1}) { - // for (auto bs : {1, 3}) { - // for (auto ic : {1, 3}) { - // for (auto ih : {3, 7}) { - // for (auto iw : {3, 7}) { - // test_pool(bs, - // ic, - // ih, - // iw, - // pooling_type, - // ceil_mode, - // global_pooling, - // exclusive, - // ksize, - // stride, - // padding); - // } - // } - // } - // } - // } - // } - // } - // } - // } - // } - // } - for (auto pooling_type : {"max", "avg"}) { for (auto ceil_mode : {true, false}) { - bool global_pooling = false; - bool exclusive = true; - int ksize = 2; - int stride = 1; - int padding = 0; - int bs = 6; - int ic = 6; - int ih = 6; - int iw = 6; - test_pool(bs, - ic, - ih, - iw, - pooling_type, - ceil_mode, - global_pooling, - exclusive, - ksize, - stride, - padding); + for (auto global_pooling : {true, false}) { + for (auto exclusive : {true /*, false*/}) { + for (auto ksize : {2, 3}) { + for (auto stride : {1, 2}) { + for (auto padding : {0, 1}) { + for (auto bs : {1, 3}) { + for (auto ic : {1, 3}) { + for (auto ih : {3, 7}) { + for (auto iw : {3, 7}) { + LOG(INFO) + << "shape: " << bs << ',' << ic << ',' << ih << ',' + << iw << '\t' << "pooling type: " << pooling_type + << '\t' << "ceil model: " << ceil_mode << '\t' + << "global_pooling: " << global_pooling << '\t' + << "exclusive: " << exclusive << '\t' + << "ksize: " << ksize << '\t' + << "stride: " << stride << '\t' + << "padding: " << padding; + test_pool(bs, + ic, + ih, + iw, + pooling_type, + ceil_mode, + global_pooling, + exclusive, + ksize, + stride, + padding); + } + } + } + } + } + } + } + } + } } } } diff --git a/lite/kernels/mlu/bridges/reshape_op.cc b/lite/kernels/mlu/bridges/reshape_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b47322b3462525be64e42b608d052719d7c5f0b --- /dev/null +++ b/lite/kernels/mlu/bridges/reshape_op.cc @@ -0,0 +1,130 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + + // ================== Trans1: NHWC => NCHW =========================== + auto input_tensor = graph->GetNode(x_var_name); + auto trans_1_axis = std::move(GetAxisNHWC2NCHW(x->dims().size())); + auto trans1_out = graph->AddNode(x_var_name + ".trans.i", + x->dims().Vectorize(), + CNML_TENSOR, + CNML_NCHW, + graph->FPType(), + CNML_NCHW); + cnmlBaseOp_t trans1_op{nullptr}; + cnmlNdTransposeOpParam_t trans1_param{nullptr}; + CNML_CALL(cnmlCreateNdTransposeOpParam( + &trans1_param, trans_1_axis.data(), trans_1_axis.size())); + CNML_CALL(cnmlCreateNdTransposeProOp(&trans1_op, + input_tensor->mlu_tensor(), + trans1_out->mlu_tensor(), + trans1_param)); + // ======================== Trans1 End ================================== + + // ======================= Reshape op =================================== + cnmlBaseOp_t reshape_op; + auto trans2_input = graph->AddNode(out_var_name + ".trans.o", + output_dims, + CNML_TENSOR, + CNML_NCHW, + graph->FPType(), + CNML_NCHW); + cnmlReshapeOpParam_t reshape_param{nullptr}; + int cnml_trans2_input_shape[4]; + CNML_CALL( + cnmlGetTensorShape(trans2_input->mlu_tensor(), cnml_trans2_input_shape)); + CNML_CALL( + cnmlCreateNdReshapeOpParam(&reshape_param, cnml_trans2_input_shape, 4)); + + // Use cnmlCreatexxxOpForward to create op. + CNML_CALL(cnmlCreateReshapeOp(&reshape_op, + reshape_param, + trans1_out->mlu_tensor(), + trans2_input->mlu_tensor())); + // ======================= Reshape op End =================================== + + // ================== Trans2: NCHW => NHWC =============================== + auto trans_2_axis = std::move(GetAxisNCHW2NHWC(output->dims().size())); + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + cnmlBaseOp_t trans2_op{nullptr}; + cnmlNdTransposeOpParam_t trans2_param{nullptr}; + CNML_CALL(cnmlCreateNdTransposeOpParam( + &trans2_param, trans_2_axis.data(), trans_2_axis.size())); + CNML_CALL(cnmlCreateNdTransposeProOp(&trans2_op, + trans2_input->mlu_tensor(), + output_tensor->mlu_tensor(), + trans2_param)); + // ======================== Trans2 End ================================== + + // =============== DEBUG ==================== + VLOG(6) << "x_var_name: " << x_var_name; + VLOG(6) << "out_var_name: " << out_var_name; + VLOG(6) << "input dim: " << x->dims(); + VLOG(6) << "output dim: " << output->dims(); + int cnml_input_shape[4]; + CNML_CALL(cnmlGetTensorShape(input_tensor->mlu_tensor(), cnml_input_shape)); + VLOG(6) << "cnml input dim: "; + for (size_t i = 0; i < 4; i++) { + VLOG(6) << cnml_input_shape[i]; + } + // cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(trans1_out->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(trans2_input->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR); + // =============== DEBUG END ================= + + graph->FuseOp(trans1_op); + graph->FuseOp(reshape_op); + graph->FuseOp(trans2_op); + CNML_CALL(cnmlDestroyBaseOp(&trans1_op)); + CNML_CALL(cnmlDestroyBaseOp(&reshape_op)); + CNML_CALL(cnmlDestroyBaseOp(&trans2_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(reshape, + kMLU, + paddle::lite::subgraph::mlu::ReshapeConverter); +REGISTER_SUBGRAPH_BRIDGE(reshape2, + kMLU, + paddle::lite::subgraph::mlu::ReshapeConverter); diff --git a/lite/kernels/mlu/bridges/reshape_op_test.cc b/lite/kernels/mlu/bridges/reshape_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0cd2c6cc26f8f40ee83c99755d8842b072693b1a --- /dev/null +++ b/lite/kernels/mlu/bridges/reshape_op_test.cc @@ -0,0 +1,98 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/reshape_op.h" + +#include + +#include + +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void test_reshape(std::vector input_shape, + std::vector out_shape) { + // prepare input&output variables + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + x->Resize(input_shape); + Tensor x_cpu; + + // initialize input&output data + FillTensor(x); + x_cpu.CopyDataFrom(*x); + + Tensor input_trans; + input_trans.Resize(input_shape); + transpose(x->mutable_data(), + input_trans.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3])}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_trans); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("reshape2"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + std::vector shape_attr; + shape_attr.resize(out_shape.size()); + for (size_t i = 0; i < out_shape.size(); i++) { + shape_attr[i] = static_cast(out_shape[i]); + } + + opdesc.SetAttr>("shape", shape_attr); + auto op = CreateOp(opdesc, &scope); + + auto os = out->dims(); + out->Resize(out_shape); + LaunchOp(op, {x_var_name}, {out_var_name}); + + Tensor out_trans; + out_trans.Resize(out_shape); + transpose(out->mutable_data(), + out_trans.mutable_data(), + {static_cast(out_shape[0]), + static_cast(out_shape[1]), + static_cast(out_shape[2]), + static_cast(out_shape[3])}, + {0, 3, 1, 2}); + out->CopyDataFrom(out_trans); + // compare results + auto* out_data = out->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], x_cpu.mutable_data()[i], 1e-5); + } +} + +TEST(MLUBridges, reshape) { test_reshape({1, 2, 4, 4}, {1, 4, 2, 4}); } +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(reshape, kMLU); +USE_SUBGRAPH_BRIDGE(reshape2, kMLU); diff --git a/lite/kernels/mlu/bridges/scale_op.cc b/lite/kernels/mlu/bridges/scale_op.cc index 5557602bd7576ccd71c51f52a538a45fe27f7ada..5b6b3dff7969562b19344f9eccbf219d26c3e02d 100644 --- a/lite/kernels/mlu/bridges/scale_op.cc +++ b/lite/kernels/mlu/bridges/scale_op.cc @@ -61,6 +61,7 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { alpha_tensor->mlu_tensor(), beta_tensor->mlu_tensor())); graph->FuseOp(scale_op); + CNML_CALL(cnmlDestroyBaseOp(&scale_op)); return SUCCESS; } diff --git a/lite/kernels/mlu/bridges/slice_op.cc b/lite/kernels/mlu/bridges/slice_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..067d110bf4160c5bcf2bbd3009d82bbb5804c998 --- /dev/null +++ b/lite/kernels/mlu/bridges/slice_op.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto scope = op->scope(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // input + auto input_var_name = op_info->Input("Input").front(); + auto input = scope->FindVar(input_var_name)->GetMutable(); + auto input_shape = input->dims().Vectorize(); + // output + auto output_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(output_var_name)->GetMutable(); + // attr + auto axes = op_info->GetAttr>("axes"); + auto starts = op_info->GetAttr>("starts"); + auto ends = op_info->GetAttr>("ends"); + + CHECK(graph->HasNode(input_var_name)); + auto input_tensor = graph->GetNode(input_var_name); + auto output_tensor = graph->AddNode(output_var_name, + output->dims().Vectorize(), + CNML_TENSOR, + CNML_NCHW, + graph->FPType()); + + std::vector begin_index(input_shape.size(), 0); + std::vector end_index(input_shape.size()); + std::vector strides(input_shape.size(), 1); + auto nhwc2nchw_axis = std::move(GetAxisNHWC2NCHW(input_shape.size())); + for (size_t i = 0; i < input_shape.size(); ++i) { + end_index[nhwc2nchw_axis[i]] = input_shape[i]; + } + for (size_t i = 0; i < axes.size(); i++) { + int dim_value = input_shape[axes[i]]; + int end = ends[i] < 0 ? std::max(ends[i] + dim_value, 0) : ends[i]; + begin_index[nhwc2nchw_axis[axes[i]]] = + starts[i] < 0 ? std::max(starts[i] + dim_value, 0) : starts[i]; + end_index[nhwc2nchw_axis[axes[i]]] = std::min(end, dim_value); + } + + cnmlNdStridedSliceOpParam_t param; + cnmlBaseOp_t slice_op; + CNML_CALL(cnmlCreateNdStridedSliceOpParam(¶m, + input_shape.size(), + begin_index.data(), + end_index.data(), + strides.data())); + CNML_CALL(cnmlCreateNdStridedSliceOp(&slice_op, + param, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + CNML_CALL(cnmlDestroyNdStridedSliceOpParam(¶m)); + + graph->FuseOp(slice_op); + CNML_CALL(cnmlDestroyBaseOp(&slice_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(slice, + kMLU, + paddle::lite::subgraph::mlu::SliceConverter); diff --git a/lite/kernels/mlu/bridges/slice_op_test.cc b/lite/kernels/mlu/bridges/slice_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a5e2a9f5a4c99b6f46fff24686cdbe546cae727d --- /dev/null +++ b/lite/kernels/mlu/bridges/slice_op_test.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/slice_op.h" +#include +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +static void slice_ref(const float* input, + std::vector in_dims, + std::vector axes, + std::vector starts, + std::vector ends, + float* out) { + auto out_dims = in_dims; + std::vector real_starts(in_dims.size(), 0); + std::vector real_ends(in_dims.size(), 0); + std::vector real_step(in_dims.size(), 0); + for (size_t i = 0; i < in_dims.size(); i++) { + real_ends[i] = in_dims[i]; + } + for (size_t i = 0; i < axes.size(); i++) { + int dim_value = in_dims[axes[i]]; + if (dim_value > 0) { + int start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; + int end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; + start = std::max(start, 0); + end = std::max(end, 0); + end = std::min(end, dim_value); + out_dims[axes[i]] = end - start; + real_starts[axes[i]] = start; + real_ends[axes[i]] = end; + } + } + const int LEN = in_dims.size(); + int dst_step[LEN]; + for (size_t i = 0; i < in_dims.size(); ++i) { + dst_step[i] = 1; + } + int src_step[LEN]; + for (size_t i = 0; i < in_dims.size(); ++i) { + src_step[i] = 1; + } + int out_num = out_dims[in_dims.size() - 1]; + for (int i = in_dims.size() - 2; i >= 0; i--) { + dst_step[i] = out_dims[i + 1] * dst_step[i + 1]; + src_step[i] = in_dims[i + 1] * src_step[i + 1]; + out_num *= out_dims[i]; + } + + for (int dst_id = 0; dst_id < out_num; dst_id++) { + int src_id = 0; + int index_id = dst_id; + for (size_t j = 0; j < out_dims.size(); j++) { + int cur_id = index_id / dst_step[j]; + index_id = index_id % dst_step[j]; + src_id += (cur_id + real_starts[j]) * src_step[j]; + } + out[dst_id] = input[src_id]; + } +} + +static void test_case(std::vector x_shape, + std::vector out_shape, + std::vector starts, + std::vector ends, + std::vector axes) { + Scope scope; + + std::string x_var_name = "x"; + std::string out_var_name = "out"; + auto* x = scope.NewTensor(x_var_name); + auto* out = scope.NewTensor(out_var_name); + x->Resize(lite::DDim(x_shape)); + out->Resize(lite::DDim(out_shape)); + + auto x_data = x->mutable_data(); + FillTensor(x, 0.f, 2.f); + + cpp::OpDesc opdesc; + opdesc.SetType("slice"); + opdesc.SetInput("Input", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axes", axes); + opdesc.SetAttr("starts", starts); + opdesc.SetAttr("ends", ends); + + std::vector out_ref(out->data_size(), 0); + slice_ref(x_data, x_shape, axes, starts, ends, out_ref.data()); + + auto type_cast = [](int64_t in) { return static_cast(in); }; + std::vector i_dims; + std::transform( + x_shape.cbegin(), x_shape.cend(), std::back_inserter(i_dims), type_cast); + + auto nchw2nhwc_axis = std::move(GetAxisNCHW2NHWC(x_shape.size())); + + Tensor input_x; + input_x.Resize(x->dims()); + transpose(x->mutable_data(), + input_x.mutable_data(), + i_dims, + nchw2nhwc_axis); + x->CopyDataFrom(input_x); + + auto op = CreateOp(opdesc, &scope); + LaunchOp(op, {x_var_name}, {out_var_name}); + + Tensor output_trans; + auto os = out->dims().Vectorize(); + output_trans.Resize(os); + std::vector o_dims(os.size()); + for (size_t i = 0; i < os.size(); ++i) { + o_dims[i] = os[nchw2nhwc_axis[i]]; + } + transpose(out->mutable_data(), + output_trans.mutable_data(), + o_dims, + GetAxisNHWC2NCHW(x_shape.size())); + + auto out_data = output_trans.mutable_data(); + for (DDim::value_type i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_ref[i], out_data[i], 1e-4); + } +} + +TEST(MLUBridges, slice) { + /* test_case({3}, {3}, {-3}, {3}, {0}); */ + test_case({3, 4}, {3, 4}, {-3, 0}, {3, 100}, {0, 1}); + test_case({3, 4, 5}, {3, 4, 2}, {-3, 0, 2}, {3, 100, -1}, {0, 1, 2}); + test_case({3, 4, 5, 6}, {3, 4, 2, 6}, {-3, 0, 2}, {3, 100, -1}, {0, 1, 2}); + /* test_case({3, 4, 5, 6, 3}, {3, 4, 2, 6, 3}, {-3, 0, 2}, {3, 100, -1}, {0, + * 1, 2}); */ + /* test_case({3, 4, 5, 6, 5, 2}, {3, 4, 2, 6, 5, 2}, {-3, 0, 2}, {3, 100, 1}, + * {0, 1, 2}); */ +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(slice, kMLU); diff --git a/lite/kernels/mlu/bridges/softmax_op.cc b/lite/kernels/mlu/bridges/softmax_op.cc index 17c911675718a15c7ede4888b268ffcd62b4d8ed..b1b621c1efc6cbc54092a8082e4d624355e07652 100644 --- a/lite/kernels/mlu/bridges/softmax_op.cc +++ b/lite/kernels/mlu/bridges/softmax_op.cc @@ -35,9 +35,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto out_var_name = op_info->Output("Out").front(); auto output = scope->FindVar(out_var_name)->GetMutable(); auto output_dims = output->dims().Vectorize(); + auto x_shape = + scope->FindVar(x_var_name)->GetMutable()->dims().Vectorize(); - // nchw axis to nhwc aixs - int nchw_to_nhwc_aixs_map[4] = {0, 3, 1, 2}; + // nchw axis to nhwc axis int axis = 1; if (op_info->HasAttr("axis")) { axis = op_info->GetAttr("axis"); @@ -45,7 +46,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { axis = output_dims.size() + axis; } } - int nhwc_axis = nchw_to_nhwc_aixs_map[axis]; + // value of nhwc2nchw_axis is index of nhwc + // order of nhwc2nchw_axis is nchw + int nhwc_axis = GetAxisNHWC2NCHW(x_shape.size())[axis]; auto output_tensor = graph->AddNode( out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); @@ -55,6 +58,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { graph->GetNode(x_var_name)->mlu_tensor(), output_tensor->mlu_tensor())); graph->FuseOp(softmax_op); + CNML_CALL(cnmlDestroyBaseOp(&softmax_op)); return SUCCESS; } diff --git a/lite/kernels/mlu/bridges/softmax_op_test.cc b/lite/kernels/mlu/bridges/softmax_op_test.cc index a5251ed43c9187fc2874f9b01853b45b8abf7f1c..d5d7251205a0f60b9e5c8568a58ba48661c9df3e 100644 --- a/lite/kernels/mlu/bridges/softmax_op_test.cc +++ b/lite/kernels/mlu/bridges/softmax_op_test.cc @@ -93,7 +93,7 @@ void test_softmax(const std::vector& input_shape, int axis) { opdesc.SetOutput("Out", {out_var_name}); opdesc.SetAttr("axis", axis); - // create and convert op to NPU model, then run it on NPU + // create and convert op to MLU model, then run it on MLU auto op = CreateOp(opdesc, &scope); // execute reference implementation and save to output tensor softmax_ref(op); diff --git a/lite/kernels/mlu/bridges/split_op.cc b/lite/kernels/mlu/bridges/split_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4188ba3ec08161552bc688c212408fa81ae815a3 --- /dev/null +++ b/lite/kernels/mlu/bridges/split_op.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims().Vectorize(); + + auto out_var_name = op_info->Output("Out"); + + auto param_axis = op_info->GetAttr("axis"); + + auto num = op_info->GetAttr("num"); + auto sections = op_info->GetAttr>("sections"); + int64_t sections_num = static_cast(sections.size()); + auto output_num = num > 0 ? num : sections_num; + + std::vector output_tensor; + for (auto out_name : out_var_name) { + auto out = scope->FindVar(out_name)->GetMutable(); + auto out_dims = out->dims().Vectorize(); + auto out_tensor = graph->AddNode( + out_name, out_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + output_tensor.push_back(out_tensor->mlu_tensor()); + } + + auto dims = x_dims.size(); + int axis = (param_axis < 0) ? (param_axis + dims) : param_axis; + CHECK_LE(axis, 4) << "Unsupport dims in mlu concat"; + int nhwc_axis = GetAxisNHWC2NCHW(dims)[axis]; + + CHECK(graph->HasNode(x_var_name)); + auto input_tensor = graph->GetNode(x_var_name); + + cnmlBaseOp_t split_op; + cnmlTensor_t inputs = input_tensor->mlu_tensor(); + CNML_CALL(cnmlCreateNdSplitOp( + &split_op, nhwc_axis, &inputs, 1, output_tensor.data(), output_num)); + graph->FuseOp(split_op); + CNML_CALL(cnmlDestroyBaseOp(&split_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(split, + kMLU, + paddle::lite::subgraph::mlu::SplitConverter); diff --git a/lite/kernels/mlu/bridges/split_op_test.cc b/lite/kernels/mlu/bridges/split_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a44a45504036e9ef6199e9d2b534aa3dde63bb01 --- /dev/null +++ b/lite/kernels/mlu/bridges/split_op_test.cc @@ -0,0 +1,199 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/split_op.h" +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +template +void split_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + int num = op_info->GetAttr("num"); + int axis = op_info->GetAttr("axis"); + std::vector sections = op_info->GetAttr>("sections"); + std::vector output_vec; + auto output = op_info->Output("Out"); + for (auto out_var : output) { + output_vec.push_back(scope->Var(out_var)->GetMutable()); + } + auto in_dims = x->dims(); + auto rank = in_dims.size(); + int outs_number = output_vec.size(); + std::vector outs_dims; + outs_dims.reserve(outs_number); + if (axis < 0) { + axis += rank; + } + if (num > 0) { + int out_axis_dim = in_dims[axis] / num; + for (int i = 0; i < outs_number; ++i) { + auto dim = in_dims; + dim[axis] = out_axis_dim; + outs_dims.push_back(dim); + } + } else if (sections.size() > 0) { + for (size_t i = 0; i < outs_number; ++i) { + auto dim = in_dims; + dim[axis] = sections[i]; + outs_dims.push_back(dim); + } + } + for (int j = 0; j < outs_dims.size(); ++j) { + output_vec[j]->Resize(outs_dims[j]); + } + + const dtype* din = x->mutable_data(); + std::vector in_strides(in_dims.size()); + in_strides[in_dims.size() - 1] = in_dims[in_dims.size() - 1]; + for (int i = in_dims.size() - 2; i >= 0; --i) { + in_strides[i] = in_strides[i + 1] * in_dims[i]; + } + + int input_offset = 0; + for (auto out : output_vec) { + auto out_dim = out->dims(); + std::vector out_strides(out_dim.size()); + out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1]; + for (int i = out_dim.size() - 2; i >= 0; --i) { + out_strides[i] = out_strides[i + 1] * out_dim[i]; + } + + dtype* out_data = out->mutable_data(); + int before = out_strides[0] / out_strides[axis]; + int in_after = in_strides[axis]; + int out_after = out_strides[axis]; + + for (int i = 0; i < before; ++i) { + std::memcpy(out_data + i * out_after, + din + input_offset + i * in_after, + sizeof(dtype) * out_after); + } + input_offset += out_strides[axis]; + } +} + +void test_split(int bs, + int ic, + int ih, + int iw, + int axis, + int num, + std::vector sections) { + // prepare input&output variables + std::string x_var_name = "x"; + std::string out_var_name_1 = "out_1"; + std::string out_var_name_2 = "out_2"; + std::string out_ref_var_name_1 = "out_ref_1"; + std::string out_ref_var_name_2 = "out_ref_2"; + + Scope scope; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out_1 = scope.Var(out_var_name_1)->GetMutable(); + auto* out_2 = scope.Var(out_var_name_2)->GetMutable(); + auto* out_ref_1 = scope.Var(out_ref_var_name_1)->GetMutable(); + auto* out_ref_2 = scope.Var(out_ref_var_name_2)->GetMutable(); + x->Resize({bs, ic, ih, iw}); + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("split"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name_1, out_var_name_2}); + opdesc.SetAttr("axis", axis); + opdesc.SetAttr("sections", sections); + opdesc.SetAttr("num", num); + + auto op = CreateOp(opdesc, &scope); + split_ref(op); + out_ref_1->CopyDataFrom(*out_1); + out_ref_2->CopyDataFrom(*out_2); + // execute reference implementation and save to output tensor + + Tensor input; + input.Resize({bs, ic, ih, iw}); + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(bs), + static_cast(ic), + static_cast(ih), + static_cast(iw)}, + {0, 2, 3, 1}); + x->CopyDataFrom(input); + LaunchOp(op, {x_var_name}, {out_var_name_1, out_var_name_2}); + + // compare results + auto* out_data_1 = out_1->mutable_data(); + auto* out_data_2 = out_2->mutable_data(); + auto* out_ref_data_1 = out_ref_1->mutable_data(); + auto* out_ref_data_2 = out_ref_2->mutable_data(); + + Tensor output1, output2; + output1.Resize(out_1->dims()); + output2.Resize(out_2->dims()); + transpose(out_data_1, + output1.mutable_data(), + {static_cast(out_1->dims()[0]), + static_cast(out_1->dims()[2]), + static_cast(out_1->dims()[3]), + static_cast(out_1->dims()[1])}, + {0, 3, 1, 2}); + transpose(out_data_2, + output2.mutable_data(), + {static_cast(out_2->dims()[0]), + static_cast(out_2->dims()[2]), + static_cast(out_2->dims()[3]), + static_cast(out_2->dims()[1])}, + {0, 3, 1, 2}); + out_data_1 = output1.mutable_data(); + out_data_2 = output2.mutable_data(); + for (int i = 0; i < out_1->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data_1[i], out_ref_data_1[i], 5e-4); + } + for (int i = 0; i < out_2->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data_2[i], out_ref_data_2[i], 5e-4); + } +} + +TEST(MLUBridges, split) { + test_split(4, 2, 3, 1, 0, 2, {}); + test_split(4, 2, 3, 1, 0, 0, {3, 1}); + test_split(4, 6, 3, 1, 1, 2, {}); + test_split(4, 6, 3, 1, 1, 0, {2, 4}); + test_split(4, 2, 2, 1, 2, 2, {}); + test_split(4, 2, 6, 1, 2, 0, {3, 3}); + test_split(4, 2, 3, 4, 3, 2, {}); + test_split(4, 2, 3, 6, 3, 0, {5, 1}); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(split, kMLU); diff --git a/lite/kernels/mlu/bridges/squeeze_op.cc b/lite/kernels/mlu/bridges/squeeze_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f8af5b014bdba29bb50036473f671ec359f26d4 --- /dev/null +++ b/lite/kernels/mlu/bridges/squeeze_op.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int SqueezeConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Create act node and set params from op + auto fp_type = graph->FPType(); + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, fp_type); + CHECK(graph->HasNode(x_var_name)); + auto input_tensor = graph->GetNode(x_var_name); + + auto output_dims_nhwc = DimNCHW2NHWC(output_dims); + std::vector o_dims(output_dims.size()); + std::transform(output_dims_nhwc.cbegin(), + output_dims_nhwc.cend(), + o_dims.begin(), + [](DDim::value_type d) { return static_cast(d); }); + + cnmlReshapeOpParam_t param; + cnmlBaseOp_t squeeze_op; + CNML_CALL(cnmlCreateNdReshapeOpParam(¶m, o_dims.data(), o_dims.size())); + CNML_CALL(cnmlCreateReshapeOp(&squeeze_op, + param, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + CNML_CALL(cnmlDestroyReshapeOpParam(¶m)); + graph->FuseOp(squeeze_op); + CNML_CALL(cnmlDestroyBaseOp(&squeeze_op)); + + if (op_type == "squeeze2") { + auto xshape_var_name = op_info->Output("XShape").front(); + auto xshape = scope->FindVar(xshape_var_name)->GetMutable(); + auto dims_64 = xshape->dims().Vectorize(); + auto dims_64_nhwc = DimNCHW2NHWC(dims_64); + auto xshape_tensor = graph->AddNode( + xshape_var_name, dims_64, CNML_TENSOR, CNML_NCHW, fp_type); + + std::vector xshape_dims(dims_64.size()); + std::transform(dims_64_nhwc.cbegin(), + dims_64_nhwc.cend(), + xshape_dims.begin(), + [](DDim::value_type d) { return static_cast(d); }); + + cnmlBaseOp_t squeeze2_op; + CNML_CALL(cnmlCreateNdReshapeOpParam( + ¶m, xshape_dims.data(), xshape_dims.size())); + CNML_CALL(cnmlCreateReshapeOp(&squeeze2_op, + param, + input_tensor->mlu_tensor(), + xshape_tensor->mlu_tensor())); + CNML_CALL(cnmlDestroyReshapeOpParam(¶m)); + graph->FuseOp(squeeze2_op); + CNML_CALL(cnmlDestroyBaseOp(&squeeze2_op)); + } + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(squeeze, + kMLU, + paddle::lite::subgraph::mlu::SqueezeConverter); +REGISTER_SUBGRAPH_BRIDGE(squeeze2, + kMLU, + paddle::lite::subgraph::mlu::SqueezeConverter); diff --git a/lite/kernels/mlu/bridges/squeeze_op_test.cc b/lite/kernels/mlu/bridges/squeeze_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..ad16dac2e978fa977acacf62ed6adca16365ed6d --- /dev/null +++ b/lite/kernels/mlu/bridges/squeeze_op_test.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/squeeze_op.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +// squeeze +TEST(MLUBridges, squeeze) { + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + std::string ref_var_name("ref"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(ref_var_name)->GetMutable(); + std::vector x_shape({1, 3, 1, 5}); + x->Resize(x_shape); + out_ref->Resize(x_shape); + std::vector out_shape({3, 5}); + out->Resize(out_shape); + + FillTensor(x, 0, 10); + out_ref->CopyDataFrom(*x); + + // SqueezeCompute squeeze; + cpp::OpDesc opdesc; + opdesc.SetType("squeeze"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + + std::vector axes{0, -2}; + opdesc.SetAttr("axes", axes); + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + LaunchOp(op, {x_var_name}, {out_var_name}); + + auto x_data = out_ref->data(); + auto out_data = out->data(); + for (int j = 0; j < out->numel(); ++j) { + EXPECT_NEAR(out_data[j], x_data[j], 1e-5); + } +} + +// squeeze2 +TEST(MLUBridges, squeeze2) { + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + std::string xshape_var_name("xshape"); + std::string ref_var_name("ref"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* xshape = scope.Var(xshape_var_name)->GetMutable(); + auto* out_ref = scope.Var(ref_var_name)->GetMutable(); + std::vector x_shape({1, 3, 1, 5}); + x->Resize(x_shape); + out_ref->Resize(x_shape); + std::vector out_shape({3, 5}); + out->Resize(out_shape); + std::vector xshape_shape({1, 3, 1, 5}); + xshape->Resize(xshape_shape); + + FillTensor(x, 0, 10); + out_ref->CopyDataFrom(*x); + + // Squeeze2Compute squeeze2; + cpp::OpDesc opdesc; + opdesc.SetType("squeeze2"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetOutput("XShape", {xshape_var_name}); + + std::vector axes({0, -2}); + opdesc.SetAttr("axes", axes); + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + LaunchOp(op, {x_var_name}, {out_var_name, xshape_var_name}); + + auto x_data = out_ref->mutable_data(); + auto out_data = out->mutable_data(); + auto xshape_data = xshape->mutable_data(); + for (int j = 0; j < out->numel(); ++j) { + EXPECT_NEAR(out_data[j], x_data[j], 1e-5); + EXPECT_NEAR(xshape_data[j], x_data[j], 1e-5); + } +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(squeeze, kMLU); +USE_SUBGRAPH_BRIDGE(squeeze2, kMLU); diff --git a/lite/kernels/mlu/bridges/tensor.cc b/lite/kernels/mlu/bridges/tensor.cc index be7e1f09beaee61dace598b958ab4f95f14b38f8..f1bf48d66e8693e72a96f0f52c285a717f464128 100644 --- a/lite/kernels/mlu/bridges/tensor.cc +++ b/lite/kernels/mlu/bridges/tensor.cc @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include #include namespace paddle { @@ -25,8 +28,9 @@ namespace mlu { MLUTensor::MLUTensor(const std::vector& shape, cnmlTensorType_t tensor_type, - cnmlDataOrder_t data_order, - cnmlDataType_t mlu_dtype) + cnmlDataOrder_t shape_order, + cnmlDataType_t mlu_dtype, + cnmlDataOrder_t data_order) : mlu_tensor_(nullptr), tensor_type_(tensor_type), mlu_ptr_(nullptr) { std::vector int_shape; for (auto i : shape) { @@ -36,15 +40,18 @@ MLUTensor::MLUTensor(const std::vector& shape, LOG(FATAL) << "Shape size is beyond the limitation of MLUTensor!"; } } - remember(int_shape, tensor_type, mlu_dtype, data_order); + remember(int_shape, tensor_type, mlu_dtype, shape_order, data_order); } void MLUTensor::remember(const std::vector& shape, cnmlTensorType_t tensor_type, cnmlDataType_t mlu_dtype, - cnmlDataOrder_t shape_order) { + cnmlDataOrder_t shape_order, + cnmlDataOrder_t data_order) { tensor_type_ = tensor_type; mlu_dtype_ = mlu_dtype; + data_order_ = data_order; + origin_shape_.assign(shape.begin(), shape.end()); int size = 4; if (shape.size() > 4 || shape_order == CNML_ARRAY) { @@ -239,13 +246,22 @@ void MLUTensor::remember(const std::vector& shape, break; } } - dim_ = shape_.size(); + auto shape_NCHW = DimNHWC2NCHW(shape_); + shape_NCHW.erase(shape_NCHW.begin() + shape.size(), shape_NCHW.end()); + dim_ = shape_NCHW.size(); + shape_ = DimNCHW2NHWC(shape_NCHW); } void MLUTensor::Create() { if (mlu_tensor_ == nullptr) { CNML_CALL(cnmlCreateTensor_V2(&mlu_tensor_, tensor_type_)); std::vector dim_shape(shape_); + if (data_order_ == CNML_NCHW) { + std::transform(origin_shape_.cbegin(), + origin_shape_.cend(), + dim_shape.begin(), + [](DDim::value_type in) { return static_cast(in); }); + } int* dim_strides = nullptr; CNML_CALL(cnmlSetTensorShape_V2( mlu_tensor_, dim_, dim_shape.data(), dim_strides)); @@ -258,6 +274,84 @@ cnmlTensor_t MLUTensor::mlu_tensor() { return mlu_tensor_; } +void MLUTensor::ToFile(std::string file_name) { + if (mlu_ptr_) { + VLOG(5) << "to dump mlu ptr: " << mlu_ptr_ << " to: " << file_name; + int count = 1; + for (size_t i = 0; i < shape_.size(); i++) { + count *= shape_[i]; + } + VLOG(6) << " dump count: " << count; + VLOG(6) << " dump shape: "; + for (size_t i = 0; i < shape_.size(); i++) { + VLOG(6) << shape_[i] << " "; + } + + std::vector cpu_data_fp32(count); + // fp16 to fp32 + if (mlu_dtype_ == CNML_DATA_FLOAT16) { + VLOG(6) << " convert fp16 to fp32 "; + std::vector cpu_data_fp16(count); + cnrtMemcpy(cpu_data_fp16.data(), + mlu_ptr_, + count * sizeof(uint16_t), + CNRT_MEM_TRANS_DIR_DEV2HOST); + for (int i = 0; i < count; i++) { + cnrtConvertHalfToFloat(&(cpu_data_fp32[i]), cpu_data_fp16[i]); + } + } else { + cnrtMemcpy(cpu_data_fp32.data(), + mlu_ptr_, + count * sizeof(float), + CNRT_MEM_TRANS_DIR_DEV2HOST); + } + + // trans to nchw + std::vector cpu_data_trans(count); + if (data_order_ != CNML_NCHW) { + switch (shape_.size()) { + case 4: + transpose(cpu_data_fp32.data(), + cpu_data_trans.data(), + shape_, + {0, 3, 1, 2}); + break; + case 3: + transpose( + cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 2, 1}); + break; + case 2: + transpose( + cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 1}); + break; + case 1: + transpose(cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0}); + break; + default: + CHECK(0) << "ToFile only support dim <=4"; + break; + } + } + + // to file + std::ostringstream outs; + for (int i = 0; i < count; i++) { + if (data_order_ == CNML_NCHW) { + outs << cpu_data_fp32[i] << std::endl; + } else { + outs << cpu_data_trans[i] << std::endl; + } + } + std::ofstream of; + of.open(file_name, std::ios::out); + of << outs.str(); + of.close(); + } else { + LOG(FATAL) << "mlu ptr is null ,can not dump mlu content to : " + << file_name; + } +} + MLUTensor::~MLUTensor() { if (mlu_tensor_ != nullptr) { CNML_CALL(cnmlDestroyTensor(&mlu_tensor_)); diff --git a/lite/kernels/mlu/bridges/tensor.h b/lite/kernels/mlu/bridges/tensor.h index 12dc97a772dabc529bf183f783a22a9f2dfa936d..22268f69ba39926dbbfb1bbb18e3a86331097f90 100644 --- a/lite/kernels/mlu/bridges/tensor.h +++ b/lite/kernels/mlu/bridges/tensor.h @@ -14,6 +14,8 @@ #pragma once +#include +#include #include #include "lite/kernels/mlu/bridges/utility.h" @@ -33,13 +35,15 @@ class MLUTensor { MLUTensor(const std::vector& shape, cnmlTensorType_t tensor_type = CNML_TENSOR, - cnmlDataOrder_t data_order = CNML_NCHW, - cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32); + cnmlDataOrder_t shape_order = CNML_NCHW, + cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32, + cnmlDataOrder_t data_order = CNML_NHWC); void remember(const std::vector& shape, cnmlTensorType_t tensor_type, cnmlDataType_t mlu_dtype, - cnmlDataOrder_t shape_order); + cnmlDataOrder_t shape_order, + cnmlDataOrder_t data_order); void Create(); cnmlTensor_t mlu_tensor(); void* mlu_data() { @@ -47,14 +51,21 @@ class MLUTensor { return mlu_ptr_; } + cnmlDataType_t dtype() { return mlu_dtype_; } void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; } + const std::vector& get_origin_shape() const { return origin_shape_; } + ~MLUTensor(); + void ToFile(std::string file_name); + cnmlDataOrder_t dorder() { return data_order_; } + private: cnmlTensor_t mlu_tensor_; std::vector shape_; + std::vector origin_shape_; cnmlTensorType_t tensor_type_; cnmlDataType_t mlu_dtype_; int dim_{0}; diff --git a/lite/kernels/mlu/bridges/test_helper.cc b/lite/kernels/mlu/bridges/test_helper.cc index 377a00689ef3a27f78ae008072578ab3701cd337..36eeb473f6a37aa28a9447280f808f5fb08978d0 100644 --- a/lite/kernels/mlu/bridges/test_helper.cc +++ b/lite/kernels/mlu/bridges/test_helper.cc @@ -24,18 +24,38 @@ namespace lite { namespace subgraph { namespace mlu { +template +void PrepareInput(Graph* graph, + const std::string& input_name, + Tensor* input_tensor, + cnmlDataOrder_t order) { + thread_local Tensor temp_input; + temp_input.Resize(input_tensor->dims().Vectorize()); + temp_input.CopyDataFrom(*input_tensor); + using data_type = typename MLUTypeTraits::type; + auto input_node = graph->AddNode( + input_name, + input_tensor->dims().Vectorize(), + CNML_TENSOR, + CNML_NCHW, + MLUTypeTraits::cnml_type, + order, + reinterpret_cast( + input_tensor->template mutable_data(TARGET(kMLU)))); + CHECK(input_node); + CNRT_CHECK(cnrtMemcpy(input_tensor->template mutable_data(), + temp_input.mutable_data(), + sizeof(data_type) * input_tensor->dims().production(), + CNRT_MEM_TRANS_DIR_HOST2DEV)); +} + void LaunchOp(const std::shared_ptr op, const std::vector& input_var_names, - const std::vector& output_var_names) { + const std::vector& output_var_names, + cnmlDataOrder_t order) { CNRT_CALL(cnrtInit(0)); - ::paddle::lite::SetMluDevice(0); + lite::SetMluDevice(0); cnrtQueue_t queue_; - cnrtInvokeFuncParam_t forward_param; - u32_t affinity = 1; - int data_param = 1; - forward_param.data_parallelism = &data_param; - forward_param.affinity = &affinity; - forward_param.end = CNRT_PARAM_END; CNRT_CALL(cnrtCreateQueue(&queue_)); cnrtDev_t dev_handle; CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, 0)); @@ -50,23 +70,21 @@ void LaunchOp(const std::shared_ptr op, // Convert input data var and add it into the MLU IR graph for (auto& input_name : input_var_names) { auto input_tensor = scope->FindMutableTensor(input_name); - CHECK(input_tensor); - Tensor temp_input; - temp_input.Resize(input_tensor->dims().Vectorize()); - temp_input.CopyDataFrom(*input_tensor); - auto input_node = - graph.AddNode(input_name, - input_tensor->dims().Vectorize(), - CNML_TENSOR, - CNML_NCHW, - graph.FPType(), - reinterpret_cast( - input_tensor->mutable_data(TARGET(kMLU)))); - CHECK(input_node); - CNRT_CHECK(cnrtMemcpy(input_tensor->mutable_data(), - temp_input.mutable_data(), - sizeof(float) * input_tensor->dims().production(), - CNRT_MEM_TRANS_DIR_HOST2DEV)); + auto data_type = input_tensor->precision(); + + switch (data_type) { +#define PREPARE_INPUT(type__) \ + case PRECISION(type__): \ + PrepareInput(&graph, input_name, input_tensor, order); \ + break; + PREPARE_INPUT(kFP16) + PREPARE_INPUT(kFloat) + PREPARE_INPUT(kInt8) + PREPARE_INPUT(kInt32) +#undef PREPARE_INPUT + default: + CHECK(0); + } } op->CheckShape(); op->InferShape(); @@ -89,8 +107,9 @@ void LaunchOp(const std::shared_ptr op, } graph.Compile(CNML_MLU270, 1); + graph.Compute(queue_, *(graph.MutableInputs()), *(graph.MutableOutputs())); + CNRT_CALL(cnrtSyncQueue(queue_)); - graph.Compute(forward_param, queue_); for (auto& output_name : output_var_names) { auto output_tensor = scope->FindMutableTensor(output_name); Tensor temp_out; diff --git a/lite/kernels/mlu/bridges/test_helper.h b/lite/kernels/mlu/bridges/test_helper.h index 4da9e72dfcc5a81a68467f7622e2c16aedb2ded5..36fe6f1efaed76deccdc6e9542bb52a2aefc2571 100644 --- a/lite/kernels/mlu/bridges/test_helper.h +++ b/lite/kernels/mlu/bridges/test_helper.h @@ -58,7 +58,8 @@ void FillTensor(Tensor* x, void LaunchOp(const std::shared_ptr op, const std::vector& input_var_names, - const std::vector& output_var_names); + const std::vector& output_var_names, + cnmlDataOrder_t order = CNML_NHWC); } // namespace mlu } // namespace subgraph diff --git a/lite/kernels/mlu/bridges/transpose_op.cc b/lite/kernels/mlu/bridges/transpose_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b6caeb3613fea8f348e3990ec2c9660321590116 --- /dev/null +++ b/lite/kernels/mlu/bridges/transpose_op.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +std::vector axis_to_nhwc(const std::vector& axis) { + std::vector new_axis(axis.size()); + + auto nhwc2nchw_axis = std::move(GetAxisNHWC2NCHW(axis.size())); + auto nchw2nhwc_axis = std::move(GetAxisNCHW2NHWC(axis.size())); + + for (size_t i = 0; i < new_axis.size(); ++i) { + new_axis[i] = nhwc2nchw_axis[axis[nchw2nhwc_axis[i]]]; + } + return new_axis; +} + +int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Get input vars and op attributes + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims().Vectorize(); + + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + + auto axis = op_info->GetAttr>("axis"); + std::vector axis_nhwc = axis_to_nhwc(axis); + + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + + CHECK(graph->HasNode(x_var_name)); + auto input_tensor = graph->GetNode(x_var_name); + cnmlBaseOp_t transpose_op{nullptr}; + + cnmlNdTransposeOpParam_t transpose_param{nullptr}; + + CNML_CALL(cnmlCreateNdTransposeOpParam( + &transpose_param, axis_nhwc.data(), axis_nhwc.size())); + + // Use cnmlCreatexxxOpForward to create op. + CNML_CALL(cnmlCreateNdTransposeProOp(&transpose_op, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + transpose_param)); + + graph->FuseOp(transpose_op); + CNML_CALL(cnmlDestroyBaseOp(&transpose_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle +REGISTER_SUBGRAPH_BRIDGE(transpose, + kMLU, + paddle::lite::subgraph::mlu::TransposeConverter); +REGISTER_SUBGRAPH_BRIDGE(transpose2, + kMLU, + paddle::lite::subgraph::mlu::TransposeConverter); diff --git a/lite/kernels/mlu/bridges/transpose_op_test.cc b/lite/kernels/mlu/bridges/transpose_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6e8f7890581279f0ab4d51006c194967fd9c61e7 --- /dev/null +++ b/lite/kernels/mlu/bridges/transpose_op_test.cc @@ -0,0 +1,153 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/transpose_op.h" +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int data_index(std::vector pos, DDimLite dims) { + int d1 = dims[1]; + int d2 = dims[2]; + int d3 = dims[3]; + return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1; +} + +std::vector pos_trans(std::vector in_pos, std::vector axis) { + std::vector out_pos(in_pos.size()); + for (size_t i = 0; i < axis.size(); i++) { + out_pos[axis[i]] = in_pos[i]; + } + return out_pos; +} + +template +void transpose_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + + auto input = + scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto output = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + auto x_dims = input->dims(); + auto y_dims = output->dims(); + auto axis = op_info->GetAttr>("axis"); + + // auto input_data = input->data(); + auto* input_data = input->mutable_data(); + auto* output_data = output->mutable_data(); + + int input_n = x_dims[0]; + int input_c = x_dims[1]; + int input_h = x_dims[2]; + int input_w = x_dims[3]; + + for (int n = 0; n < input_n; ++n) { + for (int c = 0; c < input_c; ++c) { + for (int h = 0; h < input_h; ++h) { + for (int w = 0; w < input_w; ++w) { + std::vector in_pos{n, c, h, w}; + std::vector out_pos = pos_trans(in_pos, axis); + int in_index = data_index(in_pos, x_dims); + int out_index = data_index(out_pos, y_dims); + output_data[out_index] = input_data[in_index]; + } + } + } + } +} + +void test_transpose(const std::vector& input_shape, + std::vector axis) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize(input_shape); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("transpose"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axis", axis); + + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + + // transpose_ref must run befor LaunchOp + // otherwise get Cannot access memory + // execute reference implementation and save to output tensor + transpose_ref(op); + out_ref->CopyDataFrom(*out); + + Tensor input_x; + input_x.Resize(DDim(input_shape)); + transpose(x->mutable_data(), + input_x.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3])}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_x); + + LaunchOp(op, {x_var_name}, {out_var_name}); + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + + Tensor output_trans; + output_trans.Resize(out->dims()); + auto os = out->dims(); + transpose(out_data, + output_trans.mutable_data(), + {static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}, + {0, 3, 1, 2}); + out_data = output_trans.mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +// TODO(pmshst): fix the transpose test +TEST(MLUBridges, transpose) { + std::vector input_shape = {2, 3, 4, 5}; + test_transpose(input_shape, std::vector{0, 1, 3, 2}); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(transpose, kMLU); +USE_SUBGRAPH_BRIDGE(transpose2, kMLU); diff --git a/lite/kernels/mlu/bridges/utility.cc b/lite/kernels/mlu/bridges/utility.cc index cd78553a652433fc41334a6bff5575031f5125e0..b53debd643ae2b1080644d2844d702797addabec 100644 --- a/lite/kernels/mlu/bridges/utility.cc +++ b/lite/kernels/mlu/bridges/utility.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/kernels/mlu/bridges/utility.h" + #include namespace paddle { @@ -20,33 +21,21 @@ namespace lite { namespace subgraph { namespace mlu { -void transpose(float* input_data, - float* output_data, - std::vector input_shape, - std::vector axis) { +void transpose2d(float* input_data, + float* output_data, + std::vector input_shape) { + CHECK_EQ(input_shape.size(), 2); int old_index = -1; int new_index = -1; - int dim[4] = {0}; - std::vector shape = input_shape; - for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) { - for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) { - for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) { - for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) { - old_index = dim[0] * shape[1] * shape[2] * shape[3] + - dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3]; - new_index = - dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] + - dim[axis[1]] * shape[axis[2]] * shape[axis[3]] + - dim[axis[2]] * shape[axis[3]] + dim[axis[3]]; - output_data[new_index] = input_data[old_index]; - } - } + for (int i = 0; i < input_shape[0]; i++) { + for (int j = 0; j < input_shape[1]; j++) { + old_index = i * input_shape[1] + j; + new_index = j * input_shape[0] + i; + output_data[new_index] = input_data[old_index]; } } } -int scale2position(float scale) { return static_cast(-std::log2(scale)); } - void dequant(float* dst, int8_t* src, size_t size, float scale) { for (size_t i = 0; i < size; ++i) { dst[i] = static_cast(src[i]) * scale; diff --git a/lite/kernels/mlu/bridges/utility.h b/lite/kernels/mlu/bridges/utility.h index fa8fb1597c0fb068a855928dd20057d48ecd5eaf..fd1e5eb265936f11f258d86e2b6a91af1d55c6ed 100644 --- a/lite/kernels/mlu/bridges/utility.h +++ b/lite/kernels/mlu/bridges/utility.h @@ -16,24 +16,76 @@ #include #include + #include #include #include + #include "lite/backends/mlu/mlu_utils.h" #include "lite/core/op_lite.h" #include "lite/core/tensor.h" -#include "lite/fluid/data_type.h" +#include "lite/fluid/float16.h" namespace paddle { namespace lite { namespace subgraph { namespace mlu { -void transpose(float* input_data, - float* output_data, +void transpose2d(float* input_data, + float* output_data, + std::vector input_shape); + +template +void transpose(dtype* input_data, + dtype* output_data, std::vector input_shape, - std::vector axis); -int scale2position(float scale); + std::vector axis) { + int old_index = -1; + int new_index = -1; + std::vector shape; + std::vector expand_axis; + if (input_shape.size() < 5u) { + for (size_t i = 0; i < 5 - input_shape.size(); i++) { + shape.push_back(1); + expand_axis.push_back(i); + } + for (size_t i = 0; i < input_shape.size(); i++) { + shape.push_back(input_shape[i]); + expand_axis.push_back(axis[i] + 5 - input_shape.size()); + } + } else { + shape = input_shape; + expand_axis = axis; + } + int dim[5] = {0}; + for (dim[0] = 0; dim[0] < shape[0]; dim[0]++) { + for (dim[1] = 0; dim[1] < shape[1]; dim[1]++) { + for (dim[2] = 0; dim[2] < shape[2]; dim[2]++) { + for (dim[3] = 0; dim[3] < shape[3]; dim[3]++) { + for (dim[4] = 0; dim[4] < shape[4]; dim[4]++) { + old_index = dim[0] * shape[1] * shape[2] * shape[3] * shape[4] + + dim[1] * shape[2] * shape[3] * shape[4] + + dim[2] * shape[3] * shape[4] + dim[3] * shape[4] + + dim[4]; + new_index = dim[expand_axis[0]] * shape[expand_axis[1]] * + shape[expand_axis[2]] * shape[expand_axis[3]] * + shape[expand_axis[4]] + + dim[expand_axis[1]] * shape[expand_axis[2]] * + shape[expand_axis[3]] * shape[expand_axis[4]] + + dim[expand_axis[2]] * shape[expand_axis[3]] * + shape[expand_axis[4]] + + dim[expand_axis[3]] * shape[expand_axis[4]] + + dim[expand_axis[4]]; + output_data[new_index] = input_data[old_index]; + } + } + } + } + } +} + +inline int scale2position(float scale) { return std::floor(-std::log2(scale)); } + void dequant(float* dst, int8_t* src, size_t size, float scale); void dequant(float* dst, @@ -64,27 +116,94 @@ inline const ::paddle::lite::DDimLite DimNCHW2NHWC( std::vector({dim[0], dim[2], dim[3], dim[1]})); } -inline const std::vector DimNHWC2NCHW( - const std::vector& dim) { - return std::vector({dim[0], dim[3], dim[1], dim[2]}); +template +inline const std::vector DimNHWC2NCHW( + const std::vector& dim) { + switch (dim.size()) { + case 1: + return dim; + case 2: + return dim; + case 3: + return std::vector({dim[0], dim[2], dim[1]}); + case 4: + return std::vector({dim[0], dim[3], dim[1], dim[2]}); + case 5: + return std::vector({dim[0], dim[4], dim[1], dim[2], dim[3]}); + default: + CHECK(0) << "unsupport dimension"; + } +} + +template +inline const std::vector DimNCHW2NHWC( + const std::vector& dim) { + switch (dim.size()) { + case 1: + return dim; + case 2: + return dim; + case 3: + return std::vector({dim[0], dim[2], dim[1]}); + case 4: + return std::vector({dim[0], dim[2], dim[3], dim[1]}); + case 5: + return std::vector({dim[0], dim[2], dim[3], dim[4], dim[1]}); + default: + CHECK(0) << "unsupport dimension"; + } } -inline const std::vector DimNCHW2NHWC( - const std::vector& dim) { - return std::vector({dim[0], dim[2], dim[3], dim[1]}); +template +inline std::vector GetAxisNHWC2NCHW(size_t n_dims) { + std::vector nhwc2nchw_axis(n_dims); + nhwc2nchw_axis[0] = 0; + if (n_dims > 1) nhwc2nchw_axis[1] = n_dims - 1; + for (size_t i = 2; i < n_dims; ++i) { + nhwc2nchw_axis[i] = i - 1; + } + return nhwc2nchw_axis; +} + +template +inline std::vector GetAxisNCHW2NHWC(size_t n_dims) { + std::vector nchw2nhwc_axis(n_dims); + nchw2nhwc_axis[0] = 0; + for (size_t i = 1; i < n_dims - 1; ++i) { + nchw2nhwc_axis[i] = i + 1; + } + if (n_dims > 1) nchw2nhwc_axis[n_dims - 1] = 1; + return nchw2nhwc_axis; } template -struct FPTypeTraits {}; +struct MLUTypeTraits { + /* using type = void; */ + /* static constexpr cnmlDataType_t cnml_type = CNML_DATA_INVALID; */ +}; + +template <> +struct MLUTypeTraits { + using type = float; + static constexpr cnmlDataType_t cnml_type = CNML_DATA_FLOAT32; +}; + +template <> +struct MLUTypeTraits { + using type = paddle::lite::fluid::float16; + static constexpr cnmlDataType_t cnml_type = CNML_DATA_FLOAT16; +}; template <> -struct FPTypeTraits { - typedef float T; +struct MLUTypeTraits { + using type = int8_t; + static constexpr cnmlDataType_t cnml_type = CNML_DATA_INT8; }; template <> -struct FPTypeTraits { - typedef paddle::lite::fluid::float16 T; +struct MLUTypeTraits { + using type = int32_t; + static constexpr cnmlDataType_t cnml_type = CNML_DATA_INT32; }; } // namespace mlu diff --git a/lite/kernels/mlu/io_copy_compute.cc b/lite/kernels/mlu/io_copy_compute.cc index 02e4d8b28e81e88201b895a4b8fbe9e93d3f17f9..ff8a7ddf6e4c465f288ba42b5b2537294a9d7ffd 100644 --- a/lite/kernels/mlu/io_copy_compute.cc +++ b/lite/kernels/mlu/io_copy_compute.cc @@ -41,6 +41,9 @@ class IoCopyHostToMluCompute auto mem_size = param.x->memory_size(); // LOG(INFO) << "copy size " << mem_size; auto* data = param.y->mutable_data(TARGET(kMLU), mem_size); + VLOG(6) << "io_copy host to mlu] memory size: " << mem_size + << " precision type: " << PrecisionToStr(Precision); + param.y->set_precision(param.x->precision()); CopyFromHostSync(data, param.x->raw_data(), mem_size); } @@ -79,6 +82,13 @@ class IoCopyMluToHostCompute CHECK(param.x->target() == TARGET(kMLU)); auto mem_size = param.x->memory_size(); auto* data = param.y->mutable_data(TARGET(kHost), mem_size); + VLOG(6) << "io_copy mlu to host] memory size: " << mem_size + << " precision type: " << PrecisionToStr(Precision); + + // sync queue to ensure process done + auto& mlu_context = this->ctx_->template As(); + CNRT_CALL(cnrtSyncQueue(mlu_context.exec_queue())); + CopyToHostSync(data, param.x->raw_data(), mem_size); } @@ -97,8 +107,14 @@ REGISTER_LITE_KERNEL( kNHWC, paddle::lite::kernels::mlu::IoCopyHostToMluCompute, host_to_device_kFloat) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))}) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kFloat), + DATALAYOUT(kAny))}) .Finalize(); REGISTER_LITE_KERNEL( @@ -108,8 +124,31 @@ REGISTER_LITE_KERNEL( kNHWC, paddle::lite::kernels::mlu::IoCopyHostToMluCompute, host_to_device_kFP16) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))}) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFP16), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kFP16), + DATALAYOUT(kAny))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + io_copy, + kMLU, + kInt32, + kNHWC, + paddle::lite::kernels::mlu::IoCopyHostToMluCompute, + host_to_device_kInt32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt32), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kInt32), + DATALAYOUT(kAny))}) .Finalize(); REGISTER_LITE_KERNEL( @@ -119,8 +158,14 @@ REGISTER_LITE_KERNEL( kNHWC, paddle::lite::kernels::mlu::IoCopyMluToHostCompute, device_to_host_kFloat) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kFloat), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kAny))}) .Finalize(); REGISTER_LITE_KERNEL( @@ -130,6 +175,29 @@ REGISTER_LITE_KERNEL( kNHWC, paddle::lite::kernels::mlu::IoCopyMluToHostCompute, device_to_host_kFP16) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kFP16), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFP16), + DATALAYOUT(kAny))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + io_copy, + kMLU, + kInt8, + kNHWC, + paddle::lite::kernels::mlu::IoCopyHostToMluCompute, + host_to_device_to_kInt8) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt8), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kInt8), + DATALAYOUT(kAny))}) .Finalize(); diff --git a/lite/kernels/mlu/layout_compute.cc b/lite/kernels/mlu/layout_compute.cc index d4e16734d6d2dae6f5c119194008bce114a2e918..42b12740ff0edb88ea2944e25ca03ade36caa956 100644 --- a/lite/kernels/mlu/layout_compute.cc +++ b/lite/kernels/mlu/layout_compute.cc @@ -24,9 +24,9 @@ namespace mlu {} // namespace mlu REGISTER_LITE_KERNEL( layout, - kMLU, + kX86, kFloat, - kNHWC, + kNCHW, paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute, def_layout_nhwc2nchw_fp32) .BindInput("Input", @@ -41,9 +41,9 @@ REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL( layout, - kMLU, + kX86, kFP16, - kNHWC, + kNCHW, paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute, def_layout_nhwc2nchw_fp16) .BindInput("Input", @@ -58,9 +58,9 @@ REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL( layout, - kMLU, + kX86, kFloat, - kNHWC, + kNCHW, paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute, def_layout_nchw2nhwc_fp32) .BindInput("Input", @@ -75,9 +75,9 @@ REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL( layout, - kMLU, + kX86, kFP16, - kNHWC, + kNCHW, paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute, def_layout_nchw2nhwc_fp16) .BindInput("Input", @@ -92,11 +92,11 @@ REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL( layout, - kMLU, + kX86, kInt8, - kNHWC, + kNCHW, paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute, - def_layout_nchw2nhwc_fp32_int8) + def_layout_nchw2nhwc_int8) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt8), diff --git a/lite/kernels/mlu/layout_compute.h b/lite/kernels/mlu/layout_compute.h index edacdf8a98a2ffde6e538f61d4dd8259e3211b22..df254865994fe8548df0e021ecb471f5a1020080 100644 --- a/lite/kernels/mlu/layout_compute.h +++ b/lite/kernels/mlu/layout_compute.h @@ -22,6 +22,7 @@ #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" #include "lite/core/type_system.h" +#include "lite/kernels/mlu/bridges/utility.h" #include "lite/operators/layout_op.h" namespace paddle { @@ -29,24 +30,6 @@ namespace lite { namespace kernels { namespace mlu { -template -struct FPTypeTraits {}; - -template <> -struct FPTypeTraits { - typedef float T; -}; - -template <> -struct FPTypeTraits { - typedef paddle::lite::fluid::float16 T; -}; - -template <> -struct FPTypeTraits { - typedef int8_t T; -}; - template inline void LayoutTransCompute(const int dim, const lite::Context& context, @@ -73,7 +56,7 @@ inline void LayoutTransCompute(const int dim, template class LayoutNchwToNhwcCompute - : public KernelLite { + : public KernelLite { public: using param_t = operators::LayoutParam; @@ -81,36 +64,37 @@ class LayoutNchwToNhwcCompute auto& param = this->template Param(); auto* x = param.x; auto* out = param.y; - out->template mutable_data::T>(); - auto x_dims = param.x->dims().size(); + out->template mutable_data< + typename subgraph::mlu::MLUTypeTraits::type>(); + auto x_ndims = param.x->dims().size(); auto& context = this->ctx_->template As(); const auto origin_dims = out->dims().Vectorize(); std::vector axis; - switch (x_dims) { + switch (x_ndims) { case 2: axis = {0, 1}; break; case 3: axis = {0, 2, 1}; out->Resize(std::vector{ - out->dims()[0], out->dims()[2], out->dims()[1]}); + origin_dims[0], origin_dims[2], origin_dims[1]}); break; case 4: axis = {0, 2, 3, 1}; out->Resize(std::vector{ - out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]}); + origin_dims[0], origin_dims[2], origin_dims[3], origin_dims[1]}); break; default: CHECK(0) << "Unsupport dim in mlu layout nchw to nhwc"; } LayoutTransCompute::T>( - x_dims, context, *x, out, axis); + typename subgraph::mlu::MLUTypeTraits::type>( + x_ndims, context, *x, out, axis); - if (x_dims > 2) { + if (x_ndims > 2) { out->Resize(origin_dims); } } @@ -122,7 +106,7 @@ class LayoutNchwToNhwcCompute template class LayoutNhwcToNchwCompute - : public KernelLite { + : public KernelLite { public: using param_t = operators::LayoutParam; @@ -130,25 +114,27 @@ class LayoutNhwcToNchwCompute auto& param = this->template Param(); auto* x = param.x; auto* out = param.y; - out->template mutable_data::T>(); - auto x_dims = param.x->dims().size(); + out->template mutable_data< + typename subgraph::mlu::MLUTypeTraits::type>(); auto& context = this->ctx_->template As(); - const auto origin_dims = out->dims().Vectorize(); + TensorLite tmp_t; + tmp_t.ShareDataWith(*x); + const auto x_dims = x->dims().Vectorize(); + auto x_ndims = param.x->dims().size(); std::vector axis; - switch (x_dims) { + switch (x_ndims) { case 2: axis = {0, 1}; break; case 3: - out->Resize(std::vector{ - out->dims()[0], out->dims()[2], out->dims()[1]}); + tmp_t.Resize(std::vector{x_dims[0], x_dims[2], x_dims[1]}); axis = {0, 2, 1}; break; case 4: - out->Resize(std::vector{ - out->dims()[0], out->dims()[3], out->dims()[1], out->dims()[2]}); + tmp_t.Resize( + std::vector{x_dims[0], x_dims[2], x_dims[3], x_dims[1]}); axis = {0, 3, 1, 2}; break; default: @@ -156,12 +142,8 @@ class LayoutNhwcToNchwCompute } LayoutTransCompute::T>( - x_dims, context, *x, out, axis); - - if (x_dims > 2) { - out->Resize(origin_dims); - } + typename subgraph::mlu::MLUTypeTraits::type>( + x_ndims, context, tmp_t, out, axis); } std::string doc() const override { diff --git a/lite/kernels/mlu/subgraph_compute.cc b/lite/kernels/mlu/subgraph_compute.cc index 73ca9dcc20a6311d33e5cff6c6ed6be08f3c7a1f..450031021d3ad70c6abb348a6e498d8876f5ec56 100644 --- a/lite/kernels/mlu/subgraph_compute.cc +++ b/lite/kernels/mlu/subgraph_compute.cc @@ -36,8 +36,14 @@ REGISTER_LITE_KERNEL( kNHWC, paddle::lite::kernels::mlu::SubgraphCompute, def_kFloat) - .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))}) - .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))}) + .BindInput("Inputs", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .BindOutput("Outputs", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kAny), + DATALAYOUT(kAny))}) .Finalize(); REGISTER_LITE_KERNEL( @@ -47,6 +53,12 @@ REGISTER_LITE_KERNEL( kNHWC, paddle::lite::kernels::mlu::SubgraphCompute, def_FP16) - .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))}) - .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))}) + .BindInput("Inputs", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .BindOutput("Outputs", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kAny), + DATALAYOUT(kAny))}) .Finalize(); diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h index 3bfba33f4d7e8fd86f7aaf276da2ca4a8b0bd7cf..a76f57ad90bb7f7ea2b2629c80da68cba4c7fffa 100644 --- a/lite/kernels/mlu/subgraph_compute.h +++ b/lite/kernels/mlu/subgraph_compute.h @@ -14,17 +14,24 @@ #pragma once +#include +#include #include #include #include + #include "lite/api/paddle_place.h" #include "lite/core/kernel.h" +#include "lite/core/op_lite.h" #include "lite/core/op_registry.h" +#include "lite/core/tensor.h" #include "lite/core/type_system.h" #include "lite/core/types.h" #include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/tensor.h" #include "lite/kernels/npu/bridges/engine.h" #include "lite/kernels/npu/bridges/registry.h" +#include "lite/utils/env.h" namespace paddle { namespace lite { @@ -40,56 +47,115 @@ class SubgraphEngine : public subgraph::Engine { const std::vector& input_names, const std::vector& output_names, Scope* scope, - ::paddle::lite_api::PrecisionType type) + paddle::lite_api::PrecisionType type) : subgraph::Engine( - ctx, block_idx, block_desc, input_names, output_names, scope) { - graph_.SetFPType(type); + ctx, block_idx, block_desc, input_names, output_names, scope), + fp_type_(type) { + VLOG(4) << "[MLU] PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL is " + << GetBoolFromEnv("PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL"); + VLOG(4) << "[MLU] PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE is " + << GetBoolFromEnv("PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE"); + VLOG(4) << "[MLU] LITE_DISABLE_MLU_CAST is " + << GetBoolFromEnv("LITE_DISABLE_MLU_CAST"); + if (GetBoolFromEnv("PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE")) { + disable_batch_size_changeable_ = true; + } } - int Build() { - // In order to attach all of the ops of the block desc, we need to build - // the original program firstly. - BuildOriginProgram(); - // Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph - build_device_program_status_ = BuildDeviceProgram(); - return build_device_program_status_; + bool InputShapeChanged() { + std::vector> new_shape; + // used in batch changable situation + std::vector> all_shape; + for (auto origin_itensor : origin_itensors_) { + if (!disable_batch_size_changeable_) { + auto iv = origin_itensor->dims().Vectorize(); + all_shape.push_back(iv); + iv.erase(iv.begin()); + new_shape.push_back(iv); + } else { + new_shape.push_back(origin_itensor->dims().Vectorize()); + } + } + inputs_shape_ = new_shape; + all_inputs_shape_ = all_shape; + if (shape_graph_map_.count(inputs_shape_) > 0) { + return false; + } + VLOG(3) << "MLU graph input shape changed" << std::endl; + return true; } - int Launch() { - // Rebuild device program when the shapes of input tensors have been - // changed. - if (subgraph::CHECK_SUCCESS(build_device_program_status_) && - subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED( - build_device_program_status_) && - InputShapeChanged()) { - Build(); - } - if (subgraph::CHECK_FAILED(build_device_program_status_)) { - LaunchOriginProgram(); - } else { - LaunchDeviceProgram(); + inline cnmlDataType_t PrecisionToDatatype(PrecisionType data_type) { + switch (data_type) { + case paddle::lite_api::PrecisionType::kFP16: + return CNML_DATA_FLOAT16; + case paddle::lite_api::PrecisionType::kFloat: + return CNML_DATA_FLOAT32; + case paddle::lite_api::PrecisionType::kInt32: + return CNML_DATA_INT32; + case paddle::lite_api::PrecisionType::kInt8: + return CNML_DATA_UINT8; + default: + return PrecisionToDatatype(fp_type_); } - return 0; } protected: - int BuildDeviceProgram() override { + bool BuildDeviceProgram() override { + if (origin_program_.empty()) { + BuildOriginProgram(); + } + if (!error_compile_batch_size_changeable_ && + !disable_batch_size_changeable_) { + int status = BuildDeviceProgramImpl(); + if (subgraph::CHECK_SUCCESS(status)) { + return status; + } + LOG(INFO) << "[MLU] build batch_size changeable subgraph op failed, " + "changed to input_shape changeable"; + } + error_compile_batch_size_changeable_ = true; + disable_batch_size_changeable_ = true; + return BuildDeviceProgramImpl(); + } + + bool BuildDeviceProgramImpl() { int status = 0; + auto graph = std::make_shared(); + graph->SetFPType(fp_type_); + std::vector> new_shape; + origin_itensors_.clear(); + origin_otensors_.clear(); + + auto data_order = block_desc_->GetOp(0)->Type() == "layout" + ? CNML_NCHW + : CNML_NHWC; // Convert all of input data vars and added into the MLU IR graph + status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED; for (auto& input_name : input_names_) { auto input_tensor = scope_->FindMutableTensor(input_name); + auto data_type = input_tensor->precision(); + cnmlDataType_t fp_type = PrecisionToDatatype(data_type); + origin_itensors_.push_back(input_tensor); + if (!disable_batch_size_changeable_) { + auto iv = input_tensor->dims().Vectorize(); + iv.erase(iv.begin()); + new_shape.push_back(iv); + } else { + new_shape.push_back(input_tensor->dims().Vectorize()); + } + CHECK(input_tensor); - auto input_node = - graph_.AddNode(input_name, - input_tensor->dims().Vectorize(), - CNML_TENSOR, - CNML_NCHW, - graph_.FPType(), - const_cast(input_tensor->raw_data())); + VLOG(4) << "subgraph input tensor " << input_name << std::endl; + auto input_node = graph->AddNode(input_name, + input_tensor->dims().Vectorize(), + CNML_TENSOR, + CNML_NCHW, + fp_type, + data_order); CHECK(input_node); // MLU doesn't support dynamic dimensions/shapes, so need to rebuild // the program when the shape of any input tensor is changed. - status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED; } LOG(INFO) << "START TO CONVERT "; // Convert all of ops and its weights and added into the MLU IR graph @@ -98,63 +164,304 @@ class SubgraphEngine : public subgraph::Engine { auto op = inst.op(); CHECK(op); std::string op_type = op->op_info()->Type(); + // since cnml's compile api will not return error now, we simply check + // op's type + if (!disable_batch_size_changeable_ && + std::find(unsupport_batch_size_changeable_op_type_.begin(), + unsupport_batch_size_changeable_op_type_.end(), + op_type) != + unsupport_batch_size_changeable_op_type_.end()) { + status |= subgraph::FAILED; + VLOG(4) << "[MLU] found unsupported batch_size changeable op type: " + << op_type; + if (subgraph::CHECK_FAILED(status)) { + return false; + } + return true; + } op->CheckShape(); const_cast(op)->InferShape(); if (!bridges.Exists(op_type, TARGET(kMLU))) { LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type; - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); status |= bridges.Select(op_type, TARGET(kMLU))( - reinterpret_cast(&graph_), + reinterpret_cast(graph.get()), const_cast(op), const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } // Obtain the output nodes of the MLU IR graph and build the graph to MLU // runtime - std::vector valid_output_names; for (auto& output_name : output_names_) { - if (graph_.HasNode(output_name)) { - graph_.AddOutput(graph_.GetNode(output_name)); + if (graph->HasNode(output_name)) { + graph->AddOutput(graph->GetNode(output_name)); auto output_tensor = scope_->FindMutableTensor(output_name); - void* p_data = static_cast( - output_tensor->mutable_data::T>( - TARGET(kMLU))); - auto node = graph_.GetNode(output_name); - CHECK(p_data); - node->set_mlu_ptr(p_data); - valid_output_names.push_back(output_name); + origin_otensors_.push_back(output_tensor); + VLOG(4) << "subgraph output tensor " << output_name << std::endl; + + // auto node = graph->GetNode(output_name); + // CHECK(p_data); + // node->set_mlu_ptr(p_data); } } for (auto& input_name : input_names_) { - graph_.AddInput(graph_.GetNode(input_name)); + graph->AddInput(graph->GetNode(input_name), + disable_batch_size_changeable_); } - CHECK(!valid_output_names.empty()) << "[MLU] no valid output names"; + + CHECK(!origin_otensors_.empty()) << "[MLU] no valid output names"; auto& mlu_context = this->ctx_->template As(); auto core_version = mlu_context.MLUCoreVersion(); auto core_number = mlu_context.MLUCoreNumber(); - graph_.Compile(core_version, core_number); - return status; + graph->Compile(core_version, core_number); + shape_graph_map_[new_shape] = graph; + if (GetBoolFromEnv("PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL")) { + graph->GenOfflineModel(GetOfflineModName()); + } + return true; + } + + std::string TrimStrings(const std::string& origin_str) { + std::string str = origin_str; + std::size_t found = str.find("0x"); + std::size_t found_end = 0; + const std::vector del_strs = { + "/trans_io_copy", "/trans_cast", "/trans_layout"}; + for (const auto& iterm : del_strs) { + found_end = str.find(iterm); + // trim point address and one of the del_strs + if (found != std::string::npos && found_end != std::string::npos) { + str.replace(found, found_end - found, ""); + found_end = str.find(iterm); + str.replace(found_end, iterm.size(), ""); + break; + } + } + return str; + } + + std::string GetOfflineModName() { + sort(input_names_.begin(), input_names_.end()); + sort(output_names_.begin(), output_names_.end()); + const auto& delimiter = "__"; + const auto& delimiter_num = "_"; + const auto& input_shape_str = "input_shape_"; + const auto& output_shape_str = "output_shape_"; + std::string name = ""; + std::string tmp = ""; + for (const auto& input_name : input_names_) { + tmp = input_name; + name += TrimStrings(tmp) + delimiter + input_shape_str; + auto input_tensor = scope_->FindMutableTensor(input_name); + for (const auto& iterm : input_tensor->dims().Vectorize()) { + name += std::to_string(iterm) + delimiter_num; + } + name += delimiter; + } + for (const auto& output_name : output_names_) { + tmp = output_name; + name += TrimStrings(tmp) + delimiter + output_shape_str; + auto output_tensor = scope_->FindMutableTensor(output_name); + for (const auto& iterm : output_tensor->dims().Vectorize()) { + name += std::to_string(iterm) + delimiter_num; + } + name += delimiter; + } + std::replace(name.begin(), name.end(), '/', '-'); + return name; + } + + void InferOutputsShapeOnly() { + // infer outputs shape when enable BATCH_SIZE_CHANGEABLE + const auto iter = in_out_shape_map_.find(all_inputs_shape_); + if (iter != in_out_shape_map_.end()) { + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + origin_otensors_[i]->Resize(iter->second[i]); + } + } else { + for (auto& inst : origin_program_) { + auto op = inst.op(); + CHECK(op); + op->CheckShape(); + const_cast(op)->InferShape(); + } + std::vector> outs_shape; + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + outs_shape.push_back(origin_otensors_[i]->dims().Vectorize()); + } + in_out_shape_map_[all_inputs_shape_] = outs_shape; + } } - int LaunchDeviceProgram() override { + inline void* GetOutputDataPtr(Tensor* tensor, bool use_mlu_cast) { + if (use_mlu_cast) { + // output is float, since cast fused in subgraph + return static_cast(tensor->mutable_data(TARGET(kMLU))); + } else { + return static_cast( + tensor->template mutable_data< + typename subgraph::mlu::MLUTypeTraits::type>( + TARGET(kMLU))); + } + } + + bool LaunchDeviceProgram() override { + // prepare input and output memory auto& mlu_context = this->ctx_->template As(); auto exec_queue = mlu_context.exec_queue(); - u32_t affinity = mlu_context.affinity(); - cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param(); - int data_param = 1; - forward_param.data_parallelism = &data_param; - forward_param.affinity = &affinity; - forward_param.end = CNRT_PARAM_END; - graph_.Compute(forward_param, exec_queue); - return 0; + + auto graph = shape_graph_map_[inputs_shape_]; + auto* graph_input = graph->MutableInputs(); + auto* graph_output = graph->MutableOutputs(); + CHECK_EQ(graph_input->size(), origin_itensors_.size()); + CHECK_EQ(graph_output->size(), origin_otensors_.size()); + + bool disable_mlu_cast = GetBoolFromEnv("LITE_DISABLE_MLU_CAST"); + + if (!disable_batch_size_changeable_) { + std::vector> + graph_in; + if (shape_tensor_map_in_.find(all_inputs_shape_) != + shape_tensor_map_in_.end()) { + graph_in = shape_tensor_map_in_[all_inputs_shape_]; + for (size_t i = 0; i < origin_itensors_.size(); ++i) { + graph_in[i]->set_mlu_ptr( + const_cast(origin_itensors_[i]->raw_data())); + } + } else { + graph_in.reserve(origin_itensors_.size()); + for (size_t i = 0; i < origin_itensors_.size(); ++i) { + paddle::lite::subgraph::mlu::MLUTensor tmp( + origin_itensors_[i]->dims().Vectorize()); + tmp.set_mlu_dtype(graph_input->at(i)->dtype()); + tmp.set_mlu_ptr(const_cast(origin_itensors_[i]->raw_data())); + graph_in.push_back( + std::make_shared(tmp)); + } + shape_tensor_map_in_[all_inputs_shape_] = graph_in; + } + + // TODO(zhangmingwei): we just call every op's infer_shape to get outputs' + // shape, may be it's better to use cnml's api to get output shape. This + // can be done when cnml's tensor dimension is totally equal to lite's + // tensor + // shape. + InferOutputsShapeOnly(); + // const std::vector> new_output_size = + // graph->InferOutputsShape(graph_in); + + std::vector> + graph_out; + + if (shape_tensor_map_out_.find(all_inputs_shape_) != + shape_tensor_map_out_.end()) { + graph_out = shape_tensor_map_out_[all_inputs_shape_]; + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + // origin_otensors_[i]->Resize(new_output_size.at(i)); + graph_out[i]->set_mlu_ptr( + GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast)); + } + } else { + graph_out.reserve(origin_otensors_.size()); + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + // origin_otensors_[i]->Resize(new_output_size.at(i)); + paddle::lite::subgraph::mlu::MLUTensor tmp( + origin_otensors_[i]->dims().Vectorize()); + tmp.set_mlu_dtype(graph_output->at(i)->dtype()); + tmp.set_mlu_ptr( + GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast)); + graph_out.push_back( + std::make_shared(tmp)); + } + shape_tensor_map_out_[all_inputs_shape_] = graph_out; + } + graph->Compute(exec_queue, graph_in, graph_out); + } else { + for (size_t i = 0; i < origin_itensors_.size(); ++i) { + graph_input->at(i)->set_mlu_ptr( + const_cast(origin_itensors_[i]->raw_data())); + } + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape()); + graph_output->at(i)->set_mlu_ptr( + GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast)); + } + // only cnmlComputeFusionOpForward_V3 need cnrtInvokeFuncParam_t + cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param(); + int data_param = 1; + forward_param.data_parallelism = &data_param; + u32_t affinity = mlu_context.affinity(); + forward_param.affinity = &affinity; + forward_param.end = CNRT_PARAM_END; + graph->Compute(forward_param, exec_queue); + +#ifdef MLU_DUMP_SUBGRAPH_IO + // Graph node store compile-time tensor while batchsize mutable is set. + // Only batchsize mutable is disabled, data exists in graph node at + // runtime + // =========== DUMP =================== + for (auto input_name : input_names_) { + auto input_tensor = + shape_graph_map_[inputs_shape_]->GetNode(input_name); + auto dump_name = input_name; + while (dump_name.find("/") != std::string::npos) { + dump_name = dump_name.replace(dump_name.find("/"), 1, "_"); + } + VLOG(6) << "dump_name: " << dump_name; + input_tensor->ToFile(dump_name); + } + for (auto output_name : output_names_) { + if (shape_graph_map_[inputs_shape_]->HasNode(output_name)) { + auto output_tensor = + shape_graph_map_[inputs_shape_]->GetNode(output_name); + auto dump_name = output_name; + while (dump_name.find("/") != std::string::npos) { + dump_name = dump_name.replace(dump_name.find("/"), 1, "_"); + } + VLOG(6) << "dump_name: " << dump_name; + output_tensor->ToFile(dump_name); + } else { + VLOG(6) << "graph does not have " << output_name << " as output" + << std::endl; + } + } +#endif + // =========== DUMP END ================ + } + + return true; } - paddle::lite::subgraph::mlu::Graph graph_; + paddle::lite_api::PrecisionType fp_type_; + std::vector> inputs_shape_{}; + std::vector> all_inputs_shape_{}; + std::map>, + std::shared_ptr> + shape_graph_map_{}; + // enable batch size changeable by default, this cound be changed by + // environment variable PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE and + // whether the op can be compiled with batch size changeable way + bool disable_batch_size_changeable_{false}; + bool error_compile_batch_size_changeable_{false}; + std::vector unsupport_batch_size_changeable_op_type_{"concat"}; + // search output runtime MLUTensor for certain output shape when enable + // BATCH_SIZE_CHANGEABLE + std::map>, + std::vector>> + shape_tensor_map_out_{}; + // search input runtime MLUTensor for certain input shape when enable + // BATCH_SIZE_CHANGEABLE + std::map>, + std::vector>> + shape_tensor_map_in_{}; + // search output shape for certain input shape when enable + // BATCH_SIZE_CHANGEABLE + std::map>, std::vector>> + in_out_shape_map_{}; }; template @@ -174,12 +481,11 @@ class SubgraphCompute param.scope, this->precision())); CHECK(engine_); - engine_->Build(); } void Run() override { CHECK(engine_); - engine_->Launch(); + engine_->Run(); } virtual ~SubgraphCompute() = default; diff --git a/lite/kernels/npu/bridges/engine.cc b/lite/kernels/npu/bridges/engine.cc index 8ca8357710e1f36a7c3f21417d7633e47f18c59a..884ab1acce8f0927def660ae35941d85b4c85901 100644 --- a/lite/kernels/npu/bridges/engine.cc +++ b/lite/kernels/npu/bridges/engine.cc @@ -15,6 +15,7 @@ #include "lite/kernels/npu/bridges/engine.h" #include #include +#include #include #include "lite/kernels/npu/bridges/registry.h" @@ -22,11 +23,50 @@ namespace paddle { namespace lite { namespace subgraph { -int Engine::BuildDeviceProgram() { return FAILED; } +Engine::Engine(KernelContext *ctx, + int block_idx, + cpp::BlockDesc *block_desc, + const std::vector &input_names, + const std::vector &output_names, + lite::Scope *scope) + : ctx_(ctx), block_idx_(block_idx), block_desc_(block_desc), scope_(scope) { + input_names_ = input_names; + output_names_ = output_names; + // Sort the name of input and output tensors, it's convenient for us to get + // the info of input and output tensors in the same order from the device + // program, because the result of subgraph division may be different but right + // at each call of the subgraph pass. + std::stable_sort(input_names_.begin(), input_names_.end()); + std::stable_sort(output_names_.begin(), output_names_.end()); +} -int Engine::LaunchDeviceProgram() { return 0; } +bool Engine::Run() { + if (is_first_epoch_) { + PrepareWorkspaceForDeviceProgram(); + is_first_epoch_ = false; + } + if (InputShapeChanged()) { + BuildDeviceProgram(); + } + return LaunchDeviceProgram(); +} -int Engine::BuildOriginProgram() { +bool Engine::PrepareWorkspaceForOriginProgram() { + origin_idims_.resize(input_names_.size()); + origin_itensors_.resize(input_names_.size()); + for (int i = 0; i < input_names_.size(); i++) { + origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]); + CHECK(origin_itensors_[i]); + } + origin_otensors_.resize(output_names_.size()); + for (int i = 0; i < output_names_.size(); i++) { + origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]); + CHECK(origin_otensors_[i]); + } + return true; +} + +bool Engine::BuildOriginProgram() { // TODO(hong19860320) The block_desc need to be divided into subgraphs during // the exection time. But only see them as a subgraph now. origin_program_.clear(); @@ -34,11 +74,14 @@ int Engine::BuildOriginProgram() { auto op_desc = block_desc_->GetOp(op_idx); CHECK(op_desc); std::string op_type = op_desc->Type(); + // Create op and pick up the best kernel auto op = LiteOpRegistry::Global().Create(op_desc->Type()); + CHECK(op) << "no Op found for " << op_type; op->Attach(*op_desc, scope_); std::unique_ptr picked_kernel; if (op_desc->HasAttr(kKernelTypeAttr)) { - // Create op and pick up kernel according to the kKernelTypeAttr attribute + // Create op and pick up the best kernel according to the + // kKernelTypeAttr attribute auto kernel_type = op_desc->GetAttr(kKernelTypeAttr); std::string alias; Place place; @@ -48,12 +91,14 @@ int Engine::BuildOriginProgram() { auto kernels = op->CreateKernels({place}); CHECK_GT(kernels.size(), 0u) << "No kernels found for " << op_type; auto it = std::find_if( - kernels.begin(), kernels.end(), [&](std::unique_ptr& it) { + kernels.begin(), kernels.end(), [&](std::unique_ptr &it) { return it->alias() == alias; }); CHECK(it != kernels.end()); picked_kernel = std::move(*it); } else { + // TODO(hong19860320) add kernel picking according to the type of input + // and output tensors VLOG(3) << "The attr '" << kKernelTypeAttr << "' not found, pick the first kernel for " << op_type; std::vector> kernels; @@ -74,52 +119,41 @@ int Engine::BuildOriginProgram() { } origin_program_.emplace_back(std::move(op), std::move(picked_kernel)); } - return 0; + CHECK(!origin_program_.empty()) << "no instructions"; + return true; } -int Engine::LaunchOriginProgram() { - for (auto& inst : origin_program_) { - auto op_type = inst.op()->op_info()->Type(); - if (op_type == "feed" || op_type == "fetch") continue; - inst.Run(); +bool Engine::LaunchOriginProgram() { + if (origin_program_.empty()) { + BuildOriginProgram(); + } + if (!origin_program_.empty()) { + for (auto &inst : origin_program_) { + auto op_type = inst.op()->op_info()->Type(); + if (op_type == "feed" || op_type == "fetch") continue; + inst.Run(); + } + return true; } - return 0; + return false; } -int Engine::Build() { - // In order to attach all of the ops of the block desc, we need to build the - // original program firstly. - BuildOriginProgram(); - // Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph - build_device_program_status_ = BuildDeviceProgram(); - return build_device_program_status_; +bool Engine::PrepareWorkspaceForDeviceProgram() { + return PrepareWorkspaceForOriginProgram(); } -void Engine::InitDeviceTensor() { return; } +bool Engine::BuildDeviceProgram() { return BuildOriginProgram(); } + +bool Engine::LaunchDeviceProgram() { return LaunchOriginProgram(); } bool Engine::InputShapeChanged() { + bool changed = false; for (size_t i = 0; i < origin_itensors_.size(); i++) { - if (origin_itensors_[i]->dims() != origin_idims_[i]) { - return true; - } - } - return false; -} - -int Engine::Launch() { - // Rebuild device program when the shapes of input tensors have been changed. - if (CHECK_SUCCESS(build_device_program_status_) && - CHECK_REBUILD_WHEN_SHAPE_CHANGED(build_device_program_status_) && - InputShapeChanged()) { - Build(); - InitDeviceTensor(); - } - if (CHECK_FAILED(build_device_program_status_)) { - LaunchOriginProgram(); - } else { - LaunchDeviceProgram(); + auto origin_idim = origin_itensors_[i]->dims().Vectorize(); + changed |= origin_idim != origin_idims_[i]; + origin_idims_[i] = origin_idim; } - return 0; + return changed; } } // namespace subgraph diff --git a/lite/kernels/npu/bridges/engine.h b/lite/kernels/npu/bridges/engine.h index 6a3f72077a9bed7a296b184330af119262472ada..b49b8fea5a6d39610ea7398e177e7d1ec5a35f92 100644 --- a/lite/kernels/npu/bridges/engine.h +++ b/lite/kernels/npu/bridges/engine.h @@ -33,49 +33,36 @@ class Engine { cpp::BlockDesc *block_desc, const std::vector &input_names, const std::vector &output_names, - lite::Scope *scope, - std::string model_cache_dir = "") - : ctx_(ctx), - block_idx_(block_idx), - block_desc_(block_desc), - input_names_(input_names), - output_names_(output_names), - scope_(scope), - model_cache_dir_(model_cache_dir) {} + lite::Scope *scope); virtual ~Engine() = default; - virtual int Build(); - virtual int Launch(); + virtual bool Run(); private: Engine(const Engine &) = delete; protected: - virtual int BuildDeviceProgram(); - virtual int LaunchDeviceProgram(); + virtual bool PrepareWorkspaceForOriginProgram(); + virtual bool BuildOriginProgram(); + virtual bool LaunchOriginProgram(); - virtual int BuildOriginProgram(); - virtual int LaunchOriginProgram(); + virtual bool PrepareWorkspaceForDeviceProgram(); + virtual bool BuildDeviceProgram(); + virtual bool LaunchDeviceProgram(); - virtual void InitDeviceTensor(); virtual bool InputShapeChanged(); KernelContext *ctx_{nullptr}; - int block_idx_; - cpp::BlockDesc *block_desc_; + int block_idx_{-1}; + cpp::BlockDesc *block_desc_{nullptr}; std::vector input_names_; std::vector output_names_; Scope *scope_{nullptr}; - // SUCCESS: device program build successed. FAILED: device program build - // failed. REBUILD_WHEN_SHAPE_CHANGED: device program build successed but need - // to rebuild when input shape changed. - int build_device_program_status_{0}; - std::vector origin_idims_; - std::vector origin_odims_; + bool is_first_epoch_{true}; + std::vector> origin_idims_; std::vector origin_itensors_; std::vector origin_otensors_; std::vector origin_program_; - std::string model_cache_dir_{""}; }; } // namespace subgraph diff --git a/lite/kernels/npu/bridges/graph.h b/lite/kernels/npu/bridges/graph.h index 38b03e06fa212728888cf47b3048d71fd4de06fc..1bc588496a253aa82183e020adc39989ad8d7312 100644 --- a/lite/kernels/npu/bridges/graph.h +++ b/lite/kernels/npu/bridges/graph.h @@ -19,7 +19,7 @@ #include #include #include -#include "graph/op/all_ops.h" +#include "graph/compatible/all_ops.h" #include "lite/core/op_lite.h" #include "lite/core/tensor.h" diff --git a/lite/kernels/npu/bridges/matmul_op.cc b/lite/kernels/npu/bridges/matmul_op.cc index 32af1916899454ef7a045339da5e9fc8a6131cfc..79ba82d94f24f61c2b9f51bd29634151bfcfa0ab 100644 --- a/lite/kernels/npu/bridges/matmul_op.cc +++ b/lite/kernels/npu/bridges/matmul_op.cc @@ -94,10 +94,10 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) { } else { matmul_node = graph->Add(out_name); auto matmul_op = matmul_node->data(); - matmul_op->set_input_x(*x_node->data()); - matmul_op->set_input_y(*y_node->data()); - matmul_op->set_attr_adj_x(transpose_x); - matmul_op->set_attr_adj_y(transpose_y); + matmul_op->set_input_x1(*x_node->data()); + matmul_op->set_input_x2(*y_node->data()); + matmul_op->set_attr_adj_x1(transpose_x); + matmul_op->set_attr_adj_x2(transpose_y); } if (fabs(alpha - 1.f) > 1e-6f) { diff --git a/lite/kernels/npu/bridges/utility.h b/lite/kernels/npu/bridges/utility.h index 107d90c116b8239a9060f252c45c2b2d7901ddf7..6e75e58187909ad59da37dbcb0737a92ec014e22 100644 --- a/lite/kernels/npu/bridges/utility.h +++ b/lite/kernels/npu/bridges/utility.h @@ -20,11 +20,11 @@ #include #include #include "graph/buffer.h" +#include "graph/compatible/operator_reg.h" #include "graph/graph.h" #include "graph/model.h" #include "graph/op/all_ops.h" #include "graph/operator.h" -#include "graph/operator_reg.h" #include "lite/core/op_lite.h" #include "lite/utils/macros.h" @@ -97,25 +97,26 @@ REG_OP(Pad) /* * Multiplies slices of two tensors in batches. * - * x : The input tensor - * y : The input tensor + * x1 : The input tensor + * x2 : The input tensor * - * z : The output tensor + * y : The output tensor * - * adj_x : adj_x is true, the input tensor x is transposed, otherwise - * it will not be transposed. Default is false (The current version only - * supports false). - * adj_y : adj_y is true, the input tensor y is transposed, otherwise - * it will not be transposed. Default is false. + * adj_x1 : adj_x1 is true, the input tensor x1 is transposed, + * otherwise it will not be transposed. + * Default is false (The current version only supports false). + * adj_x2 : adj_x2 is true, the input tensor x2 is transposed, + * otherwise it will not be transposed. + * Default is false. * - * 100.320.010.010 + * 100.320.010.010 */ REG_OP(BatchMatMul) - .INPUT(x, TensorType({DT_FLOAT})) - .INPUT(y, TensorType({DT_FLOAT})) - .OUTPUT(z, TensorType({DT_FLOAT})) - .ATTR(adj_x, AttrValue::BOOL{false}) - .ATTR(adj_y, AttrValue::BOOL{false}) + .INPUT(x1, TensorType({DT_FLOAT})) + .INPUT(x2, TensorType({DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT})) + .ATTR(adj_x1, AttrValue::BOOL{false}) + .ATTR(adj_x2, AttrValue::BOOL{false}) .OP_END() } // namespace ge diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc index f17d73f8dfd540c8a1b809d780084b05299ccc2f..6afb445e0ed411251d203bcb0420b0fba8ab6beb 100644 --- a/lite/kernels/npu/subgraph_compute.cc +++ b/lite/kernels/npu/subgraph_compute.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "hiai_ir_build.h" // NOLINT #include "lite/backends/npu/device.h" @@ -24,205 +25,275 @@ #include "lite/kernels/npu/bridges/paddle_use_bridges.h" #include "lite/kernels/npu/bridges/utility.h" #include "lite/utils/io.h" +#include "lite/utils/md5.h" namespace paddle { namespace lite { namespace kernels { namespace npu { -std::string SubgraphEngine::GenerateModelCacheName() const { - auto inames = device_inames_; - auto onames = device_onames_; - std::stable_sort(inames.begin(), inames.end()); - - std::string model_cache_name = "subgraph_" + std::to_string(block_idx_); - for (auto iname : inames) { - model_cache_name += "_"; - auto itensor = scope_->FindTensor(iname); - int tmp = 0; - for (auto i : itensor->dims().Vectorize()) { - tmp += i * i; +// Generate the model name by using md5 hashes based on: +// 1. the sorted variable input names +// 2. the shapes of the origin input tensors +// 3. the sorted variable output names +std::string DeviceProgram::GenerateModelName( + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims) { + std::ostringstream os; + CHECK_EQ(input_names.size(), origin_idims.size()); + for (int i = 0; i < input_names.size(); i++) { + os << input_names[i]; + for (auto dim : origin_idims[i]) { + os << dim; } - model_cache_name += std::to_string(tmp % 1999); } - model_cache_name += "_.om"; + for (auto output_name : output_names) { + os << output_name; + } + return MD5(os.str()); +} - return model_cache_name; +// Deserialize the generated model, the precisions and dimensions of the origin +// output tensors of the subgraph op into files +bool DeviceProgram::LoadFromCacheFile( + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims, + const std::string& model_cache_dir) { + // Generate the model name if not initialized + if (model_name_.empty()) { + model_name_ = GenerateModelName(input_names, output_names, origin_idims); + } + // Load from the cached model file, return a HiAI model manager client for + // inference + auto model_path = model_cache_dir + "/" + model_name_ + ".om"; + VLOG(3) << "[NPU] Load model from " << model_path; + std::vector model_buffer; + if (!ReadFile(model_path, &model_buffer)) { + LOG(WARNING) << "[NPU] read from " << model_path << " failed!"; + return false; + } + bool model_comp = false; + model_client_ = + lite::npu::Device::Global().Load(model_name_, &model_buffer, &model_comp); + if (!model_client_) { + LOG(WARNING) << "[NPU] Load model failed!"; + return false; + } + // Rewrite with the compatible model data if the cached + // model file is incompatible with the current device + if (!model_comp) { + VLOG(3) << "[NPU] Export the compatible model to " << model_path; + if (!WriteFile(model_path, model_buffer)) { + LOG(WARNING) << "[NPU] Open " << model_path << " for writting failed!"; + } + } + // Deserialize the precisions and shapes of the origin output tensors from the + // cached configuration file + auto config_path = model_cache_dir + "/" + model_name_ + ".cfg"; + VLOG(3) << "[NPU] Load configuration from " << config_path; + std::vector config_buffer; + if (!ReadFile(config_path, &config_buffer)) { + LOG(WARNING) << "[NPU] read from " << config_path << " failed!"; + return false; + } + std::string config_str(config_buffer.begin(), config_buffer.end()); + // Parse the precision and shapes of the output tensors + auto output_options = Split(config_str, ";"); + CHECK_EQ(output_options.size(), output_names.size()); + origin_otypes_.resize(output_names.size()); + origin_odims_.resize(output_names.size()); + for (int i = 0; i < output_names.size(); i++) { + auto items = Split(output_options[i], ":"); + CHECK_EQ(items.size(), 2); // precision and shapes + origin_otypes_[i] = static_cast(std::stoi(items[0])); + origin_odims_[i] = Split(items[1], ","); + } + return true; } -int SubgraphEngine::BuildDeviceProgram() { +bool DeviceProgram::BuildGraphAndCacheToFile( + const std::vector& origin_program, + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims, + const std::vector& origin_otensors, + const std::string& model_cache_dir) { + // Generate the model name if not initialized + if (model_name_.empty()) { + model_name_ = GenerateModelName(input_names, output_names, origin_idims); + } + // Convert all of ops and their input vars and weights to HiAI IR nodes, + // then added them into the HiAI IR graph int status = 0; - // Convert all of ops and their input vars and weights and added into the NPU - // HiAI IR graph + CHECK(!origin_program.empty()) << "no instructions"; subgraph::npu::Graph graph; const auto& bridges = subgraph::Registry::Instance(); - for (auto& inst : origin_program_) { + for (auto& inst : origin_program) { auto op = const_cast(inst.op()); CHECK(op); op->CheckShape(); op->InferShape(); std::string op_type = op->op_info()->Type(); if (!bridges.Exists(op_type, TARGET(kNPU))) { - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); status |= bridges.Select(op_type, TARGET(kNPU))( reinterpret_cast(&graph), op, const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } - // Collect the valid input and output nodes in the HiAI IR graph and update - // the input and output names - device_inames_.clear(); - device_onames_.clear(); + // Collect the input and output nodes of the HiAI IR graph std::vector device_inodes; + for (size_t i = 0; i < input_names.size(); i++) { + CHECK(graph.Has(input_names[i]) && graph.Get(input_names[i])->is_data()); + device_inodes.push_back(*graph.Get(input_names[i])->data()); + } std::vector device_onodes; - for (auto& input_name : input_names_) { - if (graph.Has(input_name)) { - if (graph.Get(input_name)->is_data()) { - device_inodes.push_back(*graph.Get(input_name)->data()); - device_inames_.push_back(input_name); - } else { - LOG(WARNING) << "[NPU] Input node " << input_name - << " is ignored because it is not a data node."; - } - } else { - LOG(WARNING) << "[NPU] Input node " << input_name - << " is ignored because it does not exist."; - } + for (size_t i = 0; i < output_names.size(); i++) { + CHECK(graph.Has(output_names[i])); + device_onodes.push_back(*graph.Get(output_names[i])->data()); } - for (auto& output_name : output_names_) { - if (graph.Has(output_name)) { - device_onodes.push_back(*graph.Get(output_name)->data()); - device_onames_.push_back(output_name); - } else { - LOG(WARNING) << "[NPU] Output node " << output_name - << " is ignored because it does not exist."; - } + // Build the HiAI IR graph to the HiAI om model + std::vector model_buffer; + if (!lite::npu::Device::Global().Build( + device_inodes, device_onodes, &model_buffer)) { + LOG(WARNING) << "[NPU] Build model failed!"; + return false; } - CHECK(!device_inames_.empty()) - << "[NPU] No input nodes found for building NPU model"; - CHECK(!device_onames_.empty()) - << "[NPU] No output nodes found for building NPU model"; - - // Build the HiAI IR graph to HiAI om model as the device program - if (device_program_map_.count(inputs_shape_) > 0) { - return status; + // Load the HiAI om model and create a HiAI model manager client(from HiAI + // Service) to run inference. + bool model_comp = true; + model_client_ = + lite::npu::Device::Global().Load(model_name_, &model_buffer, &model_comp); + if (!model_client_) { + LOG(WARNING) << "[NPU] Load model failed!"; + return false; } - std::string model_cache_full_dir = - model_cache_dir_.empty() ? "" : model_cache_dir_ + "/" + - GenerateModelCacheName(); - auto device_client = lite::npu::Device::Global().Build( - model_name_, device_inodes, device_onodes, model_cache_full_dir); - if (device_client == nullptr) { - LOG(WARNING) << "[NPU] Build model failed!"; - return subgraph::FAILED; + // Update the precison and dimensions of the origin output tensors + CHECK_EQ(origin_otensors.size(), output_names.size()); + origin_otypes_.resize(output_names.size()); + origin_odims_.resize(output_names.size()); + for (size_t i = 0; i < output_names.size(); i++) { + origin_otypes_[i] = graph.Get(output_names[i])->precision(); + origin_odims_[i] = origin_otensors[i]->dims().Vectorize(); } - auto device_program = std::make_shared(device_client); - if (!inputs_shape_.empty()) { - device_program_map_[inputs_shape_] = device_program; + if (!model_cache_dir.empty()) { + // Save the generated model to file, used for the model caching or the + // offline model generation + auto model_path = model_cache_dir + "/" + model_name_ + ".om"; + VLOG(3) << "[NPU] Save model to " << model_path; + if (!WriteFile(model_path, model_buffer)) { + LOG(WARNING) << "[NPU] Open " << model_path << " for writting failed!"; + } + // Serialize the precisions and shapes of the origin output tensors into the + // configuration file + std::ostringstream os; + for (int i = 0; i < output_names.size(); i++) { + os << static_cast(origin_otypes_[i]) << ":"; + for (auto dim : origin_odims_[i]) { + os << dim << ","; + } + os << ";"; + } + auto str = os.str(); + std::vector config_buffer(str.begin(), str.end()); + auto config_path = model_cache_dir + "/" + model_name_ + ".cfg"; + VLOG(3) << "[NPU] Save configuration to " << config_path; + if (!WriteFile(config_path, config_buffer)) { + LOG(WARNING) << "[NPU] Open " << config_path << " for writting failed!"; + } } + return true; +} - // Query and check the dimensions of valid input and output tensors - std::vector device_idims, device_odims; - if (device_program->client->GetModelIOTensorDim( - model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) { - LOG(WARNING) - << "[NPU] Get the dimensions of input and output tensors failed!"; - return subgraph::FAILED; +bool DeviceProgram::ShareBufferWithOriginTensors( + const std::vector& input_names, + const std::vector& output_names, + std::vector* origin_itensors, + std::vector* origin_otensors, + std::vector>* device_itensors, + std::vector>* device_otensors) { + CHECK(!model_name_.empty() && model_client_); + // Query the dimensions of the device input and output tensors if not + // initialized + if (device_idims_.empty() || device_odims_.empty()) { + if (model_client_->GetModelIOTensorDim( + model_name_, device_idims_, device_odims_) != hiai::AI_SUCCESS) { + LOG(WARNING) + << "[NPU] Get the dimensions of input and output tensors failed!"; + return false; + } } - device_program->device_idims = device_idims; - device_program->device_odims = device_odims; + // Check the dimensions of the device tensors and the origin tensors + CHECK_EQ(device_itensors->size(), input_names.size()); + CHECK_EQ(device_otensors->size(), output_names.size()); + CHECK_EQ(origin_otypes_.size(), output_names.size()); + CHECK_EQ(origin_odims_.size(), output_names.size()); + CHECK_EQ(device_idims_.size(), input_names.size()); + CHECK_EQ(device_odims_.size(), output_names.size()); + for (int i = 0; i < input_names.size(); i++) { + VLOG(3) << "[NPU] Inputs[" << i << "] name: " << input_names[i] + << " origin dims:" << (*origin_itensors)[i]->dims().repr() + << " device dims: {" << device_idims_[i].GetNumber() << "," + << device_idims_[i].GetChannel() << "," + << device_idims_[i].GetHeight() << "," + << device_idims_[i].GetWidth() << "}"; + CHECK_EQ((*origin_itensors)[i]->dims().production(), + device_idims_[i].GetNumber() * device_idims_[i].GetChannel() * + device_idims_[i].GetHeight() * device_idims_[i].GetWidth()); + VLOG(3) << "[NPU] Init the input tensors for the device program and share " + "their buffers with the origin input tensors"; + // reinit device tensor will free shared buffer, so copy data to a tmp + // tensor + Tensor tmp; + tmp.CopyDataFrom(*(*origin_itensors)[i]); + (*device_itensors)[i]->Init(&(device_idims_[i])); - CHECK_EQ(device_idims.size(), device_inames_.size()); - CHECK_EQ(device_odims.size(), device_onames_.size()); - origin_idims_.resize(device_inames_.size()); - origin_itensors_.resize(device_inames_.size()); - device_itensors_.resize(device_inames_.size()); - origin_odims_.resize(device_onames_.size()); - origin_otensors_.resize(device_onames_.size()); - device_otensors_.resize(device_onames_.size()); + std::memcpy( + (*device_itensors)[i]->GetBuffer(), tmp.raw_data(), tmp.memory_size()); - for (int i = 0; i < device_inames_.size(); i++) { - auto node = graph.Get(device_inames_[i]); - auto precision = node->precision(); - auto layout = node->layout(); - origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]); - CHECK(origin_itensors_[i]); - origin_idims_[i] = origin_itensors_[i]->dims(); - VLOG(3) << "[NPU] Inputs[" << i << "] name: " << device_inames_[i] - << " precision: " << PrecisionToStr(precision) - << " layout: " << DataLayoutToStr(layout) << " dims: {" - << device_idims[i].GetNumber() << "," - << device_idims[i].GetChannel() << "," - << device_idims[i].GetHeight() << "," << device_idims[i].GetWidth() - << "}"; - // Prepare the device input tensors - CHECK_EQ(origin_idims_[i].production(), - device_idims[i].GetNumber() * device_idims[i].GetChannel() * - device_idims[i].GetHeight() * device_idims[i].GetWidth()); - device_itensors_[i].reset(new hiai::AiTensor); - device_itensors_[i]->Init(&(device_idims[i])); + // Share data buf between device_itensor and origin_itensor + std::shared_ptr buffer = + std::make_shared((*device_itensors)[i]->GetBuffer(), + lite_api::TargetType::kHost, + (*device_itensors)[i]->GetSize()); + (*origin_itensors)[i]->ResetBuffer(buffer, + (*device_itensors)[i]->GetSize()); } - device_program->origin_idims = origin_idims_; - - for (int i = 0; i < device_onames_.size(); i++) { - auto node = graph.Get(device_onames_[i]); - auto precision = node->precision(); - auto layout = node->layout(); - origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]); - CHECK(origin_otensors_[i]); - origin_odims_[i] = origin_otensors_[i]->dims(); - VLOG(3) << "[NPU] Outputs[" << i << "] name: " << device_onames_[i] - << " precision: " << PrecisionToStr(precision) - << " layout: " << DataLayoutToStr(layout) << " dims: {" - << device_odims[i].GetNumber() << "," - << device_odims[i].GetChannel() << "," - << device_odims[i].GetHeight() << "," << device_odims[i].GetWidth() - << "}"; - // Prepare the device output tensors - switch (precision) { - case PRECISION(kFloat): - origin_otensors_[i]->mutable_data(); - break; - case PRECISION(kBool): - origin_otensors_[i]->mutable_data(); - break; - case PRECISION(kInt8): - origin_otensors_[i]->mutable_data(); - break; - case PRECISION(kInt16): - origin_otensors_[i]->mutable_data(); - break; - case PRECISION(kInt32): - origin_otensors_[i]->mutable_data(); - break; - case PRECISION(kInt64): - origin_otensors_[i]->mutable_data(); - break; - default: - LOG(FATAL) << "[NPU] " << device_onames_[i] - << " can't mutable data with precision type " - << PrecisionToStr(precision); - break; - } - device_program->origin_odims = origin_odims_; - - CHECK_EQ(origin_odims_[i].production(), - device_odims[i].GetNumber() * device_odims[i].GetChannel() * - device_odims[i].GetHeight() * device_odims[i].GetWidth()); - device_otensors_[i].reset(new hiai::AiTensor); - device_otensors_[i]->Init(&(device_odims[i])); + for (int i = 0; i < output_names.size(); i++) { + (*origin_otensors)[i]->set_precision(origin_otypes_[i]); + (*origin_otensors)[i]->Resize(origin_odims_[i]); + VLOG(3) << "[NPU] Outputs[" << i << "] name: " << output_names[i] + << " origin dims:" << (*origin_otensors)[i]->dims().repr() + << " device dims: {" << device_odims_[i].GetNumber() << "," + << device_odims_[i].GetChannel() << "," + << device_odims_[i].GetHeight() << "," + << device_odims_[i].GetWidth() << "}"; + CHECK_EQ((*origin_otensors)[i]->dims().production(), + device_odims_[i].GetNumber() * device_odims_[i].GetChannel() * + device_odims_[i].GetHeight() * device_odims_[i].GetWidth()); + (*device_otensors)[i]->Init(&(device_odims_[i])); + VLOG(3) << "[NPU] Init the output tensors for the device program and share " + "their buffers with the origin output tensors"; + // Share data buf between device_itensor and origin_itensor + std::shared_ptr buffer = + std::make_shared((*device_otensors)[i]->GetBuffer(), + lite_api::TargetType::kHost, + (*device_otensors)[i]->GetSize()); + (*origin_otensors)[i]->ResetBuffer(buffer, + (*device_otensors)[i]->GetSize()); } - return status; + return true; } -int SubgraphEngine::LaunchDeviceProgram() { - // Copy the data of origin input tensors to the buffer of input HiAI tensors - // init device_itensors_, device_otensors_, origin_otensors_ - auto device_program = device_program_map_[inputs_shape_]; - +bool DeviceProgram::ZeroCopyRun( + std::vector>* device_itensors, + std::vector>* device_otensors) { + CHECK(!model_name_.empty() && model_client_); // Run the HiAI model by name std::string key = "model_name"; // Note: key seems must be model_name hiai::AiContext model_context; @@ -234,70 +305,87 @@ int SubgraphEngine::LaunchDeviceProgram() { }; int istamp; auto start_time = GetCurrentUS(); - CHECK_EQ(device_program->client->Process( - model_context, device_itensors_, device_otensors_, 1000, istamp), + CHECK_EQ(model_client_->Process( + model_context, *device_itensors, *device_otensors, 1000, istamp), hiai::AI_SUCCESS); VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us"; - - return 0; + return true; } -int SubgraphEngine::Build() { - if (device_program_map_.count(inputs_shape_) > 0) { - return subgraph::SUCCESS; +bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() { + // Obtain the origin input tensors, and create the origin output + // tensors(Don't try to access them before launch the device program or the + // origin program) + PrepareWorkspaceForOriginProgram(); + // Create the device input and output tensors, but don't initialize them + // with the dimensions + device_itensors_.resize(input_names_.size()); + for (int i = 0; i < input_names_.size(); i++) { + device_itensors_[i].reset(new hiai::AiTensor); + CHECK(device_itensors_[i]); + } + device_otensors_.resize(output_names_.size()); + for (int i = 0; i < output_names_.size(); i++) { + device_otensors_[i].reset(new hiai::AiTensor); + CHECK(device_otensors_[i]); } - // In order to attach all of the ops of the block desc, we need to build the - // original program firstly. - BuildOriginProgram(); - // Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph - build_device_program_status_ = BuildDeviceProgram(); - return build_device_program_status_; + return true; } -void SubgraphEngine::InitDeviceTensor() { - auto device_program = device_program_map_[inputs_shape_]; - for (size_t i = 0; i < device_itensors_.size(); i++) { - if (device_itensors_[i]->GetBuffer() != origin_itensors_[i]->raw_data()) { - VLOG(3) << "init device_itensors and share input tensor buf between " - "device and host"; - device_itensors_[i]->Init(&(device_program->device_idims[i])); - std::memcpy(device_itensors_[i]->GetBuffer(), - origin_itensors_[i]->raw_data(), - origin_itensors_[i]->memory_size()); - // share data buf between device_itensor and origin_itensor - std::shared_ptr buffer = - std::make_shared(device_itensors_[i]->GetBuffer(), - lite_api::TargetType::kHost, - device_itensors_[i]->GetSize()); - origin_itensors_[i]->ResetBuffer(buffer, device_itensors_[i]->GetSize()); +bool SubgraphEngine::BuildDeviceProgram() { + // Check if the cache device program exists + if (!device_programs_.count(origin_idims_)) { + auto device_program = std::make_shared(); + // Obtain the model cache dir from the NPU Context of the subgraph op + auto model_cache_dir = ctx_->As().SubgraphModelCacheDir(); + VLOG(3) << "[NPU] Getting subgraph model_cache_dir is: " << model_cache_dir; + // Check and load if the cached model and configuration file exists + if (model_cache_dir.empty() || + !device_program->LoadFromCacheFile( + input_names_, output_names_, origin_idims_, model_cache_dir)) { + // Build the model online, including converting the paddle ops to the HiAI + // IR nodes, building the HiAI IR graph to the om model, then load it as a + // new HiAI model manager client for inference. + if (origin_program_.empty()) { + BuildOriginProgram(); + } + CHECK(!origin_program_.empty()) << "no instructions"; + if (!device_program->BuildGraphAndCacheToFile(origin_program_, + input_names_, + output_names_, + origin_idims_, + origin_otensors_, + model_cache_dir)) { + return false; + } } - } - for (size_t i = 0; i < device_otensors_.size(); i++) { - if (device_otensors_[i]->GetBuffer() != origin_otensors_[i]->raw_data()) { - VLOG(3) << "init device_otensors and share output tensor buf between " - "device and host"; - device_otensors_[i]->Init(&(device_program->device_odims[i])); - // share data buf between device_itensor and origin_itensor - origin_otensors_[i]->Resize(device_program->origin_odims[i]); - std::shared_ptr buffer = - std::make_shared(device_otensors_[i]->GetBuffer(), - lite_api::TargetType::kHost, - device_otensors_[i]->GetSize()); - origin_otensors_[i]->ResetBuffer(buffer, device_otensors_[i]->GetSize()); + if (device_program->model_client_ == nullptr) { + return false; } + device_programs_[origin_idims_] = device_program; } + auto device_program = device_programs_[origin_idims_]; + CHECK(device_program && device_program->model_client_); + return device_program->ShareBufferWithOriginTensors(input_names_, + output_names_, + &origin_itensors_, + &origin_otensors_, + &device_itensors_, + &device_otensors_); } -bool SubgraphEngine::InputShapeChanged() { - std::vector> new_shape; - for (auto origin_itensor : origin_itensors_) { - new_shape.push_back(origin_itensor->dims().Vectorize()); +bool SubgraphEngine::LaunchDeviceProgram() { + // Roll back to launch the origin program if the device program can't be + // found or the model client isn't initialized. + if (device_programs_.count(origin_idims_) == 0 || + device_programs_[origin_idims_]->model_client_ == nullptr) { + return LaunchOriginProgram(); } - if (inputs_shape_ == new_shape) { - return false; + auto device_program = device_programs_[origin_idims_]; + if (!device_program->model_client_) { + return LaunchOriginProgram(); } - inputs_shape_ = new_shape; - return true; + return device_program->ZeroCopyRun(&device_itensors_, &device_otensors_); } void SubgraphCompute::PrepareForRun() { @@ -307,15 +395,13 @@ void SubgraphCompute::PrepareForRun() { param.sub_block_desc, param.input_data_names, param.output_data_names, - param.scope, - NPUContext::SubgraphModelCacheDir())); + param.scope)); CHECK(engine_); - engine_->Build(); } void SubgraphCompute::Run() { CHECK(engine_); - engine_->Launch(); + engine_->Run(); } } // namespace npu diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h index 9f0b5a944137dbf9a521235b80398feca1cd82b0..33321a7789fbc1eee5ff759dcf682d8e875ffe96 100644 --- a/lite/kernels/npu/subgraph_compute.h +++ b/lite/kernels/npu/subgraph_compute.h @@ -28,52 +28,65 @@ namespace lite { namespace kernels { namespace npu { -class SubgraphEngine : public subgraph::Engine { +class DeviceProgram { public: - SubgraphEngine(KernelContext *ctx, - int block_idx, - cpp::BlockDesc *block_desc, - const std::vector &input_names, - const std::vector &output_names, - Scope *scope, - std::string model_cache_dir = "") - : subgraph::Engine(ctx, - block_idx, - block_desc, - input_names, - output_names, - scope, - model_cache_dir) {} + DeviceProgram() {} + ~DeviceProgram() {} + std::string GenerateModelName( + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims); + bool LoadFromCacheFile(const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims, + const std::string& model_cache_dir); + bool BuildGraphAndCacheToFile( + const std::vector& origin_program, + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims, + const std::vector& origin_otensors, + const std::string& model_cache_dir); + bool ShareBufferWithOriginTensors( + const std::vector& input_names, + const std::vector& output_names, + std::vector* origin_itensors, + std::vector* origin_otensors, + std::vector>* device_itensors, + std::vector>* device_otensors); + bool ZeroCopyRun( + std::vector>* device_itensors, + std::vector>* device_otensors); - struct device_program_t { - explicit device_program_t(std::shared_ptr _client) - : client(_client) {} - std::shared_ptr client{nullptr}; - std::vector origin_idims{}; - std::vector origin_odims{}; - std::vector device_idims{}; - std::vector device_odims{}; - }; + public: + std::string model_name_{""}; + std::shared_ptr model_client_{nullptr}; + std::vector> origin_odims_; + std::vector origin_otypes_; + std::vector device_idims_{}; + std::vector device_odims_{}; +}; - int Build() override; +class SubgraphEngine : public subgraph::Engine { + public: + SubgraphEngine(KernelContext* ctx, + int block_idx, + cpp::BlockDesc* block_desc, + const std::vector& input_names, + const std::vector& output_names, + Scope* scope) + : subgraph::Engine( + ctx, block_idx, block_desc, input_names, output_names, scope) {} protected: - int BuildDeviceProgram() override; - int LaunchDeviceProgram() override; - - void InitDeviceTensor() override; - bool InputShapeChanged() override; - - std::string GenerateModelCacheName() const; + bool PrepareWorkspaceForDeviceProgram() override; + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; - std::string model_name_{"model.om"}; - std::vector> inputs_shape_{}; - std::map>, std::shared_ptr> - device_program_map_{}; - std::vector device_inames_{}; - std::vector device_onames_{}; std::vector> device_itensors_{}; std::vector> device_otensors_{}; + std::map>, std::shared_ptr> + device_programs_; }; class SubgraphCompute : public KernelLite { diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index 600d0d22553af9d857d03491aabd2067db8f32ef..81e1a4d7562a9decab2e2daf4001faec7ac2fcee 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -21,6 +21,7 @@ add_kernel(fusion_elementwise_sub_activation_opencl add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(transpose_opencl OPENCL basic SRCS transpose_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(layout_opencl OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps}) @@ -67,6 +68,9 @@ lite_cc_test(test_scale_image_opencl SRCS scale_image_compute_test.cc lite_cc_test(test_reshape_image_opencl SRCS reshape_image_compute_test.cc DEPS reshape_opencl op_registry program context) +lite_cc_test(test_transpose_image_opencl SRCS transpose_image_compute_test.cc + DEPS transpose_opencl layout_opencl op_registry program context) + lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc DEPS concat_opencl layout_opencl op_registry program context) diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index fed8171cc273b437be411225363bf4a732769ae3..083f72134eba8afc7db696f68d64098b9c59a0f9 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -28,91 +28,83 @@ namespace paddle { namespace lite { namespace kernels { namespace opencl { -/* image kernel*/ + void ConvImageCompute::PrepareForRun() { - const auto& param = this->Param(); - auto x_dims = param.x->dims(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); + ReInitWhenNeeded(); + + auto filter_dims = conv_param_->filter->dims(); + filter_tensor_n_ = filter_dims[0]; + filter_tensor_c_ = filter_dims[1]; + filter_tensor_h_ = filter_dims[2]; + filter_tensor_w_ = filter_dims[3]; - float* filter_cpu = param.filter->mutable_data(); auto& context = ctx_->As(); CHECK(context.cl_context() != nullptr); const bool is_mali = context.cl_context()->IsArmMali(); - filter_gpu_image_ = std::unique_ptr(new Tensor); - tensor_hold_filter_image_ = std::unique_ptr(new Tensor); - tensor_hold_bias_image_ = std::unique_ptr(new Tensor); - int bs = x_dims[0]; - int c_in = x_dims[1]; - int h_out = output_dims[2]; - int w_out = output_dims[3]; - int kernel_h = filter_dims[2]; // oihw - int kernel_w = filter_dims[3]; - auto paddings = *param.paddings; - auto dilations = *param.dilations; - int stride_h = param.strides[0]; - int stride_w = param.strides[1]; - int pad_h = paddings[0]; - int pad_w = paddings[2]; - int groups = param.groups; - bool relu_fused = param.fuse_relu; - bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); - bool zero_pad = (pad_h == 0) && (pad_w == 0); - - bool pad_equal = - ((paddings[0] == paddings[1]) && (paddings[1] == paddings[2]) && - (paddings[2] == paddings[3])); - bool stride_equal = stride_h == stride_w; - bool dilation_equal = dilations[0] == dilations[1]; + + auto paddings = *conv_param_->paddings; + pad_up_ = paddings[0]; + pad_down_ = paddings[1]; + pad_left_ = paddings[2]; + pad_right_ = paddings[3]; + + auto dilations = *conv_param_->dilations; + dilation_h_ = dilations[0]; + dilation_w_ = dilations[1]; + + stride_h_ = conv_param_->strides[0]; + stride_w_ = conv_param_->strides[1]; + + groups_ = conv_param_->groups; + relu_fused_ = conv_param_->fuse_relu; + has_bias_ = (conv_param_->bias) != nullptr; + offset_ = filter_tensor_h_ / 2 - pad_up_; + + bool pad_equal = ((pad_left_ == pad_up_) && (pad_up_ == pad_left_) && + (pad_left_ == pad_right_)); + bool stride_equal = stride_h_ == stride_w_; + bool dilation_equal = dilation_h_ == dilation_w_; VLOG(3) << "Is arm mali / " << (is_mali ? "Yes" : "No"); - VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No"); - VLOG(3) << "groups:" << groups << " stride_h:" << stride_h - << " stride_w:" << stride_w << " pad_h:" << pad_h - << " pad_w:" << pad_w << " kernel_h:" << kernel_h - << " kernel_h:" << kernel_h; - VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2] - << " " << x_dims[3]; - VLOG(3) << "dialtion:" << dilations[0] << " " << dilations[1]; - VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " " - << output_dims[2] << " " << output_dims[3]; - VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " " - << filter_dims[2] << " " << filter_dims[3]; + VLOG(3) << "Is relu fused? / " << (relu_fused_ ? "Yes" : "No"); + VLOG(3) << "groups:" << groups_ << " stride_h_:" << stride_h_ + << " stride_w_:" << stride_w_ << " pad_left_:" << pad_left_ + << " pad_up_:" << pad_up_ << " filter_tensor_h_:" << filter_tensor_h_ + << " filter_tensor_h_:" << filter_tensor_h_; + VLOG(3) << "input_tensor_nchw:" << input_tensor_n_ << " " << input_tensor_c_ + << " " << input_tensor_h_ << " " << input_tensor_w_; + VLOG(3) << "dialtion:" << dilation_h_ << " " << dilation_w_; + VLOG(3) << "output_dims:" << output_tensor_n_ << " " << output_tensor_c_ + << " " << output_tensor_h_ << " " << output_tensor_w_; + VLOG(3) << "filter_dims:" << filter_tensor_n_ << " " << filter_tensor_c_ + << " " << filter_tensor_h_ << " " << filter_tensor_w_; VLOG(3) << "pad_equal:" << pad_equal; VLOG(3) << "stride_equal:" << stride_equal; VLOG(3) << "dilation_equal:" << dilation_equal; - VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " " - << paddings[2] << " " << paddings[3]; + VLOG(3) << "padding :" << pad_up_ << " " << pad_down_ << " " << pad_left_ + << " " << pad_right_; CHECK(pad_equal && stride_equal && dilation_equal); + CHECK_GE(conv_param_->dilations->size(), 2); + CHECK(dilation_h_ == dilation_w_); + CHECK_GE(conv_param_->paddings->size(), 2); + CHECK(pad_left_ == pad_up_); + CHECK_GE(conv_param_->strides.size(), 2); + CHECK(stride_h_ == stride_w_); + + if (!is_mali) { + use_tune_ = false; + } - // general gws.. - auto out_image_shape = InitImageDimInfoWith(output_dims); - - const std::vector& default_work_size = - DefaultWorkSize(output_dims, - DDim(std::vector{ - static_cast(out_image_shape["width"]), - static_cast(out_image_shape["height"])})); - - default_c_blk_ = default_work_size[0]; - default_w_blk_ = default_work_size[1]; - default_nh_blk_ = default_work_size[2]; - c_blk_ = default_c_blk_; - w_blk_ = default_w_blk_; - nh_blk_ = default_nh_blk_; - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - - if (kernel_h == 1 && kernel_w == 1) { - // conv2d_1x1 - // if (param.x->dims()[1] % 4 == 0) { - // kernel_func_names_.push_back("conv2d_1x1_simple"); - // } else { - // kernel_func_names_.push_back("conv2d_1x1_opt"); - // } + /********************************************* + * Upload filter, bias to opencl device + *********************************************/ + float* filter_cpu = conv_param_->filter->mutable_data(); + filter_gpu_image_ = std::unique_ptr(new Tensor); + tensor_hold_filter_image_ = std::unique_ptr(new Tensor); + tensor_hold_bias_image_ = std::unique_ptr(new Tensor); - if (param.x->dims()[1] % 4 == 0) { + if (filter_tensor_h_ == 1 && filter_tensor_h_ == 1) { + if (input_tensor_c_ % 4 == 0) { kernel_func_names_.push_back("conv2d_1x1_simple"); } else { kernel_func_names_.push_back("conv2d_1x1_opt"); @@ -121,89 +113,49 @@ void ConvImageCompute::PrepareForRun() { CLImageConverterNWBlock converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - // std::vector filter_image_v(filter_image_dims[0] * - // filter_image_dims[1] * 4); // 4 : - // RGBA - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); - + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d1x1opt; - { - // calc 1x1 gws - w_blk_ = maptofactor(default_w_blk_, 4); - c_blk_ = default_c_blk_; - nh_blk_ = default_nh_blk_; - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } #define DEPTH_CONV_USE_SPL #ifdef DEPTH_CONV_USE_SPL - } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] && - kernel_h == 3 && kernel_w == 3 && groups > 1) { + } else if (filter_tensor_c_ == 1 && input_tensor_c_ == output_tensor_c_ && + filter_tensor_h_ == 3 && filter_tensor_w_ == 3 && groups_ > 1) { // depth_conv2d_3x3s1, depth_conv2d_3x3 - if (stride_h == 1 && dilations[0] == 1) { + if (stride_h_ == 1 && dilation_h_ == 1) { kernel_func_names_.push_back("depth_conv2d_3x3s1"); impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1; - { - // depthwise spl gws s1 - int c_block = (output_dims[1] + 3) / 4; - int w = output_dims[3]; - int nh = output_dims[0] * output_dims[2]; - int w_blk_size = 2; - int w_blk = (w + w_blk_size - 1) / w_blk_size; - - c_blk_ = c_block; - w_blk_ = w_blk; - nh_blk_ = nh; - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } } else { kernel_func_names_.push_back("depth_conv2d_3x3"); impl_ = &ConvImageCompute::DepthwiseConv2d3x3; - { - // depthwise spl gws - int c_block = (output_dims[1] + 3) / 4; - int w = output_dims[3]; - int nh = output_dims[0] * output_dims[2]; - - c_blk_ = c_block; - w_blk_ = w; - nh_blk_ = nh; - - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } } kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl"); CLImageConverterNWBlock converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); #endif - } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] + } else if (filter_tensor_c_ == 1 && input_tensor_c_ == output_tensor_c_ #ifdef DEPTH_CONV_USE_SPL && - kernel_h != 3 + filter_tensor_h_ != 3 #endif #undef DEPTH_CONV_USE_SPL ) { @@ -213,75 +165,61 @@ void ConvImageCompute::PrepareForRun() { CLImageConverterNWBlock converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::DepthwiseConv2d; - } else if (kernel_w == 3 && kernel_h == 3) { + } else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) { // #define CONV3x3OPT_FALL_BACK #ifndef CONV3x3OPT_FALL_BACK // conv2d_3x3 - kernel_func_names_.push_back(bs > 1 ? "conv2d_3x3_multi_batch" - : "conv2d_3x3_opt"); + kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" + : "conv2d_3x3_opt"); kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl"); CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d3x3opt; - - { - int w_blk_size = 5; - int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; - - int h_blk_size = 1; - int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; - - c_blk_ = default_c_blk_; - w_blk_ = w_blk; - nh_blk_ = h_blk; - - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } #else kernel_func_names_.push_back("conv2d_3x3"); kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl"); CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d3x3; - #endif #undef CONV3x3OPT_FALL_BACK - } else if (kernel_h == 5 && kernel_w == 5) { + } else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) { #define CONV_5x5_OPT #ifndef CONV_5x5_OPT // conv2d_5x5 @@ -290,55 +228,42 @@ void ConvImageCompute::PrepareForRun() { CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d5x5; #else // conv2d_5x5_opt - kernel_func_names_.push_back(bs > 1 ? "conv2d_5x5_multi_batch" - : "conv2d_5x5_opt"); + kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_5x5_multi_batch" + : "conv2d_5x5_opt"); kernel_func_paths_.push_back("image/conv2d_5x5_opt_kernel.cl"); CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d5x5opt; - { - int w_blk_size = 5; - int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; - - int h_blk_size = 1; - int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; - - c_blk_ = default_c_blk_; - w_blk_ = w_blk; - nh_blk_ = h_blk; - - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } #endif #undef CONV_5x5_OPT - } else if (kernel_h == 7 && kernel_w == 7) { + } else if (filter_tensor_h_ == 7 && filter_tensor_w_ == 7) { #define CONV_7x7_OPT #ifndef CONV_7x7_OPT // conv2d_7x7 @@ -347,52 +272,39 @@ void ConvImageCompute::PrepareForRun() { CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d7x7; #else // conv2d_7x7 - kernel_func_names_.push_back(bs > 1 ? "conv2d_7x7_multi_batch" - : "conv2d_7x7_opt"); + kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_7x7_multi_batch" + : "conv2d_7x7_opt"); kernel_func_paths_.push_back("image/conv2d_7x7_opt_kernel.cl"); CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d7x7opt; - { - int w_blk_size = 5; - int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; - - int h_blk_size = 1; - int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; - - c_blk_ = default_c_blk_; - w_blk_ = w_blk; - nh_blk_ = h_blk; - - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } #endif #undef CONV_7x7_OPT } else { @@ -404,30 +316,30 @@ void ConvImageCompute::PrepareForRun() { // build options std::string build_options_single(" -DCL_DTYPE_half"); // relu options - VLOG(3) << "relu_fused:" << relu_fused - << " param.activation_param.active_type:" - << static_cast(param.activation_param.active_type) - << " param.activation_param.has_active:" - << param.activation_param.has_active; - if (param.activation_param.has_active) { - if (param.activation_param.active_type == - lite_api::ActivationType::kRelu) { // Note: judge using `relu_fused` + VLOG(3) << "relu_fused_:" << relu_fused_ + << " conv_param_->activation_param.active_type:" + << static_cast(conv_param_->activation_param.active_type) + << " conv_param_->activation_param.has_active:" + << conv_param_->activation_param.has_active; + if (conv_param_->activation_param.has_active) { + if (conv_param_->activation_param.active_type == + lite_api::ActivationType::kRelu) { // Note: judge using `relu_fused_` // also is ok build_options_single += " -DRELU"; - } else if (param.activation_param.active_type == + } else if (conv_param_->activation_param.active_type == lite_api::ActivationType::kRelu6) { build_options_single += " -DRELU6"; } else { LOG(FATAL) << "Unsupported activation type:" - << static_cast(param.activation_param.active_type); + << static_cast(conv_param_->activation_param.active_type); } } + GetGlobalWorkSize(); // bias options - const bool has_bias = param.bias != nullptr; const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - if (has_bias) { + has_bias_ && conv_param_->output->dims() == conv_param_->bias->dims(); + if (has_bias_) { bias_gpu_image_ = std::unique_ptr(new Tensor); build_options_single += is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH"; @@ -435,21 +347,36 @@ void ConvImageCompute::PrepareForRun() { // convert cpu buffer bias --> gpu image CLImageConverterFolder bias_converter; const DDim& bias_image_dims = - bias_converter.InitImageDimInfoWith(param.bias->dims()); - + bias_converter.InitImageDimInfoWith(conv_param_->bias->dims()); + bias_image_h_ = bias_image_dims[1]; + bias_image_w_ = bias_image_dims[0]; tensor_hold_bias_image_->Resize( {1, bias_image_dims[0], bias_image_dims[1], 4}); half_t* bias_image_data = tensor_hold_bias_image_->mutable_data(); - float* bias_cpu_data = param.bias->mutable_data(); + float* bias_cpu_data = conv_param_->bias->mutable_data(); bias_converter.NCHWToImage( - bias_cpu_data, bias_image_data, param.bias->dims()); + bias_cpu_data, bias_image_data, conv_param_->bias->dims()); this->bias_gpu_image_->mutable_data( bias_image_dims[0], bias_image_dims[1], bias_image_data); // convert cpu buffer bias --> gpu image --- end ---- + } else { + bias_gpu_image_ = std::unique_ptr(new Tensor); + CLImageConverterFolder bias_converter; + tensor_hold_bias_image_->Resize({1, 1, 1, 4}); + half_t* bias_image_data = tensor_hold_bias_image_->mutable_data(); + this->bias_gpu_image_->mutable_data( + 1, 1, bias_image_data); } + // define image pointer for filter, bias + input_image_p_ = conv_param_->x->data(); + filter_image_p_ = filter_gpu_image_->data(); + bias_image_p_ = bias_gpu_image_->data(); + output_image_p_ = conv_param_->output->mutable_data( + output_image_w_, output_image_h_); + build_options_.push_back(build_options_single); for (size_t i = 0; i < kernel_func_names_.size(); i++) { @@ -475,55 +402,55 @@ void ConvImageCompute::PrepareForRun() { VLOG(4) << "max_work_group_size: " << max_work_group_size; if (max_work_group_size > 0 && use_lws_) { - double min_turn_time = DBL_MAX; + double min_tune_time = DBL_MAX; cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize( global_work_size_, max_work_group_size); VLOG(3) << "origin :local_work_size_ : " << best_local_work_size[0] << " " << best_local_work_size[1] << " " << best_local_work_size[2]; cl::NDRange last_local_work_size = cl::NDRange{ static_cast(0), static_cast(0), static_cast(0)}; - if (use_turn_) { + if (use_tune_) { for (size_t i = 1; i < 15; i++) { - if (kernel_h == 1 && kernel_w == 1) { + if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1) { // todo use diff logics - local_work_size_ = context.cl_context()->LocalWorkSizeTurn( + local_work_size_ = context.cl_context()->LocalWorkSizeTune( global_work_size_, max_work_group_size, i); } else { - local_work_size_ = context.cl_context()->LocalWorkSizeTurn( + local_work_size_ = context.cl_context()->LocalWorkSizeTune( global_work_size_, max_work_group_size, i); } if (last_local_work_size[0] == local_work_size_[0] && last_local_work_size[1] == local_work_size_[1] && last_local_work_size[2] == local_work_size_[2]) { - // skiped turned lws + // skiped tuneed lws continue; } - auto turn_time = this->Turn(10); - if (min_turn_time > turn_time) { - min_turn_time = turn_time; + auto tune_time = this->Tune(10); + if (min_tune_time > tune_time) { + min_tune_time = tune_time; best_local_work_size = local_work_size_; } last_local_work_size = local_work_size_; } // reverse for (size_t i = 1; i < 15; i++) { - if (kernel_h == 1 && kernel_w == 1) { + if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1) { // todo use diff logics - local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse( + local_work_size_ = context.cl_context()->LocalWorkSizeTuneReverse( global_work_size_, max_work_group_size, i); } else { - local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse( + local_work_size_ = context.cl_context()->LocalWorkSizeTuneReverse( global_work_size_, max_work_group_size, i); } if (last_local_work_size[0] == local_work_size_[0] && last_local_work_size[1] == local_work_size_[1] && last_local_work_size[2] == local_work_size_[2]) { - // skiped turned lws + // skiped tuneed lws continue; } - auto turn_time = this->Turn(10); - if (min_turn_time > turn_time) { - min_turn_time = turn_time; + auto tune_time = this->Tune(10); + if (min_tune_time > tune_time) { + min_tune_time = tune_time; best_local_work_size = local_work_size_; } last_local_work_size = local_work_size_; @@ -537,548 +464,316 @@ void ConvImageCompute::PrepareForRun() { } } -void ConvImageCompute::Conv2d1x1opt(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); - - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - int input_c_block = input_image_shape["width"] / input_dims[3]; - int input_c = input_dims[1]; - auto dilations = *param.dilations; - +void ConvImageCompute::ReInitWhenNeeded() { + conv_param_ = param_.get_mutable(); + auto x_dims = conv_param_->x->dims(); #ifdef LITE_WITH_LOG - // VLOG(4) << "out_image: " << out_image; - VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," - << global_work_size_[1] << "," << global_work_size_[2] << "}"; + LOG(INFO) << "is_first_epoch_for_run_:" << is_first_epoch_for_run_ + << ", last_input_dims_:" << last_input_dims_ + << ", x_dims:" << x_dims; #endif -#ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d_1x1 params ============"; - VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - << input_image_shape["height"]; - VLOG(4) << "input_c_block: " << input_c_block; - VLOG(4) << "input_c: " << input_c; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "offset: " << offset; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; -// VLOG(4) << "default work size{c_block, w, nh}: " -// << "{" << c_block << ", " << w << ", " << nh << "" -// << "}"; -#endif - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - // handle bias use buffer for channel wise , use image for element wise - const cl::Buffer* bias_buf = nullptr; - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } - - auto kernel = kernel_; - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c_block); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, default_w_blk_); - CL_CHECK_FATAL(status); - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - event_); - CL_CHECK_FATAL(status); - if (is_turn) { - CLRuntime::Global()->command_queue().finish(); - } -} -void ConvImageCompute::Conv2d3x3(bool is_turn) { - auto kernel = kernel_; - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int input_channel = input_dims[1]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int output_channel = output_dims[1]; - int filter_width = filter_dims[3]; - int filter_height = filter_dims[2]; - int filter_channel = filter_dims[1]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); - - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - int input_c_block = input_image_shape["width"] / input_dims[3]; - int input_c = input_dims[1]; - auto dilations = *param.dilations; - - // re-calc group - int new_groups{param.groups}; - if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) { - new_groups = 1; - } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) { - new_groups = input_channel / filter_channel; - } - /* TODO(ysh329): mobile has no case below - else { - LOG(FATAL) << "Not support conv3x3 case with" - << " input_dims:" << input_dims << " output_dims:" << - output_dims - << " filter_dims:" << filter_dims; + if (is_first_epoch_for_run_ || last_input_dims_ != x_dims) { + is_first_epoch_for_run_ = false; + last_input_dims_ = x_dims; + + input_tensor_n_ = x_dims[0]; + input_tensor_c_ = x_dims[1]; + input_tensor_h_ = x_dims[2]; + input_tensor_w_ = x_dims[3]; + auto x_image_shape = InitImageDimInfoWith(x_dims); + input_image_h_ = x_image_shape["height"]; + input_image_w_ = x_image_shape["width"]; + + auto output_dims = conv_param_->output->dims(); + output_tensor_n_ = output_dims[0]; + output_tensor_c_ = output_dims[1]; + output_tensor_h_ = output_dims[2]; + output_tensor_w_ = output_dims[3]; + auto output_image_shape = InitImageDimInfoWith(output_dims); + output_image_h_ = output_image_shape["height"]; + output_image_w_ = output_image_shape["width"]; + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + CHECK_GE(conv_param_->x->dims().size(), 4); + CHECK_GE(conv_param_->output->dims().size(), 4); + if (kernel_func_names_.size() > 0 && + kernel_func_names_[0] == "conv2d_3x3") { + groups_ = conv_param_->groups; + if (filter_tensor_n_ == output_tensor_c_ && + filter_tensor_c_ == input_tensor_c_) { + groups_ = 1; + } else if (!(filter_tensor_n_ == input_tensor_c_ && + filter_tensor_c_ == 1)) { + groups_ = input_tensor_c_ / filter_tensor_c_; + } } - */ - - // const std::vector& default_work_size = - // DefaultWorkSize(output_dims, - // DDim(std::vector{ - // static_cast(out_image_shape["width"]), - // static_cast(out_image_shape["height"])})); - - // int c_block = default_work_size[0]; - // int w = default_work_size[1]; - // int nh = default_work_size[2]; - - // VLOG(4) << "============ conv2d params ============"; - // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - // << input_image_shape["height"]; - // VLOG(4) << "input_c_block: " << input_c_block; - // VLOG(4) << "input_c: " << input_c; - // VLOG(4) << "input_image: " << input_image; - // VLOG(4) << "input_dims: " << input_dims; - // VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - // VLOG(4) << "output_dims: " << output_dims; - // VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - // << out_image_shape["height"]; - // VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - // VLOG(4) << "has bias: " << has_bias; - // VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - // VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - // VLOG(4) << "offset: " << offset; - // VLOG(4) << "dilations.size : " << dilations.size(); - // VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; - // VLOG(4) << "param.groups(groups):" << param.groups; - // VLOG(4) << "new_groups:" << new_groups; - // VLOG(4) << "default work size{c_block, w, nh}: " - // << "{" << c_block << ", " << w << ", " << nh << "" - // << "}"; - - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - // STL::stringstream kernel_key; - // kernel_key << kernel_func_names_[0] << build_options_[0]; - // auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - // VLOG(4) << "kernel_key: " << kernel_key.str(); - // VLOG(4) << "kernel ready ... " << kernel_key.str(); - // VLOG(4) << "w: " << w; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { - VLOG(4) << "set bias_image: "; - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); + // define image pointer for input, output + input_image_p_ = conv_param_->x->data(); + output_image_p_ = conv_param_->output->mutable_data( + output_image_w_, output_image_h_); + + GetGlobalWorkSize(); } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c_block); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_channel); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, filter_channel); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, filter_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, filter_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, new_groups); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(input_dims[1])); - CL_CHECK_FATAL(status); - - // auto global_work_size = - // cl::NDRange{static_cast(default_work_size.data()[0]), - // static_cast(default_work_size.data()[1]), - // static_cast(default_work_size.data()[2])}; - - // VLOG(4) << "out_image: " << out_image; - // VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," - // << global_work_size[1] << "," << global_work_size[2] << "}"; - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); } -void ConvImageCompute::Conv2d3x3opt(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto dilations = *param.dilations; - - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int input_channel = input_dims[1]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int output_channel = output_dims[1]; - CHECK_EQ(input_dims[0], output_dims[0]); - int batch = input_dims[0]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); -#ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d params ============"; - // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - // << input_image_shape["height"]; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "input_dims: " << input_dims; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; -#endif +void ConvImageCompute::GetGlobalWorkSize() { + if (kernel_func_names_.size() <= 0) return; + // general input_c_block + input_c_block_ = static_cast(input_image_w_ / input_tensor_w_); - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } + // general gws + auto output_dims = conv_param_->output->dims(); + const std::vector& default_work_size = + DefaultWorkSize(output_dims, + DDim(std::vector{ + static_cast(output_image_w_), + static_cast(output_image_h_)})); + default_c_blk_ = default_work_size[0]; + default_w_blk_ = default_work_size[1]; + default_nh_blk_ = default_work_size[2]; + c_blk_ = default_c_blk_; + w_blk_ = default_w_blk_; + nh_blk_ = default_nh_blk_; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { -#ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; -#endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); + if (kernel_func_names_[0] == "conv2d_1x1_simple" || + kernel_func_names_[0] == "conv2d_1x1_opt") { + w_blk_ = maptofactor(default_w_blk_, 4); + c_blk_ = default_c_blk_; + nh_blk_ = default_nh_blk_; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + + } else if (kernel_func_names_[0] == "depth_conv2d_3x3s1") { + // depthwise spl gws s1 + int c_block = (output_tensor_c_ + 3) / 4; + int w = output_tensor_w_; + int nh = output_tensor_n_ * output_tensor_h_; + int w_blk_size = 2; + int w_blk = (w + w_blk_size - 1) / w_blk_size; + + c_blk_ = c_block; + w_blk_ = w_blk; + nh_blk_ = nh; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } else if (kernel_func_names_[0] == "depth_conv2d_3x3") { + // depthwise spl gws + int c_block = (output_tensor_c_ + 3) / 4; + int w = output_tensor_w_; + int nh = output_tensor_n_ * output_tensor_h_; + + c_blk_ = c_block; + w_blk_ = w; + nh_blk_ = nh; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + input_c_block_ = static_cast((input_tensor_c_ + 3) / 4); + } else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" || + kernel_func_names_[0] == "conv2d_3x3_opt") { + int w_blk_size = 5; + int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; + + int h_blk_size = 1; + int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; + + c_blk_ = default_c_blk_; + w_blk_ = w_blk; + nh_blk_ = h_blk; + + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } else if (kernel_func_names_[0] == "conv2d_5x5_multi_batch" || + kernel_func_names_[0] == "conv2d_5x5_opt") { + int w_blk_size = 5; + int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; + + int h_blk_size = 1; + int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; + + c_blk_ = default_c_blk_; + w_blk_ = w_blk; + nh_blk_ = h_blk; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } else if (kernel_func_names_[0] == "conv2d_7x7_multi_batch" || + kernel_func_names_[0] == "conv2d_7x7_opt") { + int w_blk_size = 5; + int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; + + int h_blk_size = 1; + int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; + + c_blk_ = default_c_blk_; + w_blk_ = w_blk; + nh_blk_ = h_blk; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, paddings[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, batch); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_channel); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); +} +void ConvImageCompute::Conv2d1x1opt(bool enable_tune) { #ifdef LITE_WITH_LOG - // VLOG(4) << "out_image: " << out_image; - VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," - << global_work_size_[1] << "," << global_work_size_[2] << "}"; + PrintConvInfo(); #endif + auto& context = ctx_->As(); - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - event_); - CL_CHECK_FATAL(status); - if (is_turn) { + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, output_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, default_w_blk_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); + CL_CHECK_FATAL(status_); + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::Conv2d5x5(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int filter_width = filter_dims[3]; - int filter_height = filter_dims[2]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); - - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - int input_c_block = input_image_shape["width"] / input_dims[3]; - int input_c = input_dims[1]; - auto dilations = *param.dilations; - +void ConvImageCompute::Conv2d3x3(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d params ============"; - VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - << input_image_shape["height"]; - VLOG(4) << "input_c_block: " << input_c_block; - VLOG(4) << "input_c: " << input_c; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "input_dims: " << input_dims; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "offset: " << offset; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + PrintConvInfo(); #endif + auto& context = ctx_->As(); - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, output_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, filter_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(17, filter_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(18, filter_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(19, groups_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(20, input_tensor_c_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status_); +} - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { +void ConvImageCompute::Conv2d3x3opt(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; + PrintConvInfo(); #endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c_block); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); + auto& context = ctx_->As(); + + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, pad_left_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_n_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, output_tensor_h_); + CL_CHECK_FATAL(status_); #ifdef LITE_WITH_LOG // VLOG(4) << "out_image: " << out_image; @@ -1086,697 +781,406 @@ void ConvImageCompute::Conv2d5x5(bool is_turn) { << global_work_size_[1] << "," << global_work_size_[2] << "}"; #endif - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); - if (is_turn) { + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); + CL_CHECK_FATAL(status_); + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::Conv2d5x5opt(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto dilations = *param.dilations; - - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int input_channel = input_dims[1]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int output_channel = output_dims[1]; - CHECK_EQ(input_dims[0], output_dims[0]); - int batch = input_dims[0]; - - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - -// default_work_size[2] = h_blk; +void ConvImageCompute::Conv2d5x5(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d params ============"; - // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - // << input_image_shape["height"]; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "input_dims: " << input_dims; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + PrintConvInfo(); #endif - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } - - auto kernel = kernel_; - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, paddings[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, batch); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_channel); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); - - // VLOG(4) << "out_image: " << out_image; + auto& context = ctx_->As(); - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - event_); - CL_CHECK_FATAL(status); - if (is_turn) { + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status_); + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::Conv2d7x7(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int filter_width = filter_dims[3]; - int filter_height = filter_dims[2]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); - - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - int input_c_block = input_image_shape["width"] / input_dims[3]; - int input_c = input_dims[1]; - auto dilations = *param.dilations; - +void ConvImageCompute::Conv2d5x5opt(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d params ============"; - VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - << input_image_shape["height"]; - VLOG(4) << "input_c_block: " << input_c_block; - VLOG(4) << "input_c: " << input_c; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "input_dims: " << input_dims; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "offset: " << offset; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + PrintConvInfo(); #endif + auto& context = ctx_->As(); - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, pad_left_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_n_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); + CL_CHECK_FATAL(status_); + if (enable_tune) { + CLRuntime::Global()->command_queue().finish(); } +} - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { +void ConvImageCompute::Conv2d7x7(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; + PrintConvInfo(); #endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); + auto& context = ctx_->As(); + + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status_); + if (enable_tune) { + CLRuntime::Global()->command_queue().finish(); } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c_block); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); +} +void ConvImageCompute::Conv2d7x7opt(bool enable_tune) { #ifdef LITE_WITH_LOG - // VLOG(4) << "out_image: " << out_image; - VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," - << global_work_size_[1] << "," << global_work_size_[2] << "}"; + PrintConvInfo(); #endif + auto& context = ctx_->As(); - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); - - if (is_turn) { + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, pad_left_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_n_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); + CL_CHECK_FATAL(status_); + + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::Conv2d7x7opt(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto dilations = *param.dilations; - - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int input_channel = input_dims[1]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int output_channel = output_dims[1]; - CHECK_EQ(input_dims[0], output_dims[0]); - int batch = input_dims[0]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); +void ConvImageCompute::DepthwiseConv2d3x3s1(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d 7x7 params ============"; - // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - // << input_image_shape["height"]; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "input_dims: " << input_dims; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + PrintConvInfo(); #endif - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } + auto& context = ctx_->As(); - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, paddings[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, batch); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_channel); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - event_); - CL_CHECK_FATAL(status); - - if (is_turn) { + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, pad_left_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); + CL_CHECK_FATAL(status_); + + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto x_dims = param.x->dims(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto dilations = *param.dilations; - - auto* input_img = param.x->data(); - auto* filter_img = filter_gpu_image_->data(); - - const cl::Image2D* bias_img = nullptr; - if (param.bias) { - bias_img = bias_gpu_image_->data(); - } - - auto image_shape = InitImageDimInfoWith(output_dims); - - auto* output_img = param.output->mutable_data( - image_shape["width"], image_shape["height"]); - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_img); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_img); - CL_CHECK_FATAL(status); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); +void ConvImageCompute::DepthwiseConv2d3x3(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; + PrintConvInfo(); #endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *output_img); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(strides[0])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(paddings[0])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(dilations[0])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(x_dims[1])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(x_dims[3])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(x_dims[2])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(output_dims[3])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(output_dims[2])); - CL_CHECK_FATAL(status); - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - event_); - CL_CHECK_FATAL(status); - - if (is_turn) { + auto& context = ctx_->As(); + + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status_); + + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto x_dims = param.x->dims(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto dilations = *param.dilations; - int offset = filter_dims[2] / 2 - paddings[0]; - int input_c_block = (x_dims[1] + 3) / 4; - - auto* input_img = param.x->data(); - auto* filter_img = filter_gpu_image_->data(); - - const cl::Image2D* bias_img = nullptr; - if (param.bias) { - bias_img = bias_gpu_image_->data(); - } - - auto image_shape = InitImageDimInfoWith(output_dims); - - auto* output_img = param.output->mutable_data( - image_shape["width"], image_shape["height"]); - - auto kernel = kernel_; - +void ConvImageCompute::DepthwiseConv2d(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "setArg"; - VLOG(4) << "strides = " << strides[0]; - VLOG(4) << "offset = " << offset; - VLOG(4) << "dilations = " << dilations[0]; - VLOG(4) << "input_c_block = " << input_c_block; - VLOG(4) << "x_dims[3] = " << x_dims[3]; - VLOG(4) << "x_dims[2] = " << x_dims[2]; - VLOG(4) << "output_dims[3] = " << output_dims[3]; - VLOG(4) << "output_dims[2] = " << output_dims[2]; + PrintConvInfo(); #endif + auto& context = ctx_->As(); - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_img); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_img); - CL_CHECK_FATAL(status); - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); -#ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; -#endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *output_img); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(strides[0])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(offset)); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(dilations[0])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(input_c_block)); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(x_dims[3])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(x_dims[2])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(output_dims[3])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(output_dims[2])); - CL_CHECK_FATAL(status); - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); - - if (is_turn) { + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, filter_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, filter_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status_); + + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::DepthwiseConv2d(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int filter_width = filter_dims[3]; - int filter_height = filter_dims[2]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); +void ConvImageCompute::Run() { (this->*impl_)(false); } - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - int input_c_block = input_image_shape["width"] / input_dims[3]; - int input_c = input_dims[1]; - auto dilations = *param.dilations; +void ConvImageCompute::PrintConvInfo() { + const bool is_element_wise_bias = + has_bias_ && conv_param_->output->dims() == conv_param_->bias->dims(); -#ifdef LITE_WITH_LOG - VLOG(4) << "============ depthwise conv2d params ============"; - VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - << input_image_shape["height"]; - VLOG(4) << "input_c_block: " << input_c_block; - VLOG(4) << "input_c: " << input_c; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "filter_dims: " << filter_dims; + VLOG(4) << "input_image_shape: " << input_image_w_ << "," << input_image_h_; + // VLOG(4) << "input_image: " << input_image_p_; + VLOG(4) << "input_dims: " << conv_param_->x->dims(); + VLOG(4) << "filter_dims: " << conv_param_->filter->dims(); // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; + VLOG(4) << "output_dims: " << conv_param_->output->dims(); + VLOG(4) << "out_image_shape: " << output_image_w_ << ", " << output_image_h_; + VLOG(4) << "paddings: " << pad_left_ << "," << pad_up_; + VLOG(4) << "has bias: " << has_bias_; VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "offset: " << offset; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; -#endif - - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - // handle bias use buffer for channel wise , use image for element wise - const cl::Buffer* bias_buf = nullptr; - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } - - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { -#ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; -#endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c_block); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, filter_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, filter_height); - CL_CHECK_FATAL(status); - -#ifdef LITE_WITH_LOG + VLOG(4) << "strides: " << stride_h_ << "," << stride_w_; + VLOG(4) << "offset: "; + VLOG(4) << "dilations.size : " << conv_param_->dilations->size(); + VLOG(4) << "dilations: " << dilation_h_ << ", " << dilation_w_; VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," << global_work_size_[1] << "," << global_work_size_[2] << "}"; -#endif - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); } -void ConvImageCompute::Run() { (this->*impl_)(false); } - -double ConvImageCompute::Turn(int times) { +double ConvImageCompute::Tune(int times) { auto GetCurrentUS = []() -> double { struct timeval time; gettimeofday(&time, NULL); diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h index 64276a5721cb20718604d91d3cfac31e583ddbf1..4eab7be1f1ac6459250c6df984160f0f6060ea1c 100644 --- a/lite/kernels/opencl/conv_image_compute.h +++ b/lite/kernels/opencl/conv_image_compute.h @@ -33,6 +33,7 @@ namespace paddle { namespace lite { namespace kernels { namespace opencl { + class ConvImageCompute : public KernelLite { @@ -42,8 +43,11 @@ class ConvImageCompute : public KernelLite kernel_func_names_{}; @@ -79,19 +87,72 @@ class ConvImageCompute : public KernelLite tensor_hold_bias_image_{nullptr}; cl::NDRange global_work_size_ = cl::NDRange{ static_cast(1), static_cast(1), static_cast(1)}; + + // opencl kernel args int c_blk_ = 1; int w_blk_ = 1; int nh_blk_ = 1; + const cl::Image2D* input_image_p_{nullptr}; + const cl::Image2D* filter_image_p_{nullptr}; + const cl::Image2D* bias_image_p_{nullptr}; + const cl::Image2D* output_image_p_{nullptr}; + + int stride_h_{-1}; + int stride_w_{-1}; + + int dilation_h_{-1}; + int dilation_w_{-1}; + + int pad_up_{-1}; + int pad_down_{-1}; + int pad_left_{-1}; + int pad_right_{-1}; + + int offset_{-1}; + int groups_{-1}; + bool relu_fused_{false}; + bool has_bias_{false}; + + int input_tensor_n_{-1}; + int input_tensor_c_{-1}; + int input_tensor_h_{-1}; + int input_tensor_w_{-1}; + int input_image_h_{-1}; + int input_image_w_{-1}; + int input_c_block_{-1}; + + int output_tensor_n_{-1}; + int output_tensor_c_{-1}; + int output_tensor_h_{-1}; + int output_tensor_w_{-1}; + int output_image_h_{-1}; + int output_image_w_{-1}; + + int filter_tensor_n_{-1}; + int filter_tensor_c_{-1}; + int filter_tensor_h_{-1}; + int filter_tensor_w_{-1}; + int filter_image_h_{-1}; + int filter_image_w_{-1}; + + int bias_image_h_{-1}; + int bias_image_w_{-1}; + int default_c_blk_ = 1; int default_w_blk_ = 1; int default_nh_blk_ = 1; + // ================= + + DDim last_input_dims_{}; + bool is_first_epoch_for_run_{true}; cl::Kernel kernel_; + cl_int status_; cl::NDRange local_work_size_ = cl::NDRange{ static_cast(1), static_cast(1), static_cast(1)}; bool use_lws_{true}; - bool use_turn_{false}; + bool use_tune_{false}; }; } // namespace opencl diff --git a/lite/kernels/opencl/expand_image_compute_test.cc b/lite/kernels/opencl/expand_image_compute_test.cc index e3188777df9752c8ac6fd2849bdaddced975bda1..c372855193e938081208addce058e3e38b692cbb 100644 --- a/lite/kernels/opencl/expand_image_compute_test.cc +++ b/lite/kernels/opencl/expand_image_compute_test.cc @@ -11,9 +11,9 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include #include +#include #include "lite/backends/opencl/target_wrapper.h" #include "lite/core/op_registry.h" #include "lite/core/tensor.h" diff --git a/lite/kernels/opencl/fc_buffer_compute.cc b/lite/kernels/opencl/fc_buffer_compute.cc index 9763faf2f33f578e6f62b07a8c89390e1b80c159..3a31c8993d77388b95260ad5c0be65f791c433eb 100644 --- a/lite/kernels/opencl/fc_buffer_compute.cc +++ b/lite/kernels/opencl/fc_buffer_compute.cc @@ -35,10 +35,27 @@ class FcCompute public: using param_t = operators::FcParam; - void PrepareForRun() override {} + void PrepareForRun() override { + fc_param_ = param_.get_mutable(); + auto w_t = fc_param_->w; + auto bias_t = fc_param_->bias; + + w_gpu_t_ = std::unique_ptr(new Tensor); + auto w_gpu_data = + w_gpu_t_->mutable_data(TARGET(kOpenCL), w_t->memory_size()); + TargetWrapperCL::MemcpySync( + w_gpu_data, w_t->raw_data(), w_t->memory_size(), IoDirection::HtoD); + + bias_gpu_t_ = std::unique_ptr(new Tensor); + auto b_gpu_data = + bias_gpu_t_->mutable_data(TARGET(kOpenCL), bias_t->memory_size()); + TargetWrapperCL::MemcpySync(b_gpu_data, + bias_t->raw_data(), + bias_t->memory_size(), + IoDirection::HtoD); + } void ReInitWhenNeeded() override { - fc_param_ = param_.get_mutable(); const auto x_dims = fc_param_->input->dims(); if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) || first_epoch_for_reinit_) { @@ -93,7 +110,7 @@ class FcCompute } void GetGlobalWorkSize() { - if (m_ == 1) { // gemv + if (kernel_func_name_ == "fc_gemv_1x4") { // gemv global_work_size_ = cl::NDRange{static_cast((n_ + 3) / 4)}; } else { // gemm global_work_size_ = cl::NDRange{static_cast((m_ + 3) / 4), @@ -103,8 +120,8 @@ class FcCompute void Run() override { auto* x_buf = fc_param_->input->data(); - auto* w_buf = fc_param_->w->data(); - auto* bias_buf = fc_param_->bias->data(); + auto* w_buf = w_gpu_t_->data(); + auto* bias_buf = bias_gpu_t_->data(); auto* out_buf = fc_param_->output->mutable_data(TARGET(kOpenCL)); @@ -154,6 +171,10 @@ class FcCompute std::string time_stamp_{GetTimeStamp()}; bool first_epoch_for_reinit_{true}; DDim last_x_dims_; + + std::unique_ptr w_gpu_t_{nullptr}; + std::unique_ptr bias_gpu_t_{nullptr}; + cl::NDRange global_work_size_; cl::Kernel kernel_; }; @@ -166,7 +187,7 @@ class FcCompute REGISTER_LITE_KERNEL( fc, kOpenCL, kFloat, kNCHW, paddle::lite::kernels::opencl::FcCompute, def) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))}) - .BindInput("W", {LiteType::GetTensorTy(TARGET(kOpenCL))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .Finalize(); diff --git a/lite/kernels/opencl/fc_buffer_compute_test.cc b/lite/kernels/opencl/fc_buffer_compute_test.cc index 4c9c8c47e4306c92486dd1b847884200959453dd..85793dffee9e4717e257ad8c73258ce35ad61d54 100644 --- a/lite/kernels/opencl/fc_buffer_compute_test.cc +++ b/lite/kernels/opencl/fc_buffer_compute_test.cc @@ -126,9 +126,11 @@ TEST(fc, compute) { out.Resize(out_dim); out_ref.Resize(out_dim); + VLOG(2) << "out.dims():" << out.dims() << ", out_dim:" << out_dim; + auto* x_data = x.mutable_data(TARGET(kOpenCL)); - auto* w_data = w.mutable_data(TARGET(kOpenCL)); - auto* bias_data = bias.mutable_data(TARGET(kOpenCL)); + auto* w_data = w.mutable_data(); + auto* bias_data = bias.mutable_data(); auto* out_data = out.mutable_data(TARGET(kOpenCL)); std::default_random_engine engine; @@ -148,17 +150,15 @@ TEST(fc, compute) { } for (size_t i = 0; i < w_dim.production(); ++i) { w_source[i] = static_cast(dist(engine)); + w_data[i] = w_source[i]; } for (size_t i = 0; i < bias_dim.production(); ++i) { bias_source[i] = 10; // static_cast(dist(engine)); + bias_data[i] = 10; } TargetWrapperCL::MemcpySync( x_data, x_source.data(), x_size, IoDirection::HtoD); - TargetWrapperCL::MemcpySync( - w_data, w_source.data(), w_size, IoDirection::HtoD); - TargetWrapperCL::MemcpySync( - bias_data, bias_source.data(), bias_size, IoDirection::HtoD); // run opencl kernel kernel->Launch(); @@ -186,8 +186,10 @@ TEST(fc, compute) { #endif std::vector out_data_from_gpu(out_dim.production()); - TargetWrapperCL::MemcpySync( - out_data_from_gpu.data(), out_data, bias_size, IoDirection::DtoH); + TargetWrapperCL::MemcpySync(out_data_from_gpu.data(), + out_data, + out_data_from_gpu.size() * sizeof(float), + IoDirection::DtoH); // run cpu ref auto* out_ref_data = out_ref.mutable_data(TARGET(kARM)); diff --git a/lite/kernels/opencl/transpose_image_compute.cc b/lite/kernels/opencl/transpose_image_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..31184092efa40cea47c3cacb6a65f03d15a229b2 --- /dev/null +++ b/lite/kernels/opencl/transpose_image_compute.cc @@ -0,0 +1,395 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/opencl/cl_half.h" +#include "lite/backends/opencl/cl_include.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/opencl/image_helper.h" +#include "lite/operators/op_params.h" +#include "lite/utils/logging.h" +#include "lite/utils/replace_stl/stream.h" +#ifdef LITE_WITH_PROFILE +#include "lite/core/profile/profiler.h" +#endif +#include "lite/backends/opencl/cl_utility.h" + +#undef LITE_WITH_LOG + +namespace paddle { +namespace lite { +namespace kernels { +namespace opencl { + +// transpose operator +class TransposeComputeFloatImage + : public KernelLite { + public: + using param_t = operators::TransposeParam; + + void PrepareForRun() override { + auto& param = *param_.get_mutable(); + Tensor* const output = param.output; + const DDimLite& out_dims = output->dims(); + if (out_dims.size() == 4) { + kernel_func_name_ = "transpose_4d"; + } else { + kernel_func_name_ = "transpose"; + } + auto& context = ctx_->As(); + VLOG(1) << "kernel_func_name_:" << kernel_func_name_; + context.cl_context()->AddKernel(kernel_func_name_, + "image/transpose_kernel.cl", + build_options_, + time_stamp_); + } + +#ifdef LITE_WITH_PROFILE + void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) { + ch->kernel_func_name = kernel_func_name_; + ch->cl_event = + event_; // `event_` defined in `kernel.h`, valid after kernel::Run + } +#endif + + void Run() override { + auto& param = *param_.get_mutable(); + const Tensor* const x = param.x; + const auto x_dims = x->dims(); + const std::map& input_image_shape = + InitImageDimInfoWith(x_dims); + const int64_t& input_image_width = input_image_shape.at("width"); + const int64_t& input_image_height = input_image_shape.at("height"); + const cl::Image2D* const x_image = x->data(); + + Tensor* const output = param.output; + const DDimLite& out_dims = output->dims(); + VLOG(4) << "out_dims= " << out_dims; + const std::map& out_image_shape = + InitImageDimInfoWith(out_dims); + cl::Image2D* const out_image = output->mutable_data( + out_image_shape.at("width"), out_image_shape.at("height")); +#ifdef LITE_WITH_LOG + VLOG(4) << "out_dims= " << out_dims; +#endif + const std::vector& default_work_size = DefaultWorkSize( + out_dims, + DDim(std::vector{ + static_cast(out_image_shape.at("width")), + static_cast(out_image_shape.at("height"))})); + + int out_C = 0, out_H = 0, out_W = 0, in_W = 0; + if (param.output->dims().size() == 4) { + out_C = out_dims[1]; + out_H = out_dims[2]; + out_W = out_dims[3]; + in_W = x_dims[3]; + } else if (param.output->dims().size() == 3) { + out_C = out_dims[0]; + out_H = out_dims[1]; + out_W = out_dims[2]; + in_W = x_dims[2]; + } else if (param.output->dims().size() == 2) { + out_C = 1; + out_H = out_dims[0]; + out_W = out_dims[1]; + in_W = x_dims[1]; + } + +#ifdef LITE_WITH_LOG + VLOG(4) << "out_C=" << out_C; + VLOG(4) << "out_H=" << out_H; + VLOG(4) << "out_W=" << out_W; + VLOG(4) << "in_W=" << in_W; + VLOG(4) << "default_work_size= " << default_work_size[0] << ", " + << default_work_size[1] << ", " << default_work_size[2]; +#endif + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_ << time_stamp_; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + +#ifdef LITE_WITH_LOG + VLOG(4) << TargetToStr(x->target()); + VLOG(4) << TargetToStr(param.output->target()); +#endif + + int arg_idx = 0; + cl_int status; + status = kernel.setArg(arg_idx, *x_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *out_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, out_C); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, out_H); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, out_W); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, in_W); + CL_CHECK_FATAL(status); + + auto global_work_size = + cl::NDRange{static_cast(default_work_size.data()[0]), + static_cast(default_work_size.data()[1]), + static_cast(default_work_size.data()[2])}; + + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status); + } + + private: + std::string kernel_func_name_{"transpose"}; + std::string build_options_{"-DCL_DTYPE_half"}; + std::string time_stamp_{GetTimeStamp()}; +}; + +// transpose2 operator +class Transpose2ComputeFloatImage + : public KernelLite { + public: + using param_t = operators::TransposeParam; + + void PrepareForRun() override {} + +#ifdef LITE_WITH_PROFILE + void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {} +#endif + + bool IsShuffleChannel(const std::vector& axis) { + bool is_shuffle_channel = true; + if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) { + for (int i = 3; i < axis.size(); ++i) { + if (axis[i] != i) { + is_shuffle_channel = false; + break; + } + } + } else { + return false; + } + return is_shuffle_channel; + } + + template + void DeviceTensorToHostTensor(const Tensor* device_tensor, + Tensor* host_tensor) { + host_tensor->Resize(device_tensor->dims()); + Dtype* host_ptr = host_tensor->mutable_data(); + CLRuntime::Global()->command_queue().finish(); + CLImageConverterDefault default_converter; + auto device_tensor_image_dim = + default_converter.InitImageDimInfoWith(device_tensor->dims()); + half_t* image_data = new half_t[device_tensor_image_dim.production() * 4]; + TargetWrapperCL::ImgcpySync(image_data, + device_tensor->data(), + device_tensor_image_dim[0], + device_tensor_image_dim[1], + 0, + 0, + IoDirection::DtoH); + default_converter.ImageToNCHW( + image_data, host_ptr, device_tensor_image_dim, host_tensor->dims()); + delete[] image_data; + } + + template + void HostTensorToDeviceTensor(const Tensor* host_tensor, + Tensor* device_tensor) { + Dtype* host_ptr = const_cast(host_tensor->data()); + CLImageConverterDefault default_converter; + auto device_tensor_image_dim = + default_converter.InitImageDimInfoWith(device_tensor->dims()); + device_tensor->mutable_data( + device_tensor_image_dim[0], device_tensor_image_dim[1]); + half_t* image_data = new half_t[device_tensor->dims().production() * 4]; + default_converter.NCHWToImage(host_ptr, image_data, device_tensor->dims()); + + TargetWrapperCL::ImgcpySync( + device_tensor->mutable_data(), + image_data, + device_tensor_image_dim[0], + device_tensor_image_dim[1], + 0, + 0, + IoDirection::HtoD); + + delete[] image_data; + } + + template + void ShuffleChannelCompute(const operators::TransposeParam& param) { + const Tensor* input = param.x; + Tensor* input_tensor = new Tensor(); + DeviceTensorToHostTensor(input, input_tensor); + Dtype* input_ptr = input_tensor->mutable_data(); + + Tensor* output = param.output; + Tensor* output_tensor = new Tensor(); + output_tensor->Resize(output->dims()); + Dtype* output_ptr = output_tensor->mutable_data(); + + // input and output's shape dimension must >= 2 && <= 6. + const DDim& in_dim = input->dims(); + const DDim& out_dim = output->dims(); + size_t offset = 1; + for (int i = 3; i < param.axis.size(); ++i) { + offset *= in_dim[i]; + } +#pragma omp parallel for collapse(3) + for (int batch = 0; batch < out_dim[0]; ++batch) { + for (int c1 = 0; c1 < out_dim[1]; ++c1) { + for (int c2 = 0; c2 < out_dim[2]; ++c2) { + size_t out_offset = + ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset; + size_t in_offset = + ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset; + memcpy(output_ptr + out_offset, + input_ptr + in_offset, + offset * sizeof(Dtype)); + } + } + } + HostTensorToDeviceTensor(output_tensor, output); + delete input_tensor; + delete output_tensor; + } + + template + void Transpose2Compute(const operators::TransposeParam& param) { + const Tensor* input = param.x; + Tensor* input_tensor = new Tensor(); + DeviceTensorToHostTensor(input, input_tensor); + Dtype* input_ptr = input_tensor->mutable_data(); + + Tensor* output = param.output; + Tensor* output_tensor = new Tensor(); + output_tensor->Resize(output->dims()); + Dtype* output_ptr = output_tensor->mutable_data(); + + // input and output's shape dimension must >= 2 && <= 6. + const DDim& in_dim = input->dims(); + const DDim& out_dim = output->dims(); + + // precompute inverted output dim and strides + size_t rout_dim[6], strides[6]; + auto& axis = param.axis; + int permute = axis.size(); // permute must >=2 && <= 6. + for (int i = 0; i < permute; ++i) { + int k = permute - 1 - i; + strides[k] = 1; + for (int j = axis[i] + 1; j < permute; ++j) { + strides[k] *= in_dim[j]; + } + rout_dim[k] = out_dim[i]; + } + + // unroll the first 2 dimensions + int reamin_dim = 1; + for (int i = 2; i < out_dim.size(); ++i) { + reamin_dim *= out_dim[i]; + } + +#pragma omp parallel for collapse(2) + for (int batch = 0; batch < out_dim[0]; ++batch) { + for (int j = 0; j < out_dim[1]; ++j) { + size_t offset = batch * strides[permute - 1] + j * strides[permute - 2]; + Dtype* out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim; + int indics[4] = {0, 0, 0, 0}; + for (int k = 0; k < reamin_dim; ++k) { + out_ptr[k] = input_ptr[offset]; + indics[0] += 1; + offset += strides[0]; + for (int p = 0; p < permute - 3; ++p) { + if (indics[p] == rout_dim[p]) { + indics[p + 1] += 1; + indics[p] = 0; + offset += strides[p + 1]; + offset -= rout_dim[p] * strides[p]; + } else { + break; + } + } + } + } + } + HostTensorToDeviceTensor(output_tensor, output); + delete input_tensor; + delete output_tensor; + } + + void Run() override { + auto& param = *param_.get_mutable(); + const std::vector axis = param.axis; + + bool shuffle_channel = IsShuffleChannel(axis); + if (shuffle_channel) { + ShuffleChannelCompute(param); + } else { + Transpose2Compute(param); + } + } +}; + +} // namespace opencl +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(transpose, + kOpenCL, + kFP16, + kImageDefault, + paddle::lite::kernels::opencl::TransposeComputeFloatImage, + image2d) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); + +REGISTER_LITE_KERNEL(transpose2, + kOpenCL, + kFP16, + kImageDefault, + paddle::lite::kernels::opencl::Transpose2ComputeFloatImage, + image2d) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); + +#define LITE_WITH_LOG diff --git a/lite/kernels/opencl/transpose_image_compute_test.cc b/lite/kernels/opencl/transpose_image_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9db9b3732d44aa3f342a8cf8b7b2fe5819586a5f --- /dev/null +++ b/lite/kernels/opencl/transpose_image_compute_test.cc @@ -0,0 +1,172 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/opencl/target_wrapper.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +#include "lite/kernels/opencl/test_helper.h" +#include "lite/operators/reshape_op.h" +#include "lite/utils/logging.h" + +#define FP16_MAX_DIFF (5e-1) + +namespace paddle { +namespace lite { +namespace kernels { +namespace opencl { + +static inline void TestWithKernel( + const std::unique_ptr& kernel) { + int64_t batch_size = 1; + int64_t ic = 2; + int64_t ih = 3; + int64_t iw = 4; + + int64_t oc = 3; + int64_t oh = 4; + int64_t ow = 2; + + lite::Tensor input, output; + operators::TransposeParam param; + + param.x = &input; + param.output = &output; + param.axis = std::vector({0, 2, 3, 1}); + const DDim input_dim = + lite::DDim{std::vector({batch_size, ic, ih, iw})}; + input.Resize(input_dim); + const DDim output_dim = + lite::DDim{std::vector({batch_size, oc, oh, ow})}; + param.output->Resize(output_dim); + + LOG(INFO) << "prepare kernel SetParam------"; + kernel->SetParam(param); + + size_t input_image_width = iw * ((ic + 3) / 4); + size_t input_image_height = ih * batch_size; + + size_t output_image_width = ow * ((oc + 3) / 4); + size_t output_image_height = oh * batch_size; + + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + + std::vector input_v(batch_size * ic * ih * iw); + + LOG(INFO) << "gen input ..."; + + float* input_v_data = &input_v[0]; + auto index = 0; + for (auto& i : input_v) { + i = index++; + } + + paddle::lite::CLImageConverterDefault default_convertor; + + std::vector x_image_data(input_image_width * input_image_height * + 4); // 4 : RGBA + + LOG(INFO) << "set mapped input ..."; + default_convertor.NCHWToImage(input_v_data, x_image_data.data(), input_dim); + + auto* input_image = input.mutable_data( + input_image_width, input_image_height, x_image_data.data()); + + LOG(INFO) << "prepare kernel ready"; + + LOG(INFO) << "mutable output ..."; + CLImageConverterDefault default_converter; + DDim out_image_shape = default_converter.InitImageDimInfoWith(output_dim); + LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " " + << out_image_shape[1]; + auto* out_image = output.mutable_data( + out_image_shape[0], out_image_shape[1]); + + LOG(INFO) << "kernel context ..."; + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + std::unique_ptr transpose_context(new KernelContext); + context->As().CopySharedTo( + &(transpose_context->As())); + kernel->SetContext(std::move(transpose_context)); + + LOG(INFO) << "kernel launch ..."; + kernel->Launch(); + + CLRuntime::Global()->command_queue().finish(); + + half_t* out_image_data = new half_t[out_image_shape.production() * 4]; + TargetWrapperCL::ImgcpySync(out_image_data, + output.data(), + out_image_shape[0], + out_image_shape[1], + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + float* out_data = new float[out_image_shape.production() * 4]; + default_converter.ImageToNCHW( + out_image_data, out_data, out_image_shape, output_dim); + + // check output data + index = 0; + auto hxw = ih * iw; + auto cxhxw = ic * hxw; + for (auto n = 0; n < batch_size; n++) { + for (auto h = 0; h < ih; h++) { + for (auto w = 0; w < iw; w++) { + for (auto c = 0; c < ic; c++) { + auto input_index = n * cxhxw + c * hxw + h * iw + w; + auto input_value = input_v_data[input_index]; + auto output_value = out_data[index]; + auto abs_diff = abs(input_value - output_value); + auto relative_diff = COMPUTE_RELATIVE_DIFF(input_value, output_value); + EXPECT_EQ( + (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF), + true); + index++; + } + } + } + } +} + +TEST(transpose_opencl, compute) { + auto kernels = KernelRegistry::Global().Create("transpose", + TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(kernels.empty()); + auto kernel = std::move(kernels.front()); + TestWithKernel(kernel); +} + +TEST(transpose2_opencl, compute) { + auto kernels = KernelRegistry::Global().Create("transpose2", + TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(kernels.empty()); + auto kernel = std::move(kernels.front()); + TestWithKernel(kernel); +} + +} // namespace opencl +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(transpose, kOpenCL, kFP16, kImageDefault, image2d); diff --git a/lite/kernels/rknpu/subgraph_compute.cc b/lite/kernels/rknpu/subgraph_compute.cc index e0b63205705609b6899918ce8e254ccdf6cbad47..a50505c38c0740f762256cd71e006caf9249838e 100644 --- a/lite/kernels/rknpu/subgraph_compute.cc +++ b/lite/kernels/rknpu/subgraph_compute.cc @@ -28,13 +28,36 @@ namespace lite { namespace kernels { namespace rknpu { -int SubgraphEngine::BuildDeviceProgram() { +bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() { + // Obtain the origin input tensors, and create the origin output + // tensors(Don't try to access them before launch the device program or the + // origin program) + PrepareWorkspaceForOriginProgram(); + // Create the device input and output tensors, but don't initialize them + // with the dimensions + device_itensors_.resize(input_names_.size()); + for (int i = 0; i < input_names_.size(); i++) { + device_itensors_[i].reset(new hiai::AiTensor); + CHECK(device_itensors_[i]); + } + device_otensors_.resize(output_names_.size()); + for (int i = 0; i < output_names_.size(); i++) { + device_otensors_[i].reset(new hiai::AiTensor); + CHECK(device_otensors_[i]); + } + return true; +} + +bool SubgraphEngine::BuildDeviceProgram() { LOG(INFO) << "[RKNPU]:BuildDeviceProgram"; int status = 0; // Convert all of ops and their input vars and weights and added into the NPU // RKNPU IR graph subgraph::rknpu::Graph graph; const auto& bridges = subgraph::Registry::Instance(); + if (origin_program_.empty()) { + BuildOriginProgram(); + } for (auto& inst : origin_program_) { auto op = const_cast(inst.op()); CHECK(op); @@ -42,13 +65,13 @@ int SubgraphEngine::BuildDeviceProgram() { op->InferShape(); std::string op_type = op->op_info()->Type(); if (!bridges.Exists(op_type, TARGET(kRKNPU))) { - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); status |= bridges.Select(op_type, TARGET(kRKNPU))( reinterpret_cast(&graph), op, const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } // Collect the valid input and output nodes in the RKNPU IR graph and update @@ -91,7 +114,7 @@ int SubgraphEngine::BuildDeviceProgram() { model_name_, graph.GetHandle(), device_itensors_, device_otensors_); if (device_program_ == nullptr) { LOG(WARNING) << "[RKNPU] Build model failed!"; - return subgraph::FAILED; + return false; } // input @@ -165,10 +188,10 @@ int SubgraphEngine::BuildDeviceProgram() { break; } } - return status; + return true; } -int SubgraphEngine::LaunchDeviceProgram() { +bool SubgraphEngine::LaunchDeviceProgram() { LOG(INFO) << "[RKNPU]:LaunchDeviceProgram"; std::vector inputs; std::vector outputs; @@ -195,7 +218,7 @@ int SubgraphEngine::LaunchDeviceProgram() { device_program_->SetInputs(inputs); device_program_->Run(); device_program_->GetOutputs(outputs); - return 0; + return true; } void SubgraphCompute::PrepareForRun() { @@ -208,13 +231,12 @@ void SubgraphCompute::PrepareForRun() { param.output_data_names, param.scope)); CHECK(engine_); - engine_->Build(); } void SubgraphCompute::Run() { LOG(INFO) << "[RKNPU]:Run"; CHECK(engine_); - engine_->Launch(); + engine_->Run(); } } // namespace rknpu diff --git a/lite/kernels/rknpu/subgraph_compute.h b/lite/kernels/rknpu/subgraph_compute.h index 863e6aef39ad54f0e9d94d4b507c6fca4128ebb8..a4bdadc658a81decd8107072f7b5948613d0c68a 100644 --- a/lite/kernels/rknpu/subgraph_compute.h +++ b/lite/kernels/rknpu/subgraph_compute.h @@ -42,14 +42,15 @@ class SubgraphEngine : public subgraph::Engine { ctx, block_idx, block_desc, input_names, output_names, scope) {} protected: - int BuildDeviceProgram() override; - int LaunchDeviceProgram() override; + bool PrepareWorkspaceForDeviceProgram() override; + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; std::string model_name_; std::vector device_inames_; std::vector device_onames_; - std::vector> device_itensors_; - std::vector> device_otensors_; + std::vector> device_itensors_{}; + std::vector> device_otensors_{}; std::unique_ptr device_program_{nullptr}; }; diff --git a/lite/kernels/x86/activation_compute.cc b/lite/kernels/x86/activation_compute.cc index 2910364f37b74d94977e2397e31eb97fd367825e..9b4c2fadd9ce427db272a9bb0cfd0e0a10716f11 100644 --- a/lite/kernels/x86/activation_compute.cc +++ b/lite/kernels/x86/activation_compute.cc @@ -78,3 +78,13 @@ REGISTER_LITE_KERNEL(softsign, .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); + +REGISTER_LITE_KERNEL(sigmoid, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SoftsignCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/activation_compute_test.cc b/lite/kernels/x86/activation_compute_test.cc index 8cc2607e73e605214e08e42e70de457a206e2468..550cf299f676105271e758eb1a13e880045ee1cc 100644 --- a/lite/kernels/x86/activation_compute_test.cc +++ b/lite/kernels/x86/activation_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/activation_compute.cc" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/activation_compute.cc" namespace paddle { namespace lite { @@ -26,8 +28,7 @@ namespace kernels { namespace x86 { TEST(relu_x86, retrive_op) { - auto relu = - KernelRegistry::Global().Create("relu"); + auto relu = KernelRegistry::Global().Create("relu"); ASSERT_FALSE(relu.empty()); ASSERT_TRUE(relu.front()); } diff --git a/lite/kernels/x86/attention_padding_mask_compute_test.cc b/lite/kernels/x86/attention_padding_mask_compute_test.cc index 35ce822e010fc3ce2dc756b86e3a437789cc8359..5c672a1ee05116ccefec074f54d0726a7cd010ea 100644 --- a/lite/kernels/x86/attention_padding_mask_compute_test.cc +++ b/lite/kernels/x86/attention_padding_mask_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/attention_padding_mask_compute.cc" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/attention_padding_mask_compute.cc" namespace paddle { namespace lite { @@ -81,8 +83,7 @@ int get_max_len(const LoD& lod) { TEST(attention_padding_mask_x86, retrive_op) { auto attention_padding_mask = - KernelRegistry::Global().Create( - "attention_padding_mask"); + KernelRegistry::Global().Create("attention_padding_mask"); ASSERT_FALSE(attention_padding_mask.empty()); ASSERT_TRUE(attention_padding_mask.front()); } diff --git a/lite/kernels/x86/batch_norm_compute_test.cc b/lite/kernels/x86/batch_norm_compute_test.cc index 5ec2cdcdda0e9ff3698c80584b36396b38328e03..dd70f78efa7334355c459fd1d85a7da4f5b05b60 100644 --- a/lite/kernels/x86/batch_norm_compute_test.cc +++ b/lite/kernels/x86/batch_norm_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/batch_norm_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/batch_norm_compute.h" namespace paddle { namespace lite { @@ -26,9 +28,7 @@ namespace kernels { namespace x86 { TEST(batch_norm_x86, retrive_op) { - auto batch_norm = - KernelRegistry::Global().Create( - "batch_norm"); + auto batch_norm = KernelRegistry::Global().Create("batch_norm"); ASSERT_FALSE(batch_norm.empty()); ASSERT_TRUE(batch_norm.front()); } diff --git a/lite/kernels/x86/cast_compute_test.cc b/lite/kernels/x86/cast_compute_test.cc index f7aa52ca6d0dde603357f009220b4a3a53f56833..b039cf5d3b01032e60ef7bdcf31a45c8ed302215 100644 --- a/lite/kernels/x86/cast_compute_test.cc +++ b/lite/kernels/x86/cast_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/cast_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/cast_compute.h" namespace paddle { namespace lite { @@ -25,8 +27,7 @@ namespace kernels { namespace x86 { TEST(cast_x86, retrive_op) { - auto cast = - KernelRegistry::Global().Create("cast"); + auto cast = KernelRegistry::Global().Create("cast"); ASSERT_FALSE(cast.empty()); ASSERT_TRUE(cast.front()); } diff --git a/lite/kernels/x86/concat_compute_test.cc b/lite/kernels/x86/concat_compute_test.cc index 468e9422752561ff6416e8859b485462b9e2abbe..4be51dff6ed613842de431cce8a7960182073c4f 100644 --- a/lite/kernels/x86/concat_compute_test.cc +++ b/lite/kernels/x86/concat_compute_test.cc @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/concat_compute.h" #include + #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/concat_compute.h" namespace paddle { namespace lite { @@ -23,9 +25,7 @@ namespace kernels { namespace x86 { TEST(concat_x86, retrive_op) { - auto concat = - KernelRegistry::Global().Create( - "concat"); + auto concat = KernelRegistry::Global().Create("concat"); ASSERT_FALSE(concat.empty()); ASSERT_TRUE(concat.front()); } diff --git a/lite/kernels/x86/conv_compute_test.cc b/lite/kernels/x86/conv_compute_test.cc index 2827c6577e5bf311b4002526d4ac10f636162d96..cd46571a2a9fd6b428f84ca278a453c8675d6ed6 100644 --- a/lite/kernels/x86/conv_compute_test.cc +++ b/lite/kernels/x86/conv_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/conv_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/conv_compute.h" namespace paddle { namespace lite { @@ -25,9 +27,7 @@ namespace kernels { namespace x86 { TEST(conv_x86, retrive_op) { - auto conv2d = - KernelRegistry::Global().Create( - "conv2d"); + auto conv2d = KernelRegistry::Global().Create("conv2d"); ASSERT_FALSE(conv2d.empty()); ASSERT_TRUE(conv2d.front()); } diff --git a/lite/kernels/x86/dropout_compute_test.cc b/lite/kernels/x86/dropout_compute_test.cc index 279f639f40ece0a10e45fe16f36fcb443cea550a..d30fbbea670d9509e722e3a27fd3dbf1d89a308c 100644 --- a/lite/kernels/x86/dropout_compute_test.cc +++ b/lite/kernels/x86/dropout_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/dropout_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/dropout_compute.h" namespace paddle { namespace lite { @@ -26,9 +28,7 @@ namespace kernels { namespace x86 { TEST(dropout_x86, retrive_op) { - auto dropout = - KernelRegistry::Global().Create( - "dropout"); + auto dropout = KernelRegistry::Global().Create("dropout"); ASSERT_FALSE(dropout.empty()); ASSERT_TRUE(dropout.front()); } diff --git a/lite/kernels/x86/elementwise_compute_test.cc b/lite/kernels/x86/elementwise_compute_test.cc index 9850c0ce86756cd12e28ab95688b79a1c539189c..6379faacad75f98f73eafbdfc2f8c9deb4d086cb 100644 --- a/lite/kernels/x86/elementwise_compute_test.cc +++ b/lite/kernels/x86/elementwise_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/elementwise_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/elementwise_compute.h" namespace paddle { namespace lite { @@ -26,9 +28,7 @@ namespace kernels { namespace x86 { TEST(elementwise_add_x86, retrive_op) { - auto elementwise_add = - KernelRegistry::Global().Create( - "elementwise_add"); + auto elementwise_add = KernelRegistry::Global().Create("elementwise_add"); ASSERT_FALSE(elementwise_add.empty()); ASSERT_TRUE(elementwise_add.front()); } diff --git a/lite/kernels/x86/elementwise_op_function.h b/lite/kernels/x86/elementwise_op_function.h index f736248ed3632af92dea2823439e6e7d28ff3e1b..4cb7160097e320798c1b1e2ee94d7fec8aedc6d6 100644 --- a/lite/kernels/x86/elementwise_op_function.h +++ b/lite/kernels/x86/elementwise_op_function.h @@ -22,7 +22,6 @@ limitations under the License. */ #include "lite/fluid/for_range.h" #include "lite/fluid/transform.h" #include "lite/utils/cp_logging.h" -#include "lite/utils/paddle_enforce.h" #include "lite/utils/variant.h" namespace paddle { @@ -66,9 +65,8 @@ inline void get_mid_dims(const lite::DDim &x_dims, for (size_t i = 0; i < y_dims.size(); ++i) { if (x_dims[i + axis] != y_dims[i]) { // only support single y_dims[i] = 1 now. - PADDLE_ENFORCE_EQ( - *mid_flag, 0, "Broadcast support y_dims with single 1."); - PADDLE_ENFORCE_EQ(y_dims[i], 1, "Broadcast dimension mismatch."); + CHECK_EQ(*mid_flag, 0) << "Broadcast support y_dims with single 1."; + CHECK_EQ(y_dims[i], 1) << "Broadcast dimension mismatch."; // m*n*k m*1*k for (size_t j = 0; j < i; ++j) { (*pre) *= y_dims[j]; @@ -95,8 +93,7 @@ inline void get_mid_dims(const lite::DDim &x_dims, } for (size_t i = 0; i < y_dims.size(); ++i) { - PADDLE_ENFORCE_EQ( - x_dims[i + axis], y_dims[i], "Broadcast dimension mismatch."); + CHECK_EQ(x_dims[i + axis], y_dims[i]) << "Broadcast dimension mismatch."; (*n) *= y_dims[i]; } @@ -314,17 +311,16 @@ void ElementwiseComputeEx(const lite::Context &ctx, TransformFunctor functor(x, y, z, ctx, func); auto x_dims = x->dims(); auto y_dims_untrimed = y->dims(); - PADDLE_ENFORCE_GE(x_dims.size(), - y_dims_untrimed.size(), - "Rank of first input must >= rank of second input."); + CHECK_GE(x_dims.size(), y_dims_untrimed.size()) + << "Rank of first input must >= rank of second input."; if (x_dims == y_dims_untrimed) { functor.Run(); return; } axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis); - PADDLE_ENFORCE(axis >= 0 && axis < static_cast(x_dims.size()), - "Axis should be in range [0, x_dims)"); + CHECK(axis >= 0 && axis < static_cast(x_dims.size())) + << "Axis should be in range [0, x_dims)"; auto y_dims = trim_trailing_singular_dims(y_dims_untrimed); axis = (y_dims.size() == 0) ? x_dims.size() : axis; int pre, n, post, mid_flag = 0; @@ -560,9 +556,8 @@ void FusedElemwiseAndActComputeEx(const lite::Context &ctx, lite::Tensor *out, lite::Tensor *intermediate_out) { if (KeepIntermediateOut) { - PADDLE_ENFORCE(intermediate_out, - "The save_intermediate_out is opened, " - "intermediate_out should not be nullptr."); + CHECK(intermediate_out) << "The save_intermediate_out is opened, " + "intermediate_out should not be nullptr."; } const lite::DDim &x_dim = x.dims(); diff --git a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc index 16bec18a1c1c4d0075e1ed1dcc4f3a3462917868..e3e8b13413808b447018ac14acf9d4a16c0f47a6 100644 --- a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc +++ b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/fill_constant_batch_size_like_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/fill_constant_batch_size_like_compute.h" + namespace paddle { namespace lite { namespace kernels { @@ -26,8 +29,7 @@ namespace x86 { TEST(fill_constant_batch_size_like_x86, retrive_op) { auto fill_constant_batch_size_like = - KernelRegistry::Global().Create( - "fill_constant_batch_size_like"); + KernelRegistry::Global().Create("fill_constant_batch_size_like"); ASSERT_FALSE(fill_constant_batch_size_like.empty()); ASSERT_TRUE(fill_constant_batch_size_like.front()); } diff --git a/lite/kernels/x86/gather_compute_test.cc b/lite/kernels/x86/gather_compute_test.cc index 286dfcb08a0c2c7bc038e0ad3b5673bd7c0f8b19..63284452244b19b807f8b101cab5cbabbbf68476 100644 --- a/lite/kernels/x86/gather_compute_test.cc +++ b/lite/kernels/x86/gather_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/gather_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/gather_compute.h" namespace paddle { namespace lite { @@ -25,9 +27,7 @@ namespace kernels { namespace x86 { TEST(gather_x86, retrive_op) { - auto gather = - KernelRegistry::Global().Create( - "gather"); + auto gather = KernelRegistry::Global().Create("gather"); ASSERT_FALSE(gather.empty()); int cnt = 0; for (auto item = gather.begin(); item != gather.end(); ++item) { diff --git a/lite/kernels/x86/gelu_compute_test.cc b/lite/kernels/x86/gelu_compute_test.cc index e930cd32df91196fa9f4559ee6ba22bd8b82d337..9bda9ac4c1c0cee84141095b3100bb82a99661b7 100644 --- a/lite/kernels/x86/gelu_compute_test.cc +++ b/lite/kernels/x86/gelu_compute_test.cc @@ -13,10 +13,12 @@ // limitations under the License. #include + #include #include #include #include + #include "lite/core/op_registry.h" #include "lite/kernels/x86/activation_compute.cc" @@ -26,8 +28,7 @@ namespace kernels { namespace x86 { TEST(gelu_x86, retrive_op) { - auto gelu = - KernelRegistry::Global().Create("gelu"); + auto gelu = KernelRegistry::Global().Create("gelu"); ASSERT_FALSE(gelu.empty()); ASSERT_TRUE(gelu.front()); } diff --git a/lite/kernels/x86/gru_compute_test.cc b/lite/kernels/x86/gru_compute_test.cc index 3e0e944f23bafda6a5eb742a8e4b023c268c9955..c4a0045b3c1b27dfb1b518aede7dad2872cd1dc2 100644 --- a/lite/kernels/x86/gru_compute_test.cc +++ b/lite/kernels/x86/gru_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/gru_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/gru_compute.h" namespace paddle { namespace lite { @@ -26,8 +28,7 @@ namespace kernels { namespace x86 { TEST(gru_x86, retrive_op) { - auto gru = - KernelRegistry::Global().Create("gru"); + auto gru = KernelRegistry::Global().Create("gru"); ASSERT_FALSE(gru.empty()); ASSERT_TRUE(gru.front()); } diff --git a/lite/kernels/x86/layer_norm_compute.h b/lite/kernels/x86/layer_norm_compute.h index 46d151bbc406e19b498b87420029da7f9c1c2f12..ba75dad11b75441dc09b75224bfc4dfb271396a8 100644 --- a/lite/kernels/x86/layer_norm_compute.h +++ b/lite/kernels/x86/layer_norm_compute.h @@ -63,10 +63,10 @@ class LayerNormCompute : public KernelLite { out.ShareDataWith(*y); out.Resize(matrix_shape); - PADDLE_ENFORCE_EQ(Mean->numel(), left); - PADDLE_ENFORCE_EQ(Var->numel(), left); - PADDLE_ENFORCE_EQ(Scale->numel(), right); - PADDLE_ENFORCE_EQ(Bias->numel(), right); + CHECK_EQ(Mean->numel(), left); + CHECK_EQ(Var->numel(), left); + CHECK_EQ(Scale->numel(), right); + CHECK_EQ(Bias->numel(), right); auto ker = paddle::lite::jit::KernelFuncs, lite::fluid::CPUPlace>::Cache() diff --git a/lite/kernels/x86/layer_norm_compute_test.cc b/lite/kernels/x86/layer_norm_compute_test.cc index d39500a5e8827230ddeecd6bbe30f8c0a47ee929..617f1fae066aa6dc5068d293f8e977a2d37fe496 100644 --- a/lite/kernels/x86/layer_norm_compute_test.cc +++ b/lite/kernels/x86/layer_norm_compute_test.cc @@ -12,15 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/layer_norm_compute.h" #include + #include #include #include + #include "lite/backends/x86/jit/helper.h" #include "lite/backends/x86/jit/kernel_base.h" #include "lite/backends/x86/jit/kernels.h" #include "lite/core/op_registry.h" +#include "lite/kernels/x86/layer_norm_compute.h" namespace paddle { namespace lite { @@ -74,9 +76,7 @@ std::vector ref(lite::Tensor* x, // layer_norm TEST(layer_norm_x86, retrive_op) { - auto layer_norm = - KernelRegistry::Global().Create( - "layer_norm"); + auto layer_norm = KernelRegistry::Global().Create("layer_norm"); ASSERT_FALSE(layer_norm.empty()); ASSERT_TRUE(layer_norm.front()); } diff --git a/lite/kernels/x86/leaky_relu_compute_test.cc b/lite/kernels/x86/leaky_relu_compute_test.cc index 76daf4ff9ffc5dea8b532610abc917406356b3a5..75ebcf071298d072682b6ea535b3c8244c328500 100644 --- a/lite/kernels/x86/leaky_relu_compute_test.cc +++ b/lite/kernels/x86/leaky_relu_compute_test.cc @@ -13,8 +13,10 @@ // limitations under the License. #include + #include #include + #include "lite/core/op_registry.h" #include "lite/kernels/x86/activation_compute.h" @@ -24,9 +26,7 @@ namespace kernels { namespace x86 { TEST(leaky_relu_x86, retrive_op) { - auto leaky_relu = - KernelRegistry::Global().Create( - "leaky_relu"); + auto leaky_relu = KernelRegistry::Global().Create("leaky_relu"); ASSERT_FALSE(leaky_relu.empty()); ASSERT_TRUE(leaky_relu.front()); } diff --git a/lite/kernels/x86/match_matrix_tensor_compute_test.cc b/lite/kernels/x86/match_matrix_tensor_compute_test.cc index 0c3f3ad50940ab0059ab04fb507a786f735584b9..02ed8e1b4bb3a7bccc8560cb1f51166d3833e6bf 100644 --- a/lite/kernels/x86/match_matrix_tensor_compute_test.cc +++ b/lite/kernels/x86/match_matrix_tensor_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/match_matrix_tensor_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/match_matrix_tensor_compute.h" namespace paddle { namespace lite { @@ -25,9 +27,7 @@ namespace kernels { namespace x86 { TEST(match_matrix_tensor_x86, retrive_op) { - auto kernel = - KernelRegistry::Global().Create( - "match_matrix_tensor"); + auto kernel = KernelRegistry::Global().Create("match_matrix_tensor"); ASSERT_FALSE(kernel.empty()); ASSERT_TRUE(kernel.front()); } diff --git a/lite/kernels/x86/matmul_compute_test.cc b/lite/kernels/x86/matmul_compute_test.cc index 53d2d1a47a0cdbdaf5dfa83a79987d908171a36d..1e98702193af11ea8678bdfbc2382c7845c49b38 100644 --- a/lite/kernels/x86/matmul_compute_test.cc +++ b/lite/kernels/x86/matmul_compute_test.cc @@ -12,22 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/matmul_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/matmul_compute.h" + namespace paddle { namespace lite { namespace kernels { namespace x86 { TEST(matmul_x86, retrive_op) { - auto matmul = - KernelRegistry::Global().Create( - "matmul"); + auto matmul = KernelRegistry::Global().Create("matmul"); ASSERT_FALSE(matmul.empty()); ASSERT_TRUE(matmul.front()); } diff --git a/lite/kernels/x86/mul_compute_test.cc b/lite/kernels/x86/mul_compute_test.cc index 32d82cbb77aeb71dcd1c172ec0c1e343c3954fea..0d66a2dbd6eb27dac6acde47cc395c3c6245b1b5 100644 --- a/lite/kernels/x86/mul_compute_test.cc +++ b/lite/kernels/x86/mul_compute_test.cc @@ -12,21 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/mul_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/mul_compute.h" + namespace paddle { namespace lite { namespace kernels { namespace x86 { TEST(mul_x86, retrive_op) { - auto mul = - KernelRegistry::Global().Create("mul"); + auto mul = KernelRegistry::Global().Create("mul"); ASSERT_FALSE(mul.empty()); ASSERT_TRUE(mul.front()); } diff --git a/lite/kernels/x86/pool_compute_test.cc b/lite/kernels/x86/pool_compute_test.cc index 4ea727cedd5206f5f1ac2685297f72c3019bb313..d67d3a1de2248a1f8c180867c76b5d31affc11b9 100644 --- a/lite/kernels/x86/pool_compute_test.cc +++ b/lite/kernels/x86/pool_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/pool_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/pool_compute.h" namespace paddle { namespace lite { @@ -26,9 +28,7 @@ namespace kernels { namespace x86 { TEST(pool_x86, retrive_op) { - auto pool2d = - KernelRegistry::Global().Create( - "pool2d"); + auto pool2d = KernelRegistry::Global().Create("pool2d"); ASSERT_FALSE(pool2d.empty()); ASSERT_TRUE(pool2d.front()); } diff --git a/lite/kernels/x86/relu_compute_test.cc b/lite/kernels/x86/relu_compute_test.cc index 37ed6db7f919e31828f89462fa46d5263c480fcc..c2233bd04cf33c983db521335d88339592d2ce6b 100644 --- a/lite/kernels/x86/relu_compute_test.cc +++ b/lite/kernels/x86/relu_compute_test.cc @@ -13,8 +13,10 @@ // limitations under the License. #include + #include #include + #include "lite/core/op_registry.h" #include "lite/kernels/x86/activation_compute.h" @@ -24,8 +26,7 @@ namespace kernels { namespace x86 { TEST(relu_x86, retrive_op) { - auto relu = - KernelRegistry::Global().Create("relu"); + auto relu = KernelRegistry::Global().Create("relu"); ASSERT_FALSE(relu.empty()); ASSERT_TRUE(relu.front()); } diff --git a/lite/kernels/x86/reshape_compute_test.cc b/lite/kernels/x86/reshape_compute_test.cc index 16fc8f31aded0ef62fdf14aa671a73ccf6635fb7..88f38adee4aa413ac91bfdec0294c816020942b5 100644 --- a/lite/kernels/x86/reshape_compute_test.cc +++ b/lite/kernels/x86/reshape_compute_test.cc @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/reshape_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/reshape_compute.h" + namespace paddle { namespace lite { namespace kernels { @@ -26,9 +29,7 @@ namespace x86 { // reshape TEST(reshape_x86, retrive_op) { - auto reshape = - KernelRegistry::Global().Create( - "reshape"); + auto reshape = KernelRegistry::Global().Create("reshape"); ASSERT_FALSE(reshape.empty()); ASSERT_TRUE(reshape.front()); } @@ -86,9 +87,7 @@ TEST(reshape_x86, run_test) { // reshape2 TEST(reshape2_x86, retrive_op) { - auto reshape2 = - KernelRegistry::Global().Create( - "reshape2"); + auto reshape2 = KernelRegistry::Global().Create("reshape2"); ASSERT_FALSE(reshape2.empty()); ASSERT_TRUE(reshape2.front()); } diff --git a/lite/kernels/x86/scale_compute_test.cc b/lite/kernels/x86/scale_compute_test.cc index 6da27f444c7ed4c5a86e5f08a6c1612110bb02b9..dafb1e590f27f14208cff1e9aef79b28256cd048 100644 --- a/lite/kernels/x86/scale_compute_test.cc +++ b/lite/kernels/x86/scale_compute_test.cc @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/scale_compute.h" #include + #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/scale_compute.h" namespace paddle { namespace lite { @@ -24,8 +26,7 @@ namespace kernels { namespace x86 { TEST(scale_x86, retrive_op) { - auto scale = - KernelRegistry::Global().Create("scale"); + auto scale = KernelRegistry::Global().Create("scale"); ASSERT_FALSE(scale.empty()); ASSERT_TRUE(scale.front()); } diff --git a/lite/kernels/x86/search_fc_compute_test.cc b/lite/kernels/x86/search_fc_compute_test.cc index 425df2a0f0544d7345923cb2efdce96074845311..515a5e30c81e9edd6b9ebb8e52955b5de6ec9e24 100644 --- a/lite/kernels/x86/search_fc_compute_test.cc +++ b/lite/kernels/x86/search_fc_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/search_fc_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/search_fc_compute.h" namespace paddle { namespace lite { @@ -53,9 +55,7 @@ void fc_cpu_base(const lite::Tensor* X, } TEST(search_fc_x86, retrive_op) { - auto search_fc = - KernelRegistry::Global().Create( - "search_fc"); + auto search_fc = KernelRegistry::Global().Create("search_fc"); ASSERT_FALSE(search_fc.empty()); ASSERT_TRUE(search_fc.front()); } diff --git a/lite/kernels/x86/search_grnn_compute_test.cc b/lite/kernels/x86/search_grnn_compute_test.cc index b85d97e3f1be1f2f02837d347e42ce6731c58414..d120ca7500513bc99b71bf0003ec31bcf1e2ac19 100644 --- a/lite/kernels/x86/search_grnn_compute_test.cc +++ b/lite/kernels/x86/search_grnn_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/search_grnn_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/search_grnn_compute.h" namespace paddle { namespace lite { @@ -25,9 +27,7 @@ namespace kernels { namespace x86 { TEST(search_grnn_x86, retrive_op) { - auto kernel = - KernelRegistry::Global().Create( - "search_grnn"); + auto kernel = KernelRegistry::Global().Create("search_grnn"); ASSERT_FALSE(kernel.empty()); ASSERT_TRUE(kernel.front()); } diff --git a/lite/kernels/x86/search_group_padding_compute_test.cc b/lite/kernels/x86/search_group_padding_compute_test.cc index f4c36c2a63488a6bb902a2b8b4ad81fa32b37672..ae2007e463c0fc97a099cd5ae902b623e361066c 100644 --- a/lite/kernels/x86/search_group_padding_compute_test.cc +++ b/lite/kernels/x86/search_group_padding_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/search_group_padding_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/search_group_padding_compute.h" namespace paddle { namespace lite { @@ -26,8 +28,7 @@ namespace x86 { TEST(search_group_padding_x86, retrieve_op) { auto search_group_padding = - KernelRegistry::Global().Create( - "search_group_padding"); + KernelRegistry::Global().Create("search_group_padding"); ASSERT_FALSE(search_group_padding.empty()); ASSERT_TRUE(search_group_padding.front()); } diff --git a/lite/kernels/x86/search_seq_depadding_compute_test.cc b/lite/kernels/x86/search_seq_depadding_compute_test.cc index 0d978b35ed040d6b7c44354f37999e6e34e2e3ef..32bf3276bb378beafbf273ffe7142b9b8fc493ac 100644 --- a/lite/kernels/x86/search_seq_depadding_compute_test.cc +++ b/lite/kernels/x86/search_seq_depadding_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/search_seq_depadding_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/search_seq_depadding_compute.h" namespace paddle { namespace lite { @@ -25,9 +27,7 @@ namespace kernels { namespace x86 { TEST(search_seq_depadding_x86, retrive_op) { - auto kernel = - KernelRegistry::Global().Create( - "search_seq_depadding"); + auto kernel = KernelRegistry::Global().Create("search_seq_depadding"); ASSERT_FALSE(kernel.empty()); ASSERT_TRUE(kernel.front()); } diff --git a/lite/kernels/x86/sequence_arithmetic_compute_test.cc b/lite/kernels/x86/sequence_arithmetic_compute_test.cc index 3b41e7d7ce37ebaf6a3f8518bc248ff4ec5c1aec..d80d3c2d1097fe2bbb47eb4c9d1384ae54d7fe8c 100644 --- a/lite/kernels/x86/sequence_arithmetic_compute_test.cc +++ b/lite/kernels/x86/sequence_arithmetic_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/sequence_arithmetic_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/sequence_arithmetic_compute.h" namespace paddle { namespace lite { @@ -77,8 +79,7 @@ void prepare_input(Tensor* x, const LoD& x_lod) { TEST(sequence_arithmetic_x86, retrive_op) { auto sequence_arithmetic = - KernelRegistry::Global().Create( - "sequence_arithmetic"); + KernelRegistry::Global().Create("sequence_arithmetic"); ASSERT_FALSE(sequence_arithmetic.empty()); ASSERT_TRUE(sequence_arithmetic.front()); } diff --git a/lite/kernels/x86/sequence_concat_compute_test.cc b/lite/kernels/x86/sequence_concat_compute_test.cc index eb6678a655ed1eb5a7bcda1dc2a6b8afe4477d2d..9899e6c08a1d1af9dea3728b5105ff78286de819 100644 --- a/lite/kernels/x86/sequence_concat_compute_test.cc +++ b/lite/kernels/x86/sequence_concat_compute_test.cc @@ -12,12 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/sequence_concat_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/sequence_concat_compute.h" + namespace paddle { namespace lite { namespace kernels { @@ -94,9 +97,7 @@ static void sequence_concat_ref(const std::vector& xs, } // namespace TEST(sequence_concat_x86, retrive_op) { - auto sequence_concat = - KernelRegistry::Global().Create( - "sequence_concat"); + auto sequence_concat = KernelRegistry::Global().Create("sequence_concat"); ASSERT_FALSE(sequence_concat.empty()); ASSERT_TRUE(sequence_concat.front()); } diff --git a/lite/kernels/x86/sequence_expand_as_compute_test.cc b/lite/kernels/x86/sequence_expand_as_compute_test.cc index d49fdbb7a6164435abb9eb7189b18376066d55df..6eafb5f1e5275e375b7c61fda3c437b6959b8dd2 100644 --- a/lite/kernels/x86/sequence_expand_as_compute_test.cc +++ b/lite/kernels/x86/sequence_expand_as_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/sequence_expand_as_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/sequence_expand_as_compute.h" namespace paddle { namespace lite { @@ -27,8 +29,7 @@ namespace x86 { TEST(sequence_expand_as_x86, retrive_op) { auto sequence_expand_as = - KernelRegistry::Global().Create( - "sequence_expand_as"); + KernelRegistry::Global().Create("sequence_expand_as"); ASSERT_FALSE(sequence_expand_as.empty()); ASSERT_TRUE(sequence_expand_as.front()); } diff --git a/lite/kernels/x86/sequence_pool_compute_test.cc b/lite/kernels/x86/sequence_pool_compute_test.cc index 372bfaf8741cdcdc902efb6b8380eb4c34dd49ad..35116adbf6f06b87482cfff99182ee6c675ba7ed 100644 --- a/lite/kernels/x86/sequence_pool_compute_test.cc +++ b/lite/kernels/x86/sequence_pool_compute_test.cc @@ -12,21 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/sequence_pool_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/sequence_pool_compute.h" + namespace paddle { namespace lite { namespace kernels { namespace x86 { TEST(sequence_pool_x86, retrive_op) { - auto sequence_pool = - KernelRegistry::Global().Create( - "sequence_pool"); + auto sequence_pool = KernelRegistry::Global().Create("sequence_pool"); ASSERT_FALSE(sequence_pool.empty()); ASSERT_TRUE(sequence_pool.front()); } diff --git a/lite/kernels/x86/sequence_reverse_compute_test.cc b/lite/kernels/x86/sequence_reverse_compute_test.cc index adf9981b242bfbb7f60989369715354cc2043685..37c2f9571d486a36eccc1f01c06a1550d4609730 100644 --- a/lite/kernels/x86/sequence_reverse_compute_test.cc +++ b/lite/kernels/x86/sequence_reverse_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/sequence_reverse_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/sequence_reverse_compute.h" namespace paddle { namespace lite { @@ -44,9 +46,7 @@ static void sequence_reverse_ref(const lite::Tensor* x, lite::Tensor* y) { } // namespace TEST(sequence_reverse_x86, retrive_op) { - auto sequence_reverse = - KernelRegistry::Global().Create( - "sequence_reverse"); + auto sequence_reverse = KernelRegistry::Global().Create("sequence_reverse"); ASSERT_FALSE(sequence_reverse.empty()); ASSERT_TRUE(sequence_reverse.front()); } diff --git a/lite/kernels/x86/sgd_compute.cc b/lite/kernels/x86/sgd_compute.cc index a3241468f9f09d66401aa83e0d738779e555dfba..dd056e30209953c1f360d714db50e3236f278510 100644 --- a/lite/kernels/x86/sgd_compute.cc +++ b/lite/kernels/x86/sgd_compute.cc @@ -41,8 +41,8 @@ class SGDCompute : public KernelLite { auto *param_out = &sgd_param.ParamOut->raw_tensor(); auto sz = param_out->numel(); - PADDLE_ENFORCE_EQ(param->numel(), sz); - PADDLE_ENFORCE_EQ(grad->numel(), sz); + CHECK_EQ(param->numel(), sz); + CHECK_EQ(grad->numel(), sz); paddle::operators::jit::sgd_attr_t attr(1, sz, 1, sz, 1); const T *lr = learning_rate->template data(); diff --git a/lite/kernels/x86/shape_compute_test.cc b/lite/kernels/x86/shape_compute_test.cc index 88bd98f33ffc7a727de584543bc7392cdbb2883f..9fe5e6c51eaee783072717cea055b00b75c59c07 100644 --- a/lite/kernels/x86/shape_compute_test.cc +++ b/lite/kernels/x86/shape_compute_test.cc @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/shape_compute.h" #include + #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/shape_compute.h" namespace paddle { namespace lite { @@ -23,8 +25,7 @@ namespace kernels { namespace x86 { TEST(shape_x86, retrive_op) { - auto shape = - KernelRegistry::Global().Create("shape"); + auto shape = KernelRegistry::Global().Create("shape"); ASSERT_FALSE(shape.empty()); ASSERT_TRUE(shape.front()); } diff --git a/lite/kernels/x86/slice_compute.h b/lite/kernels/x86/slice_compute.h index ad30215691cde66ab1c7c8c57930fc6d58de7cd5..d32327668bac389e42ff9411be50ce3df42e39ff 100644 --- a/lite/kernels/x86/slice_compute.h +++ b/lite/kernels/x86/slice_compute.h @@ -157,7 +157,7 @@ void slice_compute(const lite::Tensor* in, } } - out->mutable_data(lite::TargetType::kX86); + out->mutable_data(); auto new_out_dims = out->dims(); auto offsets = Eigen::array(); diff --git a/lite/kernels/x86/slice_compute_test.cc b/lite/kernels/x86/slice_compute_test.cc index a62a62cd88ce48c4d47d784ecbc2fd16d0f433d1..b978d4533ccb28ae8826b8304d93f9bdbe85d106 100644 --- a/lite/kernels/x86/slice_compute_test.cc +++ b/lite/kernels/x86/slice_compute_test.cc @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/slice_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/slice_compute.h" + namespace paddle { namespace lite { namespace kernels { @@ -79,8 +82,7 @@ static void slice_ref(const float* input, } TEST(slice_x86, retrive_op) { - auto slice = - KernelRegistry::Global().Create("slice"); + auto slice = KernelRegistry::Global().Create("slice"); ASSERT_FALSE(slice.empty()); ASSERT_TRUE(slice.front()); } diff --git a/lite/kernels/x86/softmax_compute_test.cc b/lite/kernels/x86/softmax_compute_test.cc index 0debeecb3150dfdd2626b6f8f3f6b5ef63981d93..f3def92992c7ca01e75d12b86b2680768a9fd2ee 100644 --- a/lite/kernels/x86/softmax_compute_test.cc +++ b/lite/kernels/x86/softmax_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/softmax_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/softmax_compute.h" namespace paddle { namespace lite { @@ -25,9 +27,7 @@ namespace kernels { namespace x86 { TEST(softmax_x86, retrive_op) { - auto softmax = - KernelRegistry::Global().Create( - "softmax"); + auto softmax = KernelRegistry::Global().Create("softmax"); ASSERT_FALSE(softmax.empty()); ASSERT_TRUE(softmax.front()); } diff --git a/lite/kernels/x86/stack_compute_test.cc b/lite/kernels/x86/stack_compute_test.cc index d105165a98f936b7a6973e57f5199977a0b8bed3..33942fca96508d2868520e5b5e242b83a1f38b0e 100644 --- a/lite/kernels/x86/stack_compute_test.cc +++ b/lite/kernels/x86/stack_compute_test.cc @@ -12,12 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/stack_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/stack_compute.h" + namespace paddle { namespace lite { namespace kernels { @@ -25,8 +28,7 @@ namespace x86 { // stack TEST(stack_x86, retrive_op) { - auto stack = - KernelRegistry::Global().Create("stack"); + auto stack = KernelRegistry::Global().Create("stack"); ASSERT_FALSE(stack.empty()); ASSERT_TRUE(stack.front()); } diff --git a/lite/kernels/x86/tanh_compute_test.cc b/lite/kernels/x86/tanh_compute_test.cc index 8132505fad6d93997c73ffb735a4a798c15d87a6..6cba531fd34df029a1cdaaf9d6925e379796260d 100644 --- a/lite/kernels/x86/tanh_compute_test.cc +++ b/lite/kernels/x86/tanh_compute_test.cc @@ -13,10 +13,12 @@ // limitations under the License. #include + #include #include #include #include + #include "lite/core/op_registry.h" #include "lite/kernels/x86/activation_compute.cc" @@ -26,8 +28,7 @@ namespace kernels { namespace x86 { TEST(tanh_x86, retrive_op) { - auto tanh = - KernelRegistry::Global().Create("tanh"); + auto tanh = KernelRegistry::Global().Create("tanh"); ASSERT_FALSE(tanh.empty()); ASSERT_TRUE(tanh.front()); } diff --git a/lite/kernels/x86/transpose_compute.h b/lite/kernels/x86/transpose_compute.h index 5f6faed2017b6bdef60e7505bf1f0088d86b3ec1..87e7fee7deec711914bd43039301f7180a4bcaa0 100644 --- a/lite/kernels/x86/transpose_compute.h +++ b/lite/kernels/x86/transpose_compute.h @@ -60,7 +60,7 @@ inline void TransCompute(const int dim, trans6(context, in, out, axis); break; default: - PADDLE_THROW("Tensors with rank at most 6 are supported"); + LOG(FATAL) << "Tensors with rank at most 6 are supported"; } } diff --git a/lite/kernels/x86/transpose_compute_test.cc b/lite/kernels/x86/transpose_compute_test.cc index d8533d98258637eba516974e03cd4d88fd452293..aa99db36c450326765d602aaf0b48f72a1a63e13 100644 --- a/lite/kernels/x86/transpose_compute_test.cc +++ b/lite/kernels/x86/transpose_compute_test.cc @@ -12,12 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/transpose_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/transpose_compute.h" + namespace paddle { namespace lite { namespace kernels { @@ -25,9 +28,7 @@ namespace x86 { // transpose TEST(transpose_x86, retrive_op) { - auto transpose = - KernelRegistry::Global().Create( - "transpose"); + auto transpose = KernelRegistry::Global().Create("transpose"); ASSERT_FALSE(transpose.empty()); ASSERT_TRUE(transpose.front()); } @@ -75,9 +76,7 @@ TEST(transpose_x86, run_test) { // transpose2 TEST(transpose2_x86, retrive_op) { - auto transpose2 = - KernelRegistry::Global().Create( - "transpose2"); + auto transpose2 = KernelRegistry::Global().Create("transpose2"); ASSERT_FALSE(transpose2.empty()); ASSERT_TRUE(transpose2.front()); } diff --git a/lite/kernels/x86/var_conv_2d_compute_test.cc b/lite/kernels/x86/var_conv_2d_compute_test.cc index edef8cb2df75dfb45ad4964975365d4ddbbe9086..a6787b2e3e84360a63618f130305446316a08e01 100644 --- a/lite/kernels/x86/var_conv_2d_compute_test.cc +++ b/lite/kernels/x86/var_conv_2d_compute_test.cc @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/var_conv_2d_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" #include "lite/core/tensor.h" +#include "lite/kernels/x86/var_conv_2d_compute.h" + namespace paddle { namespace lite { namespace kernels { @@ -197,9 +200,7 @@ static void var_conv_2d_ref(const lite::Tensor* bottom, } TEST(var_conv_2d_x86, retrive_op) { - auto var_conv_2d = - KernelRegistry::Global().Create( - "var_conv_2d"); + auto var_conv_2d = KernelRegistry::Global().Create("var_conv_2d"); ASSERT_FALSE(var_conv_2d.empty()); ASSERT_TRUE(var_conv_2d.front()); } diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt index 7ded008387b7d7c92fb2ce6b18e73e1c1e51f29d..fdb485df02f366f7f4868965b1f20c6861b03d43 100644 --- a/lite/kernels/xpu/CMakeLists.txt +++ b/lite/kernels/xpu/CMakeLists.txt @@ -6,6 +6,7 @@ if(LITE_WITH_XTCL) add_subdirectory(bridges) add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges}) else() + # basic add_kernel(conv_compute_xpu XPU basic SRCS conv_compute.cc DEPS ${lite_kernel_deps}) add_kernel(io_copy_compute_xpu XPU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} target_wrapper_xpu) add_kernel(batch_norm_compute_xpu XPU basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps}) @@ -15,15 +16,32 @@ else() add_kernel(mul_compute_xpu XPU basic SRCS mul_compute.cc DEPS ${lite_kernel_deps}) add_kernel(softmax_compute_xpu XPU basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps}) add_kernel(scale_compute_xpu XPU basic SRCS scale_compute.cc DEPS ${lite_kernel_deps}) - add_kernel(lookup_table_compute_xpu XPU basic SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps}) - add_kernel(layer_norm_compute_xpu XPU basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps}) add_kernel(dropout_compute_xpu XPU basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps}) add_kernel(matmul_compute_xpu XPU basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps}) add_kernel(stack_compute_xpu XPU basic SRCS stack_compute.cc DEPS ${lite_kernel_deps}) add_kernel(slice_compute_xpu XPU basic SRCS slice_compute.cc DEPS ${lite_kernel_deps}) add_kernel(cast_compute_xpu XPU basic SRCS cast_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(sequence_topk_avg_pooling_compute_xpu XPU basic SRCS sequence_topk_avg_pooling_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(concat_compute_xpu XPU basic SRCS concat_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(search_fc_compute_xpu XPU basic SRCS search_fc_compute.cc DEPS ${lite_kernel_deps}) + + # extra + add_kernel(lookup_table_compute_xpu XPU extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(layer_norm_compute_xpu XPU extra SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(sequence_reverse_compute_xpu XPU extra SRCS sequence_reverse_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(sequence_concat_compute_xpu XPU extra SRCS sequence_concat_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(sequence_arithmetic_compute_xpu XPU extra SRCS sequence_arithmetic_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(sequence_pool_compute_xpu XPU extra SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(match_matrix_tensor_compute_xpu XPU extra SRCS match_matrix_tensor_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(var_conv_2d_compute_xpu XPU extra SRCS var_conv_2d_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(search_grnn_compute_xpu XPU extra SRCS search_grnn_compute.cc DEPS ${lite_kernel_deps}) + + # extra(fused kernel) add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(__xpu__resnet_cbam_compute_xpu XPU extra SRCS __xpu__resnet_cbam_compute.cc DEPS ${lite_kernel_deps}) add_kernel(__xpu__multi_encoder_compute_xpu XPU extra SRCS __xpu__multi_encoder_compute.cc DEPS ${lite_kernel_deps}) add_kernel(__xpu__embedding_with_eltwise_add_compute_xpu XPU extra SRCS __xpu__embedding_with_eltwise_add_compute.cc DEPS ${lite_kernel_deps}) add_kernel(__xpu__fc_compute_xpu XPU extra SRCS __xpu__fc_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(__xpu__search_attention_compute_xpu XPU extra SRCS __xpu__search_attention_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(__xpu__mmdnn_compute_xpu XPU extra SRCS __xpu__mmdnn_compute.cc DEPS ${lite_kernel_deps}) endif() diff --git a/lite/kernels/xpu/__xpu__mmdnn_compute.cc b/lite/kernels/xpu/__xpu__mmdnn_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..39ddecb1139073cb1a0bd8e3c7afc89f1d739da8 --- /dev/null +++ b/lite/kernels/xpu/__xpu__mmdnn_compute.cc @@ -0,0 +1,1386 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +namespace { + +void FillMax(float max, float* xpu_ptr) { + float maxs[4] = {max, 0.0f, 0.0f, 0.0f}; + xpu_memcpy( + xpu_ptr, maxs, 4 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE); +} + +void GrnnLayout(int batch, + const std::vector& offset, + std::vector* new_offset_ptr, + std::vector* idx_sorted_ptr) { + auto& new_offset = *new_offset_ptr; + auto& idx_sorted = *idx_sorted_ptr; + + std::vector width; + width.resize(batch); + new_offset.clear(); + idx_sorted.clear(); + + idx_sorted.resize(batch); + for (int i = 0; i < batch; i++) { + width[i] = offset[i + 1] - offset[i]; + idx_sorted[i] = i; + } + std::sort(idx_sorted.data(), + idx_sorted.data() + batch, + [&width](int a, int b) { return width[a] > width[b]; }); + int max_width = width[idx_sorted[0]]; + new_offset.resize(max_width + 1); + new_offset[0] = 0; + int j = batch - 1; + int last_width = 0; + int sub_row = 0; + int sub_col = 0; + + for (int i = 1; i <= max_width;) { + for (int k = j; k >= 0; --k) { + if (width[idx_sorted[k]] > last_width) { + sub_row = width[idx_sorted[k]] - last_width; + sub_col = k + 1; + for (int s = 0; s < sub_row; s++) { + new_offset[i] = new_offset[i - 1] + sub_col; + i++; + } + // move on + last_width = width[idx_sorted[k]]; + j = k - 1; + break; + } + } + } +} + +} // anonymous namespace + +class MMDNNIdInfo { + XPUScratchPadGuard l3_buffer_guard_; + char* l3_buffer_{nullptr}; + std::unique_ptr cpu_buffer_guard_; + char* cpu_buffer_{nullptr}; + + public: + const int64_t* id0_64{nullptr}; + const int64_t* id1_64{nullptr}; + int64_t* lod_64{nullptr}; + int* lod_32{nullptr}; + int* new_offset_32{nullptr}; + int* idx_sorted_32{nullptr}; + + std::vector lod; + std::vector new_offset; + std::vector idx_sorted; + int batch; + int seqlen_max; + int seqlen_sum; + int seqlen_square_sum; + + void Init(int upper_bound_batch, int upper_bound_seqlen) { + int ub_lod_64_size = (upper_bound_batch + 1) * sizeof(int64_t); + int ub_lod_32_size = (upper_bound_batch + 1) * sizeof(int); + int ub_new_offset_32_size = (upper_bound_seqlen + 1) * sizeof(int); + int ub_idx_sorted_32_size = (upper_bound_batch + 1) * sizeof(int); + int total_size = ub_lod_64_size + ub_lod_32_size + ub_new_offset_32_size + + ub_idx_sorted_32_size; + + // TODO(miaotianxiang): use l3? + l3_buffer_guard_ = TargetWrapperXPU::MallocScratchPad(total_size, false); + l3_buffer_ = reinterpret_cast(l3_buffer_guard_->addr_); + cpu_buffer_guard_.reset(new char[total_size]); + cpu_buffer_ = cpu_buffer_guard_.get(); + } + + void Update(lite::Tensor* id0, lite::Tensor* id1) { + auto& id0_lod = id0->lod()[0]; + lod.clear(); + for (auto e : id0_lod) { + lod.push_back(e); + } + + seqlen_max = 0; + seqlen_sum = 0; + seqlen_square_sum = 0; + batch = lod.size() - 1; + for (int i = 0; i < batch; i++) { + int seqlen = lod[i + 1] - lod[i]; + seqlen_max = std::max(seqlen_max, seqlen); + seqlen_sum = seqlen_sum + seqlen; + seqlen_square_sum = seqlen_square_sum + seqlen * seqlen; + } + GrnnLayout(batch, lod, &new_offset, &idx_sorted); + + id0_64 = id0->data(); + id1_64 = id1->data(); + + int offset = 0; + lod_64 = reinterpret_cast(l3_buffer_ + offset); + memcpy( + cpu_buffer_ + offset, id0_lod.data(), id0_lod.size() * sizeof(int64_t)); + offset += id0_lod.size() * sizeof(int64_t); + lod_32 = reinterpret_cast(l3_buffer_ + offset); + memcpy(cpu_buffer_ + offset, lod.data(), lod.size() * sizeof(int)); + offset += lod.size() * sizeof(int); + new_offset_32 = reinterpret_cast(l3_buffer_ + offset); + memcpy(cpu_buffer_ + offset, + new_offset.data(), + new_offset.size() * sizeof(int)); + offset += new_offset.size() * sizeof(int); + idx_sorted_32 = reinterpret_cast(l3_buffer_ + offset); + memcpy(cpu_buffer_ + offset, + idx_sorted.data(), + idx_sorted.size() * sizeof(int)); + offset += idx_sorted.size() * sizeof(int); + xpu_memcpy( + l3_buffer_, cpu_buffer_, offset, XPUMemcpyKind::XPU_HOST_TO_DEVICE); + } +}; + +class MMDNNFcOp { + const int16_t* weight_{nullptr}; + XPUScratchPadGuard weight_max_guard_; + float* weight_max_{nullptr}; + const float* bias_{nullptr}; + XPUScratchPadGuard in_max_guard_; + float* in_max_{nullptr}; + int n_; + int k_; + xdnn::Activation_t::act_enum act_type_; + XPUScratchPadGuard out_max_guard_; + + public: + float* out_max{nullptr}; + + void Init(const int16_t* weight, + float weight_max, + const float* bias, + int n, + int k, + xdnn::Activation_t::act_enum act_type) { + n_ = n; + k_ = k; + act_type_ = act_type; + + weight_ = weight; + weight_max_guard_ = + TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false); + weight_max_ = reinterpret_cast(weight_max_guard_->addr_); + FillMax(weight_max, weight_max_); + + bias_ = bias; + + in_max_guard_ = + TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false); + out_max_guard_ = + TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false); + in_max_ = reinterpret_cast(in_max_guard_->addr_); + out_max = reinterpret_cast(in_max_guard_->addr_); + } + + void Init(lite::Tensor* weight, + float weight_max, + lite::Tensor* bias, + int n, + int k, + xdnn::Activation_t::act_enum act_type) { + Init(weight->data(), + weight_max, + bias ? bias->data() : nullptr, + n, + k, + act_type); + } + + void Infer(xdnn::Context* ctx, + const float* in, + int m, + float* out, + const float* in_max_by_caller = nullptr) { + if (in_max_by_caller == nullptr) { + xdnn::findmax(ctx, in, m * k_, in_max_); + in_max_by_caller = in_max_; + } + xdnn::gemm_int16_maxptr(ctx, + false, + true, + m, + n_, + k_, + 1.0f, + in, + k_, + weight_, + k_, + 0.0f, + out, + n_, + bias_, + act_type_, + in_max_by_caller, + weight_max_, + out_max); + } +}; + +class MMDNNGrnnOp { + MMDNNFcOp fc_e2h0_; + MMDNNFcOp fc_e2h1_; + MMDNNFcOp fc_e2h2_; + const int16_t* dense_h2h_{nullptr}; + float dense_h2h_max_[3]; + XPUScratchPadGuard input_max_guard_; + float* input_max_{nullptr}; + XPUScratchPadGuard hbm_buffer_guard_; + float* hbm_buffer_{nullptr}; + // require: cap_l * max(cap_e_, cap_h_) * 5 + // seq2batch_out: [cap_l, cap_e_] + // fc_e2h_out: [3, cap_l, cap_h_] + // gru_out: [cap_l, cap_h_] + int cap_e_; + int cap_h_; + int max_cap_l_; + + public: + void Init(lite::Tensor* wh, + const std::vector& wh_maxs, + lite::Tensor* wi, + const std::vector& wi_maxs, + int cap_e, + int cap_h, + int max_cap_l) { + cap_e_ = cap_e; + cap_h_ = cap_h; + max_cap_l_ = max_cap_l; + + // weight + auto* dense_e2h = wi->data(); + fc_e2h0_.Init(dense_e2h, + wi_maxs[0], + nullptr, + cap_h_, + cap_e_, + xdnn::Activation_t::LINEAR); + fc_e2h1_.Init(dense_e2h + cap_e_ * cap_h_, + wi_maxs[1], + nullptr, + cap_h_, + cap_e_, + xdnn::Activation_t::LINEAR); + fc_e2h2_.Init(dense_e2h + cap_e_ * cap_h_ * 2, + wi_maxs[2], + nullptr, + cap_h_, + cap_e_, + xdnn::Activation_t::LINEAR); + + dense_h2h_ = wh->data(); + dense_h2h_max_[0] = wh_maxs[0]; + dense_h2h_max_[1] = wh_maxs[1]; + dense_h2h_max_[2] = wh_maxs[2]; + + input_max_guard_ = + TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false); + input_max_ = reinterpret_cast(input_max_guard_->addr_); + hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad( + 5 * std::max(cap_e_, cap_h_) * max_cap_l_ * sizeof(float), false); + hbm_buffer_ = reinterpret_cast(hbm_buffer_guard_->addr_); + } + + void Infer(xdnn::Context* ctx, + const MMDNNIdInfo& sentense, + const float* in, + float* out, + float* l3_buffer = nullptr, + int l3_size = 0) { + int batch = sentense.batch; + int cap_l = sentense.seqlen_sum; + int max_width = sentense.seqlen_max; + + int slot_size = cap_l * std::max(cap_e_, cap_h_); + float* seq2batch_out = hbm_buffer_; + float* fc_e2h_out = hbm_buffer_ + 1 * slot_size; + float* gru_out = hbm_buffer_ + 4 * slot_size; + if (l3_size > 0 && l3_size >= 5 * slot_size * sizeof(float)) { + seq2batch_out = l3_buffer; + fc_e2h_out = l3_buffer + 1 * slot_size; + gru_out = l3_buffer + 4 * slot_size; + } + + xdnn::search_seq2batch(ctx, + batch, + max_width, + cap_e_, + sentense.idx_sorted_32, + sentense.lod_32, + sentense.new_offset_32, + in, + seq2batch_out); + + xdnn::findmax(ctx, in, cap_l * cap_e_, input_max_); + fc_e2h0_.Infer(ctx, seq2batch_out, cap_l, fc_e2h_out, input_max_); + fc_e2h1_.Infer( + ctx, seq2batch_out, cap_l, fc_e2h_out + cap_l * cap_h_, input_max_); + fc_e2h2_.Infer( + ctx, seq2batch_out, cap_l, fc_e2h_out + cap_l * cap_h_ * 2, input_max_); + xdnn::search_grnn(ctx, + cap_l, + cap_h_, + cap_e_, + max_width, + sentense.new_offset_32, + fc_e2h_out, + dense_h2h_, + gru_out, + dense_h2h_max_[0], + dense_h2h_max_[1], + dense_h2h_max_[2]); + + xdnn::search_batch2seq(ctx, + batch, + max_width, + cap_h_, + sentense.idx_sorted_32, + sentense.lod_32, + sentense.new_offset_32, + gru_out, + out); + } +}; + +class MMDNNAttentionOp { + int dim_; + float alpha0_; + float alpha1_; + MMDNNFcOp seqfc_; + XPUScratchPadGuard hbm_buffer_guard_; + float* hbm_buffer_{nullptr}; + // require: cap_l * dim_ + seqlen_square_sum + // seqfc_out: [cap_l, dim_] + // batchgemm0_out: [seqlen_square_sum] + // seq_softmax_out: [seqlen_square_sum], reuse of batchgemm0_out + // batchgemm1_out: [cap_l, dim_], reuse of seqfc_out + + public: + void Init(lite::Tensor* att_fc_w, + float att_fc_w_max, + lite::Tensor* att_fc_b, + int dim, + int upper_bound_batch, + int upper_bound_seqlen) { + dim_ = dim; + alpha0_ = 0.0883883461356163f; // TODO(miaotianxiang): + alpha1_ = 1.0f; + + seqfc_.Init(att_fc_w, + att_fc_w_max, + att_fc_b, + dim_, + dim_, + xdnn::Activation_t::LINEAR); + hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad( + (upper_bound_batch * (upper_bound_seqlen * dim_ + + upper_bound_seqlen * upper_bound_seqlen)) * + sizeof(float), + false); + hbm_buffer_ = reinterpret_cast(hbm_buffer_guard_->addr_); + } + + void Infer(xdnn::Context* ctx, + const MMDNNIdInfo& sentense, + const float* input, + float* pool_out, + float* l3_buffer = nullptr, + int l3_size = 0) { + int batch = sentense.batch; + int cap_l = sentense.seqlen_sum; + int max_width = sentense.seqlen_max; + int* lod_32 = sentense.lod_32; + + float* seqfc_out = hbm_buffer_; + float* batchgemm0_out = hbm_buffer_ + cap_l * dim_; + float* seq_softmax_out = batchgemm0_out; + float* batchgemm1_out = seqfc_out; + if (l3_size > 0 && + l3_size >= + (cap_l * dim_ + sentense.seqlen_square_sum) * sizeof(float)) { + seqfc_out = l3_buffer; + batchgemm0_out = l3_buffer + cap_l * dim_; + seq_softmax_out = batchgemm0_out; + batchgemm1_out = seqfc_out; + } + + seqfc_.Infer(ctx, input, cap_l, seqfc_out); + xdnn::search_noaligned_mat_mul(ctx, + 0, + 1, + batch, + lod_32, + max_width, + dim_, + alpha0_, + input, + seqfc_out, + batchgemm0_out); + xdnn::search_seq_softmax( + ctx, batchgemm0_out, seq_softmax_out, lod_32, batch, max_width); + xdnn::search_noaligned_mat_mul(ctx, + 0, + 0, + batch, + lod_32, + max_width, + dim_, + alpha1_, + seq_softmax_out, + input, + batchgemm1_out); + xdnn::sequence_pooling_forward(ctx, + xdnn::Pooling_t::MAX_WITHOUT_INDEX, + batch, + lod_32, + dim_, + batchgemm1_out, + nullptr, + pool_out); + } +}; + +class MMDNNMatchConvTopk { + std::vector topks_; + int dim_t_; + int dim_in_; + int out_channel_; + + MMDNNFcOp xw_fc_; + const int16_t* conv_weight_{nullptr}; + float conv_weight_max_; + XPUScratchPadGuard hbm_buffer_guard_; + float* hbm_buffer_{nullptr}; + // xw_out: [sum(left_len), dim_t_ * dim_in_] + // xwy_out: [sum(left_len * right_len) * dim_t_] + // conv_out: [sum(left_len * right_len) * out_channel_] + // seq_concat_out: [sum(left_len * right_len) * (dim_t_ + out_channel_)] + + XPUScratchPadGuard left_lod_32_guard_; + int* left_lod_32_{nullptr}; + XPUScratchPadGuard right_lod_32_guard_; + int* right_lod_32_{nullptr}; + XPUScratchPadGuard match_lod_32_guard_; + int* match_lod_32_{nullptr}; + XPUScratchPadGuard conv_lod_32_guard_; + int* conv_lod_32_{nullptr}; + XPUScratchPadGuard topk_offset_32_guard_; + int* topk_offset_32_{nullptr}; + XPUScratchPadGuard topks_xpu_guard_; + int* topks_xpu_{nullptr}; + XPUScratchPadGuard useless_topk_pos_guard_; + int* useless_topk_pos_{nullptr}; + + public: + float* seq_avg_topk_out{nullptr}; + + void Init(lite::Tensor* input_w, + float input_w_max, + lite::Tensor* conv_w, + float conv_w_max, + int dim_t, + int dim_in, + int upper_bound_batch, + int upper_bound_seqlen, + const std::vector& topks) { + dim_t_ = dim_t; + dim_in_ = dim_in; + out_channel_ = 5; // TODO(miaotianxiang): + topks_ = topks; + + xw_fc_.Init(input_w, + input_w_max, + nullptr, + dim_t_ * dim_in_, + dim_in_, + xdnn::Activation_t::LINEAR); + conv_weight_ = conv_w->data(); + conv_weight_max_ = conv_w_max; + + hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad( + (upper_bound_batch * upper_bound_seqlen * dim_t_ * dim_in_ + + upper_bound_batch * upper_bound_seqlen * upper_bound_seqlen * + (dim_t_ + out_channel_) * 2) * + sizeof(float), + false); + hbm_buffer_ = reinterpret_cast(hbm_buffer_guard_->addr_); + + left_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad( + (upper_bound_batch + 1) * sizeof(int), false); + left_lod_32_ = reinterpret_cast(left_lod_32_guard_->addr_); + right_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad( + (upper_bound_batch + 1) * sizeof(int), false); + right_lod_32_ = reinterpret_cast(right_lod_32_guard_->addr_); + match_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad( + (upper_bound_batch + 1) * sizeof(int), false); + match_lod_32_ = reinterpret_cast(match_lod_32_guard_->addr_); + conv_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad( + (upper_bound_batch + 1) * sizeof(int), false); + conv_lod_32_ = reinterpret_cast(conv_lod_32_guard_->addr_); + topk_offset_32_guard_ = TargetWrapperXPU::MallocScratchPad( + (upper_bound_batch + 1) * sizeof(int), false); + topk_offset_32_ = reinterpret_cast(topk_offset_32_guard_->addr_); + topks_xpu_guard_ = + TargetWrapperXPU::MallocScratchPad(topks_.size() * sizeof(int), false); + topks_xpu_ = reinterpret_cast(topks_xpu_guard_->addr_); + xpu_memcpy(topks_xpu_, + topks_.data(), + topks_.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + useless_topk_pos_guard_ = + TargetWrapperXPU::MallocScratchPad(4 * sizeof(int), false); + useless_topk_pos_ = reinterpret_cast(useless_topk_pos_guard_->addr_); + } + + void Infer(xdnn::Context* ctx, + lite::Tensor* left, + lite::Tensor* right, + lite::Tensor* out, + float* l3_buffer = nullptr, + int l3_size = 0) { + auto left_lod = left->lod()[0]; + auto right_lod = right->lod()[0]; + int batch = left_lod.size() - 1; + + std::vector left_lod_32_cpu; + for (auto e : left_lod) { + left_lod_32_cpu.push_back(e); + } + xpu_memcpy(left_lod_32_, + left_lod_32_cpu.data(), + left_lod_32_cpu.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + std::vector right_lod_32_cpu; + for (auto e : right_lod) { + right_lod_32_cpu.push_back(e); + } + xpu_memcpy(right_lod_32_, + right_lod_32_cpu.data(), + right_lod_32_cpu.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + + std::vector lod_match = {0}; + std::vector lod_conv = {0}; + std::vector lod_topk = {0}; + int x_mul_y_sum = 0; + int left_seqlen_sum = 0; + int left_seqlen_max = 0; + int right_seqlen_sum = 0; + int right_seqlen_max = 0; + for (int i = 0; i < batch; i++) { + int len_x = left_lod[i + 1] - left_lod[i]; + int len_y = right_lod[i + 1] - right_lod[i]; + int imgsize = len_x * len_y; + x_mul_y_sum = x_mul_y_sum + imgsize; + lod_match.push_back(lod_match.back() + imgsize * dim_t_); + lod_conv.push_back(lod_conv.back() + imgsize * out_channel_); + lod_topk.push_back(lod_topk.back() + imgsize * (dim_t_ + out_channel_)); + + left_seqlen_max = std::max(left_seqlen_max, len_x); + right_seqlen_max = std::max(right_seqlen_max, len_y); + left_seqlen_sum += len_x; + right_seqlen_sum += len_y; + } + xpu_memcpy(match_lod_32_, + lod_match.data(), + lod_match.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + xpu_memcpy(conv_lod_32_, + lod_conv.data(), + lod_conv.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + xpu_memcpy(topk_offset_32_, + lod_topk.data(), + lod_topk.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + + float* xwy_out = hbm_buffer_; + float* conv_out = hbm_buffer_ + x_mul_y_sum * dim_t_; + float* seq_concat_out = hbm_buffer_ + x_mul_y_sum * (dim_t_ + out_channel_); + float* xw_out = hbm_buffer_ + x_mul_y_sum * (dim_t_ + out_channel_) * 2; + int total_len = x_mul_y_sum * (dim_t_ + out_channel_) * 2 + + left_seqlen_sum * dim_t_ * dim_in_; + if (l3_size > 0 && l3_size >= total_len * sizeof(float)) { + xwy_out = l3_buffer; + conv_out = l3_buffer + x_mul_y_sum * dim_t_; + seq_concat_out = l3_buffer + x_mul_y_sum * (dim_t_ + out_channel_); + xw_out = l3_buffer + x_mul_y_sum * (dim_t_ + out_channel_) * 2; + } + seq_avg_topk_out = out->mutable_data(TARGET(kXPU)); + + int max_width = std::max(left_seqlen_max, right_seqlen_max); + xw_fc_.Infer(ctx, left->data(), left_seqlen_sum, xw_out); + xdnn::match_matrix_tensor(ctx, + batch, + xw_out, + right->data(), + left_lod_32_, + right_lod_32_, + dim_t_, + dim_in_, + xwy_out, + xw_fc_.out_max, + xdnn::Activation_t::RELU, + max_width); + xdnn::search_varconv( + ctx, + batch, + dim_t_, + out_channel_, + 5, + 5, + 1, + 1, + xwy_out, + conv_weight_, + right_lod_32_, + left_lod_32_, + conv_out, + conv_weight_max_, + xdnn::Activation_t::RELU); // TODO(miaotianxiang): + xdnn::sequence_concat(ctx, + xwy_out, + match_lod_32_, + conv_out, + conv_lod_32_, + seq_concat_out, + batch); + xdnn::sequence_topk_avg_pooling(ctx, + seq_concat_out, + seq_avg_topk_out, + useless_topk_pos_, + batch, + dim_t_ + out_channel_, + topk_offset_32_, + left_lod_32_, + right_lod_32_, + topks_xpu_, + topks_.size()); + } +}; + +class MMDNNBidEmbGrnnAtt { + const float* table_{nullptr}; + int table_len_; + int emb_dim_; + int cap_h_; + MMDNNGrnnOp bi_fw_; + MMDNNGrnnOp bi_rv_; + MMDNNAttentionOp att_; + XPUScratchPadGuard hbm_buffer_guard_; + float* hbm_buffer_{nullptr}; + // require at least: 4 * cap_l * emb_dim_ + // emb_rv: [cap_l, emb_dim_] + // grnn_fw: [cap_l, emb_dim_] + // grnn_rv: [cap_l, emb_dim_] + // grnn_rv_rv: [cap_l, emb_dim_] + // concat_2in: [cap_l, 2 * emb_dim_] + // L3.bi_fw: 5 * cap_l * emb_dim_ + // L3.bi_rv: 5 * cap_l * emb_dim_ + // L3.att: cap_l * 2 * emb_dim_ + seqlen_square_sum + + // execution-plan: + // 1. bid_emb_ew, alloc(emb_rv) + // 2. bi_rv, alloc(grnn_rv) + // 3. free(emb_rv) + // 4. sequence_reverse, alloc(grnn_rv_rv) + // 5. sequence_pooling(grnn_rv) + // 6. free(grnn_rv) + // 7. bi_fw alloc(grnn_fw) + // 8. sequence_pooling(grnn_fw) + // 9. concat_2 alloc(concat_2in) + // 10. concat_3 + // 11. att + + // alloc-plan: + // [0]: emb_rv, grnn_rv_rv + // [1]: grnn_rv, grnn_fw + // [2, 3]: concat_2in + // [2, 3, 4, 5, 6]: L3.bi_fw, L3.bi_rv + // [4, 5, ..., ?]: L3.att + + public: + float* emb_fw{nullptr}; + float* concat_3in{nullptr}; + float* pool_fw{nullptr}; + float* pool_rv{nullptr}; + float* att_out{nullptr}; + + void Init(lite::Tensor* table, + lite::Tensor* fw_wh, + const std::vector& fw_wh_maxs, + lite::Tensor* fw_wi, + const std::vector& fw_wi_maxs, + lite::Tensor* rv_wh, + const std::vector& rv_wh_maxs, + lite::Tensor* rv_wi, + const std::vector& rv_wi_maxs, + lite::Tensor* att_fc_w, + float att_fc_w_max, + lite::Tensor* att_fc_b, + int upper_bound_batch, + int upper_bound_seqlen) { + table_ = table->data(); + table_len_ = table->dims()[0]; + emb_dim_ = table->dims()[1]; + cap_h_ = emb_dim_; + int max_cap_l = upper_bound_batch * upper_bound_seqlen; + + bi_fw_.Init( + fw_wh, fw_wh_maxs, fw_wi, fw_wi_maxs, emb_dim_, cap_h_, max_cap_l); + bi_rv_.Init( + rv_wh, rv_wh_maxs, rv_wi, rv_wi_maxs, emb_dim_, cap_h_, max_cap_l); + att_.Init(att_fc_w, + att_fc_w_max, + att_fc_b, + 2 * cap_h_, + upper_bound_batch, + upper_bound_seqlen); + + hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad( + 4 * max_cap_l * cap_h_ * sizeof(float), false); + hbm_buffer_ = reinterpret_cast(hbm_buffer_guard_->addr_); + } + + void Infer(xdnn::Context* ctx, + int batch, + const MMDNNIdInfo& sentense, + lite::Tensor* grnn_fw_pool_out, + lite::Tensor* grnn_rv_pool_out, + lite::Tensor* att_pool_out, + lite::Tensor* concat_3in1_out, + lite::Tensor* emb_fw_out, + float* l3_buffer = nullptr, + int l3_size = 0) { + int cap_l = sentense.seqlen_sum; + int slot_len = cap_l * cap_h_; + + float* emb_rv = hbm_buffer_; + float* grnn_fw = hbm_buffer_ + slot_len; + float* grnn_rv = hbm_buffer_ + slot_len; + float* grnn_rv_rv = hbm_buffer_; + float* concat_2in = hbm_buffer_ + 2 * slot_len; + if (l3_size > 0 && l3_size >= 4 * slot_len * sizeof(float)) { + emb_rv = l3_buffer; + grnn_fw = l3_buffer + slot_len; + grnn_rv = l3_buffer + slot_len; + grnn_rv_rv = l3_buffer; + } + emb_fw = emb_fw_out->mutable_data(TARGET(kXPU)); + concat_3in = concat_3in1_out->mutable_data(TARGET(kXPU)); + pool_fw = grnn_fw_pool_out->mutable_data(TARGET(kXPU)); + pool_rv = grnn_rv_pool_out->mutable_data(TARGET(kXPU)); + att_out = att_pool_out->mutable_data(TARGET(kXPU)); + + xdnn::search_bid_emb_ew(ctx, + batch, + sentense.lod_64, + sentense.id0_64, + sentense.id1_64, + table_, + table_len_, + emb_dim_, + emb_fw, + emb_rv, + table_len_ - 2, + 1); + bi_rv_.Infer(ctx, + sentense, + emb_rv, + grnn_rv, + l3_buffer + 2 * slot_len, + l3_size - 2 * slot_len * sizeof(float)); + xdnn::sequence_reverse( + ctx, batch, sentense.lod_32, cap_h_, grnn_rv, grnn_rv_rv); + xdnn::sequence_pooling_forward(ctx, + xdnn::Pooling_t::LAST, + batch, + sentense.lod_32, + cap_h_, + grnn_rv, + nullptr, + pool_rv); + + bi_fw_.Infer(ctx, + sentense, + emb_fw, + grnn_fw, + l3_buffer + 2 * slot_len, + l3_size - 2 * slot_len * sizeof(float)); + xdnn::sequence_pooling_forward(ctx, + xdnn::Pooling_t::LAST, + batch, + sentense.lod_32, + cap_h_, + grnn_fw, + nullptr, + pool_fw); + const int concat_widths[] = {cap_h_, cap_h_, cap_h_}; + const float* concat_ptrs[] = {emb_fw, grnn_fw, grnn_rv_rv}; + xdnn::concat( + ctx, cap_l, concat_widths + 1, 2, concat_ptrs + 1, concat_2in); + xdnn::concat(ctx, cap_l, concat_widths, 3, concat_ptrs, concat_3in); + att_.Infer(ctx, + sentense, + concat_2in, + att_out, + l3_buffer + 4 * slot_len, + l3_size - 4 * slot_len * sizeof(float)); + } +}; + +class MMDNNEmbAtt { + const float* table_{nullptr}; + int table_len_; + int emb_dim_; + MMDNNAttentionOp att_; + + public: + float* emb_fw{nullptr}; + float* att_out{nullptr}; + + void Init(lite::Tensor* table, + lite::Tensor* att_fc_w, + float att_fc_w_max, + lite::Tensor* att_fc_b, + int upper_bound_batch, + int upper_bound_seqlen) { + table_ = table->data(); + table_len_ = table->dims()[0]; + emb_dim_ = table->dims()[1]; + att_.Init(att_fc_w, + att_fc_w_max, + att_fc_b, + emb_dim_, + upper_bound_batch, + upper_bound_seqlen); + } + + void Infer(xdnn::Context* ctx, + int batch, + const MMDNNIdInfo& sentense, + lite::Tensor* att_pool_out, + lite::Tensor* emb_fw_out, + float* l3_buffer = nullptr, + int l3_size = 0) { + emb_fw = emb_fw_out->mutable_data(TARGET(kXPU)); + att_out = att_pool_out->mutable_data(TARGET(kXPU)); + + int cap_l = sentense.lod.back(); + const float* emb_tables[] = {table_, table_}; + const int64_t* emb_indices[] = {sentense.id0_64, sentense.id1_64}; + xdnn::embedding_with_ewadd(ctx, + emb_dim_, + cap_l, + 2, + table_len_ - 2, + emb_tables, + emb_indices, + nullptr, + nullptr, + emb_fw); + att_.Infer(ctx, sentense, emb_fw, att_out, l3_buffer, l3_size); + } +}; + +class MMDNNMergeAll { + MMDNNGrnnOp coverage_fw_; + MMDNNGrnnOp coverage_rv_; + int cap_e_; + int cap_h_; + + // TODO(miaotianxiang): + const int fc0_k_ = 1152; + const int fc0_n_ = 512; + const int fc1_k_ = 640; + const int fc1_n_ = 320; + const int fc2_k_ = 320; + const int fc2_n_ = 1; + MMDNNFcOp fc0_; + MMDNNFcOp fc1_; + MMDNNFcOp fc2_; + + XPUScratchPadGuard hbm_buffer_guard_; + float* hbm_buffer_{nullptr}; + // topk_concat_out_fw: [cap_l, cap_e_] <= [cap_l, cap_h_] + // topk_concat_out_rv: [cap_l, cap_e_] <= [cap_l, cap_h_] + // grnn_fw: [cap_l, cap_h_] + // grnn_rv: [cap_l, cap_h_] + // pool_fw: [batch, cap_h_] + // pool_rv: [batch, cap_h_] + // fc0_in: [batch, fc0_k_] + // fc0_out: [batch, fc0_n_] + // fc1_in: [batch, fc1_k_] + // fc1_out: [batch, fc1_n_] + // fc2_out: [batch, fc2_n_] + + public: + void Init(lite::Tensor* grnn_fw_wh, + std::vector grnn_fw_wh_maxs, + lite::Tensor* grnn_fw_wi, + std::vector grnn_fw_wi_maxs, + lite::Tensor* grnn_rv_wh, + std::vector grnn_rv_wh_maxs, + lite::Tensor* grnn_rv_wi, + std::vector grnn_rv_wi_maxs, + lite::Tensor* fc0_w, + float fc0_w_max, + lite::Tensor* fc0_b, + lite::Tensor* fc1_w, + float fc1_w_max, + lite::Tensor* fc1_b, + lite::Tensor* fc2_w, + float fc2_w_max, + lite::Tensor* fc2_b, + int upper_bound_batch, + int upper_bound_seqlen) { + int max_cap_l = upper_bound_batch * upper_bound_seqlen; + cap_e_ = grnn_fw_wi->dims()[2]; + cap_h_ = grnn_fw_wi->dims()[1]; + + coverage_fw_.Init(grnn_fw_wh, + grnn_fw_wh_maxs, + grnn_fw_wi, + grnn_fw_wi_maxs, + cap_e_, + cap_h_, + max_cap_l); + coverage_rv_.Init(grnn_rv_wh, + grnn_rv_wh_maxs, + grnn_rv_wi, + grnn_rv_wi_maxs, + cap_e_, + cap_h_, + max_cap_l); + + fc0_.Init( + fc0_w, fc0_w_max, fc0_b, fc0_n_, fc0_k_, xdnn::Activation_t::RELU); + fc1_.Init( + fc1_w, fc1_w_max, fc1_b, fc1_n_, fc1_k_, xdnn::Activation_t::RELU); + fc2_.Init( + fc2_w, fc2_w_max, fc2_b, fc2_n_, fc2_k_, xdnn::Activation_t::LINEAR); + + int hbm_total_len = max_cap_l * cap_h_ * 4 + + upper_bound_batch * (2 * cap_h_ + fc0_k_ + fc0_n_ + + fc1_k_ + fc1_n_ + fc2_n_); + hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad( + hbm_total_len * sizeof(float), false); + hbm_buffer_ = reinterpret_cast(hbm_buffer_guard_->addr_); + } + + void Infer(xdnn::Context* ctx, + const MMDNNIdInfo& sentense, + const std::vector concat_2in1_x, + const std::vector concat_7in1_x, + lite::Tensor* out, + float* l3_buffer = nullptr, + int l3_size = 0) { + int batch = sentense.batch; + int cap_l = sentense.seqlen_sum; + + float* topk_concat_out_fw = hbm_buffer_; + int hbm_total_len = + cap_l * cap_h_ * 4 + + batch * (2 * cap_h_ + fc0_k_ + fc0_n_ + fc1_k_ + fc1_n_ + fc2_n_); + if (l3_size > 0 && l3_size >= hbm_total_len * sizeof(float)) { + topk_concat_out_fw = l3_buffer; + } + float* topk_concat_out_rv = topk_concat_out_fw + cap_l * cap_h_; + float* grnn_fw = topk_concat_out_rv + cap_l * cap_h_; + float* grnn_rv = grnn_fw + cap_l * cap_h_; + float* pool_fw = grnn_rv + cap_l * cap_h_; + float* pool_rv = pool_fw + batch * cap_h_; + float* fc0_in = pool_fw + batch * cap_h_ * 2; + float* fc0_out = fc0_in + batch * fc0_k_; + float* fc1_in = fc0_out + batch * fc0_n_; + float* fc1_out = fc1_in + batch * fc1_k_; + // float* fc2_out = fc1_out + batch * fc1_n_; + float* fc2_out = out->mutable_data(TARGET(kXPU)); + + const int concat_widths[] = {static_cast(concat_2in1_x[0]->dims()[1]), + static_cast(concat_2in1_x[1]->dims()[1])}; + const float* concat_ptrs[] = {concat_2in1_x[0]->data(), + concat_2in1_x[1]->data()}; + xdnn::concat( + ctx, cap_l, concat_widths, 2, concat_ptrs, topk_concat_out_fw); + xdnn::sequence_reverse(ctx, + batch, + sentense.lod_32, + cap_e_, + topk_concat_out_fw, + topk_concat_out_rv); + coverage_fw_.Infer(ctx, + sentense, + topk_concat_out_fw, + grnn_fw, + l3_buffer + hbm_total_len, + l3_size - hbm_total_len * sizeof(float)); + coverage_rv_.Infer(ctx, + sentense, + topk_concat_out_rv, + grnn_rv, + l3_buffer + hbm_total_len, + l3_size - hbm_total_len * sizeof(float)); + xdnn::sequence_pooling_forward(ctx, + xdnn::Pooling_t::LAST, + batch, + sentense.lod_32, + cap_h_, + grnn_fw, + nullptr, + pool_fw); + xdnn::sequence_pooling_forward(ctx, + xdnn::Pooling_t::LAST, + batch, + sentense.lod_32, + cap_h_, + grnn_rv, + nullptr, + pool_rv); + + const int concat_widths_fc0[] = { + static_cast(concat_7in1_x[0]->dims()[1]), + static_cast(concat_7in1_x[1]->dims()[1]), + static_cast(concat_7in1_x[2]->dims()[1]), + static_cast(concat_7in1_x[3]->dims()[1]), + static_cast(concat_7in1_x[4]->dims()[1]), + static_cast(concat_7in1_x[5]->dims()[1]), + static_cast(concat_7in1_x[6]->dims()[1]), + }; + const float* concat_ptrs_fc0[] = { + concat_7in1_x[0]->data(), + concat_7in1_x[1]->data(), + concat_7in1_x[2]->data(), + concat_7in1_x[3]->data(), + concat_7in1_x[4]->data(), + concat_7in1_x[5]->data(), + concat_7in1_x[6]->data(), + }; + const int concat_widths_fc1[] = {cap_h_, cap_h_, fc0_n_}; + const float* concat_ptrs_fc1[] = {pool_fw, pool_rv, fc0_out}; + + xdnn::concat( + ctx, batch, concat_widths_fc0, 7, concat_ptrs_fc0, fc0_in); + fc0_.Infer(ctx, fc0_in, batch, fc0_out); + xdnn::concat( + ctx, batch, concat_widths_fc1, 3, concat_ptrs_fc1, fc1_in); + fc1_.Infer(ctx, fc1_in, batch, fc1_out); + fc2_.Infer(ctx, fc1_out, batch, fc2_out); + } +}; + +class XPUMmdnnBidEmbGrnnAttCompute + : public KernelLite { + public: + using param_t = operators::XPUMmdnnBidEmbGrnnAttParam; + + void PrepareForRun() override; + + void Run() override; + + private: + MMDNNIdInfo id_; + MMDNNBidEmbGrnnAtt compound_; + int upper_bound_batch_ = 40; + int upper_bound_seqlen_ = 512; +}; + +void XPUMmdnnBidEmbGrnnAttCompute::PrepareForRun() { + auto& param = this->Param(); + + id_.Init(upper_bound_batch_, upper_bound_seqlen_); + compound_.Init(param.emb_tbl, + param.grnn_fw_wh, + param.grnn_fw_wh_maxs, + param.grnn_fw_wi, + param.grnn_fw_wi_maxs, + param.grnn_rv_wh, + param.grnn_rv_wh_maxs, + param.grnn_rv_wi, + param.grnn_rv_wi_maxs, + param.att_fc_w, + param.att_fc_w_max, + param.att_fc_b, + upper_bound_batch_, + upper_bound_seqlen_); +} + +void XPUMmdnnBidEmbGrnnAttCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto* xpu_ctx = ctx.GetRawContext(); + + int batch = param.id0->lod()[0].size() - 1; + id_.Update(param.id0, param.id1); + compound_.Infer(ctx.GetRawContext(), + batch, + id_, + param.grnn_fw_pool_out, + param.grnn_rv_pool_out, + param.att_pool_out, + param.concat_3in1_out, + param.emb_fw_out, + reinterpret_cast( + reinterpret_cast(xpu_ctx->workspace_l3_ptr) + + xpu_ctx->used_l3_size), + xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size); +} + +class XPUMmdnnBidEmbAttCompute + : public KernelLite { + public: + using param_t = operators::XPUMmdnnBidEmbAttParam; + + void PrepareForRun() override; + + void Run() override; + + private: + MMDNNIdInfo id_; + MMDNNEmbAtt compound_; + int upper_bound_batch_ = 40; + int upper_bound_seqlen_ = 512; +}; + +void XPUMmdnnBidEmbAttCompute::PrepareForRun() { + auto& param = this->Param(); + + id_.Init(upper_bound_batch_, upper_bound_seqlen_); + compound_.Init(param.emb_tbl, + param.att_fc_w, + param.att_fc_w_max, + param.att_fc_b, + upper_bound_batch_, + upper_bound_seqlen_); +} + +void XPUMmdnnBidEmbAttCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto* xpu_ctx = ctx.GetRawContext(); + + int batch = param.id0->lod()[0].size() - 1; + id_.Update(param.id0, param.id1); + compound_.Infer(ctx.GetRawContext(), + batch, + id_, + param.att_pool_out, + param.emb_fw_out, + reinterpret_cast( + reinterpret_cast(xpu_ctx->workspace_l3_ptr) + + xpu_ctx->used_l3_size), + xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size); +} + +class XPUMmdnnMatchConvTopkCompute + : public KernelLite { + public: + using param_t = operators::XPUMmdnnMatchConvTopkParam; + + void PrepareForRun() override; + + void Run() override; + + private: + MMDNNMatchConvTopk compound_; + int upper_bound_batch_ = 40; + int upper_bound_seqlen_ = 512; +}; + +void XPUMmdnnMatchConvTopkCompute::PrepareForRun() { + auto& param = this->Param(); + + compound_.Init(param.input_w, + param.input_w_max, + param.conv_w, + param.conv_w_max, + param.dim_t, + param.input_w->dims()[0], + upper_bound_batch_, + upper_bound_seqlen_, + param.topks); +} + +void XPUMmdnnMatchConvTopkCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto* xpu_ctx = ctx.GetRawContext(); + + compound_.Infer(ctx.GetRawContext(), + param.input_x, + param.input_y, + param.topk_out, + reinterpret_cast( + reinterpret_cast(xpu_ctx->workspace_l3_ptr) + + xpu_ctx->used_l3_size), + xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size); +} + +class XPUMmdnnMergeAllCompute + : public KernelLite { + public: + using param_t = operators::XPUMmdnnMergeAllParam; + + void PrepareForRun() override; + + void Run() override; + + private: + MMDNNIdInfo id_; + MMDNNMergeAll compound_; + int upper_bound_batch_ = 40; + int upper_bound_seqlen_ = 512; +}; + +void XPUMmdnnMergeAllCompute::PrepareForRun() { + auto& param = this->Param(); + + id_.Init(upper_bound_batch_, upper_bound_seqlen_); + compound_.Init(param.grnn_fw_wh, + param.grnn_fw_wh_maxs, + param.grnn_fw_wi, + param.grnn_fw_wi_maxs, + param.grnn_rv_wh, + param.grnn_rv_wh_maxs, + param.grnn_rv_wi, + param.grnn_rv_wi_maxs, + param.fc0_w, + param.fc0_w_max, + param.fc0_b, + param.fc1_w, + param.fc1_w_max, + param.fc1_b, + param.fc2_w, + param.fc2_w_max, + param.fc2_b, + upper_bound_batch_, + upper_bound_seqlen_); +} + +void XPUMmdnnMergeAllCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto* xpu_ctx = ctx.GetRawContext(); + + id_.Update(param.concat_2in1_x[0], param.concat_2in1_x[1]); + compound_.Infer(ctx.GetRawContext(), + id_, + param.concat_2in1_x, + param.concat_7in1_x, + param.out, + reinterpret_cast( + reinterpret_cast(xpu_ctx->workspace_l3_ptr) + + xpu_ctx->used_l3_size), + xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(__xpu__mmdnn_bid_emb_grnn_att, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUMmdnnBidEmbGrnnAttCompute, + def) + .BindInput("id0", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindInput("id1", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindInput("emb_tbl", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_fw_wh", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_fw_wi", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_rv_wh", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_rv_wi", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("att_fc_w", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("att_fc_b", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("grnn_fw_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("grnn_rv_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("att_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("concat_3in1_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("emb_fw_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(__xpu__mmdnn_bid_emb_att, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUMmdnnBidEmbAttCompute, + def) + .BindInput("id0", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindInput("id1", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindInput("emb_tbl", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("att_fc_w", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("att_fc_b", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("att_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("concat_3in1_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("emb_fw_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(__xpu__mmdnn_match_conv_topk, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUMmdnnMatchConvTopkCompute, + def) + .BindInput("input_x", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("input_y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("input_w", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("conv_w", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("topk_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(__xpu__mmdnn_merge_all, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUMmdnnMergeAllCompute, + def) + .BindInput("concat_7in1_x", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("concat_2in1_x", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_fw_wh", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_fw_wi", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_rv_wh", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_rv_wi", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("fc0_w", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("fc0_b", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("fc1_w", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("fc1_b", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("fc2_w", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("fc2_b", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc b/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..0d57445cd44953f504e292ad38d44d047daa3a7a --- /dev/null +++ b/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/__xpu__resnet_cbam_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void XPUResNetCbamCompute::PrepareForRun() { + auto& param = this->Param(); + + for (auto* filter : param.filter) { + arg_filter_.push_back( + reinterpret_cast(filter->data())); + } + for (auto* bias : param.bias) { + if (bias == nullptr) { + arg_bias_.push_back(nullptr); + } else { + arg_bias_.push_back(bias->data()); + } + } + for (auto* max_filter : param.max_filter) { + arg_max_filter_.push_back(max_filter->data()); + } +} + +void XPUResNetCbamCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto input_dims = param.input->dims(); + int batch_size = input_dims[0]; + int height = input_dims[2]; + int width = input_dims[3]; + + int r = xdnn::conv2d_int16_resnet_cbam( + ctx.GetRawContext(), /* context */ + batch_size, /* num */ + height, /* height */ + width, /* width */ + param.input->data(), /* bottom */ + &arg_filter_[0], /* weight_list */ + param.output->mutable_data(TARGET(kXPU)), /* top */ + &arg_bias_[0], /* bias_list */ + &arg_max_filter_[0], /* max_filter_list */ + param.pool_p, /* pool_p */ + true, /* midtype_fp16 */ + false /* dynamic_shape */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(__xpu__resnet_cbam, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUResNetCbamCompute, + def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("MaxFilter", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/__xpu__resnet_cbam_compute.h b/lite/kernels/xpu/__xpu__resnet_cbam_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..b952bb088ea88399966c170cbeadebfa698889d8 --- /dev/null +++ b/lite/kernels/xpu/__xpu__resnet_cbam_compute.h @@ -0,0 +1,45 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class XPUResNetCbamCompute + : public KernelLite { + public: + using param_t = operators::XPUResNetCbamParam; + + virtual void PrepareForRun(); + + virtual void Run(); + + private: + std::vector arg_filter_; + std::vector arg_max_filter_; + std::vector arg_bias_; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/__xpu__search_attention_compute.cc b/lite/kernels/xpu/__xpu__search_attention_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..515be8935637d89d58db830f96f2ea439e7d7e68 --- /dev/null +++ b/lite/kernels/xpu/__xpu__search_attention_compute.cc @@ -0,0 +1,219 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/__xpu__search_attention_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void XPUMmdnnSearchAttentionCompute::PrepareForRun() { + offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); + pad_begin_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); + w_max_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(8 * sizeof(float)); + buffer_at_l3_guard_ = TargetWrapperXPU::MallocScratchPad( + 5 * L3_SLOT_SIZE * sizeof(float), false /* use_l3 */); + buffer_at_gm_guard_ = TargetWrapperXPU::MallocScratchPad( + 5 * GM_SLOT_SIZE * sizeof(float), false /* use_l3 */); + + offset_cpu.reset(new int[64]); + pad_begin_cpu.reset(new int[64]); +} + +void XPUMmdnnSearchAttentionCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto* X = param.X; + auto* W = param.W; + auto* b = param.b; + float W_max = param.W_max; + float alpha0 = param.alpha0; + float alpha1 = param.alpha1; + float mask = param.mask; + + const int16_t* w_data = W->data(); + const float* b_data = b->data(); + + int batch = X->lod()[0].size() - 1; + int dim0 = X->dims()[0]; + int dim1 = X->dims()[1]; + const auto offset = X->lod()[0]; + int max_seq = 0; + + auto* top = param.Out; + LoD top_lod; + top_lod.push_back(X->lod()[0]); + top->set_lod(top_lod); + top->Resize({dim0, dim1}); + auto* top_data = top->mutable_data(TARGET(kXPU)); + + float maxs_cpu[8] = {0.0f, 0.0f, 0.0f, 0.0f, W_max, 0.0f, 0.0f, 0.0f}; + for (int i = 0; i < batch; ++i) { + offset_cpu[i] = offset[i]; // type of offset is int64, not supported by xpu + pad_begin_cpu[i] = offset[i + 1] - offset[i]; + if (offset[i + 1] - offset[i] > max_seq) { + max_seq = offset[i + 1] - offset[i]; + } + } + offset_cpu[batch] = offset[batch]; + + xpu_memcpy(offset_xpu_guard_->addr_, + offset_cpu.get(), + offset.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + xpu_memcpy(pad_begin_xpu_guard_->addr_, + pad_begin_cpu.get(), + batch * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + xpu_memcpy(w_max_xpu_guard_->addr_, + maxs_cpu, + 8 * sizeof(float), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + + int* offset_xpu = reinterpret_cast(offset_xpu_guard_->addr_); + int* pad_begin_xpu = reinterpret_cast(pad_begin_xpu_guard_->addr_); + float* maxs_xpu = reinterpret_cast(w_max_xpu_guard_->addr_); + float* buffer_at_l3 = reinterpret_cast(buffer_at_l3_guard_->addr_); + float* buffer_at_gm = reinterpret_cast(buffer_at_gm_guard_->addr_); + + // when use l3, max_seq <= 128: + // group_padding: batch * max_seq * dim1; at (slot0, slot1) + // seq_fc: batch * max_seq * dim1; at (slot2, slot3) + // batchgemm0: batch * max_seq * max_seq; at slot4 + // attention_padding_mask: batch * max_seq * max_seq; at slot3 + // seq_softmax: batch * max_seq * max_seq; at slot4 + // batchgemm1: batch * max_seq * dim1; at (slot2, slot3) + float* group_padding_output = buffer_at_l3; + float* seq_fc_output = buffer_at_l3 + 2 * L3_SLOT_SIZE; + float* batchgemm0_output = buffer_at_l3 + 4 * L3_SLOT_SIZE; + float* attention_output = buffer_at_l3 + 3 * L3_SLOT_SIZE; + float* seq_softmax_output = buffer_at_l3 + 4 * L3_SLOT_SIZE; + float* batchgemm1_output = buffer_at_l3 + 2 * L3_SLOT_SIZE; + + if (max_seq > 128) { + group_padding_output = buffer_at_gm; + seq_fc_output = buffer_at_gm + 1 * GM_SLOT_SIZE; + batchgemm0_output = buffer_at_gm + 2 * GM_SLOT_SIZE; + attention_output = buffer_at_gm + 1 * GM_SLOT_SIZE; + seq_softmax_output = buffer_at_gm + 3 * GM_SLOT_SIZE; + batchgemm1_output = buffer_at_gm + 4 * GM_SLOT_SIZE; + } + + const auto* bottom_data = X->data(); + xdnn::search_sequence_pad_depad(ctx.GetRawContext(), + const_cast(bottom_data), + group_padding_output, + offset_xpu, + max_seq, + batch, + dim1, + 0); // is_depad = 0 + // do-findmax + xdnn::findmax(ctx.GetRawContext(), + group_padding_output, + batch * max_seq * dim1, + maxs_xpu); + xdnn::gemm_int16_maxptr( + ctx.GetRawContext(), + false, + true, // trans_a, trans_b + batch * max_seq, + dim1, + dim1, // m, n, k + 1.0f, + group_padding_output, + dim1, // alpha, data_a, lda + w_data, + dim1, + 0.0f, // data_b, ldb, beta + seq_fc_output, + dim1, + b_data, // data_c, ldc, bias + xdnn::Activation_t::LINEAR, + maxs_xpu, + maxs_xpu + 4, + nullptr); // max_a, max_b, max_c + xdnn::search_aligned_mat_mul(ctx.GetRawContext(), + 0, + 1, + batch, + max_seq, + max_seq, + dim1, + alpha0, + group_padding_output, + dim1, + seq_fc_output, + dim1, + batchgemm0_output, + max_seq); + xdnn::search_pad_mask(ctx.GetRawContext(), + batchgemm0_output, + attention_output, + pad_begin_xpu, + batch, + max_seq, + max_seq, + batch, + mask); + xdnn::softmax2d_forward(ctx.GetRawContext(), + attention_output, + seq_softmax_output, + batch * max_seq, + max_seq, + true); + xdnn::search_aligned_mat_mul(ctx.GetRawContext(), + 0, + 0, + batch, + max_seq, + dim1, + max_seq, + alpha1, + seq_softmax_output, + max_seq, + group_padding_output, + dim1, + batchgemm1_output, + dim1); + xdnn::search_sequence_pad_depad(ctx.GetRawContext(), + top_data, + batchgemm1_output, + offset_xpu, + max_seq, + batch, + dim1, + 1); // is_depad = 1 +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(__xpu__mmdnn_search_attention, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUMmdnnSearchAttentionCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("b", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/__xpu__search_attention_compute.h b/lite/kernels/xpu/__xpu__search_attention_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..f9670dbab6247927acf6ac7d7b47f98a464a3489 --- /dev/null +++ b/lite/kernels/xpu/__xpu__search_attention_compute.h @@ -0,0 +1,52 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class XPUMmdnnSearchAttentionCompute + : public KernelLite { + public: + using param_t = operators::XPUMmdnnSearchAttentionParam; + + void PrepareForRun() override; + + void Run() override; + + private: + XPUScratchPadGuard offset_xpu_guard_; + XPUScratchPadGuard pad_begin_xpu_guard_; + XPUScratchPadGuard w_max_xpu_guard_; + XPUScratchPadGuard buffer_at_l3_guard_; + XPUScratchPadGuard buffer_at_gm_guard_; + + std::unique_ptr offset_cpu; + std::unique_ptr pad_begin_cpu; + + const int L3_SLOT_SIZE = 40 * 128 * 128; + const int GM_SLOT_SIZE = 40 * 512 * 512; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/concat_compute.cc b/lite/kernels/xpu/concat_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..f088bb80f0c500c6f900726195bcb5903049d3fb --- /dev/null +++ b/lite/kernels/xpu/concat_compute.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/concat_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void ConcatCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto ins = param.x; + auto out = param.output; + int64_t axis = param.axis; + + int n = ins.size(); + int h = 1; + int w_except_axis = 1; + CHECK(n <= 8) << "XPU only surpport at most 8 tensors for now"; + for (int i = 0; i < axis; ++i) { + h *= (ins[0]->dims())[i]; + } + for (int i = axis + 1; i < ins[0]->dims().size(); ++i) { + w_except_axis *= (ins[0]->dims())[i]; + } + CHECK(axis >= 0) << "concat: axis shoud >= 0!"; + CHECK(axis < ins[0]->dims().size()) << "concat: axis shoud < ins[0]->dims()!"; + for (int i = 0; i < n; ++i) { + int hh = 1; + int ww = 1; + for (int j = 0; j < axis; ++j) { + hh *= (ins[i]->dims())[j]; + } + for (int j = axis + 1; j < ins[i]->dims().size(); ++j) { + ww *= (ins[i]->dims())[j]; + } + CHECK(hh == h) << "concat: h should be eual!"; + CHECK(ww == w_except_axis) << "concat: w should be eual except for axis!"; + } + + int in_w_host[n]; // NOLINT + const float* ptrs[n]; // NOLINT + + for (int i = 0; i < n; ++i) { + ptrs[i] = ins[i]->data(); + in_w_host[i] = w_except_axis * (ins[i]->dims())[axis]; + } + + int r = xdnn::concat(ctx.GetRawContext(), /* ctx */ + h, /* height */ + in_w_host, /* width_x */ + n, /* n */ + ptrs, /* lm_ptrs */ + out->mutable_data(TARGET(kXPU)) /*y*/); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + concat, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ConcatCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("AxisTensor", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/concat_compute.h b/lite/kernels/xpu/concat_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..f29899a741194270272770d8b781cd9b0b54abc9 --- /dev/null +++ b/lite/kernels/xpu/concat_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class ConcatCompute : public KernelLite { + public: + using param_t = operators::ConcatParam; + + virtual void Run(); + + virtual ~ConcatCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/match_matrix_tensor_compute.cc b/lite/kernels/xpu/match_matrix_tensor_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c4e896d23add6df99a7b66a830dc526dc808e95 --- /dev/null +++ b/lite/kernels/xpu/match_matrix_tensor_compute.cc @@ -0,0 +1,179 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/match_matrix_tensor_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void MatchMatrixTensorCompute::PrepareForRun() { + wx_max_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); + offset_l_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); + offset_r_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); + + offset_l_cpu.reset(new int[64]); + offset_r_cpu.reset(new int[64]); +} + +void MatchMatrixTensorCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto* x = param.x; + auto* y = param.y; + auto* w = param.w; + auto* out = param.out; + auto* tmp = param.tmp; + int dim_t = param.dim_t; + float w_max = param.__xpu__w_max; + bool fuse_relu = param.fuse_relu; + bool float_to_fix = param.__xpu__float_to_fix; + CHECK(float_to_fix) << "W should be fixed point"; + + xdnn::Activation_t act = xdnn::Activation_t::LINEAR; + if (fuse_relu) { + act = xdnn::Activation_t::RELU; + } + + int dim_in = x->dims()[1]; + const auto& offset_l = x->lod()[0]; + const auto& offset_r = y->lod()[0]; + + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + top_size += dim_t * len_l * len_r; + top_offset.push_back(top_size); + } + auto* bottom_l_data = x->data(); + auto* bottom_r_data = y->data(); + auto* w_data = w->data(); + auto* out_data = out->mutable_data(TARGET(kXPU)); + auto* bottom_l_trans_data = tmp->mutable_data(TARGET(kXPU)); + int batch_size = x->lod()[0].size() - 1; + + float* wx_max = reinterpret_cast(wx_max_xpu_guard_->addr_); + int* offset_l_xpu = reinterpret_cast(offset_l_xpu_guard_->addr_); + int* offset_r_xpu = reinterpret_cast(offset_r_xpu_guard_->addr_); + + int r = xdnn::gemm_int16_tmp_api( + ctx.GetRawContext(), /* ctx */ + false, + false, /* trans_a, trans_b */ + x->dims()[0], + dim_t * dim_in, + dim_in, /* m, n, k */ + 1.0f, + bottom_l_data, + dim_in, /* alpha, data_a, lda */ + w_data, + dim_t * dim_in, + 0.0f, /* data_b, ldb, beta */ + bottom_l_trans_data, + dim_t * dim_in, /* data_c, ldc */ + nullptr, /* bias */ + xdnn::Activation_t::LINEAR, + 0.0f, + w_max, + wx_max /* max_a, max_b, max_c */); + CHECK_EQ(r, 0); + + int max_width = 0; + for (int i = 0; i < offset_l.size(); ++i) { + offset_l_cpu[i] = offset_l[i]; + if (i != 0 && (offset_l_cpu[i] - offset_l_cpu[i - 1] > max_width)) { + max_width = offset_l_cpu[i] - offset_l_cpu[i - 1]; + } + } + for (int i = 0; i < offset_r.size(); ++i) { + offset_r_cpu[i] = offset_r[i]; + if (i != 0 && (offset_r_cpu[i] - offset_r_cpu[i - 1] > max_width)) { + max_width = offset_r_cpu[i] - offset_r_cpu[i - 1]; + } + } + xpu_memcpy(offset_l_xpu, + offset_l_cpu.get(), + offset_l.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + xpu_memcpy(offset_r_xpu, + offset_r_cpu.get(), + offset_r.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + + r = xdnn::match_matrix_tensor(ctx.GetRawContext(), + batch_size, + bottom_l_trans_data, + bottom_r_data, + offset_l_xpu, + offset_r_xpu, + dim_t, + dim_in, + out_data, + wx_max, + act, + max_width); + CHECK_EQ(r, 0); + + int lod_lv1_size = batch_size * dim_t; + int lod_lv2_size = x->lod()[0].back() * dim_t; + std::vector out_lod0(batch_size + 1, 0); + std::vector out_lod1(lod_lv1_size + 1, 0); + std::vector out_lod2(lod_lv2_size + 1, 0); + for (int i = 0; i < batch_size; i++) { + out_lod0[i + 1] = out_lod0[i] + dim_t; + int len_l = offset_l[i + 1] - offset_l[i]; + + for (int j = 0; j < dim_t; j++) { + out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l; + int len_r = offset_r[i + 1] - offset_r[i]; + + for (int k = 0; k < len_l; k++) { + out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] = + out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r; + } + } + } + + paddle::lite::LoD out_lod; + out_lod.push_back(top_offset); + out_lod.push_back(offset_l); + out_lod.push_back(offset_r); + out->set_lod(out_lod); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(match_matrix_tensor, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::MatchMatrixTensorCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Tmp", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/match_matrix_tensor_compute.h b/lite/kernels/xpu/match_matrix_tensor_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..3bd0b622db1fce178ea66604d89dc50d6477a105 --- /dev/null +++ b/lite/kernels/xpu/match_matrix_tensor_compute.h @@ -0,0 +1,47 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class MatchMatrixTensorCompute + : public KernelLite { + public: + using param_t = operators::MatchMatrixTensorParam; + + virtual void PrepareForRun(); + + virtual void Run(); + + private: + XPUScratchPadGuard wx_max_xpu_guard_; + XPUScratchPadGuard offset_l_xpu_guard_; + XPUScratchPadGuard offset_r_xpu_guard_; + + std::unique_ptr offset_l_cpu; + std::unique_ptr offset_r_cpu; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/search_fc_compute.cc b/lite/kernels/xpu/search_fc_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..79f4c2d0d809ea9848fb383863d0f9dd2ec5a2ae --- /dev/null +++ b/lite/kernels/xpu/search_fc_compute.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/search_fc_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void SearchFcCompute::PrepareForRun() { + maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(float)); +} + +void SearchFcCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* bottom = param.X; + auto* w = param.W; + auto* b = param.b; + auto* top = param.Out; + float w_max = param.__xpu__w_max; + int out_size = param.out_size; + bool fuse_relu = param.fuse_relu; + bool float_to_fix = param.__xpu__float_to_fix; + CHECK(float_to_fix) << "W should be fixed point"; + + int batch = bottom->dims()[0]; + int _out = w->dims()[0]; + int _in = w->dims()[1]; + + xdnn::Activation_t act = xdnn::Activation_t::LINEAR; + if (fuse_relu) { + act = xdnn::Activation_t::RELU; + } + + std::vector top_dims{bottom->dims()[0], out_size}; + top->Resize(top_dims); + + const auto* bottom_data = bottom->data(); + const auto* weights = w->data(); + const auto* bias_data = b->data(); + auto* top_data = top->mutable_data(TARGET(kXPU)); + + float* maxs_xpu = reinterpret_cast(maxs_xpu_guard_->addr_); + float maxs_cpu[8] = {0.0f, 0.0f, 0.0f, 0.0f, w_max, 0.0f, 0.0f, 0.0f}; + xpu_memcpy(maxs_xpu, + &maxs_cpu[0], + 8 * sizeof(float), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + + int r = xdnn::findmax( + ctx.GetRawContext(), bottom_data, batch * _in, maxs_xpu); + CHECK_EQ(r, 0); + r = xdnn::gemm_int16_maxptr( + ctx.GetRawContext(), /* ctx */ + false, + true, /*trans_a, trans_b*/ + batch, + _out, + _in, /*m, n, k*/ + 1.0f, + bottom_data, + _in, /*alpha, data_a, lda*/ + weights, + _in, + 0.0f, /*data_b, ldb, beta*/ + top_data, + _out, + bias_data, /* data_c, ldc, bias*/ + act, + maxs_xpu, + maxs_xpu + 4, + nullptr /*act, max_a, max_b, max_c*/); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_fc, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::SearchFcCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("b", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/search_fc_compute.h b/lite/kernels/xpu/search_fc_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..c7ee06abd957187c18c1306f40a77735f40558e7 --- /dev/null +++ b/lite/kernels/xpu/search_fc_compute.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class SearchFcCompute : public KernelLite { + public: + using param_t = operators::SearchFcParam; + + void PrepareForRun() override; + + void Run() override; + + private: + XPUScratchPadGuard maxs_xpu_guard_; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/search_grnn_compute.cc b/lite/kernels/xpu/search_grnn_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..1c19f58da1b5deaa3d74791561494f13b681cf3a --- /dev/null +++ b/lite/kernels/xpu/search_grnn_compute.cc @@ -0,0 +1,282 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/search_grnn_compute.h" +#include +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void SearchGrnnCompute::PrepareForRun() { + offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); + new_offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(256 * sizeof(int)); + maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(16 * sizeof(float)); + + idx_sorted_by_width_data_cpu.reset(new int[64]); + offset_cpu.reset(new int[64]); + new_offset_cpu.reset(new int[256]); +} + +void SearchGrnnCompute::prepare_layout(const operators::SearchGrnnParam& param, + const paddle::lite::Tensor* bottom) { + auto* idx_sorted_by_width = param.idx_sorted_by_width; + auto* layout_input = param.layout_input; + + int dim0 = bottom->dims()[0]; + int dim1 = 1; + if (bottom->dims().size() > 1) { + dim1 = bottom->dims()[1]; + } + int batch = bottom->lod()[0].size() - 1; + auto& offset = bottom->lod()[0]; + + idx_sorted_by_width->Resize({batch}); + std::vector width; + width.resize(batch); + + // sort sequences by width (descending) and find the largest width in the + // batch + for (int i = 0; i < batch; i++) { + width[i] = offset[i + 1] - offset[i]; + idx_sorted_by_width_data_cpu[i] = i; + } + std::sort(idx_sorted_by_width_data_cpu.get(), + idx_sorted_by_width_data_cpu.get() + batch, + [&width](int a, int b) { return width[a] > width[b]; }); + int max_width = width[idx_sorted_by_width_data_cpu[0]]; + + // start of reorganizing the input + std::vector new_offset; + new_offset.resize(max_width + 1); + new_offset[0] = 0; + int j = batch - 1; + int last_width = 0; + int sub_row = 0; + int sub_col = 0; + + for (int i = 1; i <= max_width;) { + for (int k = j; k >= 0; --k) { + if (width[idx_sorted_by_width_data_cpu[k]] > last_width) { + sub_row = width[idx_sorted_by_width_data_cpu[k]] - last_width; + sub_col = k + 1; + for (int s = 0; s < sub_row; s++) { + new_offset[i] = new_offset[i - 1] + sub_col; + i++; + } + // move on + last_width = width[idx_sorted_by_width_data_cpu[k]]; + j = k - 1; + break; + } + } + } + + // copying to the reorganized buffer + if (bottom->dims().size() == 1) { + } else { + LoD new_lod; + new_lod.push_back(new_offset); + layout_input->set_lod(new_lod); + layout_input->Resize({dim0, dim1}); + } + + xpu_memcpy(idx_sorted_by_width->mutable_data(TARGET(kXPU)), + idx_sorted_by_width_data_cpu.get(), + idx_sorted_by_width->numel() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); +} + +void SearchGrnnCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* bottom = param.x; + auto* wi = param.wi; + auto* wh = param.wh; + auto* top = param.out; + auto* tmp_buffer = param.tmp_buffer; + auto* idx_sorted_by_width = param.idx_sorted_by_width; + auto* layout_input = param.layout_input; + int cap_h = param.num_hidden; + int cap_e = param.num_input; + int cap_l = bottom->dims()[0]; + auto wi_max = param.__xpu__wi_max; + auto wh_max = param.__xpu__wh_max; + bool float_to_fix = param.__xpu__float_to_fix; + CHECK(float_to_fix) << "W should be fixed point"; + + int dim = 1; + if (bottom->dims().size() > 1) { + dim = bottom->dims()[1]; + } + + const auto& offset = bottom->lod()[0]; + LoD top_lod; + top_lod.push_back(offset); + top->set_lod(top_lod); + std::vector top_dims_vec{cap_l, cap_h}; + top->Resize(top_dims_vec); + auto* top_hidden = top->mutable_data(TARGET(kXPU)); + const auto* dense_e2h = wi->data(); + const auto* dense_h2h = wh->data(); + + // Prepare idx_sorted_by_width + prepare_layout(param, bottom); + int batch = bottom->lod()[0].size() - 1; + int max_width = layout_input->lod()[0].size() - 1; + const auto& new_offset = layout_input->lod()[0]; + auto* new_emb = layout_input->mutable_data(TARGET(kXPU)); + + // Prepare offset and new_offset + int* offset_xpu = reinterpret_cast(offset_xpu_guard_->addr_); + int* new_offset_xpu = reinterpret_cast(new_offset_xpu_guard_->addr_); + float* maxs_xpu = reinterpret_cast(maxs_xpu_guard_->addr_); + CHECK_LE(offset.size(), 64); + CHECK_LE(new_offset.size(), 256); + + for (size_t i = 0; i < offset.size(); ++i) { + offset_cpu[i] = offset[i]; + } + for (size_t i = 0; i < new_offset.size(); ++i) { + new_offset_cpu[i] = new_offset[i]; + } + xpu_memcpy(offset_xpu, + offset_cpu.get(), + offset.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + xpu_memcpy(new_offset_xpu, + new_offset_cpu.get(), + new_offset.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + + int r = xdnn::search_seq2batch(ctx.GetRawContext(), + batch, + max_width, + dim, + idx_sorted_by_width->data(), + offset_xpu, + new_offset_xpu, + bottom->data(), + new_emb); + CHECK_EQ(r, 0); + + // this buffer is used for book keeping info which will be used in bp + // buffer also needed in bp, so make it larger + tmp_buffer->Resize({20, cap_l, cap_h}); + auto* buffer_data = tmp_buffer->mutable_data(TARGET(kXPU)); + // the internal hidden + auto* hidden = buffer_data + 19 * cap_l * cap_h; + + // do-findmax + float maxs_cpu[16] = {0.0f, + 0.0f, + 0.0f, + 0.0f, + wi_max[0], + 0.0f, + 0.0f, + 0.0f, + wi_max[1], + 0.0f, + 0.0f, + 0.0f, + wi_max[2], + 0.0f, + 0.0f, + 0.0f}; + xpu_memcpy(maxs_xpu, + maxs_cpu, + 16 * sizeof(float), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + r = xdnn::findmax( + ctx.GetRawContext(), new_emb, cap_l * cap_e, maxs_xpu); + CHECK_EQ(r, 0); + + // precompute embedding to hidden + for (int i = 0; i < 3; ++i) { + const int16_t* data_b = dense_e2h + i * cap_e * cap_h; // e2h, e2hr, e2hz + float* data_c = buffer_data + i * cap_l * cap_h; // w_x_e, wr_x_e, wz_x_e + int r = xdnn::gemm_int16_maxptr( + ctx.GetRawContext(), + false, + true, // trans_a, trans_b + cap_l, + cap_h, + cap_e, // m, n, k + 1.0f, + new_emb, + cap_e, // alpha, data_a, lda + data_b, + cap_e, + 0.0f, // data_b, ldb, beta + data_c, + cap_h, // data_c, ldc + nullptr, + xdnn::Activation_t::LINEAR, // bias, act + maxs_xpu, + maxs_xpu + 4 * (i + 1)); // max_a, max_b + CHECK_EQ(r, 0); + } + + r = xdnn::search_grnn(ctx.GetRawContext(), + cap_l, + cap_h, + cap_e, + max_width, + new_offset_xpu, + buffer_data, + dense_h2h, + hidden, + wh_max[0], + wh_max[1], + wh_max[2]); + CHECK_EQ(r, 0); + + r = xdnn::search_batch2seq(ctx.GetRawContext(), + batch, + max_width, + cap_h, + idx_sorted_by_width->data(), + offset_xpu, + new_offset_xpu, + hidden, + top_hidden); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_grnn, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::SearchGrnnCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Wi", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Wh", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("tmp_buffer", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("idx_sorted_by_width", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindOutput("layout_input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/search_grnn_compute.h b/lite/kernels/xpu/search_grnn_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..7208e782474d39eabb41b4bc969d27a1d7d5f797 --- /dev/null +++ b/lite/kernels/xpu/search_grnn_compute.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class SearchGrnnCompute : public KernelLite { + public: + using param_t = operators::SearchGrnnParam; + + void PrepareForRun() override; + + void prepare_layout(const operators::SearchGrnnParam& param, + const paddle::lite::Tensor* bottom); + void Run() override; + + private: + XPUScratchPadGuard offset_xpu_guard_; + XPUScratchPadGuard new_offset_xpu_guard_; + XPUScratchPadGuard maxs_xpu_guard_; + + std::unique_ptr idx_sorted_by_width_data_cpu; + std::unique_ptr offset_cpu; + std::unique_ptr new_offset_cpu; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/sequence_arithmetic_compute.cc b/lite/kernels/xpu/sequence_arithmetic_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..226c615dba57ae381ed2457e588c5df32f25e04b --- /dev/null +++ b/lite/kernels/xpu/sequence_arithmetic_compute.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/sequence_arithmetic_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void SequenceArithmeticCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* bottom0 = param.X; + auto* bottom1 = param.Y; + auto* top = param.Out; + + int op_type = param.op_type; + + auto len1 = bottom0->numel(); + auto len2 = bottom1->numel(); + const auto* bottom_data0 = bottom0->data(); + const auto* bottom_data1 = bottom1->data(); + auto* top_data = top->mutable_data(TARGET(kXPU)); + + switch (op_type) { + case 1: // addition: top[0] = bottom[0] + bottom[1] + if (len1 > len2) { + xdnn::elementwise_add( + ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2); + xdnn::memcpy_device(ctx.GetRawContext(), + &top_data[len2], + &bottom_data0[len2], + (len1 - len2) * sizeof(float)); + } else { + xdnn::elementwise_add( + ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1); + } + break; + case 2: // substraction: top[0] = bottom[0] - bottom[1] + if (len1 > len2) { + xdnn::elementwise_sub( + ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2); + xdnn::memcpy_device(ctx.GetRawContext(), + &top_data[len2], + &bottom_data0[len2], + (len1 - len2) * sizeof(float)); + } else { + xdnn::elementwise_sub( + ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1); + } + break; + case 3: // multiplication: top[0] = bottom[0] * bottom[1] + if (len1 > len2) { + xdnn::elementwise_mul( + ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2); + xdnn::memcpy_device(ctx.GetRawContext(), + &top_data[len2], + &bottom_data0[len2], + (len1 - len2) * sizeof(float)); + } else { + xdnn::elementwise_mul( + ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1); + } + break; + default: + break; + } +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(sequence_arithmetic, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::SequenceArithmeticCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(search_seq_arithmetic, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::SequenceArithmeticCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/sequence_arithmetic_compute.h b/lite/kernels/xpu/sequence_arithmetic_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..9526587ac48cd5025022d646e31c24cac6b59a13 --- /dev/null +++ b/lite/kernels/xpu/sequence_arithmetic_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class SequenceArithmeticCompute + : public KernelLite { + public: + using param_t = operators::SequenceArithmeticParam; + + void Run() override; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/sequence_concat_compute.cc b/lite/kernels/xpu/sequence_concat_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..fd7f5999a6ccb18efbcb0e96b50f2b31884fc21c --- /dev/null +++ b/lite/kernels/xpu/sequence_concat_compute.cc @@ -0,0 +1,141 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/sequence_concat_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void SequenceConcatCompute::PrepareForRun() { + lod0_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); + lod1_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); + + lod0_cpu.reset(new int[64]); + lod1_cpu.reset(new int[64]); +} + +template +inline LoD ConcatLoD(const std::vector& xs, + std::vector* xs_in_order) { + std::vector result; + result.resize(xs[0]->lod()[0].size()); + + for (size_t i = 1; i < result.size(); ++i) { + size_t sum = 0; + for (size_t j = 0; j < xs.size(); ++j) { + auto& x_lod = xs[j]->lod()[0]; + if (x_lod[i - 1] < x_lod[i]) { + xs_in_order->emplace_back(xs[j]->Slice(x_lod[i - 1], x_lod[i])); + } + sum += x_lod[i]; + } + result[i] = sum; + } + LoD lod; + lod.emplace_back(result); + return lod; +} + +void SequenceConcatCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto xs = param.X; + auto out = param.Out; + + size_t lod_size = 0; + for (auto& x : xs) { + if (lod_size == 0) { + lod_size = x->lod()[0].size(); + } else { + CHECK_EQ(lod_size, x->lod()[0].size()) + << "The number of sequence must be same between each input"; + } + } + CHECK_NE(lod_size, 0) << "Each input must have sequence information"; + + // TODO(miaotianxiang): + int64_t dim0 = 0; + int64_t feature_size = 0; + std::vector out_dims; + for (const auto& tensor : param.X) { + const auto x_dims = tensor->dims(); + if (out_dims.empty()) { + out_dims = x_dims.data(); + } + dim0 += x_dims[0]; + if (feature_size == 0) { + feature_size = x_dims.production() / x_dims[0]; + } else { + CHECK_EQ(feature_size, x_dims.production() / x_dims[0]) + << "Inputs of sequence concat must have same feature size"; + } + } + out_dims[0] = dim0; + out->Resize(out_dims); + std::vector x_in_order; + out->set_lod(ConcatLoD(xs, &x_in_order)); + + CHECK(xs.size() == 2) << "XPU only support sequence_pool for 2 tensors"; + + auto lod0 = xs[0]->lod()[0]; + auto lod1 = xs[1]->lod()[0]; + int batch_size = lod0.size() - 1; + + int* lod0_xpu = reinterpret_cast(lod0_xpu_guard_->addr_); + int* lod1_xpu = reinterpret_cast(lod1_xpu_guard_->addr_); + for (int i = 0; i < lod0.size(); ++i) { + lod0_cpu[i] = lod0[i]; + } + for (int i = 0; i < lod1.size(); ++i) { + lod1_cpu[i] = lod1[i]; + } + xpu_memcpy(lod0_xpu, + lod0_cpu.get(), + lod0.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + xpu_memcpy(lod1_xpu, + lod1_cpu.get(), + lod1.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + + int r = xdnn::sequence_concat(ctx.GetRawContext(), + xs[0]->data(), + lod0_xpu, + xs[1]->data(), + lod1_xpu, + out->mutable_data(TARGET(kXPU)), + batch_size); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(sequence_concat, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::SequenceConcatCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/sequence_concat_compute.h b/lite/kernels/xpu/sequence_concat_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..5726671975d546d1e549ecbe95790c11faafba7b --- /dev/null +++ b/lite/kernels/xpu/sequence_concat_compute.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class SequenceConcatCompute + : public KernelLite { + public: + using param_t = operators::SequenceConcatParam; + + void PrepareForRun() override; + + void Run() override; + + private: + XPUScratchPadGuard lod0_xpu_guard_; + XPUScratchPadGuard lod1_xpu_guard_; + + std::unique_ptr lod0_cpu; + std::unique_ptr lod1_cpu; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/sequence_pool_compute.cc b/lite/kernels/xpu/sequence_pool_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..81d9b5873c3c42afe94acdd8eb5a292326b7a7b6 --- /dev/null +++ b/lite/kernels/xpu/sequence_pool_compute.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/sequence_pool_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void XPUSequencePoolCompute::PrepareForRun() { + lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); + lod_cpu.reset(new int[64]); +} + +void XPUSequencePoolCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* in = param.X; + auto* out = param.Out; + std::string pool_type_str = param.pool_type; + + auto dims = in->dims(); + auto lod = in->lod(); + dims[0] = lod[0].size() - 1; + + xdnn::Pooling_t pool_type = xdnn::Pooling_t::MAX_WITHOUT_INDEX; + if (pool_type_str == "MAX") { + } else if (pool_type_str == "LAST") { + pool_type = xdnn::Pooling_t::LAST; + } else { + CHECK(false); + } + + int num_seq = out->dims()[0]; + int dim = out->numel() / num_seq; + + auto in_lod = in->lod()[0]; + for (size_t i = 0; i < in_lod.size(); ++i) { + lod_cpu[i] = in_lod[i]; + } + int* lod_xpu = reinterpret_cast(lod_xpu_guard_->addr_); + xpu_memcpy(lod_xpu, + lod_cpu.get(), + in_lod.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + + int r = + xdnn::sequence_pooling_forward(ctx.GetRawContext(), + pool_type, + num_seq, + lod_xpu, + dim, + in->data(), + nullptr /* index */, + out->mutable_data(TARGET(kXPU))); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(sequence_pool, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUSequencePoolCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/sequence_pool_compute.h b/lite/kernels/xpu/sequence_pool_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..232634de0e387e764eccdeeda4cb8fd2d5dce598 --- /dev/null +++ b/lite/kernels/xpu/sequence_pool_compute.h @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class XPUSequencePoolCompute + : public KernelLite { + public: + using param_t = operators::SequencePoolParam; + + void PrepareForRun() override; + + void Run() override; + + private: + XPUScratchPadGuard lod_xpu_guard_; + + std::unique_ptr lod_cpu; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/sequence_reverse_compute.cc b/lite/kernels/xpu/sequence_reverse_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..11e4b80570c19fa90e7846d18a88f966f9a003b7 --- /dev/null +++ b/lite/kernels/xpu/sequence_reverse_compute.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/sequence_reverse_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template +void SequenceReverseCompute::PrepareForRun() { + lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); + lod_cpu.reset(new int[64]); +} + +template +void SequenceReverseCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* x = param.X; + auto* y = param.Out; + + auto lod = x->lod()[0]; + size_t limit = x->numel(); + size_t ele_cnt_in_4_byte = limit / x->dims()[0]; + auto* x_data = x->template data(); + auto* y_data = y->template mutable_data(TARGET(kXPU)); + int batch_size = lod.size() - 1; + + if (std::is_same::value) { + ele_cnt_in_4_byte /= 4; + } else if (std::is_same::value) { + // remain the same + } else if (std::is_same::value) { + ele_cnt_in_4_byte *= 2; + } else if (std::is_same::value) { + // remain the same + } else if (std::is_same::value) { + ele_cnt_in_4_byte *= 2; + } + + for (size_t i = 0; i < lod.size(); ++i) { + lod_cpu[i] = lod[i]; + } + int* lod_xpu = reinterpret_cast(lod_xpu_guard_->addr_); + xpu_memcpy(lod_xpu, + lod_cpu.get(), + lod.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + + int r = xdnn::sequence_reverse(ctx.GetRawContext(), + batch_size, + lod_xpu, + ele_cnt_in_4_byte, + reinterpret_cast(x_data), + reinterpret_cast(y_data)); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +namespace xpu = paddle::lite::kernels::xpu; +using SequenceReverseFp32 = + xpu::SequenceReverseCompute; +using SequenceReverseInt64 = + xpu::SequenceReverseCompute; + +REGISTER_LITE_KERNEL( + sequence_reverse, kXPU, kFloat, kNCHW, SequenceReverseFp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + sequence_reverse, kXPU, kInt64, kNCHW, SequenceReverseInt64, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/xpu/sequence_reverse_compute.h b/lite/kernels/xpu/sequence_reverse_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..91b285de767c65f93352380df7877e53d61ccd53 --- /dev/null +++ b/lite/kernels/xpu/sequence_reverse_compute.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template +class SequenceReverseCompute : public KernelLite { + public: + using param_t = operators::SequenceReverseParam; + + void PrepareForRun() override; + + void Run() override; + + private: + XPUScratchPadGuard lod_xpu_guard_; + std::unique_ptr lod_cpu; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..54c74211f9738995a8191c77e879a85762d71b3b --- /dev/null +++ b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/sequence_topk_avg_pooling_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void SequenceTopkAvgPoolingCompute::PrepareForRun() { + lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(256 * sizeof(int)); + in_lod_cpu.reset(new int[64]); + row_lod_cpu.reset(new int[64]); + col_lod_cpu.reset(new int[64]); +} + +void SequenceTopkAvgPoolingCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* in = param.X; + auto* row = param.ROW; + auto* col = param.COLUMN; + auto* out = param.Out; + auto* pos = param.pos; + + auto channel_num = param.channel_num; + auto topks = param.topks; + auto k_num = topks.size(); + auto max_k = topks[topks.size() - 1]; + auto in_lod = in->lod()[0]; + + auto row_lod = row->lod()[0]; + auto col_lod = col->lod()[0]; + int batch_size = row_lod.size() - 1; + int pos_total_size = row_lod[batch_size] * channel_num * max_k; + std::vector vec_pos_shape; + vec_pos_shape.push_back(pos_total_size); + pos->Resize(vec_pos_shape); + auto pos_data = pos->mutable_data(TARGET(kXPU)); + + int offset = 0; + std::vector vec_out_lod; + vec_out_lod.reserve(batch_size + 1); + for (int i = 0; i <= batch_size; ++i) { + offset = row_lod[i]; + vec_out_lod.push_back(offset); + } + LoD lod_temp; + lod_temp.push_back(vec_out_lod); + out->set_lod(lod_temp); + + auto in_data = in->data(); + auto out_data = out->mutable_data(TARGET(kXPU)); + + int* in_lod_xpu = reinterpret_cast(lod_xpu_guard_->addr_); + int* row_lod_xpu = in_lod_xpu + in_lod.size(); + int* col_lod_xpu = row_lod_xpu + row_lod.size(); + int* topks_xpu = col_lod_xpu + col_lod.size(); + for (int i = 0; i < in_lod.size(); ++i) { + in_lod_cpu[i] = in_lod[i]; + } + for (int i = 0; i < row_lod.size(); ++i) { + row_lod_cpu[i] = row_lod[i]; + } + for (int i = 0; i < col_lod.size(); ++i) { + col_lod_cpu[i] = col_lod[i]; + } + xpu_memcpy(in_lod_xpu, + in_lod_cpu.get(), + in_lod.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + xpu_memcpy(row_lod_xpu, + row_lod_cpu.get(), + row_lod.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + xpu_memcpy(col_lod_xpu, + col_lod_cpu.get(), + col_lod.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + xpu_memcpy(topks_xpu, + topks.data(), + topks.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + + int r = xdnn::sequence_topk_avg_pooling(ctx.GetRawContext(), + in_data, + out_data, + pos_data, + batch_size, + channel_num, + in_lod_xpu, + row_lod_xpu, + col_lod_xpu, + topks_xpu, + k_num); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(sequence_topk_avg_pooling, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::SequenceTopkAvgPoolingCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("COLUMN", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("pos", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..7c54ca96225ee9ec37d6d0487a526347c19fdb2d --- /dev/null +++ b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h @@ -0,0 +1,45 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class SequenceTopkAvgPoolingCompute + : public KernelLite { + public: + using param_t = operators::SequenceTopkAvgPoolingParam; + + void PrepareForRun() override; + + void Run() override; + + private: + XPUScratchPadGuard lod_xpu_guard_; + std::unique_ptr in_lod_cpu; + std::unique_ptr row_lod_cpu; + std::unique_ptr col_lod_cpu; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc index 9c2191331c85a7f99ffb5a2e9662ed5831cb1dda..981922f8eacab57da4638e1fdcdd3df72465b379 100644 --- a/lite/kernels/xpu/subgraph_compute.cc +++ b/lite/kernels/xpu/subgraph_compute.cc @@ -27,12 +27,35 @@ namespace lite { namespace kernels { namespace xpu { -int SubgraphEngine::BuildDeviceProgram() { +bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() { + // Obtain the origin input tensors, and create the origin output + // tensors(Don't try to access them before launch the device program or the + // origin program) + PrepareWorkspaceForOriginProgram(); + // Create the device input and output tensors, but don't initialize them + // with the dimensions + device_itensors_.resize(input_names_.size()); + for (int i = 0; i < input_names_.size(); i++) { + device_itensors_[i].reset(new hiai::AiTensor); + CHECK(device_itensors_[i]); + } + device_otensors_.resize(output_names_.size()); + for (int i = 0; i < output_names_.size(); i++) { + device_otensors_[i].reset(new hiai::AiTensor); + CHECK(device_otensors_[i]); + } + return true; +} + +bool SubgraphEngine::BuildDeviceProgram() { int status = 0; // Convert all of ops and their input vars and weights and added into the XPU // IR graph subgraph::xpu::Graph graph; const auto& bridges = subgraph::Registry::Instance(); + if (origin_program_.empty()) { + BuildOriginProgram(); + } for (auto& inst : origin_program_) { auto op = const_cast(inst.op()); CHECK(op); @@ -40,13 +63,13 @@ int SubgraphEngine::BuildDeviceProgram() { op->InferShape(); std::string op_type = op->op_info()->Type(); if (!bridges.Exists(op_type, TARGET(kXPU))) { - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); status |= bridges.Select(op_type, TARGET(kXPU))( reinterpret_cast(&graph), op, const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } // Obtain the output nodes of the XPU IR graph and build the graph to the XPU @@ -86,7 +109,7 @@ int SubgraphEngine::BuildDeviceProgram() { &graph.builder_, &graph.params_, &device_onodes); if (device_program_ == nullptr) { LOG(WARNING) << "[XPU] Build model failed!"; - return subgraph::FAILED; + return false; } // Query and check the dimensions of input and output tensors @@ -166,10 +189,10 @@ int SubgraphEngine::BuildDeviceProgram() { device_otensors_[i].strides = nullptr; device_otensors_[i].byte_offset = 0; } - return status; + return true; } -int SubgraphEngine::LaunchDeviceProgram() { +bool SubgraphEngine::LaunchDeviceProgram() { for (size_t i = 0; i < device_itensors_.size(); i++) { // Update the data pointer of DLTensor to track the origin input tensors device_itensors_[i].data = @@ -191,7 +214,7 @@ int SubgraphEngine::LaunchDeviceProgram() { const_cast(origin_otensors_[i]->raw_data()); device_program_->CopyOutputTo(i, &device_otensors_[i]); } - return 0; + return true; } void SubgraphCompute::PrepareForRun() { @@ -203,12 +226,11 @@ void SubgraphCompute::PrepareForRun() { param.output_data_names, param.scope)); CHECK(engine_); - engine_->Build(); } void SubgraphCompute::Run() { CHECK(engine_); - engine_->Launch(); + engine_->Run(); } } // namespace xpu diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h index 601c8821bc826e350c233573bf7eff89cdf5c1f5..f09a06a85d5382c72e9efb20cede8bea1922f2da 100644 --- a/lite/kernels/xpu/subgraph_compute.h +++ b/lite/kernels/xpu/subgraph_compute.h @@ -39,13 +39,14 @@ class SubgraphEngine : public subgraph::Engine { ctx, block_idx, block_desc, input_names, output_names, scope) {} protected: - int BuildDeviceProgram() override; - int LaunchDeviceProgram() override; + bool PrepareWorkspaceForDeviceProgram() override; + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; std::vector device_inames_; std::vector device_onames_; - std::vector device_itensors_; - std::vector device_otensors_; + std::vector device_itensors_{}; + std::vector device_otensors_{}; std::unique_ptr device_program_{nullptr}; }; diff --git a/lite/kernels/xpu/var_conv_2d_compute.cc b/lite/kernels/xpu/var_conv_2d_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..b573c810922db98e901c9f9a1953116f3fdfc657 --- /dev/null +++ b/lite/kernels/xpu/var_conv_2d_compute.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/var_conv_2d_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void VarConv2DCompute::PrepareForRun() { + offset_x_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); + offset_y_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); + offset_x_cpu.reset(new int[64]); + offset_y_cpu.reset(new int[64]); +} + +void VarConv2DCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* bottom = param.X; + auto* w = param.W; + auto* top = param.Out; + + int output_channel = param.output_channel; + int input_channel = param.input_channel; + int kernel_h = param.kernel_h; + int kernel_w = param.kernel_w; + int stride_h = param.stride_h; + int stride_w = param.stride_w; + float w_max = param.__xpu__w_max; + bool fuse_relu = param.fuse_relu; + bool float_to_fix = param.__xpu__float_to_fix; + CHECK(float_to_fix) << "W should be fixed point"; + + xdnn::Activation_t act = xdnn::Activation_t::LINEAR; + if (fuse_relu) { + act = xdnn::Activation_t::RELU; + } + + int batch = bottom->lod()[0].size() - 1; + const auto& offset_x = bottom->lod()[2]; + const auto& offset_y = bottom->lod()[1]; + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + int top_im_y = 0; + if (width != 0) { + top_im_x = (width - 1) / stride_w + 1; + } + if (height != 0) { + top_im_y = (height - 1) / stride_h + 1; + } + int top_im_size = top_im_y * top_im_x; + top_size += output_channel * top_im_size; + top_offset.push_back(top_size); + } + + LoD top_lod; + top_lod.push_back(top_offset); + top_lod.push_back(bottom->lod()[1]); + top_lod.push_back(bottom->lod()[2]); + top->set_lod(top_lod); + std::vector top_dims_vec{top_size}; + top_dims_vec.push_back(1); + top->Resize(top_dims_vec); + auto* top_data = top->mutable_data(TARGET(kXPU)); + + auto* bottom_data = bottom->data(); + auto* w_data = w->data(); + + int* offset_x_xpu = reinterpret_cast(offset_x_xpu_guard_->addr_); + int* offset_y_xpu = reinterpret_cast(offset_y_xpu_guard_->addr_); + for (int i = 0; i < (batch + 1); ++i) { + offset_x_cpu[i] = offset_x[i]; + offset_y_cpu[i] = offset_y[i]; + } + xpu_memcpy(offset_x_xpu, + offset_x_cpu.get(), + (batch + 1) * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + xpu_memcpy(offset_y_xpu, + offset_y_cpu.get(), + (batch + 1) * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + + int r = xdnn::search_varconv(ctx.GetRawContext(), + batch, + input_channel, + output_channel, + kernel_h, + kernel_w, + stride_h, + stride_w, + bottom_data, + w_data, + offset_x_xpu, + offset_y_xpu, + top_data, + w_max, + act); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(var_conv_2d, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::VarConv2DCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/var_conv_2d_compute.h b/lite/kernels/xpu/var_conv_2d_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..4d9f0ca7a9851a0c3071e72519c4ad1f40ea3483 --- /dev/null +++ b/lite/kernels/xpu/var_conv_2d_compute.h @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class VarConv2DCompute : public KernelLite { + public: + using param_t = operators::VarConv2DParam; + + void PrepareForRun() override; + + void Run() override; + + private: + XPUScratchPadGuard offset_x_xpu_guard_; + XPUScratchPadGuard offset_y_xpu_guard_; + std::unique_ptr offset_x_cpu; + std::unique_ptr offset_y_cpu; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/CMakeLists.txt b/lite/model_parser/CMakeLists.txt index 34d524c5c1b86fb6b689b86089c355e3de42a34e..a83cecf4444910e710d0eb92b9c3449190f5bda2 100644 --- a/lite/model_parser/CMakeLists.txt +++ b/lite/model_parser/CMakeLists.txt @@ -1,8 +1,9 @@ if (NOT LITE_ON_TINY_PUBLISH) add_subdirectory(pb) endif() -add_subdirectory(cpp) +add_subdirectory(general) add_subdirectory(naive_buffer) +add_subdirectory(flatbuffers) #lite_cc_library(runtime_lite SRCS runtime.cc) diff --git a/lite/model_parser/desc_apis.h b/lite/model_parser/base/apis.h similarity index 95% rename from lite/model_parser/desc_apis.h rename to lite/model_parser/base/apis.h index 28d7f84b2a574a0399046636c9b809c0878f8d4d..2ad6ff47ee17fcdfab335b3a6f87229811d971ae 100644 --- a/lite/model_parser/desc_apis.h +++ b/lite/model_parser/base/apis.h @@ -17,5 +17,6 @@ #include "lite/model_parser/base/block_desc.h" #include "lite/model_parser/base/op_desc.h" #include "lite/model_parser/base/program_desc.h" +#include "lite/model_parser/base/traits.h" #include "lite/model_parser/base/var_desc.h" #include "lite/utils/all.h" diff --git a/lite/model_parser/base/block_desc.h b/lite/model_parser/base/block_desc.h index f4ddfddf406e76905f0286441d09b50402513ac6..3fd7998aa392034173f7474bc6b4d106f9fbcbd4 100644 --- a/lite/model_parser/base/block_desc.h +++ b/lite/model_parser/base/block_desc.h @@ -17,6 +17,7 @@ #include #include #include +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -46,11 +47,11 @@ class BlockDescReadAPI { class BlockDescWriteAPI { public: - virtual void SetIdx(int32_t idx) = 0; - virtual void SetParentIdx(int32_t idx) = 0; - virtual void ClearVars() = 0; - virtual void ClearOps() = 0; - virtual void SetForwardBlockIdx(int32_t idx) = 0; + virtual void SetIdx(int32_t idx) { NotImplemented(); } + virtual void SetParentIdx(int32_t idx) { NotImplemented(); } + virtual void ClearVars() { NotImplemented(); } + virtual void ClearOps() { NotImplemented(); } + virtual void SetForwardBlockIdx(int32_t idx) { NotImplemented(); } template T* AddVar(); @@ -59,6 +60,11 @@ class BlockDescWriteAPI { T* AddOp(); virtual ~BlockDescWriteAPI() = default; + + private: + void NotImplemented() const { + LOG(FATAL) << "BlockDescWriteAPI is not available in model read-only mode."; + } }; // The reading and writing of the model are one-time and separate. diff --git a/lite/model_parser/base/op_desc.h b/lite/model_parser/base/op_desc.h index 144f7064f07f16f58b1aa97da819862acb312a63..185f5917c46127de1e16e274d0be95073b1a37f6 100644 --- a/lite/model_parser/base/op_desc.h +++ b/lite/model_parser/base/op_desc.h @@ -15,56 +15,13 @@ #pragma once #include #include +#include "lite/model_parser/base/traits.h" +#include "lite/utils/cp_logging.h" #include "lite/utils/string.h" namespace paddle { namespace lite { -// The AttrType is used to make the proto::AttrType portable. -enum class OpAttrType { - INT = 0, - FLOAT = 1, - STRING = 2, - INTS = 3, - FLOATS = 4, - STRINGS = 5, - BOOLEAN = 6, - BOOLEANS = 7, - BLOCK = 8, - LONG = 9, - BLOCKS = 10, - LONGS = 11, - UNK, -}; - -template -struct OpAttrTypeTrait; - -template -struct OpDataTypeTrait; - -#define TYPE_TRAIT_IMPL(T, type__) \ - template <> \ - struct OpAttrTypeTrait { \ - typedef type__ DT; \ - }; \ - template <> \ - struct OpDataTypeTrait { \ - static constexpr OpAttrType AT = OpAttrType::T; \ - static constexpr const char* ATN = #T; \ - }; - -TYPE_TRAIT_IMPL(INT, int32_t); -TYPE_TRAIT_IMPL(FLOAT, float); -TYPE_TRAIT_IMPL(STRING, std::string); -TYPE_TRAIT_IMPL(BOOLEAN, bool); -TYPE_TRAIT_IMPL(LONG, int64_t); -TYPE_TRAIT_IMPL(INTS, std::vector); -TYPE_TRAIT_IMPL(FLOATS, std::vector); -TYPE_TRAIT_IMPL(STRINGS, std::vector); -TYPE_TRAIT_IMPL(LONGS, std::vector); -#undef TYPE_TRAIT_IMPL - class OpDescReadAPI { public: virtual std::string Type() const = 0; @@ -105,16 +62,25 @@ class OpDescReadAPI { class OpDescWriteAPI { public: - virtual void SetType(const std::string& type) = 0; + virtual void SetType(const std::string& type) { NotImplemented(); } virtual void SetInput(const std::string& param, - const std::vector& args) = 0; + const std::vector& args) { + NotImplemented(); + } virtual void SetOutput(const std::string& param, - const std::vector& args) = 0; + const std::vector& args) { + NotImplemented(); + } template void SetAttr(const std::string& name, const T& v); virtual ~OpDescWriteAPI() = default; + + private: + void NotImplemented() const { + LOG(FATAL) << "OpDescWriteAPI is not available in model read-only mode."; + } }; // The reading and writing of the model are one-time and separate. diff --git a/lite/model_parser/base/program_desc.h b/lite/model_parser/base/program_desc.h index f04aa1ddf6f62e2eb3129c92f53c9401b6fdefc7..c4423f288d8ea90039ffad0db08342b594415fe6 100644 --- a/lite/model_parser/base/program_desc.h +++ b/lite/model_parser/base/program_desc.h @@ -14,6 +14,8 @@ #pragma once +#include "lite/utils/cp_logging.h" + namespace paddle { namespace lite { @@ -34,13 +36,19 @@ class ProgramDescReadAPI { class ProgramDescWriteAPI { public: - virtual void ClearBlocks() = 0; - virtual void SetVersion(int64_t version) = 0; + virtual void ClearBlocks() { NotImplemented(); } + virtual void SetVersion(int64_t version) { NotImplemented(); } template T* AddBlock(); virtual ~ProgramDescWriteAPI() = default; + + private: + void NotImplemented() const { + LOG(FATAL) + << "ProgramDescWriteAPI is not available in model read-only mode."; + } }; // The reading and writing of the model are one-time and separate. diff --git a/lite/model_parser/base/traits.h b/lite/model_parser/base/traits.h new file mode 100644 index 0000000000000000000000000000000000000000..bda293686c7996abb9b0fe36edcc84407ed3b541 --- /dev/null +++ b/lite/model_parser/base/traits.h @@ -0,0 +1,82 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace paddle { +namespace lite { + +// The AttrType is used to make the proto::AttrType portable. +enum class OpAttrType { + INT = 0, + FLOAT = 1, + STRING = 2, + INTS = 3, + FLOATS = 4, + STRINGS = 5, + BOOLEAN = 6, + BOOLEANS = 7, + BLOCK = 8, + LONG = 9, + BLOCKS = 10, + LONGS = 11, + UNK, +}; + +struct Standard {}; +struct Flatbuffers {}; + +template +class VectorView; + +template +struct OpDataTypeTrait; + +#define ATTR_TYPE_TRAIT_IMPL(T, type__) \ + template \ + struct OpDataTypeTrait { \ + typedef type__ ET; \ + typedef type__ RT; \ + static constexpr OpAttrType AT = OpAttrType::T; \ + static constexpr const char* ATN = #T; \ + }; +#define ATTR_VECTOR_TYPE_TRAIT_IMPL(T, type__) \ + template \ + struct OpDataTypeTrait, U> { \ + typedef type__ ET; \ + typedef VectorView RT; \ + static constexpr OpAttrType AT = OpAttrType::T; \ + static constexpr const char* ATN = #T; \ + }; + +ATTR_TYPE_TRAIT_IMPL(BLOCK, int16_t); +ATTR_TYPE_TRAIT_IMPL(INT, int32_t); +ATTR_TYPE_TRAIT_IMPL(FLOAT, float); +ATTR_TYPE_TRAIT_IMPL(STRING, std::string); +ATTR_TYPE_TRAIT_IMPL(BOOLEAN, bool); +ATTR_TYPE_TRAIT_IMPL(LONG, int64_t); + +ATTR_VECTOR_TYPE_TRAIT_IMPL(INTS, int32_t); +ATTR_VECTOR_TYPE_TRAIT_IMPL(FLOATS, float); +ATTR_VECTOR_TYPE_TRAIT_IMPL(STRINGS, std::string); +ATTR_VECTOR_TYPE_TRAIT_IMPL(LONGS, int64_t); + +#undef ATTR_TYPE_TRAIT_IMPL +#undef ATTR_VECTOR_TYPE_TRAIT_IMPL + +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/base/var_desc.h b/lite/model_parser/base/var_desc.h index 0aa88d02d540f297c2995b8e7c1ccf4eca8472c0..47596f8792a83677a036bcb3d937e67576204546 100644 --- a/lite/model_parser/base/var_desc.h +++ b/lite/model_parser/base/var_desc.h @@ -16,6 +16,7 @@ #include #include +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -62,11 +63,16 @@ class VarDescReadAPI { class VarDescWriteAPI { public: - virtual void SetName(std::string name) = 0; - virtual void SetType(VarDataType type) = 0; - virtual void SetPersistable(bool persistable) = 0; - virtual void SetShape(const std::vector& dims) = 0; + virtual void SetName(std::string name) { NotImplemented(); } + virtual void SetType(VarDataType type) { NotImplemented(); } + virtual void SetPersistable(bool persistable) { NotImplemented(); } + virtual void SetShape(const std::vector& dims) { NotImplemented(); } virtual ~VarDescWriteAPI() = default; + + private: + void NotImplemented() const { + LOG(FATAL) << "VarDescWriteAPI is not available in model read-only mode."; + } }; // The reading and writing of the model are one-time and separate. diff --git a/lite/model_parser/base/vector_view.h b/lite/model_parser/base/vector_view.h new file mode 100644 index 0000000000000000000000000000000000000000..c6337faa403a2c9a2758b90a4c1f7d092554b0b2 --- /dev/null +++ b/lite/model_parser/base/vector_view.h @@ -0,0 +1,84 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/model_parser/base/traits.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace vector_view { + +template +struct ElementTraits { + typedef T element_type; +}; + +template +struct VectorTraits; + +template +struct VectorTraits { + typedef std::vector vector_type; + typedef typename vector_type::const_iterator const_iterator; + typedef typename vector_type::const_reference const_reference; + typedef const_reference subscript_return_type; +}; + +} // namespace vector_view + +// In the process of optimizing the performance of model loading, we found +// that it was necessary to reduce the copying and construction of STL +// containers. So use VectorView to simulate the operation of STL containers +// without copying, such as iteration and subscripting. +// +// Currently, VectorView is applicable to STL vector and Flatbuffers Vector. +// We used the template Traits to unify the behavior of the two, and provided +// an implicit conversion operator from VectorView to STL vector. Please use +// implicit conversion with caution because it will bring significant overhead. + +template +class VectorView { + public: + typedef vector_view::VectorTraits Traits; + explicit VectorView(typename Traits::vector_type const* cvec) { + cvec_ = cvec; + } + typename Traits::subscript_return_type operator[](size_t i) const { + return cvec_->operator[](i); + } + typename Traits::const_iterator begin() const { return cvec_->begin(); } + typename Traits::const_iterator end() const { return cvec_->end(); } + size_t size() const { return cvec_->size(); } + operator std::vector() const { + VLOG(5) << "Copying elements out of VectorView will damage performance."; + std::vector tmp; + tmp.reserve(cvec_->size()); + for (auto val : *cvec_) { + tmp.push_back(val); + } + return tmp; + } + ~VectorView() = default; + + private: + typename Traits::vector_type const* cvec_; +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/compatibility.cc b/lite/model_parser/compatibility.cc index 67d7c9d69152d31d1381ea847ef859a08e4f82a7..dd43f7bd25277e34a2fd8b04aae6b705402a0436 100644 --- a/lite/model_parser/compatibility.cc +++ b/lite/model_parser/compatibility.cc @@ -20,10 +20,7 @@ #include "lite/model_parser/naive_buffer/program_desc.h" #include "lite/model_parser/naive_buffer/var_desc.h" #ifndef LITE_ON_TINY_PUBLISH -#include "lite/model_parser/cpp/block_desc.h" -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/cpp/program_desc.h" -#include "lite/model_parser/cpp/var_desc.h" +#include "lite/model_parser/cpp_desc.h" #endif namespace paddle { diff --git a/lite/model_parser/compatibility.h b/lite/model_parser/compatibility.h index 9e421d709d1823852d6dac5cd0070b4330f56752..a47870cf9c4d8e1743f2eb749823e88f18b33900 100644 --- a/lite/model_parser/compatibility.h +++ b/lite/model_parser/compatibility.h @@ -17,7 +17,7 @@ #include #include #include "lite/api/paddle_place.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" namespace paddle { namespace lite { diff --git a/lite/model_parser/compatibility_test.cc b/lite/model_parser/compatibility_test.cc index b3cb38f1c95649567b72d73b8938420537ec7b5b..957bcb25ea68b5555c9937de4e87dc8e9c4923b1 100644 --- a/lite/model_parser/compatibility_test.cc +++ b/lite/model_parser/compatibility_test.cc @@ -17,10 +17,7 @@ #include "lite/api/paddle_lite_factory_helper.h" #include "lite/model_parser/compatible_pb.h" -#include "lite/model_parser/cpp/block_desc.h" -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/cpp/program_desc.h" -#include "lite/model_parser/cpp/var_desc.h" +#include "lite/model_parser/cpp_desc.h" USE_LITE_KERNEL(leaky_relu, kCUDA, kFloat, kNCHW, def); diff --git a/lite/model_parser/compatible_pb.h b/lite/model_parser/compatible_pb.h index 80fee49133130b09fbdd490ed86dce0af924aac1..c9889a5879160dd60ec64c4806df8af888db99c9 100644 --- a/lite/model_parser/compatible_pb.h +++ b/lite/model_parser/compatible_pb.h @@ -21,10 +21,7 @@ * lite::pb::XXDesc/lite::naive_buffer::XXDesc. */ -#include "lite/model_parser/cpp/block_desc.h" -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/cpp/program_desc.h" -#include "lite/model_parser/cpp/var_desc.h" +#include "lite/model_parser/cpp_desc.h" namespace paddle { namespace lite { diff --git a/lite/model_parser/compatible_pb_test.cc b/lite/model_parser/compatible_pb_test.cc index 088b64bf2cd13ce0f443f962bd2cb5f709c4d4f2..d9a46e463209eb33e6f2cb53f4644056f88e7085 100644 --- a/lite/model_parser/compatible_pb_test.cc +++ b/lite/model_parser/compatible_pb_test.cc @@ -14,10 +14,7 @@ #include "lite/model_parser/compatible_pb.h" #include -#include "lite/model_parser/cpp/block_desc.h" -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/cpp/program_desc.h" -#include "lite/model_parser/cpp/var_desc.h" +#include "lite/model_parser/cpp_desc.h" #include "lite/model_parser/naive_buffer/block_desc.h" #include "lite/model_parser/naive_buffer/op_desc.h" #include "lite/model_parser/naive_buffer/program_desc.h" diff --git a/lite/model_parser/cpp_desc.h b/lite/model_parser/cpp_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..477f90a28d7bf1e31dbc648b18af42381e0c93d6 --- /dev/null +++ b/lite/model_parser/cpp_desc.h @@ -0,0 +1,26 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/model_parser/general/block_desc.h" +#include "lite/model_parser/general/op_desc.h" +#include "lite/model_parser/general/program_desc.h" +#include "lite/model_parser/general/var_desc.h" + +namespace paddle { +namespace lite { +namespace cpp = general; +} +} diff --git a/lite/model_parser/flatbuffers/CMakeLists.txt b/lite/model_parser/flatbuffers/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ca669bfeb512de47f3a15eb7119f12487accc8a --- /dev/null +++ b/lite/model_parser/flatbuffers/CMakeLists.txt @@ -0,0 +1,16 @@ +function(lite_fbs_library TARGET) + set(multiValueArgs SRCS FBS_DEPS) + cmake_parse_arguments(args "" "" "${multiValueArgs}" ${ARGN}) + lite_cc_library(${TARGET} SRCS ${args_SRCS}) + add_dependencies(${TARGET} ${args_FBS_DEPS}) +endfunction() + +lite_fbs_library(fbs_op_desc SRCS op_desc.cc FBS_DEPS framework_fbs_header) +lite_fbs_library(fbs_var_desc SRCS var_desc.cc FBS_DEPS framework_fbs_header) +lite_fbs_library(fbs_block_desc SRCS block_desc.cc FBS_DEPS framework_fbs_header) +lite_fbs_library(fbs_program_desc SRCS program_desc.cc FBS_DEPS framework_fbs_header) + +lite_cc_test(test_vector_view SRCS vector_view_test.cc) +if (TARGET test_vector_view) + add_dependencies(test_vector_view framework_fbs_header) +endif() diff --git a/lite/model_parser/flatbuffers/block_desc.cc b/lite/model_parser/flatbuffers/block_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..fc43af6d6273c845f00e2046ae846f044659fe57 --- /dev/null +++ b/lite/model_parser/flatbuffers/block_desc.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/model_parser/flatbuffers/block_desc.h" + +namespace paddle { +namespace lite { +namespace fbs { + +template <> +proto::VarDesc* BlockDesc::GetVar(int32_t idx) { + CHECK_LT(idx, VarsSize()) << "idx >= vars.size()"; + return const_cast(desc_->vars()->Get(idx)); +} + +template <> +proto::OpDesc* BlockDesc::GetOp(int32_t idx) { + CHECK_LT(idx, OpsSize()) << "idx >= ops.size()"; + return const_cast(desc_->ops()->Get(idx)); +} + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/block_desc.h b/lite/model_parser/flatbuffers/block_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..0bfef5a452051c37e31f9d2c6ab2504e9addd800 --- /dev/null +++ b/lite/model_parser/flatbuffers/block_desc.h @@ -0,0 +1,69 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/model_parser/base/block_desc.h" +#include "lite/model_parser/flatbuffers/framework_generated.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace fbs { + +class BlockDesc : public BlockDescAPI { + public: + explicit BlockDesc(proto::BlockDesc* desc) : desc_(desc) { CHECK(desc_); } + + int32_t Idx() const override { return desc_->idx(); } + + int32_t ParentIdx() const override { return desc_->parent_idx(); } + + size_t VarsSize() const override { return desc_->vars()->size(); } + + template + T* GetVar(int32_t idx); + + template + T const* GetVar(int32_t idx) const { + return GetVar(idx); + } + + size_t OpsSize() const override { + CHECK(desc_); + CHECK(desc_->ops()); + return desc_->ops()->size(); + } + + template + T* GetOp(int32_t idx); + + template + T const* GetOp(int32_t idx) const { + return GetOp(idx); + } + + int32_t ForwardBlockIdx() const override { + return desc_->forward_block_idx(); + } + + BlockDesc() = delete; + + private: + proto::BlockDesc* desc_; // not_own +}; + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/framework.fbs b/lite/model_parser/flatbuffers/framework.fbs new file mode 100644 index 0000000000000000000000000000000000000000..90f6e626088003975f18303e47230a85c303181d --- /dev/null +++ b/lite/model_parser/flatbuffers/framework.fbs @@ -0,0 +1,172 @@ +// Generated from framework.proto + +namespace paddle.lite.fbs.proto; + +enum AttrType : int { + INT = 0, + FLOAT = 1, + STRING = 2, + INTS = 3, + FLOATS = 4, + STRINGS = 5, + BOOLEAN = 6, + BOOLEANS = 7, + BLOCK = 8, + LONG = 9, + BLOCKS = 10, + LONGS = 11, +} + +namespace paddle.lite.fbs.proto.VarType_; + +enum Type : int { + BOOL = 0, + INT16 = 1, + INT32 = 2, + INT64 = 3, + FP16 = 4, + FP32 = 5, + FP64 = 6, + LOD_TENSOR = 7, + SELECTED_ROWS = 8, + FEED_MINIBATCH = 9, + FETCH_LIST = 10, + STEP_SCOPES = 11, + LOD_RANK_TABLE = 12, + LOD_TENSOR_ARRAY = 13, + PLACE_LIST = 14, + READER = 15, + RAW = 17, + TUPLE = 18, + SIZE_T = 19, + UINT8 = 20, + INT8 = 21, +} + +namespace paddle.lite.fbs.proto.CompatibleInfo_; + +enum Type : int { + COMPATIBLE = 0, + DEFINITELY_NOT = 1, + POSSIBLE = 2, + BUG_FIX = 3, + PRECISION_CHANGE = 4, +} + +namespace paddle.lite.fbs.proto; + +table Version { + version:long; +} + +table OpDesc { + type:string (required); + inputs:[paddle.lite.fbs.proto.OpDesc_.Var]; + outputs:[paddle.lite.fbs.proto.OpDesc_.Var]; + attrs:[paddle.lite.fbs.proto.OpDesc_.Attr]; + is_target:bool; +} + +namespace paddle.lite.fbs.proto.OpDesc_; + +table Attr { + name:string (required, key); + type:paddle.lite.fbs.proto.AttrType; + i:int; + f:float; + s:string; + ints:[int]; + floats:[float]; + strings:[string]; + b:bool; + bools:[bool]; + block_idx:int; + l:long; + blocks_idx:[int]; + longs:[long]; +} + +table Var { + parameter:string (required, key); + arguments:[string]; +} + +namespace paddle.lite.fbs.proto; + +table VarType { + type:paddle.lite.fbs.proto.VarType_.Type; + selected_rows:paddle.lite.fbs.proto.VarType_.TensorDesc; + lod_tensor:paddle.lite.fbs.proto.VarType_.LoDTensorDesc; + tensor_array:paddle.lite.fbs.proto.VarType_.LoDTensorArrayDesc; + reader:paddle.lite.fbs.proto.VarType_.ReaderDesc; + tuple:paddle.lite.fbs.proto.VarType_.Tuple; +} + +namespace paddle.lite.fbs.proto.VarType_; + +table TensorDesc { + data_type:paddle.lite.fbs.proto.VarType_.Type; + dims:[long]; +} + +table LoDTensorDesc { + tensor:paddle.lite.fbs.proto.VarType_.TensorDesc (required); + lod_level:int; +} + +table LoDTensorArrayDesc { + tensor:paddle.lite.fbs.proto.VarType_.TensorDesc (required); + lod_level:int; +} + +table ReaderDesc { + lod_tensor:[paddle.lite.fbs.proto.VarType_.LoDTensorDesc]; +} + +table Tuple { + element_type:[paddle.lite.fbs.proto.VarType_.Type]; +} + +namespace paddle.lite.fbs.proto; + +table VarDesc { + name:string (required, key); + type:paddle.lite.fbs.proto.VarType (required); + persistable:bool; + need_check_feed:bool; +} + +table BlockDesc { + idx:int; + parent_idx:int; + vars:[paddle.lite.fbs.proto.VarDesc]; + ops:[paddle.lite.fbs.proto.OpDesc]; + forward_block_idx:int = -1; +} + +table CompatibleInfo { + version:string (required); + type:paddle.lite.fbs.proto.CompatibleInfo_.Type; +} + +table OpCompatibleMap { + pair:[paddle.lite.fbs.proto.OpCompatibleMap_.OpCompatiblePair]; + default_required_version:string; +} + +namespace paddle.lite.fbs.proto.OpCompatibleMap_; + +table OpCompatiblePair { + op_name:string (required, key); + compatible_info:paddle.lite.fbs.proto.CompatibleInfo (required); +} + +namespace paddle.lite.fbs.proto; + +table ProgramDesc { + blocks:[paddle.lite.fbs.proto.BlockDesc]; + version:paddle.lite.fbs.proto.Version; + op_compatible_map:paddle.lite.fbs.proto.OpCompatibleMap; +} + +root_type paddle.lite.fbs.proto.ProgramDesc; diff --git a/lite/model_parser/flatbuffers/op_desc.cc b/lite/model_parser/flatbuffers/op_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..9e416b020d8fed0861d1d0b02ae74a9ccc47df59 --- /dev/null +++ b/lite/model_parser/flatbuffers/op_desc.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/model_parser/flatbuffers/op_desc.h" + +namespace paddle { +namespace lite { +namespace fbs { + +template <> +std::string OpDesc::GetAttr(const std::string& name) const { + const auto& it = desc_->attrs()->LookupByKey(name.c_str()); + if (!it->s()) { + return std::string(); + } + return it->s()->str(); +} + +template <> +std::string OpDesc::GetAttr(size_t idx) const { + const auto& it = desc_->attrs()->Get(idx); + if (!it->s()) { + return std::string(); + } + return it->s()->str(); +} + +template <> +lite::VectorView +OpDesc::GetAttr>(const std::string& name) const { + const auto& it = desc_->attrs()->LookupByKey(name.c_str()); + CHECK(it) << "Attr " << name << "does not exist."; + return VectorView(it->strings()); +} + +template <> +VectorView OpDesc::GetAttr>( + size_t idx) const { + const auto& it = desc_->attrs()->Get(idx); + CHECK(it) << "Attr " << idx << "does not exist."; + return VectorView(it->strings()); +} + +#define GET_ATTR_IMPL(T, fb_f__) \ + template <> \ + typename lite::OpDataTypeTrait::RT OpDesc::GetAttr( \ + const std::string& name) const { \ + const auto& it = desc_->attrs()->LookupByKey(name.c_str()); \ + return it->fb_f__(); \ + } \ + template <> \ + typename lite::OpDataTypeTrait::RT OpDesc::GetAttr( \ + size_t idx) const { \ + const auto& it = desc_->attrs()->Get(idx); \ + return it->fb_f__(); \ + } + +#define GET_ATTRS_IMPL(T, fb_f__) \ + template <> \ + typename lite::OpDataTypeTrait::RT OpDesc::GetAttr( \ + const std::string& name) const { \ + const auto& it = desc_->attrs()->LookupByKey(name.c_str()); \ + return typename lite::OpDataTypeTrait::RT(it->fb_f__()); \ + } \ + template <> \ + typename lite::OpDataTypeTrait::RT OpDesc::GetAttr( \ + size_t idx) const { \ + const auto& it = desc_->attrs()->Get(idx); \ + return typename lite::OpDataTypeTrait::RT(it->fb_f__()); \ + } + +GET_ATTR_IMPL(int32_t, i); +GET_ATTR_IMPL(int16_t, block_idx); +GET_ATTR_IMPL(float, f); +GET_ATTR_IMPL(bool, b); +GET_ATTR_IMPL(int64_t, l); +GET_ATTRS_IMPL(std::vector, ints); +GET_ATTRS_IMPL(std::vector, floats); +GET_ATTRS_IMPL(std::vector, longs); + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/op_desc.h b/lite/model_parser/flatbuffers/op_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..b2d78ca68af3d2f0595e710d9c0f75d8cceefbb3 --- /dev/null +++ b/lite/model_parser/flatbuffers/op_desc.h @@ -0,0 +1,200 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "lite/model_parser/base/op_desc.h" +#include "lite/model_parser/flatbuffers/framework_generated.h" +#include "lite/model_parser/flatbuffers/vector_view.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace fbs { + +class OpDesc : public OpDescAPI { + public: + explicit OpDesc(proto::OpDesc* desc) : desc_(desc) { CHECK(desc_); } + + std::string Type() const override { return desc_->type()->str(); } + + // Get the arguments of parameter called `param` + std::vector Input(const std::string& param) const override { + const auto& var = desc_->inputs()->LookupByKey(param.c_str()); + std::vector args_vec; + if (var->arguments()) { + args_vec.reserve(var->arguments()->size()); + for (const auto& in : *var->arguments()) { + args_vec.push_back(in->str()); + } + } + return args_vec; + } + + std::vector InputArgumentNames() const override { + const auto& vars = desc_->inputs(); + std::vector input_names_vec; + if (vars) { + input_names_vec.reserve(vars->size()); + for (const auto& in : *vars) { + input_names_vec.push_back(in->parameter()->str()); + } + } + return input_names_vec; + } + + std::vector Output(const std::string& param) const override { + const auto& var = desc_->outputs()->LookupByKey(param.c_str()); + std::vector args_vec; + if (var->arguments()) { + args_vec.reserve(var->arguments()->size()); + for (const auto& out : *var->arguments()) { + args_vec.push_back(out->str()); + } + } + return args_vec; + } + + std::vector OutputArgumentNames() const override { + const auto& vars = desc_->outputs(); + std::vector output_names_vec; + if (vars) { + output_names_vec.reserve(vars->size()); + for (const auto& out : *vars) { + output_names_vec.push_back(out->parameter()->str()); + } + } + return output_names_vec; + } + + bool HasAttr(const std::string& name) const override { + return desc_->attrs()->LookupByKey(name.c_str()) != nullptr; + } + + size_t AttrsSize() const { return desc_->attrs()->size(); } + + std::string AttrName(size_t idx) const { + return desc_->attrs()->Get(idx)->name()->str(); + } + + OpDescAPI::AttrType GetAttrType(const std::string& name) const override { + const auto& attr = desc_->attrs()->LookupByKey(name.c_str()); + CHECK(attr); + return static_cast(attr->type()); + } + + OpDescAPI::AttrType GetAttrType(size_t idx) const { + const auto& attr = desc_->attrs()->Get(idx); + CHECK(attr); + return static_cast(attr->type()); + } + + std::vector AttrNames() const override { + const auto& attrs = desc_->attrs(); + std::vector attr_names_vec; + if (attrs) { + attr_names_vec.reserve(attrs->size()); + for (const auto& attr : *attrs) { + attr_names_vec.push_back(attr->name()->str()); + } + } + return attr_names_vec; + } + + template + typename lite::OpDataTypeTrait::RT GetAttr( + const std::string& name) const; + + template + typename lite::OpDataTypeTrait::RT GetAttr(size_t idx) const; + + OpDesc() = delete; + + private: + proto::OpDesc* desc_; + + // To reduce overhead, we expect to use namespace aliasing to make cpp::Desc + // and flatbuffers::Desc replace each other. However, there is no direct + // inheritance relationship between the two data types, and the read-only + // version of flatbuffers lacks some write implementations. Therefore, at + // present, we are temporarily providing a default interface that triggers + // execution-time errors to avoid type ambiguity and compile-time errors + // caused by different building options. + + public: + bool HasInput(const std::string& param) const { + return desc_->inputs()->LookupByKey(param.c_str()) != nullptr; + } + + const std::map>& inputs() const { + NotImplemented(); + return inputs_; + } + const std::map>& outputs() const { + NotImplemented(); + return outputs_; + } + std::map>* mutable_inputs() { + NotImplemented(); + return &inputs_; + } + std::map>* mutable_outputs() { + NotImplemented(); + return &outputs_; + } + + std::vector input_vars() const { + NotImplemented(); + return std::vector(); + } + + std::vector output_vars() const { + NotImplemented(); + return std::vector(); + } + + bool HasOutput(const std::string& param) const { + NotImplemented(); + return false; + } + + const std::map& attrs() const { + NotImplemented(); + return attrs_; + } + const std::map& attr_types() const { + NotImplemented(); + return attr_types_; + } + + private: + void NotImplemented() const { + LOG(FATAL) << "The additional interfaces of OpDesc is temporarily " + "unavailable in read-only mode."; + } + std::string type_; + std::map> inputs_; + std::map> outputs_; + std::map attrs_; + std::map attr_types_; +}; + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/program_desc.cc b/lite/model_parser/flatbuffers/program_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..36429103a72f7b54651aac8d30671f7b3c41956e --- /dev/null +++ b/lite/model_parser/flatbuffers/program_desc.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/model_parser/flatbuffers/program_desc.h" + +namespace paddle { +namespace lite { +namespace fbs { + +template <> +proto::BlockDesc* ProgramDesc::GetBlock(int32_t idx) { + CHECK_LT(idx, BlocksSize()) << "idx >= blocks.size()"; + return const_cast(desc_->blocks()->Get(idx)); +} + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/program_desc.h b/lite/model_parser/flatbuffers/program_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..f41fd996b2533321c2494ea6c15d53ed31a3e7c8 --- /dev/null +++ b/lite/model_parser/flatbuffers/program_desc.h @@ -0,0 +1,54 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/model_parser/base/program_desc.h" +#include "lite/model_parser/flatbuffers/framework_generated.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace fbs { + +class ProgramDesc : public ProgramDescAPI { + public: + ProgramDesc() = default; + explicit ProgramDesc(proto::ProgramDesc *desc) : desc_(desc) { CHECK(desc); } + + size_t BlocksSize() const override { return desc_->blocks()->size(); } + + template + T *GetBlock(int32_t idx); + + template + T const *GetBlock(int32_t idx) const { + return GetBlock(idx); + } + + bool HasVersion() const override { return desc_->version() != nullptr; } + + int64_t Version() const override { + CHECK(HasVersion()); + return desc_->version()->version(); + } + + private: + proto::ProgramDesc *desc_; // not_own +}; + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/var_desc.cc b/lite/model_parser/flatbuffers/var_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..a629ffd5e35223aee218a8798a597b8c684c8c62 --- /dev/null +++ b/lite/model_parser/flatbuffers/var_desc.cc @@ -0,0 +1,15 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/model_parser/flatbuffers/var_desc.h" diff --git a/lite/model_parser/flatbuffers/var_desc.h b/lite/model_parser/flatbuffers/var_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..387e52ec3150e5bc01f365934c310fb1990ce1e4 --- /dev/null +++ b/lite/model_parser/flatbuffers/var_desc.h @@ -0,0 +1,83 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/model_parser/base/var_desc.h" +#include "lite/model_parser/flatbuffers/framework_generated.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace fbs { + +class VarDesc : public VarDescAPI { + public: + explicit VarDesc(proto::VarDesc* desc) : desc_(desc) {} + + std::string Name() const override { return desc_->name()->str(); } + + VarDescAPI::Type GetType() const override { + return static_cast(desc_->type()->type()); + } + + bool Persistable() const override { return desc_->persistable(); } + + std::vector GetShape() const override { + CHECK(GetType() == VarDescAPI::Type::LOD_TENSOR); + const auto& dims = desc_->type()->lod_tensor()->tensor()->dims(); + std::vector dims_vec; + dims_vec.reserve(dims->size()); + for (const auto& dim : *dims) { + dims_vec.push_back(dim); + } + return dims_vec; + } + + VarDesc() = delete; + + private: + proto::VarDesc* desc_; + + // To reduce overhead, we expect to use namespace aliasing to make cpp::Desc + // and flatbuffers::Desc replace each other. However, there is no direct + // inheritance relationship between the two data types, and the read-only + // version of flatbuffers lacks some write implementations. Therefore, at + // present, we are temporarily providing a default interface that triggers + // execution-time errors to avoid type ambiguity and compile-time errors + // caused by different building options. + + public: + VarDescAPI::Type GetDataType() const { + NotImplemented(); + return data_type_; + } + void SetDataType(Type data_type) { NotImplemented(); } + void SetShape(const std::vector& dims) { NotImplemented(); } + + private: + void NotImplemented() const { + LOG(FATAL) << "The additional interfaces of VarDesc is temporarily " + "unavailable in read-only mode."; + } + Type data_type_; + std::vector shape_; +}; + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/vector_view.h b/lite/model_parser/flatbuffers/vector_view.h new file mode 100644 index 0000000000000000000000000000000000000000..ccb700072690c3ecfe55549a1f39d3d574686c7d --- /dev/null +++ b/lite/model_parser/flatbuffers/vector_view.h @@ -0,0 +1,131 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "flatbuffers/flatbuffers.h" +#include "lite/model_parser/base/vector_view.h" + +namespace paddle { +namespace lite { +namespace vector_view { + +template +struct ElementTraits::value>::type> { + typedef flatbuffers::Offset element_type; +}; + +template <> +struct ElementTraits { + typedef flatbuffers::Offset element_type; +}; + +template +struct VectorTraits { + typedef flatbuffers::Vector::element_type> + vector_type; + typedef typename vector_type::const_iterator const_iterator; + typedef typename const_iterator::value_type value_type; + typedef const typename const_iterator::reference const_reference; + typedef value_type subscript_return_type; +}; + +struct FBSStrIterator { + typedef flatbuffers::VectorIterator< + flatbuffers::Offset, + typename flatbuffers::IndirectHelper< + flatbuffers::Offset>::return_type> + VI; + + explicit FBSStrIterator(const VI& iter) { iter_ = iter; } + const VI& raw_iter() const { return iter_; } + + bool operator==(const FBSStrIterator& other) const { + return iter_ == other.raw_iter(); + } + + bool operator<(const FBSStrIterator& other) const { + return iter_ < other.raw_iter(); + } + + bool operator!=(const FBSStrIterator& other) const { + return iter_ != other.raw_iter(); + } + + ptrdiff_t operator-(const FBSStrIterator& other) const { + return iter_ - other.raw_iter(); + } + + std::string operator*() const { return iter_.operator*()->str(); } + std::string operator->() const { return iter_.operator->()->str(); } + + FBSStrIterator& operator++() { + iter_++; + return *this; + } + + FBSStrIterator& operator--() { + iter_--; + return *this; + } + + FBSStrIterator operator+(const size_t& offset) { + return FBSStrIterator(iter_ + offset); + } + + FBSStrIterator operator-(const size_t& offset) { + return FBSStrIterator(iter_ - offset); + } + + private: + VI iter_; +}; + +} // namespace vector_view + +template <> +class VectorView { + public: + typedef vector_view::VectorTraits Traits; + explicit VectorView(typename Traits::vector_type const* cvec) { + cvec_ = cvec; + } + std::string operator[](size_t i) const { return cvec_->operator[](i)->str(); } + vector_view::FBSStrIterator begin() const { + return vector_view::FBSStrIterator(cvec_->begin()); + } + vector_view::FBSStrIterator end() const { + return vector_view::FBSStrIterator(cvec_->end()); + } + size_t size() const { return cvec_->size(); } + operator std::vector() const { + VLOG(5) << "Copying elements out of VectorView will damage performance."; + std::vector tmp; + tmp.reserve(cvec_->size()); + for (auto val : *cvec_) { + tmp.push_back(val->str()); + } + return tmp; + } + ~VectorView() = default; + + private: + typename Traits::vector_type const* cvec_; +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/vector_view_test.cc b/lite/model_parser/flatbuffers/vector_view_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6512ee69bd4f34c0d6e49274d478404191fd9476 --- /dev/null +++ b/lite/model_parser/flatbuffers/vector_view_test.cc @@ -0,0 +1,133 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/model_parser/flatbuffers/vector_view.h" +#include +#include +#include +#include +#include "lite/model_parser/flatbuffers/framework_generated.h" + +namespace paddle { +namespace lite { + +TEST(VectorView, std_vector) { + std::vector vector{1, 2, 3}; + VectorView vector_view(&vector); + size_t i = 0; + for (const auto& value : vector_view) { + EXPECT_EQ(value, vector[i]); + ++i; + } + for (size_t j = 0; j < vector_view.size(); ++j) { + EXPECT_EQ(vector_view[i], vector[i]); + } +} + +TEST(VectorView, Flatbuffers) { + using namespace flatbuffers; // NOLINT + using namespace paddle::lite::fbs; // NOLINT + + auto create_desc = [](FlatBufferBuilder& fbb) { + /* --------- Set --------- */ + // Attr + std::vector ints({-1, 0, 1, 2, 3}); + auto string_0 = fbb.CreateString("string_0"); + auto string_1 = fbb.CreateString("string_1"); + std::vector> strings; + strings.push_back(string_0); + strings.push_back(string_1); + auto attr = proto::OpDesc_::CreateAttrDirect(fbb, + nullptr, + proto::AttrType_INT, + 0, + 0.0f, + nullptr, + &ints, + nullptr, + &strings); + + // OpDesc + std::vector> attrs; + attrs.push_back(attr); + auto op_desc = + proto::CreateOpDescDirect(fbb, "hello!", nullptr, nullptr, &attrs); + + // BlockDesc 0 + std::vector> ops; + ops.push_back(op_desc); + auto block_0 = proto::CreateBlockDescDirect(fbb, 0, 0, nullptr, &ops); + + // BlockDesc 1 + auto block_1 = proto::CreateBlockDescDirect(fbb, 1); + + // ProgramDesc + std::vector> block_vector; + block_vector.push_back(block_0); + block_vector.push_back(block_1); + auto orc = proto::CreateProgramDescDirect(fbb, &block_vector); + fbb.Finish(orc); + }; + + FlatBufferBuilder fbb; + create_desc(fbb); + auto program = fbs::proto::GetProgramDesc(fbb.GetBufferPointer()); + + // BlockDesc View + VectorView block_view(program->blocks()); + EXPECT_EQ(block_view.size(), static_cast(2)); + EXPECT_EQ(block_view[0]->idx(), 0); + EXPECT_EQ(block_view[1]->idx(), 1); + + // OpDesc & Attr View + VectorView op_view(block_view[0]->ops()); + EXPECT_EQ(op_view[0]->type()->str(), std::string("hello!")); + VectorView attr_view(op_view[0]->attrs()); + + // int32_t View + VectorView ints_view(attr_view[0]->ints()); + std::vector ints({-1, 0, 1, 2, 3}); + size_t cnt_0 = 0; + for (const auto& i : ints_view) { + EXPECT_EQ(i, ints[cnt_0]); + ++cnt_0; + } + for (size_t i = 0; i < ints_view.size(); ++i) { + EXPECT_EQ(ints_view[i], ints[i]); + } + std::vector ints_2(ints_view); + for (size_t i = 0; i < ints_2.size(); ++i) { + EXPECT_EQ(ints_2[i], ints[i]); + } + + // String View + VectorView strings_view(attr_view[0]->strings()); + std::vector strings({"string_0", "string_1"}); + EXPECT_EQ(strings_view.size(), strings.size()); + size_t cnt_1 = 0; + for (const auto& s : strings_view) { + EXPECT_EQ(s, strings[cnt_1]); + ++cnt_1; + } + for (size_t i = 0; i < strings_view.size(); ++i) { + EXPECT_EQ(strings_view[i], strings[i]); + } + std::vector string_2(strings_view); + for (size_t i = 0; i < string_2.size(); ++i) { + EXPECT_EQ(string_2[i], strings[i]); + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/cpp/CMakeLists.txt b/lite/model_parser/general/CMakeLists.txt similarity index 100% rename from lite/model_parser/cpp/CMakeLists.txt rename to lite/model_parser/general/CMakeLists.txt diff --git a/lite/model_parser/cpp/block_desc.cc b/lite/model_parser/general/block_desc.cc similarity index 92% rename from lite/model_parser/cpp/block_desc.cc rename to lite/model_parser/general/block_desc.cc index a4dc7cd72acacb6392cecdfe9a551773c1937888..0766333d66c1299b738098a33a1a2c6433782337 100644 --- a/lite/model_parser/cpp/block_desc.cc +++ b/lite/model_parser/general/block_desc.cc @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/model_parser/cpp/block_desc.h" +#include "lite/model_parser/general/block_desc.h" namespace paddle { namespace lite { -namespace cpp { +namespace general { template <> VarDesc* BlockDesc::GetVar(int32_t idx) { @@ -42,6 +42,6 @@ OpDesc* BlockDesc::AddOp() { return &ops_.back(); } -} // namespace cpp +} // namespace general } // namespace lite } // namespace paddle diff --git a/lite/model_parser/cpp/block_desc.h b/lite/model_parser/general/block_desc.h similarity index 88% rename from lite/model_parser/cpp/block_desc.h rename to lite/model_parser/general/block_desc.h index a6cd714e60a66398bffb5ed05a3d7d7eb1da9ac2..3b1b1ff4e6616c936bd3b09bff563656f6bdbc6a 100644 --- a/lite/model_parser/cpp/block_desc.h +++ b/lite/model_parser/general/block_desc.h @@ -14,16 +14,17 @@ #pragma once #include -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/cpp/var_desc.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" +#include "lite/model_parser/general/op_desc.h" +#include "lite/model_parser/general/var_desc.h" namespace paddle { namespace lite { -namespace cpp { +namespace general { /* - * The cpp::BlockDesc is the internal representation for Op. All the internal + * The general::BlockDesc is the internal representation for Op. All the + * internal * imprementation should use it, not the pb::BlockDesc. */ class BlockDesc : public BlockDescAPI { @@ -82,6 +83,6 @@ class BlockDesc : public BlockDescAPI { int32_t forward_block_idx_; }; -} // namespace cpp +} // namespace general } // namespace lite } // namespace paddle diff --git a/lite/model_parser/cpp/op_desc.cc b/lite/model_parser/general/op_desc.cc similarity index 95% rename from lite/model_parser/cpp/op_desc.cc rename to lite/model_parser/general/op_desc.cc index a816943bb9689483f1eb60575147a42594db2654..b4589a14f26b641a0e48c69ec067cd847649b67e 100644 --- a/lite/model_parser/cpp/op_desc.cc +++ b/lite/model_parser/general/op_desc.cc @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/model_parser/cpp/op_desc.h" +#include "lite/model_parser/general/op_desc.h" #include #include namespace paddle { namespace lite { -namespace cpp { +namespace general { std::vector OpDesc::OutputArgumentNames() const { std::vector res; @@ -69,6 +69,6 @@ bool OpDesc::HasOutput(const std::string& param) const { return it != outputs_.end(); } -} // namespace cpp +} // namespace general } // namespace lite } // namespace paddle diff --git a/lite/model_parser/cpp/op_desc.h b/lite/model_parser/general/op_desc.h similarity index 96% rename from lite/model_parser/cpp/op_desc.h rename to lite/model_parser/general/op_desc.h index dfd60c0793af650ede4327bc37f5dccac2e9ee67..e0c2541182adde6ab9171a55d859a5bd5a1195e2 100644 --- a/lite/model_parser/cpp/op_desc.h +++ b/lite/model_parser/general/op_desc.h @@ -17,16 +17,16 @@ #include #include #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/utils/any.h" #include "lite/utils/varient.h" namespace paddle { namespace lite { -namespace cpp { +namespace general { /* - * The cpp::OpDesc is the internal representation for Op. All the internal + * The general::OpDesc is the internal representation for Op. All the internal * imprementation should use it, not the pb::OpDesc. */ class OpDesc : public OpDescAPI { @@ -131,6 +131,6 @@ class OpDesc : public OpDescAPI { } }; -} // namespace cpp +} // namespace general } // namespace lite } // namespace paddle diff --git a/lite/model_parser/cpp/program_desc.cc b/lite/model_parser/general/program_desc.cc similarity index 91% rename from lite/model_parser/cpp/program_desc.cc rename to lite/model_parser/general/program_desc.cc index 3c6adcddf319db57366e5b3cdb05bc6169f229ee..670c7684312265d5a1f1eb2cbef54ed5fe62b2d2 100644 --- a/lite/model_parser/cpp/program_desc.cc +++ b/lite/model_parser/general/program_desc.cc @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/model_parser/cpp/program_desc.h" +#include "lite/model_parser/general/program_desc.h" namespace paddle { namespace lite { -namespace cpp { +namespace general { template <> BlockDesc* ProgramDesc::GetBlock(int32_t idx) { @@ -30,6 +30,6 @@ BlockDesc* ProgramDesc::AddBlock() { return &blocks_.back(); } -} // namespace cpp +} // namespace general } // namespace lite } // namespace paddle diff --git a/lite/model_parser/cpp/program_desc.h b/lite/model_parser/general/program_desc.h similarity index 87% rename from lite/model_parser/cpp/program_desc.h rename to lite/model_parser/general/program_desc.h index 63ac8e0d79c16ea6e64daa4a0b1922a3350037cc..0fbc0742fe149075d3ede2b688fd071727baafc9 100644 --- a/lite/model_parser/cpp/program_desc.h +++ b/lite/model_parser/general/program_desc.h @@ -14,15 +14,16 @@ #pragma once #include -#include "lite/model_parser/cpp/block_desc.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" +#include "lite/model_parser/general/block_desc.h" namespace paddle { namespace lite { -namespace cpp { +namespace general { /* - * The cpp::ProgramDesc is the internal representation for Op. All the internal + * The general::ProgramDesc is the internal representation for Op. All the + * internal * imprementation should use it, not the pb::ProgramDesc. */ class ProgramDesc : public ProgramDescAPI { @@ -59,6 +60,6 @@ class ProgramDesc : public ProgramDescAPI { std::vector blocks_; }; -} // namespace cpp +} // namespace general } // namespace lite } // namespace paddle diff --git a/lite/model_parser/cpp/var_desc.cc b/lite/model_parser/general/var_desc.cc similarity index 92% rename from lite/model_parser/cpp/var_desc.cc rename to lite/model_parser/general/var_desc.cc index e30bb3eb55d274d5287702d6247b94d5d33c4e74..f2782d1778b07ef201401a62f9c7a6295159ef5f 100644 --- a/lite/model_parser/cpp/var_desc.cc +++ b/lite/model_parser/general/var_desc.cc @@ -12,4 +12,4 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/model_parser/cpp/var_desc.h" +#include "lite/model_parser/general/var_desc.h" diff --git a/lite/model_parser/cpp/var_desc.h b/lite/model_parser/general/var_desc.h similarity index 91% rename from lite/model_parser/cpp/var_desc.h rename to lite/model_parser/general/var_desc.h index c56d7cce53180e0157913372f8b0da4c9cedd8c9..ed69d035dfbe837afa79a3f52bd2c0c925bd19ea 100644 --- a/lite/model_parser/cpp/var_desc.h +++ b/lite/model_parser/general/var_desc.h @@ -15,14 +15,14 @@ #pragma once #include #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" namespace paddle { namespace lite { -namespace cpp { +namespace general { /* - * The cpp::VarDesc is the internal representation for Op. All the internal + * The general::VarDesc is the internal representation for Op. All the internal * imprementation should use it, not the pb::VarDesc. */ class VarDesc : public VarDescAPI { @@ -59,6 +59,6 @@ class VarDesc : public VarDescAPI { std::vector shape_; }; -} // namespace cpp +} // namespace general } // namespace lite } // namespace paddle diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc index ea94ca52e8f123da5077f3b751ab03b857e8c390..640dd044174c831e4570c5e8cc81af02fa50f0c4 100644 --- a/lite/model_parser/model_parser.cc +++ b/lite/model_parser/model_parser.cc @@ -21,7 +21,7 @@ #include "lite/core/tensor.h" #include "lite/core/variable.h" #include "lite/core/version.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/naive_buffer/combined_params_desc.h" #include "lite/model_parser/naive_buffer/param_desc.h" #include "lite/model_parser/naive_buffer/program_desc.h" diff --git a/lite/model_parser/naive_buffer/block_desc.h b/lite/model_parser/naive_buffer/block_desc.h index ea4a779fb17559d3487c07b60bd18020fc0e9cce..61c624d9593244a3e680b5541e32cd4aeee949d5 100644 --- a/lite/model_parser/naive_buffer/block_desc.h +++ b/lite/model_parser/naive_buffer/block_desc.h @@ -17,7 +17,7 @@ #include #include #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/naive_buffer/proto/framework.nb.h" namespace paddle { diff --git a/lite/model_parser/naive_buffer/combined_params_desc.h b/lite/model_parser/naive_buffer/combined_params_desc.h index a5462ef5eea47867a737cd1eff344c696f9dc159..1131bab9615b53055d58ba962ad21e206ee70bfc 100644 --- a/lite/model_parser/naive_buffer/combined_params_desc.h +++ b/lite/model_parser/naive_buffer/combined_params_desc.h @@ -17,7 +17,7 @@ #include #include #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/naive_buffer/param_desc.h" #include "lite/model_parser/naive_buffer/proto/framework.nb.h" diff --git a/lite/model_parser/naive_buffer/op_desc.h b/lite/model_parser/naive_buffer/op_desc.h index cce0c22c2e717b6d622314f31af2dc418503c78b..f4cd2d8578cf69854fc4044b739fdfa3d6516d50 100644 --- a/lite/model_parser/naive_buffer/op_desc.h +++ b/lite/model_parser/naive_buffer/op_desc.h @@ -23,7 +23,7 @@ #include #include #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/naive_buffer/proto/framework.nb.h" namespace paddle { diff --git a/lite/model_parser/naive_buffer/param_desc.h b/lite/model_parser/naive_buffer/param_desc.h index 0a20b153312d99602ada77317e64c5934df0f070..ebbbdaf846a3550015ec97c11ccfb7d34271b6c5 100644 --- a/lite/model_parser/naive_buffer/param_desc.h +++ b/lite/model_parser/naive_buffer/param_desc.h @@ -17,7 +17,7 @@ #include #include #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/naive_buffer/proto/framework.nb.h" namespace paddle { diff --git a/lite/model_parser/naive_buffer/program_desc.h b/lite/model_parser/naive_buffer/program_desc.h index d3926b7c629c4bf56d104ca12c1fc70fbf3c0387..1552b6bcdd7ea7f8efd3954e2625712a7684a5f2 100644 --- a/lite/model_parser/naive_buffer/program_desc.h +++ b/lite/model_parser/naive_buffer/program_desc.h @@ -15,7 +15,7 @@ #pragma once #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/naive_buffer/proto/framework.nb.h" namespace paddle { diff --git a/lite/model_parser/naive_buffer/var_desc.h b/lite/model_parser/naive_buffer/var_desc.h index bf0845d7464f511dfb77812612c2b99c954600da..20c8e03a5433ba98c8dc3d98af25920a934ee31d 100644 --- a/lite/model_parser/naive_buffer/var_desc.h +++ b/lite/model_parser/naive_buffer/var_desc.h @@ -17,7 +17,7 @@ #include #include #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/naive_buffer/naive_buffer_wrapper_helper.h" #include "lite/model_parser/naive_buffer/proto/framework.nb.h" diff --git a/lite/model_parser/pb/block_desc.h b/lite/model_parser/pb/block_desc.h index 2a34a51f686caab7aed6a9fb64bb405cd64a2d71..8844173798dcacf77c876f717b71c87cbc57e5e6 100644 --- a/lite/model_parser/pb/block_desc.h +++ b/lite/model_parser/pb/block_desc.h @@ -16,7 +16,7 @@ #include #include "lite/core/framework.pb.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/utils/cp_logging.h" namespace paddle { diff --git a/lite/model_parser/pb/op_desc.h b/lite/model_parser/pb/op_desc.h index f21c194a271b46c84b3a363c6f7c0d9c1f7b1f32..6f186e778298a5ae59a63188640725b3ae5322c9 100644 --- a/lite/model_parser/pb/op_desc.h +++ b/lite/model_parser/pb/op_desc.h @@ -26,7 +26,7 @@ #include #include #include "lite/core/framework.pb.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/utils/all.h" namespace paddle { diff --git a/lite/model_parser/pb/program_desc.h b/lite/model_parser/pb/program_desc.h index 9ff4c28a6d9adce85950bb7e83f15004d766d2dc..950bf5480db501289250ece88b28d1c1369e56fc 100644 --- a/lite/model_parser/pb/program_desc.h +++ b/lite/model_parser/pb/program_desc.h @@ -16,7 +16,7 @@ #include #include "lite/core/framework.pb.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/utils/cp_logging.h" namespace paddle { diff --git a/lite/model_parser/pb/var_desc.h b/lite/model_parser/pb/var_desc.h index eefacef4b0c90faf132b2e4ef141ac7009939db5..d36881d5892ca8b4bef754554d164409fab4b858 100644 --- a/lite/model_parser/pb/var_desc.h +++ b/lite/model_parser/pb/var_desc.h @@ -18,7 +18,7 @@ #include #include #include "lite/core/framework.pb.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/utils/cp_logging.h" namespace paddle { diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt index 192cffccb19040a5ab77feae4d8b6a5a5fe4ba00..45b49f91ace12da5934471e01afd91c2832f1d6d 100644 --- a/lite/operators/CMakeLists.txt +++ b/lite/operators/CMakeLists.txt @@ -39,6 +39,7 @@ add_operator(unsqueeze_op_lite basic SRCS unsqueeze_op.cc DEPS ${op_DEPS}) add_operator(stack_op basic SRCS stack_op.cc DEPS ${op_DEPS}) add_operator(cast_op_lite basic SRCS cast_op.cc DEPS ${op_DEPS}) add_operator(affine_channel_op basic SRCS affine_channel_op.cc DEPS ${op_DEPS}) +add_operator(affine_grid_op basic SRCS affine_grid_op.cc DEPS ${op_DEPS}) add_operator(range_op basic SRCS range_op.cc DEPS ${op_DEPS}) add_operator(reduce_mean_op basic SRCS reduce_mean_op.cc DEPS ${op_DEPS}) add_operator(relu_op basic SRCS relu_op.cc DEPS ${op_DEPS}) @@ -76,6 +77,8 @@ add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS}) add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS}) add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS}) add_operator(sequence_unpad_op_lite extra SRCS sequence_unpad_op.cc DEPS ${op_DEPS}) +add_operator(sequence_pad_op_lite extra SRCS sequence_pad_op.cc DEPS ${op_DEPS}) +add_operator(sequence_mask_op_lite extra SRCS sequence_mask_op.cc DEPS ${op_DEPS}) add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS}) add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS}) add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS}) @@ -110,6 +113,8 @@ add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposal add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS}) add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS}) add_operator(max_pool_with_index_op extra SRCS max_pool_with_index_op.cc DEPS ${op_DEPS}) +add_operator(pixel_shuffle_op extra SRCS pixel_shuffle_op.cc DEPS ${op_DEPS}) +add_operator(clip_op extra SRCS clip_op.cc DEPS ${op_DEPS}) # for OCR specific add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS}) @@ -137,12 +142,15 @@ add_operator(topk_op extra SRCS topk_op.cc DEPS ${op_DEPS}) add_operator(increment_op extra SRCS increment_op.cc DEPS ${op_DEPS}) add_operator(layer_norm_op extra SRCS layer_norm_op.cc DEPS ${op_DEPS}) add_operator(sequence_softmax_op extra SRCS sequence_softmax_op.cc DEPS ${op_DEPS}) +add_operator(retinanet_detection_output_op extra SRCS retinanet_detection_output_op.cc DEPS ${op_DEPS}) +add_operator(where_index_op extra SRCS where_index_op.cc DEPS ${op_DEPS}) # for content-dnn specific add_operator(search_aligned_mat_mul_op extra SRCS search_aligned_mat_mul_op.cc DEPS ${op_DEPS}) add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS}) add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS}) add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS}) add_operator(lstm_op extra SRCS lstm_op.cc DEPS ${op_DEPS}) +add_operator(topk_pooling_op extra SRCS topk_pooling_op.cc DEPS ${op_DEPS}) # for deformable-convNet add_operator(deformable_conv_op extra SRCS deformable_conv_op.cc DEPS ${op_DEPS}) @@ -160,6 +168,9 @@ add_operator(__xpu__resnet50_op extra SRCS __xpu__resnet50_op.cc DEPS ${op_DEPS} add_operator(__xpu__multi_encoder_op extra SRCS __xpu__multi_encoder_op.cc DEPS ${op_DEPS}) add_operator(__xpu__embedding_with_eltwise_add_op extra SRCS __xpu__embedding_with_eltwise_add_op.cc DEPS ${op_DEPS}) add_operator(__xpu__fc_op extra SRCS __xpu__fc_op.cc DEPS ${op_DEPS}) +add_operator(__xpu__resnet_cbam_op extra SRCS __xpu__resnet_cbam_op.cc DEPS ${op_DEPS}) +add_operator(__xpu__search_attention_op extra SRCS __xpu__search_attention_op.cc DEPS ${op_DEPS}) +add_operator(__xpu__mmdnn_op extra SRCS __xpu__mmdnn_op.cc DEPS ${op_DEPS}) if (NOT LITE_WITH_X86) lite_cc_test(test_fc_op SRCS fc_op_test.cc diff --git a/lite/operators/__xpu__mmdnn_op.cc b/lite/operators/__xpu__mmdnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..35024da911ba0659c5005a1adc641fa3adc2f282 --- /dev/null +++ b/lite/operators/__xpu__mmdnn_op.cc @@ -0,0 +1,239 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/__xpu__mmdnn_op.h" +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool XPUMmdnnBidEmbGrnnAttOp::CheckShape() const { return true; } + +bool XPUMmdnnBidEmbGrnnAttOp::InferShapeImpl() const { + auto& id_dims = param_.id0->dims(); + auto& id_lod = param_.id0->lod()[0]; + auto& emb_tbl_dims = param_.emb_tbl->dims(); + auto& grnn_wh_dims = param_.grnn_rv_wh->dims(); + + param_.grnn_fw_pool_out->Resize( + {(int64_t)id_lod.size() - 1, grnn_wh_dims[2]}); + param_.grnn_rv_pool_out->Resize( + {(int64_t)id_lod.size() - 1, grnn_wh_dims[2]}); + param_.att_pool_out->Resize( + {(int64_t)id_lod.size() - 1, 2 * grnn_wh_dims[2]}); + param_.concat_3in1_out->Resize({id_dims[0], 3 * grnn_wh_dims[2]}); + param_.concat_3in1_out->set_lod({id_lod}); + param_.emb_fw_out->Resize({id_dims[0], emb_tbl_dims[1]}); + param_.emb_fw_out->set_lod({id_lod}); + return true; +} + +bool XPUMmdnnBidEmbGrnnAttOp::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + param_.id0 = + scope->FindVar(op_desc.Input("id0").front())->GetMutable(); + param_.id1 = + scope->FindVar(op_desc.Input("id1").front())->GetMutable(); + param_.emb_tbl = scope->FindVar(op_desc.Input("emb_tbl").front()) + ->GetMutable(); + param_.grnn_fw_wh = scope->FindVar(op_desc.Input("grnn_fw_wh").front()) + ->GetMutable(); + param_.grnn_fw_wi = scope->FindVar(op_desc.Input("grnn_fw_wi").front()) + ->GetMutable(); + param_.grnn_rv_wh = scope->FindVar(op_desc.Input("grnn_rv_wh").front()) + ->GetMutable(); + param_.grnn_rv_wi = scope->FindVar(op_desc.Input("grnn_rv_wi").front()) + ->GetMutable(); + param_.att_fc_w = scope->FindVar(op_desc.Input("att_fc_w").front()) + ->GetMutable(); + param_.att_fc_b = scope->FindVar(op_desc.Input("att_fc_b").front()) + ->GetMutable(); + + param_.grnn_fw_pool_out = + scope->FindVar(op_desc.Output("grnn_fw_pool_out").front()) + ->GetMutable(); + param_.grnn_rv_pool_out = + scope->FindVar(op_desc.Output("grnn_rv_pool_out").front()) + ->GetMutable(); + param_.att_pool_out = scope->FindVar(op_desc.Output("att_pool_out").front()) + ->GetMutable(); + param_.concat_3in1_out = + scope->FindVar(op_desc.Output("concat_3in1_out").front()) + ->GetMutable(); + param_.emb_fw_out = scope->FindVar(op_desc.Output("emb_fw_out").front()) + ->GetMutable(); + + param_.grnn_fw_wh_maxs = + op_desc.GetAttr>("grnn_fw_wh_maxs"); + param_.grnn_fw_wi_maxs = + op_desc.GetAttr>("grnn_fw_wi_maxs"); + param_.grnn_rv_wh_maxs = + op_desc.GetAttr>("grnn_rv_wh_maxs"); + param_.grnn_rv_wi_maxs = + op_desc.GetAttr>("grnn_rv_wi_maxs"); + param_.att_fc_w_max = op_desc.GetAttr("att_fc_w_max"); + return true; +} + +bool XPUMmdnnBidEmbAttOp::CheckShape() const { return true; } + +bool XPUMmdnnBidEmbAttOp::InferShapeImpl() const { + auto& id_dims = param_.id0->dims(); + auto& id_lod = param_.id0->lod()[0]; + auto& emb_tbl_dims = param_.emb_tbl->dims(); + + param_.att_pool_out->Resize({(int64_t)id_lod.size() - 1, emb_tbl_dims[1]}); + param_.emb_fw_out->Resize({id_dims[0], emb_tbl_dims[1]}); + param_.emb_fw_out->set_lod({id_lod}); + return true; +} + +bool XPUMmdnnBidEmbAttOp::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + param_.id0 = + scope->FindVar(op_desc.Input("id0").front())->GetMutable(); + param_.id1 = + scope->FindVar(op_desc.Input("id1").front())->GetMutable(); + param_.emb_tbl = scope->FindVar(op_desc.Input("emb_tbl").front()) + ->GetMutable(); + param_.att_fc_w = scope->FindVar(op_desc.Input("att_fc_w").front()) + ->GetMutable(); + param_.att_fc_b = scope->FindVar(op_desc.Input("att_fc_b").front()) + ->GetMutable(); + + param_.att_pool_out = scope->FindVar(op_desc.Output("att_pool_out").front()) + ->GetMutable(); + param_.emb_fw_out = scope->FindVar(op_desc.Output("emb_fw_out").front()) + ->GetMutable(); + + param_.att_fc_w_max = op_desc.GetAttr("att_fc_w_max"); + return true; +} + +bool XPUMmdnnMatchConvTopkOp::CheckShape() const { return true; } + +bool XPUMmdnnMatchConvTopkOp::InferShapeImpl() const { + int channel_num = param_.channel_num; + std::vector topks = param_.topks; + auto row_dim = param_.input_x->dims(); + auto num_k = topks.size(); + auto row_shape_0 = row_dim[0]; + std::vector vec_out_shape; + vec_out_shape.push_back(row_shape_0); + vec_out_shape.push_back(channel_num * num_k); + + param_.topk_out->Resize(lite::DDim(vec_out_shape)); + param_.topk_out->set_lod(param_.input_x->lod()); + return true; +} + +bool XPUMmdnnMatchConvTopkOp::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + param_.input_x = scope->FindVar(op_desc.Input("input_x").front()) + ->GetMutable(); + param_.input_y = scope->FindVar(op_desc.Input("input_y").front()) + ->GetMutable(); + param_.input_w = scope->FindVar(op_desc.Input("input_w").front()) + ->GetMutable(); + param_.conv_w = scope->FindVar(op_desc.Input("conv_w").front()) + ->GetMutable(); + + param_.topk_out = scope->FindVar(op_desc.Output("topk_out").front()) + ->GetMutable(); + + param_.input_w_max = op_desc.GetAttr("input_w_max"); + param_.conv_w_max = op_desc.GetAttr("conv_w_max"); + param_.topks = op_desc.GetAttr>("topks"); + param_.channel_num = op_desc.GetAttr("channel_num"); + param_.dim_t = op_desc.GetAttr("dim_t"); + return true; +} + +bool XPUMmdnnMergeAllOp::CheckShape() const { return true; } + +bool XPUMmdnnMergeAllOp::InferShapeImpl() const { + int64_t dim0 = param_.concat_7in1_x[0]->dims()[0]; + int64_t dim1 = param_.fc2_w->dims()[0]; + std::vector vec_out_shape; + vec_out_shape.push_back(dim0); + vec_out_shape.push_back(dim1); + + param_.out->Resize(lite::DDim(vec_out_shape)); + return true; +} + +bool XPUMmdnnMergeAllOp::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + param_.concat_7in1_x.clear(); + for (auto& name : op_desc.Input("concat_7in1_x")) { + auto t = scope->FindVar(name)->GetMutable(); + param_.concat_7in1_x.push_back(t); + } + param_.concat_2in1_x.clear(); + for (auto& name : op_desc.Input("concat_2in1_x")) { + auto t = scope->FindVar(name)->GetMutable(); + param_.concat_2in1_x.push_back(t); + } + param_.grnn_fw_wh = scope->FindVar(op_desc.Input("grnn_fw_wh").front()) + ->GetMutable(); + param_.grnn_fw_wi = scope->FindVar(op_desc.Input("grnn_fw_wi").front()) + ->GetMutable(); + param_.grnn_rv_wh = scope->FindVar(op_desc.Input("grnn_rv_wh").front()) + ->GetMutable(); + param_.grnn_rv_wi = scope->FindVar(op_desc.Input("grnn_rv_wi").front()) + ->GetMutable(); + param_.fc0_w = scope->FindVar(op_desc.Input("fc0_w").front()) + ->GetMutable(); + param_.fc0_b = scope->FindVar(op_desc.Input("fc0_b").front()) + ->GetMutable(); + param_.fc1_w = scope->FindVar(op_desc.Input("fc1_w").front()) + ->GetMutable(); + param_.fc1_b = scope->FindVar(op_desc.Input("fc1_b").front()) + ->GetMutable(); + param_.fc2_w = scope->FindVar(op_desc.Input("fc2_w").front()) + ->GetMutable(); + param_.fc2_b = scope->FindVar(op_desc.Input("fc2_b").front()) + ->GetMutable(); + + param_.out = + scope->FindVar(op_desc.Output("out").front())->GetMutable(); + + param_.grnn_fw_wh_maxs = + op_desc.GetAttr>("grnn_fw_wh_maxs"); + param_.grnn_fw_wi_maxs = + op_desc.GetAttr>("grnn_fw_wi_maxs"); + param_.grnn_rv_wh_maxs = + op_desc.GetAttr>("grnn_rv_wh_maxs"); + param_.grnn_rv_wi_maxs = + op_desc.GetAttr>("grnn_rv_wi_maxs"); + param_.fc0_w_max = op_desc.GetAttr("fc0_w_max"); + param_.fc1_w_max = op_desc.GetAttr("fc1_w_max"); + param_.fc2_w_max = op_desc.GetAttr("fc2_w_max"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_grnn_att, + paddle::lite::operators::XPUMmdnnBidEmbGrnnAttOp); +REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_att, + paddle::lite::operators::XPUMmdnnBidEmbAttOp); +REGISTER_LITE_OP(__xpu__mmdnn_match_conv_topk, + paddle::lite::operators::XPUMmdnnMatchConvTopkOp); +REGISTER_LITE_OP(__xpu__mmdnn_merge_all, + paddle::lite::operators::XPUMmdnnMergeAllOp); diff --git a/lite/operators/__xpu__mmdnn_op.h b/lite/operators/__xpu__mmdnn_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7038898cad0823746f905e4e60c06885b57a737c --- /dev/null +++ b/lite/operators/__xpu__mmdnn_op.h @@ -0,0 +1,107 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class XPUMmdnnBidEmbGrnnAttOp : public OpLite { + public: + XPUMmdnnBidEmbGrnnAttOp() {} + + explicit XPUMmdnnBidEmbGrnnAttOp(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "XPUMmdnnBidEmbGrnnAttOp"; } + + private: + mutable XPUMmdnnBidEmbGrnnAttParam param_; +}; + +class XPUMmdnnBidEmbAttOp : public OpLite { + public: + XPUMmdnnBidEmbAttOp() {} + + explicit XPUMmdnnBidEmbAttOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "XPUMmdnnBidEmbAttOp"; } + + private: + mutable XPUMmdnnBidEmbAttParam param_; +}; + +class XPUMmdnnMatchConvTopkOp : public OpLite { + public: + XPUMmdnnMatchConvTopkOp() {} + + explicit XPUMmdnnMatchConvTopkOp(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "XPUMmdnnMatchConvTopkOp"; } + + private: + mutable XPUMmdnnMatchConvTopkParam param_; +}; + +class XPUMmdnnMergeAllOp : public OpLite { + public: + XPUMmdnnMergeAllOp() {} + + explicit XPUMmdnnMergeAllOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "XPUMmdnnMergeAllOp"; } + + private: + mutable XPUMmdnnMergeAllParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/__xpu__resnet_cbam_op.cc b/lite/operators/__xpu__resnet_cbam_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6013f4fa90033c51df7a0d3bb670e02f8bf4628d --- /dev/null +++ b/lite/operators/__xpu__resnet_cbam_op.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/__xpu__resnet_cbam_op.h" +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool XPUResNetCbamOp::CheckShape() const { return true; } + +bool XPUResNetCbamOp::InferShapeImpl() const { + auto input_shape = param_.input->dims(); + std::vector output_shape_vec{1, 64}; + paddle::lite::DDim output_shape(output_shape_vec); + output_shape[0] = input_shape[0]; + param_.output->Resize(output_shape); + return true; +} + +bool XPUResNetCbamOp::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + param_.input = const_cast( + &scope->FindVar(op_desc.Input("Input").front())->Get()); + param_.output = scope->FindVar(op_desc.Output("Output").front()) + ->GetMutable(); + + param_.filter.clear(); + for (auto& name : op_desc.Input("Filter")) { + auto t = + const_cast(&scope->FindVar(name)->Get()); + param_.filter.push_back(t); + } + param_.bias.clear(); + for (auto& name : op_desc.Input("Bias")) { + if (name.substr(0, 11) == "placeholder") { + param_.bias.push_back(nullptr); + } else { + auto t = + const_cast(&scope->FindVar(name)->Get()); + param_.bias.push_back(t); + } + } + param_.max_filter.clear(); + for (auto& name : op_desc.Input("MaxFilter")) { + auto t = + const_cast(&scope->FindVar(name)->Get()); + param_.max_filter.push_back(t); + } + + param_.pool_p = op_desc.GetAttr("pool_p"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(__xpu__resnet_cbam, paddle::lite::operators::XPUResNetCbamOp); diff --git a/lite/operators/__xpu__resnet_cbam_op.h b/lite/operators/__xpu__resnet_cbam_op.h new file mode 100644 index 0000000000000000000000000000000000000000..26e5bafeae31183e9054e7e77ea46813c95db707 --- /dev/null +++ b/lite/operators/__xpu__resnet_cbam_op.h @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class XPUResNetCbamOp : public OpLite { + public: + XPUResNetCbamOp() {} + explicit XPUResNetCbamOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "ResNetCbam"; } + + private: + mutable XPUResNetCbamParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/__xpu__search_attention_op.cc b/lite/operators/__xpu__search_attention_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..acd8c817b0d81ef03df1c05417b8bb2f56c00812 --- /dev/null +++ b/lite/operators/__xpu__search_attention_op.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/__xpu__search_attention_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool XPUMmdnnSearchAttentionOp::CheckShape() const { return true; } + +bool XPUMmdnnSearchAttentionOp::InferShapeImpl() const { + auto& x_dims = param_.X->dims(); + param_.Out->Resize(x_dims); + param_.Out->set_lod(param_.X->lod()); + return true; +} + +bool XPUMmdnnSearchAttentionOp::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + auto x = op_desc.Input("X").front(); + auto w = op_desc.Input("W").front(); + auto b = op_desc.Input("b").front(); + auto out = op_desc.Output("Out").front(); + + param_.X = scope->FindVar(x)->GetMutable(); + param_.W = scope->FindVar(w)->GetMutable(); + param_.b = scope->FindVar(b)->GetMutable(); + param_.Out = scope->FindVar(out)->GetMutable(); + + param_.W_max = op_desc.GetAttr("W_max"); + param_.pad_id = op_desc.GetAttr("pad_id"); + param_.alpha0 = op_desc.GetAttr("alpha0"); + param_.alpha1 = op_desc.GetAttr("alpha1"); + param_.mask = op_desc.GetAttr("mask"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(__xpu__mmdnn_search_attention, + paddle::lite::operators::XPUMmdnnSearchAttentionOp); diff --git a/lite/operators/__xpu__search_attention_op.h b/lite/operators/__xpu__search_attention_op.h new file mode 100644 index 0000000000000000000000000000000000000000..81bd366ee8a51dc8d2d7fb4c9cb03d2199bcb4f2 --- /dev/null +++ b/lite/operators/__xpu__search_attention_op.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class XPUMmdnnSearchAttentionOp : public OpLite { + public: + XPUMmdnnSearchAttentionOp() {} + + explicit XPUMmdnnSearchAttentionOp(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { + return "XPUMmdnnSearchAttentionOp"; + } + + private: + mutable XPUMmdnnSearchAttentionParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/activation_grad_ops.cc b/lite/operators/activation_grad_ops.cc index b31163e5dce6d9b77d923ba44ed58952263610a5..a30231be921e2c4445bb4c7a72c9572b14c1c0f5 100644 --- a/lite/operators/activation_grad_ops.cc +++ b/lite/operators/activation_grad_ops.cc @@ -41,15 +41,11 @@ bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc, if (opdesc.HasInput("X")) { auto X_name = opdesc.Input("X").front(); param_.X = GetVar(scope, X_name); - } else { - param_.X = param_.X_grad; } if (opdesc.HasInput("Out")) { auto Out_name = opdesc.Input("Out").front(); param_.Out = GetVar(scope, Out_name); - } else { - param_.Out = param_.Out_grad; } return true; @@ -60,3 +56,5 @@ bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc, } // namespace paddle REGISTER_LITE_OP(square_grad, paddle::lite::operators::ActivationGradOp); +REGISTER_LITE_OP(relu_grad, paddle::lite::operators::ActivationGradOp); +REGISTER_LITE_OP(tanh_grad, paddle::lite::operators::ActivationGradOp); diff --git a/lite/operators/affine_grid_op.cc b/lite/operators/affine_grid_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..22c6b531ae2f4db4136c842720edf56e41900157 --- /dev/null +++ b/lite/operators/affine_grid_op.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/affine_grid_op.h" +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool AffineGridOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Out); + + const auto x_dims = param_.X->dims(); + + CHECK_OR_FALSE(x_dims.size() == 3); + CHECK_OR_FALSE(x_dims[1] == 2 && x_dims[2] == 3); + + if (param_.output_shape.size() != 0) { + CHECK_OR_FALSE(param_.output_shape.size() == 4); + } + return true; +} + +bool AffineGridOpLite::InferShapeImpl() const { + int N = param_.X->dims()[0]; + int H, W; + if (param_.output_shape.size() == 0) { + const auto out_shape = param_.OutputShape->dims(); + H = out_shape[2]; + W = out_shape[3]; + + } else { + H = param_.output_shape[2]; + W = param_.output_shape[3]; + } + param_.Out->Resize(std::vector({N, H, W, 2})); + + return true; +} + +bool AffineGridOpLite::AttachImpl(const cpp::OpDesc &op_desc, + lite::Scope *scope) { + auto x = op_desc.Input("Theta").front(); + auto output = op_desc.Output("Output").front(); + + param_.X = scope->FindVar(x)->GetMutable(); + param_.output_shape = op_desc.GetAttr>("output_shape"); + + param_.Out = scope->FindVar(output)->GetMutable(); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(affine_grid, paddle::lite::operators::AffineGridOpLite); diff --git a/lite/operators/affine_grid_op.h b/lite/operators/affine_grid_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a94eb3d122b74b4e42d8714f284e478e6fb053f6 --- /dev/null +++ b/lite/operators/affine_grid_op.h @@ -0,0 +1,48 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/operators/op_params.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class AffineGridOpLite : public OpLite { + public: + AffineGridOpLite() {} + + explicit AffineGridOpLite(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "affine_grid"; } + + private: + mutable AffineGridParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/assign_value_op.cc b/lite/operators/assign_value_op.cc index ff5b55735f7b58aa2eaa2274574336dadd8061e6..f6f8cb7e3c8958693dd7234b7a21b29b769aa96c 100644 --- a/lite/operators/assign_value_op.cc +++ b/lite/operators/assign_value_op.cc @@ -26,12 +26,15 @@ bool AssignValueOpLite::CheckShape() const { auto shape = param_.shape; auto int32_values = param_.int32_values; auto fp32_values = param_.fp32_values; + auto int64_values = param_.int64_values; + auto bool_values = param_.bool_values; size_t shape_num = 1; - for (int i = 0; i < shape.size(); i++) { + for (size_t i = 0; i < shape.size(); i++) { shape_num *= shape[i]; } - CHECK_OR_FALSE(shape_num == int32_values.size() || - shape_num == fp32_values.size()); + CHECK_OR_FALSE( + shape_num == int32_values.size() || shape_num == fp32_values.size() || + shape_num == int64_values.size() || shape_num == bool_values.size()); return true; } @@ -47,9 +50,18 @@ bool AssignValueOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { param_.shape = op_desc.GetAttr>("shape"); param_.dtype = op_desc.GetAttr("dtype"); - param_.fp32_values = op_desc.GetAttr>("fp32_values"); - param_.int32_values = op_desc.GetAttr>("int32_values"); - + if (op_desc.HasAttr("fp32_values")) { + param_.fp32_values = op_desc.GetAttr>("fp32_values"); + } + if (op_desc.HasAttr("int32_values")) { + param_.int32_values = op_desc.GetAttr>("int32_values"); + } + if (op_desc.HasAttr("int64_values")) { + param_.int64_values = op_desc.GetAttr>("int64_values"); + } + if (op_desc.HasAttr("bool_values")) { + param_.bool_values = op_desc.GetAttr>("bool_values"); + } auto out = op_desc.Output("Out").front(); param_.Out = scope->FindVar(out)->GetMutable(); return true; diff --git a/lite/operators/clip_op.cc b/lite/operators/clip_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ad8eef45f3b38cd176d1bd3d2d0b42620faf602c --- /dev/null +++ b/lite/operators/clip_op.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/clip_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool ClipOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.out); + return true; +} + +bool ClipOpLite::InferShapeImpl() const { + param_.out->Resize(param_.x->dims()); + param_.out->set_lod(param_.x->lod()); + return true; +} + +bool ClipOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { + AttachInput(op_desc, scope, "X", false, ¶m_.x); + AttachInput(op_desc, scope, "Min", true, ¶m_.min_tensor); + AttachInput(op_desc, scope, "Max", true, ¶m_.max_tensor); + AttachOutput(op_desc, scope, "Out", false, ¶m_.out); + + param_.min = op_desc.GetAttr("min"); + param_.max = op_desc.GetAttr("max"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(clip, paddle::lite::operators::ClipOpLite); diff --git a/lite/operators/clip_op.h b/lite/operators/clip_op.h new file mode 100644 index 0000000000000000000000000000000000000000..25c7f9a824ffc4b395a13df39811074724211f44 --- /dev/null +++ b/lite/operators/clip_op.h @@ -0,0 +1,48 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/operators/op_params.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class ClipOpLite : public OpLite { + public: + ClipOpLite() {} + + explicit ClipOpLite(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "clip"; } + + private: + mutable ClipParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h index c3e375e2e44b8184e6e7e635ab2c6c1f8889f844..a1d4e2e8a038046b257b3ab5f936cc4cb2e62c67 100644 --- a/lite/operators/conv_op.h +++ b/lite/operators/conv_op.h @@ -74,7 +74,7 @@ class ConvOpLite : public OpLite { param_.output = scope->FindVar(Out)->GetMutable(); param_.strides = op_desc.GetAttr>("strides"); - auto paddings = op_desc.GetAttr>("paddings"); + std::vector paddings = op_desc.GetAttr>("paddings"); param_.groups = op_desc.GetAttr("groups"); auto dilations = op_desc.GetAttr>("dilations"); param_.dilations = std::make_shared>(dilations); @@ -130,15 +130,18 @@ class ConvOpLite : public OpLite { padding_algorithm_ = op_desc.GetAttr("padding_algorithm"); } // For Int8 - if (op_desc.HasAttr("enable_int8")) { - param_.enable_int8 = op_desc.GetAttr("enable_int8"); - if (op_desc.HasAttr("input_scale")) - param_.input_scale = op_desc.GetAttr("input_scale"); - if (op_desc.HasAttr("weight_scale")) - param_.weight_scale = - op_desc.GetAttr>("weight_scale"); - if (op_desc.HasAttr("output_scale")) { - param_.output_scale = op_desc.GetAttr("output_scale"); + const OpInfo* op_info = dynamic_cast(&op_desc); + if (op_info != nullptr && op_info->HasAttr("enable_int8")) { + param_.enable_int8 = op_info->GetAttr("enable_int8"); + auto input_name = op_info->Input("Input").front(); + auto filter_name = op_info->Input("Filter").front(); + auto output_name = op_info->Output("Output").front(); + if (op_info->HasInputScale(input_name)) + param_.input_scale = op_info->GetInputScale(input_name)[0]; + if (op_info->HasInputScale(filter_name)) + param_.weight_scale = op_info->GetInputScale(filter_name); + if (op_info->HasOutputScale(output_name)) { + param_.output_scale = op_info->GetOutputScale(output_name)[0]; } } diff --git a/lite/operators/conv_transpose_op.cc b/lite/operators/conv_transpose_op.cc index 9d098eb975ef071a4650ea547d6081d950b251f1..732f8c5056f930259655339c8d8a0b2846f29313 100644 --- a/lite/operators/conv_transpose_op.cc +++ b/lite/operators/conv_transpose_op.cc @@ -106,7 +106,7 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc& op_desc, param_.output = scope->FindVar(Out)->GetMutable(); param_.strides = op_desc.GetAttr>("strides"); - auto paddings = op_desc.GetAttr>("paddings"); + std::vector paddings = op_desc.GetAttr>("paddings"); param_.groups = op_desc.GetAttr("groups"); auto dilations = op_desc.GetAttr>("dilations"); diff --git a/lite/operators/elementwise_ops.cc b/lite/operators/elementwise_ops.cc index 6cc41f0a66cfac4a0baa0153765a59766fa045f4..5895bb667aa22507d362004627304ecf78e085f1 100644 --- a/lite/operators/elementwise_ops.cc +++ b/lite/operators/elementwise_ops.cc @@ -144,6 +144,8 @@ REGISTER_LITE_OP(elementwise_add, paddle::lite::operators::ElementwiseOp); REGISTER_LITE_OP(elementwise_mul, paddle::lite::operators::ElementwiseOp); REGISTER_LITE_OP(elementwise_max, paddle::lite::operators::ElementwiseOp); REGISTER_LITE_OP(elementwise_div, paddle::lite::operators::ElementwiseOp); +REGISTER_LITE_OP(elementwise_mod, paddle::lite::operators::ElementwiseOp); +REGISTER_LITE_OP(elementwise_pow, paddle::lite::operators::ElementwiseOp); // #ifdef LITE_WITH_TRAIN // REGISTER_LITE_OP(elementwise_sub_grad, diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc index d4032c5e8b98ff6d5763d2d06610d2e214ad90ca..28a220da2de0920643d46f1ed9c610dfa613cf95 100644 --- a/lite/operators/fc_op.cc +++ b/lite/operators/fc_op.cc @@ -102,14 +102,18 @@ bool FcOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) { } // For Int8 - if (op_desc.HasAttr("enable_int8")) { - param_.enable_int8 = op_desc.GetAttr("enable_int8"); - if (op_desc.HasAttr("input_scale")) - param_.input_scale = op_desc.GetAttr("input_scale"); - if (op_desc.HasAttr("weight_scale")) - param_.weight_scale = op_desc.GetAttr>("weight_scale"); - if (op_desc.HasAttr("output_scale")) - param_.output_scale = op_desc.GetAttr("output_scale"); + const OpInfo* op_info = dynamic_cast(&op_desc); + if (op_info != nullptr && op_info->HasAttr("enable_int8")) { + param_.enable_int8 = op_info->GetAttr("enable_int8"); + auto input_name = op_info->Input("Input").front(); + auto weight_name = op_info->Input("W").front(); + auto out_name = op_info->Output("Out").front(); + if (op_info->HasInputScale(input_name)) + param_.input_scale = op_info->GetInputScale(input_name)[0]; + if (op_info->HasInputScale(weight_name)) + param_.weight_scale = op_info->GetInputScale(weight_name); + if (op_info->HasOutputScale(out_name)) + param_.output_scale = op_info->GetOutputScale(out_name)[0]; } return true; } diff --git a/lite/operators/match_matrix_tensor_op.cc b/lite/operators/match_matrix_tensor_op.cc index 1cc751109f76a96097d363b493322dde182a715d..fd70143131b458c1d985a21a6d9d84c707ba9986 100644 --- a/lite/operators/match_matrix_tensor_op.cc +++ b/lite/operators/match_matrix_tensor_op.cc @@ -94,6 +94,18 @@ bool MatchMatrixTensorOpLite::AttachImpl(const cpp::OpDesc& op_desc, param_.dim_t = op_desc.GetAttr("dim_t"); + if (op_desc.HasAttr("fuse_relu")) { + param_.fuse_relu = op_desc.GetAttr("fuse_relu"); + } +#ifdef LITE_WITH_XPU + if (op_desc.HasAttr("__xpu__float_to_fix")) { + param_.__xpu__float_to_fix = op_desc.GetAttr("__xpu__float_to_fix"); + } + if (op_desc.HasAttr("__xpu__w_max")) { + param_.__xpu__w_max = op_desc.GetAttr("__xpu__w_max"); + } +#endif + return true; } diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h index 8b463956438c61a95af9ec6ae7e7a3230672a237..f351e8e5344424d80fa79f8d7c83be3bf367441f 100644 --- a/lite/operators/op_params.h +++ b/lite/operators/op_params.h @@ -21,10 +21,9 @@ #include "lite/core/scope.h" #include "lite/core/tensor.h" #include "lite/core/types.h" -#include "lite/model_parser/cpp/block_desc.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" +#include "lite/model_parser/cpp_desc.h" #include "lite/utils/all.h" -#include "lite/utils/variant.h" /* * This file contains all the argument parameter data structure for operators. */ @@ -1032,12 +1031,28 @@ struct SequenceExpandParam : ParamBase { int ref_level{-1}; }; +struct SequencePadParam : ParamBase { + const lite::Tensor* X{}; + const lite::Tensor* PadValue{}; + lite::Tensor* Out{}; + lite::Tensor* Length{}; + int padded_length{-1}; +}; + struct SequenceUnpadParam : ParamBase { const lite::Tensor* X{}; const lite::Tensor* Length{}; lite::Tensor* Out{}; }; +struct SequenceMaskParam : ParamBase { + const lite::Tensor* X{}; + const lite::Tensor* MaxLenTensor{nullptr}; + lite::Tensor* Y{}; + int maxlen{-1}; + int out_dtype; +}; + struct SequenceExpandAsParam : ParamBase { const lite::Tensor* x{nullptr}; const lite::Tensor* y{nullptr}; @@ -1114,6 +1129,11 @@ struct VarConv2DParam : ParamBase { int kernel_w; bool fuse_relu{false}; + +#ifdef LITE_WITH_XPU + bool __xpu__float_to_fix{false}; // Is W already converted to int16/int8 + float __xpu__w_max{0.0f}; // Abs max in W +#endif }; /// ----------------------- shape operators ---------------------- @@ -1166,6 +1186,13 @@ struct AffineChannelParam : ParamBase { lite::Tensor* Out{}; }; +struct AffineGridParam : ParamBase { + const lite::Tensor* X{}; // Theta:shape {?, 2, 3} + std::vector output_shape; + const lite::Tensor* OutputShape; + lite::Tensor* Out{}; +}; + struct AnchorGeneratorParam : ParamBase { const lite::Tensor* Input{}; std::vector anchor_sizes{}; @@ -1324,6 +1351,8 @@ struct AssignValueParam : ParamBase { int dtype{}; std::vector fp32_values{}; std::vector int32_values{}; + std::vector int64_values{}; + std::vector bool_values{}; lite::Tensor* Out{}; }; @@ -1338,6 +1367,15 @@ struct SequenceTopkAvgPoolingParam : ParamBase { std::vector topks{}; }; +/// --------------- topk_pooling operators ------------------ +struct TopkPoolingParam : ParamBase { + const lite::Tensor* X{}; + const lite::Tensor* Y{}; + lite::Tensor* Out{}; + int top_k{1}; + int feat_map_num{1}; +}; + /// --------------- search_fc operators ------------------ struct SearchFcParam : ParamBase { const lite::Tensor* X{}; @@ -1345,6 +1383,13 @@ struct SearchFcParam : ParamBase { const lite::Tensor* b{}; lite::Tensor* Out{}; int out_size{}; + + bool fuse_relu{false}; + +#ifdef LITE_WITH_XPU + bool __xpu__float_to_fix{false}; // Is W already converted to int16/int8 + float __xpu__w_max{0.0f}; // Abs max in W +#endif }; /// --------------------- match_matrix_tensor operators -------------------- struct MatchMatrixTensorParam : ParamBase { @@ -1355,6 +1400,12 @@ struct MatchMatrixTensorParam : ParamBase { lite::Tensor* tmp{}; int dim_t; + bool fuse_relu{false}; + +#ifdef LITE_WITH_XPU + bool __xpu__float_to_fix{false}; // Is w already converted to int16/int8 + float __xpu__w_max{0.0f}; // Abs max in w +#endif }; /// --------------------- search_seq_depadding operators -------------------- @@ -1376,6 +1427,12 @@ struct SearchGrnnParam : ParamBase { lite::Tensor* tmp_buffer{}; lite::Tensor* idx_sorted_by_width{}; lite::Tensor* layout_input{}; + +#ifdef LITE_WITH_XPU + bool __xpu__float_to_fix{false}; // Is wi/wh already converted to int16/int8 + std::vector __xpu__wi_max; // Abs max in wi + std::vector __xpu__wh_max; // Abs max in wh +#endif }; struct SplitLodTensorParam : ParamBase { @@ -1530,6 +1587,106 @@ struct XPUFcParam : ParamBase { std::string activation_type{""}; }; +struct XPUResNetCbamParam : ParamBase { + lite::Tensor* input{}; + std::vector filter; + std::vector bias; + std::vector max_filter; + lite::Tensor* output{}; + + float pool_p{1.0f}; +}; + +struct XPUMmdnnSearchAttentionParam : ParamBase { + lite::Tensor* X{}; + lite::Tensor* W{}; + lite::Tensor* b{}; + lite::Tensor* Out{}; + + float W_max{0.0f}; + int pad_id{0}; + float alpha0{1.0f}; + float alpha1{1.0f}; + float mask{1.0f}; +}; + +struct XPUMmdnnBidEmbGrnnAttParam : ParamBase { + lite::Tensor* id0{}; + lite::Tensor* id1{}; + lite::Tensor* emb_tbl{}; + lite::Tensor* grnn_fw_wh{}; + lite::Tensor* grnn_fw_wi{}; + lite::Tensor* grnn_rv_wh{}; + lite::Tensor* grnn_rv_wi{}; + lite::Tensor* att_fc_w{}; + lite::Tensor* att_fc_b{}; + + std::vector grnn_fw_wh_maxs; + std::vector grnn_fw_wi_maxs; + std::vector grnn_rv_wh_maxs; + std::vector grnn_rv_wi_maxs; + float att_fc_w_max{0.0f}; + + lite::Tensor* grnn_fw_pool_out{}; // 1 + lite::Tensor* grnn_rv_pool_out{}; // 2 + lite::Tensor* att_pool_out{}; // 3 + lite::Tensor* concat_3in1_out{}; // 4 + lite::Tensor* emb_fw_out{}; // 5 +}; + +struct XPUMmdnnBidEmbAttParam : ParamBase { + lite::Tensor* id0{}; + lite::Tensor* id1{}; + lite::Tensor* emb_tbl{}; + lite::Tensor* att_fc_w{}; + lite::Tensor* att_fc_b{}; + + float att_fc_w_max{0.0f}; + + lite::Tensor* att_pool_out{}; // 1 + lite::Tensor* emb_fw_out{}; // 2 +}; + +struct XPUMmdnnMatchConvTopkParam : ParamBase { + lite::Tensor* input_x{}; + lite::Tensor* input_y{}; + lite::Tensor* input_w{}; + lite::Tensor* conv_w{}; + + float input_w_max{0.0f}; + float conv_w_max{0.0f}; + std::vector topks; + int channel_num{0}; + int dim_t{0}; + + lite::Tensor* topk_out{}; +}; + +struct XPUMmdnnMergeAllParam : ParamBase { + std::vector concat_7in1_x; + std::vector concat_2in1_x; + lite::Tensor* grnn_fw_wh{}; + lite::Tensor* grnn_fw_wi{}; + lite::Tensor* grnn_rv_wh{}; + lite::Tensor* grnn_rv_wi{}; + lite::Tensor* fc0_w{}; + lite::Tensor* fc0_b{}; + lite::Tensor* fc1_w{}; + lite::Tensor* fc1_b{}; + lite::Tensor* fc2_w{}; + lite::Tensor* fc2_b{}; + + std::vector grnn_fw_wh_maxs; + std::vector grnn_fw_wi_maxs; + std::vector grnn_rv_wh_maxs; + std::vector grnn_rv_wi_maxs; + float fc0_w_max{0.0f}; + float fc1_w_max{0.0f}; + float fc2_w_max{0.0f}; + + lite::Tensor* out{}; +}; + // For DeformableConvolution op struct DeformableConvParam : ParamBase { lite::Tensor* x{}; @@ -1568,6 +1725,34 @@ struct PixelShuffleParam : ParamBase { lite::Tensor* output{nullptr}; int upscale_factor{1}; }; + +struct RetinanetDetectionOutputParam : ParamBase { + std::vector bboxes{}; + std::vector scores{}; + std::vector anchors{}; + Tensor* im_info{}; + Tensor* out{}; + float score_threshold{}; + int nms_top_k{}; + float nms_threshold{}; + float nms_eta{}; + int keep_top_k{}; +}; + +struct WhereIndexParam : ParamBase { + const lite::Tensor* input{nullptr}; + lite::Tensor* output{nullptr}; +}; + +struct ClipParam : ParamBase { + Tensor* x{}; + Tensor* min_tensor{}; + Tensor* max_tensor{}; + Tensor* out{}; + float min{}; + float max{}; +}; + } // namespace operators } // namespace lite } // namespace paddle diff --git a/lite/operators/pixel_shuffle_op.cc b/lite/operators/pixel_shuffle_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..40f564bdd6d2699bafe497bdfded21ea4f3956a3 --- /dev/null +++ b/lite/operators/pixel_shuffle_op.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/pixel_shuffle_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool PixelShuffleOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.output); + CHECK_OR_FALSE(param_.upscale_factor); + const auto x_dims = param_.x->dims(); + const auto upscale_factor = param_.upscale_factor; + CHECK_EQ_OR_FALSE(x_dims[1] % (upscale_factor * upscale_factor), 0); + return true; +} + +bool PixelShuffleOpLite::InferShapeImpl() const { + const auto x_dims = param_.x->dims(); + const auto upscale_factor = param_.upscale_factor; + auto output_dims = x_dims; + output_dims[0] = x_dims[0]; + output_dims[1] = x_dims[1] / (upscale_factor * upscale_factor); + output_dims[2] = x_dims[2] * upscale_factor; + output_dims[3] = x_dims[3] * upscale_factor; + param_.output->Resize(output_dims); + return true; +} + +bool PixelShuffleOpLite::AttachImpl(const cpp::OpDesc& opdesc, + lite::Scope* scope) { + auto input = opdesc.Input("X").front(); + auto out = opdesc.Output("Out").front(); + + param_.x = scope->FindVar(input)->GetMutable(); + param_.output = scope->FindVar(out)->GetMutable(); + + if (opdesc.HasAttr("upscale_factor")) { + param_.upscale_factor = opdesc.GetAttr("upscale_factor"); + } + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(pixel_shuffle, paddle::lite::operators::PixelShuffleOpLite); diff --git a/lite/operators/pixel_shuffle_op.h b/lite/operators/pixel_shuffle_op.h new file mode 100644 index 0000000000000000000000000000000000000000..63efd8df778c6d92bc448f795c19ff5bffba62c8 --- /dev/null +++ b/lite/operators/pixel_shuffle_op.h @@ -0,0 +1,45 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class PixelShuffleOpLite : public OpLite { + public: + PixelShuffleOpLite() {} + explicit PixelShuffleOpLite(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "pixel_shuffle"; } + + private: + mutable PixelShuffleParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h index 92f00a4272fddeb03abd04cba473a997cce37217..916ed1dd6f036c6c36954622abbbc1361de1b790 100644 --- a/lite/operators/pool_op.h +++ b/lite/operators/pool_op.h @@ -54,7 +54,7 @@ class PoolOpLite : public OpLite { param_.ksize = op_desc.GetAttr>("ksize"); param_.global_pooling = op_desc.GetAttr("global_pooling"); param_.strides = op_desc.GetAttr>("strides"); - auto paddings = op_desc.GetAttr>("paddings"); + std::vector paddings = op_desc.GetAttr>("paddings"); if (op_desc.HasAttr("exclusive")) { param_.exclusive = op_desc.GetAttr("exclusive"); diff --git a/lite/operators/retinanet_detection_output_op.cc b/lite/operators/retinanet_detection_output_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e27f2bfca0ab25b8f73d4c6a68d539a7c22389e0 --- /dev/null +++ b/lite/operators/retinanet_detection_output_op.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/retinanet_detection_output_op.h" +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool RetinanetDetectionOutputOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.bboxes.size() > 0); + CHECK_OR_FALSE(param_.scores.size() > 0); + CHECK_OR_FALSE(param_.anchors.size() > 0); + CHECK_OR_FALSE(param_.bboxes.size() == param_.scores.size()); + CHECK_OR_FALSE(param_.bboxes.size() == param_.anchors.size()); + CHECK_OR_FALSE(param_.im_info); + CHECK_OR_FALSE(param_.out); + + DDim bbox_dims = param_.bboxes.front()->dims(); + DDim score_dims = param_.scores.front()->dims(); + DDim anchor_dims = param_.anchors.front()->dims(); + DDim im_info_dims = param_.im_info->dims(); + + CHECK_OR_FALSE(bbox_dims.size() == 3); + CHECK_OR_FALSE(score_dims.size() == 3); + CHECK_OR_FALSE(anchor_dims.size() == 2); + CHECK_OR_FALSE(bbox_dims[2] == 4); + CHECK_OR_FALSE(bbox_dims[1] == score_dims[1]); + CHECK_OR_FALSE(anchor_dims[0] == bbox_dims[1]); + CHECK_OR_FALSE(im_info_dims.size() == 2); + + return true; +} + +bool RetinanetDetectionOutputOpLite::InferShapeImpl() const { + DDim bbox_dims = param_.bboxes.front()->dims(); + param_.out->Resize({bbox_dims[1], bbox_dims[2] + 2}); + return true; +} + +bool RetinanetDetectionOutputOpLite::AttachImpl(const cpp::OpDesc &op_desc, + lite::Scope *scope) { + for (auto arg_name : op_desc.Input("BBoxes")) { + param_.bboxes.push_back( + scope->FindVar(arg_name)->GetMutable()); + } + for (auto arg_name : op_desc.Input("Scores")) { + param_.scores.push_back( + scope->FindVar(arg_name)->GetMutable()); + } + for (auto arg_name : op_desc.Input("Anchors")) { + param_.anchors.push_back( + scope->FindVar(arg_name)->GetMutable()); + } + AttachInput(op_desc, scope, "ImInfo", false, ¶m_.im_info); + AttachOutput(op_desc, scope, "Out", false, ¶m_.out); + + param_.score_threshold = op_desc.GetAttr("score_threshold"); + param_.nms_top_k = op_desc.GetAttr("nms_top_k"); + param_.nms_threshold = op_desc.GetAttr("nms_threshold"); + param_.nms_eta = op_desc.GetAttr("nms_eta"); + param_.keep_top_k = op_desc.GetAttr("keep_top_k"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(retinanet_detection_output, + paddle::lite::operators::RetinanetDetectionOutputOpLite); diff --git a/lite/operators/retinanet_detection_output_op.h b/lite/operators/retinanet_detection_output_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9969227e15941644249b46ba7372f9afc705672c --- /dev/null +++ b/lite/operators/retinanet_detection_output_op.h @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/operators/op_params.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class RetinanetDetectionOutputOpLite : public OpLite { + public: + RetinanetDetectionOutputOpLite() {} + + explicit RetinanetDetectionOutputOpLite(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { + return "retinanet_detection_output"; + } + +#ifdef LITE_WITH_PROFILE + void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {} +#endif + + private: + mutable RetinanetDetectionOutputParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/search_fc_op.cc b/lite/operators/search_fc_op.cc index 71e62c2ae729b4e1516a219888b9af3f7d994428..8024c38f9cc4a6d3ba2d47d6c61e716dd57bb362 100644 --- a/lite/operators/search_fc_op.cc +++ b/lite/operators/search_fc_op.cc @@ -70,6 +70,18 @@ bool SearchFcOpLite::AttachImpl(const cpp::OpDesc &op_desc, param_.Out = scope->FindVar(Out)->GetMutable(); param_.out_size = op_desc.GetAttr("out_size"); + if (op_desc.HasAttr("fuse_relu")) { + param_.fuse_relu = op_desc.GetAttr("fuse_relu"); + } +#ifdef LITE_WITH_XPU + if (op_desc.HasAttr("__xpu__float_to_fix")) { + param_.__xpu__float_to_fix = op_desc.GetAttr("__xpu__float_to_fix"); + } + if (op_desc.HasAttr("__xpu__w_max")) { + param_.__xpu__w_max = op_desc.GetAttr("__xpu__w_max"); + } +#endif + return true; } diff --git a/lite/operators/search_grnn_op.cc b/lite/operators/search_grnn_op.cc index 1ced477c109d8cd93485f0193523887759939f17..6f743693bc782e636064ca398539433b497dc645 100644 --- a/lite/operators/search_grnn_op.cc +++ b/lite/operators/search_grnn_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/operators/search_grnn_op.h" +#include #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" @@ -84,6 +85,18 @@ bool SearchGrnnOpLite::AttachImpl(const cpp::OpDesc& op_desc, param_.layout_input = scope->FindVar(layout_input)->GetMutable(); +#ifdef LITE_WITH_XPU + if (op_desc.HasAttr("__xpu__float_to_fix")) { + param_.__xpu__float_to_fix = op_desc.GetAttr("__xpu__float_to_fix"); + } + if (op_desc.HasAttr("__xpu__wi_max")) { + param_.__xpu__wi_max = op_desc.GetAttr>("__xpu__wi_max"); + } + if (op_desc.HasAttr("__xpu__wh_max")) { + param_.__xpu__wh_max = op_desc.GetAttr>("__xpu__wh_max"); + } +#endif + return true; } diff --git a/lite/operators/sequence_mask_op.cc b/lite/operators/sequence_mask_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..bac1dc8a26abe9a9ae2bbd77e03c2375b4814268 --- /dev/null +++ b/lite/operators/sequence_mask_op.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/sequence_mask_op.h" + +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SequenceMaskOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Y); + return true; +} + +bool SequenceMaskOp::InferShapeImpl() const { return true; } + +bool SequenceMaskOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { + param_.X = const_cast( + &scope->FindVar(opdesc.Input("X").front())->Get()); + if (opdesc.HasInput("MaxLenTensor") && + !opdesc.Input("MaxLenTensor").empty()) { + auto var = scope->FindVar(opdesc.Input("MaxLenTensor").front()); + if (var != nullptr) { + param_.MaxLenTensor = var->GetMutable(); + } + } + param_.Y = + scope->FindVar(opdesc.Output("Y").front())->GetMutable(); + param_.maxlen = opdesc.GetAttr("maxlen"); + param_.out_dtype = opdesc.GetAttr("out_dtype"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(sequence_mask, paddle::lite::operators::SequenceMaskOp); diff --git a/lite/operators/sequence_mask_op.h b/lite/operators/sequence_mask_op.h new file mode 100644 index 0000000000000000000000000000000000000000..97008b865b850f3837fcc49befc5735987fb2048 --- /dev/null +++ b/lite/operators/sequence_mask_op.h @@ -0,0 +1,45 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SequenceMaskOp : public OpLite { + public: + SequenceMaskOp() {} + explicit SequenceMaskOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "sequence_mask"; } + + private: + mutable SequenceMaskParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/sequence_pad_op.cc b/lite/operators/sequence_pad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..687c4a1989deaa5afea2356338630fa0ee846cb5 --- /dev/null +++ b/lite/operators/sequence_pad_op.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/sequence_pad_op.h" +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SequencePadOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.PadValue); + CHECK_OR_FALSE(param_.Out); + CHECK_OR_FALSE(param_.Length); + + return true; +} + +bool SequencePadOp::InferShapeImpl() const { + auto x_dims = param_.X->dims(); + CHECK_GE(x_dims.size(), 2) << "The rank of SequencePad OP Input(x) can't be " + "less than 2. But the rank we received is " + << x_dims.size(); + auto time_step_dims = x_dims.Slice(1, x_dims.size()); + auto pad_value_dims = param_.PadValue->dims(); + CHECK_EQ((pad_value_dims == DDim({1})) || (pad_value_dims == time_step_dims), + true) + << "The SequencePad OP Input(PadValue) must be a scalar or a tensor " + "whiose shape equals to time steps in sequences"; + + auto x_lod = param_.X->lod(); + CHECK_EQ(x_lod.empty(), false) + << "The SequencePad OP Input(X) must hold lod info."; + const auto &x_lod_0 = x_lod[0]; + CHECK_GE(x_lod_0.size(), 2) + << "The size of SequencePadOp Input(X)'s lod info can't be less than 2. " + "But the size we received is " + << x_lod_0.size(); + CHECK_EQ(x_dims[0], static_cast(x_lod_0.back())) + << "The SequencePadOp Input(X)'s lod info mismatches the actual tensor " + "shape. The 1st dimension of Input(X)'s lod info is " + << x_dims[0] << ", the 1st dimension of actual tensor shape is " + << static_cast(x_lod_0.back()); + + int seq_num = x_lod_0.size() - 1; + int max_seq_len = 0; + for (int i = 0; i < seq_num; ++i) { + max_seq_len = + std::max(max_seq_len, static_cast(x_lod_0[i + 1] - x_lod_0[i])); + } + if (param_.padded_length == -1) { + param_.padded_length = max_seq_len; + } + CHECK_GE(param_.padded_length, max_seq_len) + << "The SequencePadOp Attr(padded_length) should be greater than or " + "equal to the length of the longest original sequence. But the " + "padded_length we received is " + << param_.padded_length + << ", the length of the longest original sequence is " << max_seq_len; + + int out_dim_0 = seq_num; + std::vector out_dims_vec{out_dim_0, param_.padded_length}; + std::vector len_dims_vec{out_dim_0}; + auto time_step_dims_vec = time_step_dims.Vectorize(); + out_dims_vec.insert( + out_dims_vec.end(), time_step_dims_vec.begin(), time_step_dims_vec.end()); + param_.Out->Resize(out_dims_vec); + param_.Length->Resize(len_dims_vec); + return true; +} + +bool SequencePadOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { + param_.X = const_cast( + &scope->FindVar(opdesc.Input("X").front())->Get()); + param_.PadValue = const_cast( + &scope->FindVar(opdesc.Input("PadValue").front())->Get()); + param_.Length = scope->FindVar(opdesc.Input("Length").front()) + ->GetMutable(); + param_.Out = + scope->FindVar(opdesc.Output("Out").front())->GetMutable(); + param_.padded_length = opdesc.GetAttr("padded_length"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(sequence_pad, paddle::lite::operators::SequencePadOp); diff --git a/lite/operators/sequence_pad_op.h b/lite/operators/sequence_pad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..bd5d732a5d8816d4f7994ee0e3175ac8a032b2d4 --- /dev/null +++ b/lite/operators/sequence_pad_op.h @@ -0,0 +1,45 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SequencePadOp : public OpLite { + public: + SequencePadOp() {} + explicit SequencePadOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "sequence_pad"; } + + private: + mutable SequencePadParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/sequence_reverse_op.cc b/lite/operators/sequence_reverse_op.cc index 19a47cac9da666269fc5ef2a172ff0295b71e95d..fa2b0553aa2ac84f27d5d27d31df5ce9584d82c3 100644 --- a/lite/operators/sequence_reverse_op.cc +++ b/lite/operators/sequence_reverse_op.cc @@ -34,6 +34,7 @@ bool SequenceReverseOp::InferShapeImpl() const { const auto *input = param_.X; auto out_dims = input->dims(); param_.Out->Resize(out_dims); + param_.Out->set_lod(param_.X->lod()); return true; } @@ -45,6 +46,7 @@ bool SequenceReverseOp::AttachImpl(const cpp::OpDesc &opdesc, scope->FindVar(opdesc.Output("Y").front())->GetMutable(); CHECK(param_.X); CHECK(param_.Out); + return true; } diff --git a/lite/operators/topk_pooling_op.cc b/lite/operators/topk_pooling_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..76634d216a8a120f4e83dfe511089c6deb750cba --- /dev/null +++ b/lite/operators/topk_pooling_op.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/topk_pooling_op.h" +#include "lite/core/op_registry.h" +namespace paddle { +namespace lite { +namespace operators { + +bool TopkPoolingOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Y); + CHECK_OR_FALSE(param_.Out); + return true; +} + +bool TopkPoolingOp::InferShapeImpl() const { + auto out_dims = param_.X->dims(); + out_dims[1] *= param_.top_k; + auto out = param_.Out; + out->Resize(out_dims); + out->set_lod(param_.X->lod()); + + return true; +} + +bool TopkPoolingOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { + auto x = op_desc.Input("X").front(); + auto y = op_desc.Input("Y").front(); + param_.X = scope->FindTensor(x); + param_.Y = scope->FindTensor(y); + auto output = op_desc.Output("Out").front(); + param_.Out = scope->FindMutableTensor(output); + param_.top_k = op_desc.GetAttr("top_k"); + param_.feat_map_num = op_desc.GetAttr("feat_map_num"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(topk_pooling, paddle::lite::operators::TopkPoolingOp); diff --git a/lite/operators/topk_pooling_op.h b/lite/operators/topk_pooling_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ec48c476ca3e6854038bed591ca59402eda93736 --- /dev/null +++ b/lite/operators/topk_pooling_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class TopkPoolingOp : public OpLite { + public: + TopkPoolingOp() {} + explicit TopkPoolingOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "topk_pooling"; } + + private: + mutable TopkPoolingParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/transpose_op.cc b/lite/operators/transpose_op.cc index fe40bf6fa2f84ce7c999b41435aed00cd6555887..8f1372a883a1cd54ac2368f1e7f5e30a60a6b1db 100644 --- a/lite/operators/transpose_op.cc +++ b/lite/operators/transpose_op.cc @@ -43,24 +43,9 @@ bool TransposeOp::CheckShape() const { } bool TransposeOp::InferShapeImpl() const { - CHECK_OR_FALSE(param_.x); - CHECK_OR_FALSE(param_.output); auto x_dims = param_.x->dims(); - auto x_rank = x_dims.size(); std::vector axis = param_.axis; size_t axis_size = axis.size(); - // "The input tensor's rank(%d) should be equal to the axis's size(%d)", - // x_rank, axis_size - CHECK_OR_FALSE(x_rank == axis_size); - - std::vector count(axis_size, 0); - for (size_t i = 0; i < axis_size; i++) { - // Each element of Attribute axis should be a unique value - // range from 0 to (dims - 1), - // where the dims is the axis's size - CHECK_OR_FALSE(axis[i] < static_cast(axis_size) && - ++count[axis[i]] == 1); - } lite::DDim out_dims(x_dims); for (size_t i = 0; i < axis_size; i++) { out_dims[i] = x_dims[axis[i]]; @@ -113,24 +98,9 @@ bool Transpose2Op::CheckShape() const { } bool Transpose2Op::InferShapeImpl() const { - CHECK_OR_FALSE(param_.x); - CHECK_OR_FALSE(param_.output); auto x_dims = param_.x->dims(); - auto x_rank = x_dims.size(); std::vector axis = param_.axis; size_t axis_size = axis.size(); - // "The input tensor's rank(%d) should be equal to the axis's size(%d)", - // x_rank, axis_size - CHECK_OR_FALSE(x_rank == axis_size); - - std::vector count(axis_size, 0); - for (size_t i = 0; i < axis_size; i++) { - // Each element of Attribute axis should be a unique value - // range from 0 to (dims - 1), - // where the dims is the axis's size - CHECK_OR_FALSE(axis[i] < static_cast(axis_size) && - ++count[axis[i]] == 1); - } lite::DDim out_dims(x_dims); for (size_t i = 0; i < axis_size; i++) { out_dims[i] = x_dims[axis[i]]; diff --git a/lite/operators/var_conv_2d_op.cc b/lite/operators/var_conv_2d_op.cc index 8cf11f6465d73646ec9bf846cbe6347bdc4b9f5b..83b6cc6a24ed1537adec8fd7d54a477edf91f873 100644 --- a/lite/operators/var_conv_2d_op.cc +++ b/lite/operators/var_conv_2d_op.cc @@ -52,6 +52,15 @@ bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { if (opdesc.HasAttr("fuse_relu")) { param_.fuse_relu = opdesc.GetAttr("fuse_relu"); } +#ifdef LITE_WITH_XPU + if (opdesc.HasAttr("__xpu__float_to_fix")) { + param_.__xpu__float_to_fix = opdesc.GetAttr("__xpu__float_to_fix"); + } + if (opdesc.HasAttr("__xpu__w_max")) { + param_.__xpu__w_max = opdesc.GetAttr("__xpu__w_max"); + } +#endif + return true; } diff --git a/lite/operators/where_index_op.cc b/lite/operators/where_index_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..81443b7058e0c7d68008cbe98040b3f50eac852f --- /dev/null +++ b/lite/operators/where_index_op.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/where_index_op.h" +#include "lite/core/op_registry.h" +namespace paddle { +namespace lite { +namespace operators { + +bool WhereIndexdOp::CheckShape() const { + CHECK_OR_FALSE(param_.input); + CHECK_OR_FALSE(param_.output); + CHECK_GE(param_.input->dims().size(), 1); + return true; +} + +bool WhereIndexdOp::InferShapeImpl() const { + int64_t rank = static_cast(param_.input->dims().size()); + int64_t numel = static_cast(param_.input->dims().production()); + param_.output->Resize({numel, rank}); + return true; +} + +bool WhereIndexdOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { + AttachParam(¶m_); + auto input = opdesc.Input("Condition").front(); + auto output = opdesc.Output("Out").front(); + CHECK(scope->FindVar(input)); + CHECK(scope->FindVar(output)); + param_.input = GetVar(scope, input); + param_.output = GetMutableVar(scope, output); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(where_index, paddle::lite::operators::WhereIndexdOp); diff --git a/lite/operators/where_index_op.h b/lite/operators/where_index_op.h new file mode 100644 index 0000000000000000000000000000000000000000..157a3cb0be33ffad275ae55a0999095357a09948 --- /dev/null +++ b/lite/operators/where_index_op.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class WhereIndexdOp : public OpLite { + public: + WhereIndexdOp() {} + explicit WhereIndexdOp(const std::string &op_type) : OpLite(op_type) {} + bool CheckShape() const override; + bool InferShapeImpl() const override; + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "where_index_op"; } + + private: + mutable WhereIndexParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/tests/api/CMakeLists.txt b/lite/tests/api/CMakeLists.txt index 810a20abbc0d13897822cef2c99e5942e352a19f..844c3f2ac7146e05b2d93eac76279df022e06652 100644 --- a/lite/tests/api/CMakeLists.txt +++ b/lite/tests/api/CMakeLists.txt @@ -6,11 +6,25 @@ if(LITE_WITH_XPU) lite_cc_test(test_ernie_lite_xpu SRCS test_ernie_lite_xpu.cc DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) + ARGS --model_dir=${LITE_MODEL_DIR}/ernie) lite_cc_test(test_bert_lite_xpu SRCS test_bert_lite_xpu.cc DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) + ARGS --model_dir=${LITE_MODEL_DIR}/bert) + if(WITH_TESTING) + add_dependencies(test_resnet50_lite_xpu extern_lite_download_resnet50_tar_gz) + add_dependencies(test_ernie_lite_xpu extern_lite_download_ernie_tar_gz) + add_dependencies(test_bert_lite_xpu extern_lite_download_bert_tar_gz) + endif() + # TODO(miaotianxiang): enable later + #lite_cc_test(test_fpr_lite_xpu SRCS test_fpr_lite_xpu.cc + #DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + #${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} + #ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) + #lite_cc_test(test_mmdnn_lite_xpu SRCS test_mmdnn_lite_xpu.cc + #DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + #${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} + #ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) endif() if(LITE_WITH_RKNPU) diff --git a/lite/tests/api/test_bert_lite_xpu.cc b/lite/tests/api/test_bert_lite_xpu.cc index b3ee9febb3f0eabd36118680beca66ace9470de4..5d66fd0d5496e105ba97bea6c5e5387d96c9e01b 100644 --- a/lite/tests/api/test_bert_lite_xpu.cc +++ b/lite/tests/api/test_bert_lite_xpu.cc @@ -93,7 +93,7 @@ TEST(Ernie, test_ernie_lite_xpu) { for (size_t i = 0; i < results.size(); ++i) { for (size_t j = 0; j < results[i].size(); ++j) { EXPECT_NEAR( - out->data()[j + (out->shape()[1] * i)], results[i][j], 1e-5); + out->data()[j + (out->shape()[1] * i)], results[i][j], 3e-5); } } } diff --git a/lite/tests/api/test_ernie_lite_xpu.cc b/lite/tests/api/test_ernie_lite_xpu.cc index 0b614fec96cbcc5d9c96653681d0e8794cf4ab8f..b1db9f353657f3f09bcad25db4e777b05f15e0f7 100644 --- a/lite/tests/api/test_ernie_lite_xpu.cc +++ b/lite/tests/api/test_ernie_lite_xpu.cc @@ -93,7 +93,7 @@ TEST(Ernie, test_ernie_lite_xpu) { for (size_t i = 0; i < results.size(); ++i) { for (size_t j = 0; j < results[i].size(); ++j) { EXPECT_NEAR( - out->data()[j + (out->shape()[1] * i)], results[i][j], 1e-5); + out->data()[j + (out->shape()[1] * i)], results[i][j], 2e-5); } } } diff --git a/lite/tests/api/test_fpr_lite_xpu.cc b/lite/tests/api/test_fpr_lite_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..026c25690fe2a673be0a5a97b163d7bbe5fdb4f6 --- /dev/null +++ b/lite/tests/api/test_fpr_lite_xpu.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/api/lite_api_test_helper.h" +#include "lite/api/paddle_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { + +TEST(ResnetCbam, test_resnet_cbam_lite_xpu) { + lite_api::CxxConfig config; + // config.set_model_dir(FLAGS_model_dir); + config.set_model_file(FLAGS_model_dir + "/__model__"); + config.set_param_file(FLAGS_model_dir + "/__params__"); + config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}, + lite_api::Place{TARGET(kX86), PRECISION(kFloat)}, + lite_api::Place{TARGET(kHost), PRECISION(kFloat)}}); + config.set_xpu_workspace_l3_size_per_thread(); + auto predictor = lite_api::CreatePaddlePredictor(config); + + auto input_tensor = predictor->GetInput(0); + std::vector input_shape{1, 3, 224, 224}; + input_tensor->Resize(input_shape); + auto* data = input_tensor->mutable_data(); + int input_num = 1; + for (size_t i = 0; i < input_shape.size(); ++i) { + input_num *= input_shape[i]; + } + for (int i = 0; i < input_num; i++) { + data[i] = 1; + } + + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor->Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeats; ++i) { + predictor->Run(); + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/api/test_mmdnn_lite_xpu.cc b/lite/tests/api/test_mmdnn_lite_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..a2a98821e70cb462b23887f851cfc4bce6b463ca --- /dev/null +++ b/lite/tests/api/test_mmdnn_lite_xpu.cc @@ -0,0 +1,311 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/api/lite_api_test_helper.h" +#include "lite/api/paddle_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/utils/cp_logging.h" +#include "lite/utils/string.h" + +DEFINE_bool(perf, false, "perf?"); +DEFINE_string(perf_input, "perf_input", "perf_input"); + +namespace paddle { +namespace lite { + +std::vector input0; +std::vector input0_lod = {0}; +std::vector input1; +std::vector input1_lod = {0}; +std::vector input2; +std::vector input2_lod = {0}; +std::vector input3; +std::vector input3_lod = {0}; +std::vector input4; +std::vector input4_lod = {0}; +std::vector input5; +std::vector input5_lod = {0}; + +void ParseInput() { + std::string raw_input = + "0 1;145 10251 839 3719 428 52;1050488 1050488 911898 3719 760166 " + "760166;3719 428 52 18 1102 10327 252 20 153 2897 1146 70 156 6 145 " + "10251 839 5 1779 1729 1779 1729 18 2707 6 2707 20 4742 4937 432 6 " + "3869;3719 760166 760166 18 1035176 1035176 764393 764393 1259006 767614 " + "767614 1020808 769579 793958 793958 1050488 911898 751332 751332 750336 " + "750799 750336 751575 751575 751544 751735 751397 751365 751512 751512 " + "753011 751562;3719 428 52 18 1102 10327 252 20 153 2897 1146 70 156 6 " + "145 10251 839 2 1211 3 3719 720 1540 145 10251 839 9405 4315 5998 4 2 " + "600 373 41 3719 428 52 44 10251 4302 1319 7 12 2 768 6 918 6 841 870 8 " + "843 8 271;3719 760166 760166 18 1035176 1035176 764393 764393 1259006 " + "767614 767614 1020808 769579 793958 793958 1050488 911898 2 773899 " + "773899 3719 1118420 1118420 1050488 1050488 911898 9405 4315 5998 4 2 " + "785435 785435 41 3719 760166 760166 44 10251 4302 1319 750118 750118 2 " + "750465 750465 750274 750398 750233 751252 751252 753447 752830 753112;\n" + "0 0;145 10251 839 3719 428 52;1050488 1050488 911898 3719 760166 " + "760166;2109 2467 1805 227 3719 428 52 18 1102 10327 252 20 6 242 78 6 " + "532 78;2109 2467 1805 1245431 1245431 760166 760166 18 1035176 1035176 " + "764393 764393 752116 242 750370 750370 752081 751247;2109 2467 1805 227 " + "3719 428 52 18 1102 10327 252 20 2 145 242 1050 252 3582 2212;2109 2467 " + "1805 1245431 1245431 760166 760166 18 1035176 1035176 764393 764393 2 " + "871717 871717 757921 757921 3582 2212;\n" + "0 0;145 10251 839 3719 428 52;1050488 1050488 911898 3719 760166 " + "760166;145 10251 839 76 31 1337 823 7506 567 65 170 8 21293 3719 5 43 " + "394 743 42;1050488 1050488 911898 750016 750016 1337 823 7506 762617 " + "762617 866652 8 21293 3719 5 43 914758 914758 757202;145 10251 839 76 " + "31 1337 823 7506 567 65 170 8 21293 3719 2 17580 30 523324 3 10251 4104 " + "281 3 8511 3719 2217 3 13 226 3083 4 11251 1606 357 9 2 145 10251 839 " + "76 31 1337 823 7506 567 65 170 2 7506 2445 8 145 10251 839 528 839 " + "19670 6538;1050488 1050488 911898 750016 750016 1337 823 7506 762617 " + "762617 866652 8 21293 3719 2 816626 816626 523324 3 1181698 1181698 " + "751656 780821 1063148 3719 2217 3 752498 752498 831323 753602 11251 " + "1606 357 9 2 1050488 1050488 911898 750016 750016 1337 823 7506 762617 " + "762617 866652 2 7506 753045 753045 756756 1050488 911898 528 839 19670 " + "6538;\n" + "0 0;145 10251 839 3719 428 52;1050488 1050488 911898 3719 760166 " + "760166;145 10251 839 99 4 1102 10327 2196 41 3719 428 52 44 99 4 2899 " + "229 10 10 10;1050488 1050488 911898 807966 750273 1035176 1035176 " + "1237875 41 3719 760166 760166 753645 753645 750273 2899 229 750001 " + "750001 750001;145 10251 839 99 4 1102 10327 2196 41 3719 428 52 44 99 4 " + "2899 229 10 10 10 2 1177 8 145 10251 839 99 4 1102 10327 2196 41 3719 " + "428 52 44 99 4 2 101 8 1922 17 2184 2 1154 1922 72 1198 1266 " + "4516;1050488 1050488 911898 807966 750273 1035176 1035176 1237875 41 " + "3719 760166 760166 753645 753645 750273 2899 229 750001 750001 750001 2 " + "750257 750257 756756 1050488 911898 807966 750273 1035176 1035176 " + "1237875 41 3719 760166 760166 753645 753645 750273 2 764513 764513 " + "851213 851213 854628 2 753018 753018 754317 753328 754085 754070;\n" + "0 0;145 10251 839 3719 428 52;1050488 1050488 911898 3719 760166 " + "760166;73 5347 112 8 145 10251 839 262 169 22729 3719 6 743 6 339 1156 " + "78 136 399 693 128 571;776150 776150 112 756756 756756 1050488 911898 " + "791355 791355 22729 3719 6 758277 758277 750137 750234 750241 750178 " + "750055 750216 750212 750049;73 5347 112 8 145 10251 839 262 169 22729 " + "3719 2 588 415 549 415 115 23;776150 776150 112 756756 756756 1050488 " + "911898 791355 791355 22729 3719 2 750221 750221 750262 750277 750277 " + "750261;"; + auto raw_lines = Split(raw_input, "\n"); + for (auto& raw_line : raw_lines) { + auto inputx = Split(raw_line, ";"); + for (size_t i = 1; i < inputx.size(); ++i) { + auto tokens = Split(inputx[i], " "); + static std::vector* const input_array[] = { + &input0, &input0, &input1, &input2, &input3, &input4, &input5}; + static std::vector* const lod_array[] = {&input0_lod, + &input0_lod, + &input1_lod, + &input2_lod, + &input3_lod, + &input4_lod, + &input5_lod}; + for (auto token : tokens) { + input_array[i]->push_back((int64_t)atoi(token.c_str())); + } + lod_array[i]->push_back((uint64_t)tokens.size() + + (*lod_array[i])[lod_array[i]->size() - 1]); + } + } + return; +} + +class MmdnnReader { + std::ifstream ifs; + std::vector StringSplit(const std::string& in, + const std::string& delim) { + std::vector ret; + if (in == "") { + return ret; + } + auto begpos = in.find_first_not_of(delim); + while (begpos != std::string::npos) { + auto endpos = in.find_first_of(delim, begpos); + if (endpos == std::string::npos) { + endpos = in.size(); + } + std::string ssubstr = in.substr(begpos, endpos - begpos); + ret.push_back(ssubstr); + begpos = endpos + 1; + if (endpos >= (in.size() - 1)) { + break; + } + } + return ret; + } + + public: + std::vector data[6]; + std::vector lod[6]; + + void Init(std::string file_name) { ifs.open(file_name); } + + int Read(int maxline) { + for (int i = 0; i < 6; i++) { + data[i].clear(); + } + for (int i = 0; i < 6; i++) { + lod[i].clear(); + lod[i].push_back(0); + } + std::string line; + int cnt = 0; + while (cnt < maxline && getline(ifs, line)) { + std::vector split1 = StringSplit(line, ";"); + for (int i = 1; i < 7; i++) { + std::vector split2 = StringSplit(split1[i], " "); + if (split2.size() == 0) { + split2.push_back("1280000"); + } + for (size_t j = 0; j < split2.size(); j++) { + data[i - 1].push_back(std::stoi(split2[j].c_str(), nullptr, 0)); + } + // if (i % 2 == 1) { + // lod[i / 2].push_back(lod[i / 2].back() + split2.size()); + //} + lod[i - 1].push_back(lod[i - 1].back() + split2.size()); + } + cnt++; + } + return cnt; + } +}; + +TEST(MMDNN, test_mmdnn_lite_xpu) { + lite_api::CxxConfig config; + config.set_model_dir(FLAGS_model_dir); + config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}, + lite_api::Place{TARGET(kXPU), PRECISION(kInt64)}, + lite_api::Place{TARGET(kX86), PRECISION(kFloat)}, + lite_api::Place{TARGET(kX86), PRECISION(kInt64)}, + lite_api::Place{TARGET(kHost), PRECISION(kFloat)}}); + config.set_xpu_workspace_l3_size_per_thread(); + auto predictor = lite_api::CreatePaddlePredictor(config); + + if (FLAGS_perf) { + MmdnnReader reader; + reader.Init(FLAGS_perf_input); + int UB_batch = 40; // upper bound of batch + int iter = 0; + double tsc_sum = 0; + + while (true) { + int batch = reader.Read(UB_batch); + if (batch <= 0) { + break; + } + ++iter; + for (int i = 0; i < 6; ++i) { + auto input_x = predictor->GetInput(i); + input_x->Resize({(int64_t)reader.data[i].size(), 1}); + input_x->SetLoD({reader.lod[i]}); + auto* data_x = input_x->mutable_data(); + memcpy(data_x, + reader.data[i].data(), + reader.data[i].size() * sizeof(int64_t)); + } + + auto start = GetCurrentUS(); + predictor->Run(); + auto end = GetCurrentUS(); + tsc_sum += end - start; + } + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " + << FLAGS_threads << ", warmup: " << FLAGS_warmup + << ", repeats: " << iter << ", spend " << tsc_sum / iter / 1000.0 + << " ms in average."; + + return; + } + + ParseInput(); + + { + std::vector input0_shape{(int64_t)input0.size(), 1}; + auto input_tensor0 = predictor->GetInput(0); + input_tensor0->Resize(input0_shape); + input_tensor0->SetLoD({input0_lod}); + auto* data0 = input_tensor0->mutable_data(); + memcpy(data0, input0.data(), sizeof(int64_t) * input0.size()); + } + { + std::vector input1_shape{(int64_t)input1.size(), 1}; + auto input_tensor1 = predictor->GetInput(1); + input_tensor1->Resize(input1_shape); + input_tensor1->SetLoD({input1_lod}); + auto* data1 = input_tensor1->mutable_data(); + memcpy(data1, input1.data(), sizeof(int64_t) * input1.size()); + } + { + std::vector input2_shape{(int64_t)input2.size(), 1}; + auto input_tensor2 = predictor->GetInput(2); + input_tensor2->Resize(input2_shape); + input_tensor2->SetLoD({input2_lod}); + auto* data2 = input_tensor2->mutable_data(); + memcpy(data2, input2.data(), sizeof(int64_t) * input2.size()); + } + { + std::vector input3_shape{(int64_t)input3.size(), 1}; + auto input_tensor3 = predictor->GetInput(3); + input_tensor3->Resize(input3_shape); + input_tensor3->SetLoD({input3_lod}); + auto* data3 = input_tensor3->mutable_data(); + memcpy(data3, input3.data(), sizeof(int64_t) * input3.size()); + } + { + std::vector input4_shape{(int64_t)input4.size(), 1}; + auto input_tensor4 = predictor->GetInput(4); + input_tensor4->Resize(input4_shape); + input_tensor4->SetLoD({input4_lod}); + auto* data4 = input_tensor4->mutable_data(); + memcpy(data4, input4.data(), sizeof(int64_t) * input4.size()); + } + { + std::vector input5_shape{(int64_t)input5.size(), 1}; + auto input_tensor5 = predictor->GetInput(5); + input_tensor5->Resize(input5_shape); + input_tensor5->SetLoD({input5_lod}); + auto* data5 = input_tensor5->mutable_data(); + memcpy(data5, input5.data(), sizeof(int64_t) * input5.size()); + } + + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor->Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeats; ++i) { + predictor->Run(); + } + + auto out = predictor->GetOutput(0); + auto out_shape = out->shape(); + auto out_size = std::accumulate( + out_shape.begin(), out_shape.end(), 1, std::multiplies()); + for (int i = 0; i < out_size; ++i) { + LOG(INFO) << "out[" << i << "] = " << out->data()[i]; + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index d29f88f334754720b4681042ac5693723e028ba1..9fa795ad89981c52d00772dcd86d952430782adb 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -63,6 +63,7 @@ if(LITE_BUILD_EXTRA) lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_ctc_align_compute SRCS ctc_align_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_clip_compute SRCS clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) # for training kernel if (LITE_WITH_TRAIN) diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc index 5824ba91c2f824dd351f8977aa497b9ce2238ec6..a62c698f83fe10409af0bba8774135d3409358ea 100644 --- a/lite/tests/kernels/activation_compute_test.cc +++ b/lite/tests/kernels/activation_compute_test.cc @@ -300,7 +300,7 @@ TEST(Activation_relu, precision) { abs_error = 1e-2; // Using fp16 in NPU #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; @@ -426,7 +426,7 @@ TEST(Activation_tanh, precision) { abs_error = 1e-2; // Using fp16 in NPU #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; @@ -572,7 +572,7 @@ TEST(Activation_gelu, precision) { LOG(INFO) << "test gelu op"; Place place; float abs_error = 2e-5; -#if defined(LITE_WITH_XPU) +#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/activation_grad_compute_test.cc b/lite/tests/kernels/activation_grad_compute_test.cc index 5d5046b01dee6c84f341159b68300197c20695e6..2ad5b80a910f323b34b039eabda0ceb4b49784c5 100644 --- a/lite/tests/kernels/activation_grad_compute_test.cc +++ b/lite/tests/kernels/activation_grad_compute_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/activation_grad_compute.h" +#include "lite/kernels/host/activation_grad_compute.h" #include #include "lite/core/op_registry.h" #include "lite/kernels/arm/activation_compute.h" @@ -20,13 +20,11 @@ namespace paddle { namespace lite { namespace kernels { -namespace arm { using param_t = operators::ActivationParam; using grad_param_t = operators::ActivationGradParam; -using kernel_t = SquareCompute; -using grad_kernel_t = SquareGradCompute; +template class ActivationGradTester { public: explicit ActivationGradTester(DDim dims) : dims_(dims) {} @@ -71,22 +69,28 @@ class ActivationGradTester { void run_backward(grad_param_t* param, grad_kernel_t* kernel, const std::vector& in_vec, + const std::vector& out_vec, const std::vector& out_grad_vec, float* in_grad_vec) { Tensor x; + Tensor out; Tensor x_grad; Tensor out_grad; x.Resize(dims_); + out.Resize(dims_); x_grad.Resize(dims_); out_grad.Resize(dims_); auto* x_data = x.mutable_data(); + auto* out_data = out.mutable_data(); auto* out_grad_data = out_grad.mutable_data(); for (int i = 0; i < dims_.production(); i++) { x_data[i] = in_vec[i]; + out_data[i] = out_vec[i]; out_grad_data[i] = out_grad_vec[i]; } param->X = &x; + param->Out = &out; param->X_grad = &x_grad; param->Out_grad = &out_grad; kernel->SetParam(*param); @@ -102,7 +106,9 @@ class ActivationGradTester { std::vector x(dims_.production()); std::vector out(dims_.production()); for (int i = 0; i < dims_.production(); i++) { - x[i] = 1.0 * static_cast(i % 128) * 0.3f - 1.1; + x[i] = static_cast(i % 3 - 2.0) / 2.0 * 0.333 + + static_cast(i % 19 - 10.0) / 10.0 * 0.333 + + static_cast(i % 39 - 20.0) / 20.0 * 0.333 + 0.001213; } this->run_forward(¶m_, &kernel_, x, out.data()); @@ -120,7 +126,8 @@ class ActivationGradTester { for (int i = 0; i < dims_.production(); i++) { out_grad[i] = 1.0; } - this->run_backward(&grad_param_, &grad_kernel_, x, out_grad, x_grad.data()); + this->run_backward( + &grad_param_, &grad_kernel_, x, out, out_grad, x_grad.data()); for (int i = 0; i < dims_.production(); i++) { EXPECT_NEAR(x_grad[i], (out_delta[i] - out[i]) / delta, max_grad_delta); @@ -137,31 +144,58 @@ class ActivationGradTester { grad_param_t grad_param_; }; -void TestNormalCase(DDim dims) { - std::unique_ptr tester(new ActivationGradTester(dims)); +void TestSquareGrad(DDim dims) { + LOG(INFO) << "Test Square grad"; + std::unique_ptr< + ActivationGradTester> + tester( + new ActivationGradTester( + dims)); tester->prepare_kernel(); float delta = 0.001; float max_grad_delta = 0.005; tester->check_grad(delta, max_grad_delta); } -TEST(activation_grad_arm, compute) { - LOG(INFO) << "Test Square grad"; +void TestReluGrad(DDim dims) { + LOG(INFO) << "Test Relu grad"; + std::unique_ptr> + tester(new ActivationGradTester( + dims)); + tester->prepare_kernel(); + float delta = 0.001; + float max_grad_delta = 0.005; + tester->check_grad(delta, max_grad_delta); +} + +void TestTanhGrad(DDim dims) { + LOG(INFO) << "Test Tanh grad"; + std::unique_ptr> + tester(new ActivationGradTester( + dims)); + tester->prepare_kernel(); + float delta = 0.001; + float max_grad_delta = 0.005; + tester->check_grad(delta, max_grad_delta); +} + +TEST(activation_grad_host, compute) { DeviceInfo::Init(); - for (auto n : {2}) { - for (auto c : {2}) { - for (auto h : {2}) { - for (auto w : {2}) { - TestNormalCase(DDim(std::vector({n, c, h, w}))); + for (auto n : {2, 1}) { + for (auto c : {2, 9}) { + for (auto h : {2, 1}) { + for (auto w : {2, 10}) { + TestSquareGrad(DDim(std::vector({n, c, h, w}))); + TestReluGrad(DDim(std::vector({n, c, h, w}))); + TestTanhGrad(DDim(std::vector({n, c, h, w}))); } } } } } -} // namespace arm } // namespace kernels } // namespace lite } // namespace paddle USE_LITE_KERNEL(square, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(square_grad, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(square_grad, kHost, kFloat, kNCHW, def); diff --git a/lite/tests/kernels/batch_norm_compute_test.cc b/lite/tests/kernels/batch_norm_compute_test.cc index ae65e0e3c320ff153a99d2a1656227bad34428d4..9674f95d0b52dbc264ef78748d0c0fba1e4ebc37 100644 --- a/lite/tests/kernels/batch_norm_compute_test.cc +++ b/lite/tests/kernels/batch_norm_compute_test.cc @@ -157,7 +157,7 @@ TEST(BatchNorm, precision) { LOG(INFO) << "test BatchNorm op"; float abs_error = 2e-5; Place place; -#if defined(LITE_WITH_XPU) +#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #elif defined(LITE_WITH_NPU) place = TARGET(kNPU); diff --git a/lite/tests/kernels/box_clip_compute_test.cc b/lite/tests/kernels/box_clip_compute_test.cc index 72947fa4b258a894e5a73c5e8fe8cce12ef9a02c..c599e64214d3fb15a52cb14fe48de7a7d75b2868 100644 --- a/lite/tests/kernels/box_clip_compute_test.cc +++ b/lite/tests/kernels/box_clip_compute_test.cc @@ -70,9 +70,7 @@ class BoxClipComputeTester : public arena::TestCase { float sign = i % 3 == 0 ? -1.0f : 1.0f; input_data[i] = sign * static_cast((i * 7) % 20); } - SetCommonTensor(input_, input_dims_, input_data.data()); - auto input_tensor = baseline_scope()->FindMutableTensor(input_); - input_tensor->set_lod(input_lod_); + SetCommonTensor(input_, input_dims_, input_data.data(), input_lod_); std::vector im_info_data{10, 10, 1, 15, 15, 1}; SetCommonTensor(im_info_, im_info_dim_, im_info_data.data()); diff --git a/lite/tests/kernels/cast_compute_test.cc b/lite/tests/kernels/cast_compute_test.cc index 86331bb8a1cce89da76d2ebb87a9d091e34f68c5..34038dfdc797d0e5ee618b575ad532fd64809276 100644 --- a/lite/tests/kernels/cast_compute_test.cc +++ b/lite/tests/kernels/cast_compute_test.cc @@ -135,7 +135,7 @@ TEST(Cast, precision) { float abs_error = 2e-5; #if defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/clip_compute_test.cc b/lite/tests/kernels/clip_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6c6149bb753b2a83813d0a129d61d7444456c399 --- /dev/null +++ b/lite/tests/kernels/clip_compute_test.cc @@ -0,0 +1,130 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" + +namespace paddle { +namespace lite { + +class ClipComputeTester : public arena::TestCase { + protected: + // common attributes for this op. + std::string x_ = "x"; + std::string out_ = "out"; + std::string min_tensor_ = "min_tensor"; + std::string max_tensor_ = "max_tensor"; + float min_{}; + float max_{}; + bool use_minmax_tensor_{}; + DDim x_dims_; + + public: + ClipComputeTester(const Place& place, + const std::string& alias, + int n, + int c, + int h, + int w, + float min, + float max, + bool use_minmax_tensor) + : TestCase(place, alias) { + x_dims_ = DDim(std::vector({n, c, h, w})); + min_ = min; + max_ = max; + use_minmax_tensor_ = use_minmax_tensor; + } + + void RunBaseline(Scope* scope) override { + auto* x = scope->FindTensor(x_); + auto* out = scope->NewTensor(out_); + CHECK(out); + out->Resize(x->dims()); + const auto* x_data = x->data(); + auto* out_data = out->mutable_data(); + + for (int i = 0; i < x->numel(); i++) { + if (x_data[i] < min_) + out_data[i] = min_; + else if (x_data[i] > max_) + out_data[i] = max_; + else + out_data[i] = x_data[i]; + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + op_desc->SetType("clip"); + op_desc->SetInput("X", {x_}); + op_desc->SetOutput("Out", {out_}); + if (use_minmax_tensor_) { + op_desc->SetInput("Min", {min_tensor_}); + op_desc->SetInput("Max", {max_tensor_}); + op_desc->SetAttr("min", 0.f); + op_desc->SetAttr("max", 0.f); + } else { + op_desc->SetAttr("min", min_); + op_desc->SetAttr("max", max_); + } + } + + void PrepareData() override { + std::vector x_data(x_dims_.production()); + for (int i = 0; i < x_dims_.production(); i++) { + float sign = i % 3 == 0 ? -1.0f : 1.0f; + x_data[i] = sign * static_cast(i % 128) * 0.013f + 0.001; + } + SetCommonTensor(x_, x_dims_, x_data.data()); + + if (use_minmax_tensor_) { + std::vector min_data = {min_}; + SetCommonTensor( + min_tensor_, DDim(std::vector({1})), min_data.data()); + + std::vector max_data = {max_}; + SetCommonTensor( + max_tensor_, DDim(std::vector({1})), max_data.data()); + } + } +}; + +TEST(Clip, precision) { + LOG(INFO) << "test clip op"; +#ifdef LITE_WITH_ARM + Place place(TARGET(kARM)); + + float min = -1; + float max = 1; + for (int n : {1, 3}) { + for (int c : {3, 5}) { + for (int h : {5, 6}) { + for (int w : {6, 7}) { + for (bool use_minmax_tensor : {true, false}) { + std::unique_ptr tester(new ClipComputeTester( + place, "def", n, c, h, w, min, max, use_minmax_tensor)); + arena::Arena arena(std::move(tester), place, 2e-5); + arena.TestPrecision(); + } + } + } + } + } +#endif +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/dropout_compute_test.cc b/lite/tests/kernels/dropout_compute_test.cc index 025f02ce31505cee684fb9a21c7b26d96e1c3026..c4ecc0cf01e3da7c43294ba1249b5b4f106caa95 100644 --- a/lite/tests/kernels/dropout_compute_test.cc +++ b/lite/tests/kernels/dropout_compute_test.cc @@ -94,7 +94,7 @@ TEST(Dropout, precision) { #if defined(LITE_WITH_NPU) place = TARGET(kNPU); abs_error = 1e-2; // Using fp16 in NPU -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/elementwise_compute_test.cc b/lite/tests/kernels/elementwise_compute_test.cc index 505ab72dc125d5b527845f4695a444c215422f8b..d91c304ef7e76b9ff623ebfe1bb9ad5bb4ace2c9 100644 --- a/lite/tests/kernels/elementwise_compute_test.cc +++ b/lite/tests/kernels/elementwise_compute_test.cc @@ -228,7 +228,7 @@ TEST(Elementwise, precision) { abs_error = 1e-2; // use fp16 in npu #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/elementwise_grad_compute_test.cc b/lite/tests/kernels/elementwise_grad_compute_test.cc index 2b5fbbb65d3d7e17bf90afb71f5c8154f0d88488..04e74e49099f13a7e5920b306f8d2e26650a2574 100644 --- a/lite/tests/kernels/elementwise_grad_compute_test.cc +++ b/lite/tests/kernels/elementwise_grad_compute_test.cc @@ -215,18 +215,6 @@ class ElementwiseAddGradTester { fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production()); this->run_forward(¶m_, &kernel_, x, y, out.data()); - for (int i = 0; i < x_dims_.production(); i++) { - LOG(INFO) << "x_" << i << ": " << x[i]; - } - - for (int i = 0; i < y_dims_.production(); i++) { - LOG(INFO) << "y_" << i << ": " << y[i]; - } - - for (int i = 0; i < out_dims_.production(); i++) { - LOG(INFO) << "out_" << i << ": " << out[i]; - } - // backward std::vector out_grad(out_dims_.production()); std::vector x_grad(x_dims_.production()); @@ -242,14 +230,6 @@ class ElementwiseAddGradTester { x_grad.data(), y_grad.data()); - for (int i = 0; i < x_grad.size(); i++) { - LOG(INFO) << "x_grad_" << i << ": " << x_grad[i]; - } - - for (int i = 0; i < y_grad.size(); i++) { - LOG(INFO) << "y_grad_" << i << ": " << y_grad[i]; - } - // get numeric gradient std::vector x_delta(x_dims_.production()); std::vector y_delta(y_dims_.production()); @@ -443,18 +423,6 @@ class ElementwiseSubGradTester { fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production()); this->run_forward(¶m_, &kernel_, x, y, out.data()); - for (int i = 0; i < x_dims_.production(); i++) { - LOG(INFO) << "x_" << i << ": " << x[i]; - } - - for (int i = 0; i < y_dims_.production(); i++) { - LOG(INFO) << "y_" << i << ": " << y[i]; - } - - for (int i = 0; i < out_dims_.production(); i++) { - LOG(INFO) << "out_" << i << ": " << out[i]; - } - // backward std::vector out_grad(out_dims_.production()); std::vector x_grad(x_dims_.production()); @@ -470,14 +438,6 @@ class ElementwiseSubGradTester { x_grad.data(), y_grad.data()); - for (int i = 0; i < x_grad.size(); i++) { - LOG(INFO) << "x_grad_" << i << ": " << x_grad[i]; - } - - for (int i = 0; i < y_grad.size(); i++) { - LOG(INFO) << "y_grad_" << i << ": " << y_grad[i]; - } - // get numeric gradient std::vector x_delta(x_dims_.production()); std::vector y_delta(y_dims_.production()); diff --git a/lite/tests/kernels/gather_compute_test.cc b/lite/tests/kernels/gather_compute_test.cc index 4d0ad1ab47a17c3e8d227b9e0482d7cbe21ab7e2..c023a12b0fb4e3118976d854114c554ca6bf6462 100644 --- a/lite/tests/kernels/gather_compute_test.cc +++ b/lite/tests/kernels/gather_compute_test.cc @@ -98,7 +98,7 @@ TEST(Gather, precision) { abs_error = 1e-2; // use fp16 in npu #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/layer_norm_compute_test.cc b/lite/tests/kernels/layer_norm_compute_test.cc index 5ea01a6cca504db230d62a63ef3a62d4f73470fa..bd4480b6127a318286b3172f53fc8a5bceb8c328 100644 --- a/lite/tests/kernels/layer_norm_compute_test.cc +++ b/lite/tests/kernels/layer_norm_compute_test.cc @@ -147,7 +147,7 @@ TEST(LayerNorm, precision) { LOG(INFO) << "test layer_norm op"; float abs_error = 2e-5; Place place; -#if defined(LITE_WITH_XPU) +#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #elif defined(LITE_WITH_NPU) place = TARGET(kNPU); diff --git a/lite/tests/kernels/lookup_table_compute_test.cc b/lite/tests/kernels/lookup_table_compute_test.cc index 988077c6c319d5bcc8e50d6c8e5544331a86fe45..ae39abf1dbaf206fe0a68dd492a48a2452c8094e 100644 --- a/lite/tests/kernels/lookup_table_compute_test.cc +++ b/lite/tests/kernels/lookup_table_compute_test.cc @@ -116,7 +116,7 @@ TEST(LookupTable, precision) { abs_error = 1e-2; #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; @@ -132,7 +132,8 @@ TEST(LookupTable, precision) { std::vector>{{5, 2, 3, 1}, {2, 3, 1}, {3, 1}}) { for (auto w_dims : std::vector>{{4, 2}, {6, 8}, {12, 15}}) { -#if defined(LITE_WITH_XPU) && defined(LITE_WITH_NPU) +#if (defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)) || \ + defined(LITE_WITH_NPU) for (auto padding_idx : std::vector{-1}) { // Only -1 is supported by XPU or NPU #else diff --git a/lite/tests/kernels/matmul_compute_test.cc b/lite/tests/kernels/matmul_compute_test.cc index 59b0fde8fd18b8a2170b6fdbd42444f09843f077..9799c15622b07a8d126654c79738d29b176c2cf4 100644 --- a/lite/tests/kernels/matmul_compute_test.cc +++ b/lite/tests/kernels/matmul_compute_test.cc @@ -457,7 +457,7 @@ TEST(Matmul2x2, precision) { abs_error = 1e-2; // use fp16 in npu #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; @@ -489,7 +489,7 @@ TEST(Matmul2x2_y_transpose, precision) { abs_error = 1e-2; // use fp16 in npu #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/mul_compute_test.cc b/lite/tests/kernels/mul_compute_test.cc index d070292332b65ed577ec6cefdb220ee691eb99e9..d89b3569358034d72ac8019f2348b49764ca6b0c 100644 --- a/lite/tests/kernels/mul_compute_test.cc +++ b/lite/tests/kernels/mul_compute_test.cc @@ -127,7 +127,7 @@ TEST(Mul, precision) { #if defined(LITE_WITH_NPU) place = TARGET(kNPU); abs_error = 1e-2; // use fp16 in npu -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/multiclass_nms_compute_test.cc b/lite/tests/kernels/multiclass_nms_compute_test.cc index a1190197bffdf505fec77c6b22b7871316a2d125..dd16730ef551ddc11825936d99733f33015fd2c0 100644 --- a/lite/tests/kernels/multiclass_nms_compute_test.cc +++ b/lite/tests/kernels/multiclass_nms_compute_test.cc @@ -478,7 +478,7 @@ TEST(multiclass_nms, precision) { Place place; #if defined(LITE_WITH_ARM) place = TARGET(kHost); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/pool_compute_test.cc b/lite/tests/kernels/pool_compute_test.cc index 04894188b0bf1557000479ae18b0369997909f89..fc4d004e552e76792470f46a54afd6aa13bbc330 100644 --- a/lite/tests/kernels/pool_compute_test.cc +++ b/lite/tests/kernels/pool_compute_test.cc @@ -381,7 +381,7 @@ TEST(Pool, precision) { #if defined(LITE_WITH_NPU) place = TARGET(kNPU); abs_error = 1e-2; // Using fp16 in NPU -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/reshape_compute_test.cc b/lite/tests/kernels/reshape_compute_test.cc index 3a866b6cf22cf67c3f5a60e5a4aa8603cee6a1a3..f3fcc0bad5418624c86897bafc52dbf3a7ec0d8e 100644 --- a/lite/tests/kernels/reshape_compute_test.cc +++ b/lite/tests/kernels/reshape_compute_test.cc @@ -206,7 +206,7 @@ TEST(Reshape, precision) { abs_error = 1e-2; // Using fp16 in NPU #elif defined(LITE_WITH_ARM) place = TARGET(kHost); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/roi_align_compute_test.cc b/lite/tests/kernels/roi_align_compute_test.cc index 8eb84dd0337d0635dc360e2e04aa1ad047e912c0..2bbfdcd81da951bd769ab03094a0df48f3a6e13b 100644 --- a/lite/tests/kernels/roi_align_compute_test.cc +++ b/lite/tests/kernels/roi_align_compute_test.cc @@ -106,13 +106,11 @@ class RoiAlignComputeTester : public arena::TestCase { } LOG(INFO) << "Read rois data. " << datas[0] << " " << datas.back(); reader.close(); - SetCommonTensor(rois_, dims, datas.data()); - auto rois_tensor = baseline_scope()->FindMutableTensor(rois_); std::vector lod0({0, 152, 304}); LoD lod; lod.push_back(lod0); - rois_tensor->set_lod(lod); + SetCommonTensor(rois_, dims, datas.data(), lod); } }; diff --git a/lite/tests/kernels/scale_compute_test.cc b/lite/tests/kernels/scale_compute_test.cc index efd0497002ee402426a7198bf47ec60c7f41d2fd..9d1f4403dc1a82e58d8c764933ba01c0e0b5c082 100644 --- a/lite/tests/kernels/scale_compute_test.cc +++ b/lite/tests/kernels/scale_compute_test.cc @@ -165,7 +165,7 @@ TEST(Scale, precision) { abs_error = 4e-3; // Using fp16 in NPU #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); abs_error = 3e-4; // Some operations use fp16 in XPU #elif defined(LITE_WITH_X86) diff --git a/lite/tests/kernels/sequence_conv_compute_test.cc b/lite/tests/kernels/sequence_conv_compute_test.cc index 84887b2573516d0c82cbb8c9b4cf9336f30ee41d..68afaad04f8e84995e811f81f99a2d4109c845a5 100644 --- a/lite/tests/kernels/sequence_conv_compute_test.cc +++ b/lite/tests/kernels/sequence_conv_compute_test.cc @@ -85,21 +85,31 @@ class SequenceConvComputeTester : public arena::TestCase { auto output_dims = output->dims(); auto output_data = output->mutable_data(); std::vector> res; - if (contextStart_ == -2) { + + if (contextStart_ == -2 && lod_.size() == 1 && + lod_[0] == std::vector({0, 4})) { res = {{-0.08867277f, -0.17257819f, -0.2564836f}, {0.194508f, 0.05720823f, -0.08009153f}, {0.73512584f, 0.5749428f, 0.41475973f}, {0.5635012f, 0.49485126f, 0.42620137f}}; - } else if (contextStart_ == -1) { + } else if (contextStart_ == -1 && lod_.size() == 1 && + lod_[0] == std::vector({0, 4})) { res = {{0.194508f, 0.05720823f, -0.08009153f}, {0.73512584f, 0.5749428f, 0.41475973f}, {0.5635012f, 0.49485126f, 0.42620137f}, {0.2517162f, 0.23646072f, 0.22120519f}}; - } else if (contextStart_ == 0) { + } else if (contextStart_ == 0 && lod_.size() == 1 && + lod_[0] == std::vector({0, 4})) { res = {{0.73512584f, 0.5749428f, 0.41475973f}, {0.5635012f, 0.49485126f, 0.42620137f}, {0.2517162f, 0.23646072f, 0.22120519f}, {0.02574372f, 0.03337148f, 0.04099924f}}; + } else if (contextStart_ == -1 && lod_.size() == 1 && + lod_[0] == std::vector({0, 2, 4})) { + res = {{0.194508, 0.05720823, -0.08009153}, + {0.7093821, 0.57208234, 0.43478262}, + {0.19450802, 0.17925248, 0.16399695}, + {0.2517162, 0.23646072, 0.22120519}}; } else { fprintf(stderr, "not supported contextStart_\n"); exit(-1); @@ -136,12 +146,25 @@ void TestNormalCase(Place place, float abs_error = 2e-5) { } } +void TestBatchCase(Place place, float abs_error = 2e-5) { + std::vector> lod{{0, 2, 4}}; + std::vector dims{4, 5}; + std::vector candidate_pad_idx{-1}; + for (int pad_idx : candidate_pad_idx) { + std::unique_ptr tester(new SequenceConvComputeTester( + place, "def", lod, DDim(dims), pad_idx, 1, 3, 3)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } +} + TEST(sequence_conv, precision) { #ifdef LITE_WITH_ARM float abs_error = 2e-5; Place place(TARGET(kARM)); TestNormalCase(place, abs_error); + TestBatchCase(place, abs_error); #endif } diff --git a/lite/tests/kernels/slice_compute_test.cc b/lite/tests/kernels/slice_compute_test.cc index fc96b39f010eab5eedd431cb81e881b7aadb11a2..b566bfa3e86cf6067f9914b5fc3932458a6ee186 100644 --- a/lite/tests/kernels/slice_compute_test.cc +++ b/lite/tests/kernels/slice_compute_test.cc @@ -202,20 +202,15 @@ class SliceComputeTester : public arena::TestCase { DDim({static_cast(ends_.size())}), ends_.data()); } else if (use_tensor_list_) { - Scope& scope_ = this->scope(); for (int i = 0; i < starts_.size(); ++i) { - auto* tensor = scope_.NewTensor("starts_tensor_list_" + - paddle::lite::to_string(i)); - tensor->Resize(DDim({1})); - auto* d = tensor->mutable_data(); - d[0] = starts_[i]; + SetCommonTensor("starts_tensor_list_" + paddle::lite::to_string(i), + DDim({1}), + &starts_[i]); } for (int i = 0; i < ends_.size(); ++i) { - auto* tensor = - scope_.NewTensor("ends_tensor_list_" + paddle::lite::to_string(i)); - tensor->Resize(DDim({1})); - auto* d = tensor->mutable_data(); - d[0] = ends_[i]; + SetCommonTensor("ends_tensor_list_" + paddle::lite::to_string(i), + DDim({1}), + &ends_[i]); } } } @@ -273,7 +268,7 @@ TEST(Slice, precision) { test_slice(place); test_slice_tensor(place); test_slice_tensor_list(place); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) Place place(TARGET(kXPU)); test_slice(place); #endif diff --git a/lite/tests/kernels/softmax_compute_test.cc b/lite/tests/kernels/softmax_compute_test.cc index a91f6534ffa1f8022e2005cc83255d306adf77c1..87a94aba184a055081446b4df830b72146834ed2 100644 --- a/lite/tests/kernels/softmax_compute_test.cc +++ b/lite/tests/kernels/softmax_compute_test.cc @@ -111,8 +111,12 @@ TEST(Softmax, precision) { for (auto x_dims : std::vector>{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}}) { - for (auto axis : {-1, 0, 1, 2, 3}) { - if (axis >= x_dims.size()) continue; + int ndims = x_dims.size(); + for (int axis = -1; axis < ndims; axis++) { +#if defined(LITE_WITH_XPU) + if (axis != -1 && axis != ndims - 1) + continue; // -1 and dims.size() - 1 are only supported by XPU +#endif std::unique_ptr tester( new SoftmaxComputeTest(place, "def", DDim(x_dims), axis)); arena::Arena arena(std::move(tester), place, abs_error); diff --git a/lite/tests/kernels/stack_compute_test.cc b/lite/tests/kernels/stack_compute_test.cc index 10b289e41972eb6a9f332f0376393fdfaae94abe..72529cac5165badd50c086a75e882417725adb96 100644 --- a/lite/tests/kernels/stack_compute_test.cc +++ b/lite/tests/kernels/stack_compute_test.cc @@ -106,7 +106,7 @@ TEST(Stack, precision) { Place place; #ifdef LITE_WITH_ARM place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/transpose_compute_test.cc b/lite/tests/kernels/transpose_compute_test.cc index 0ec010e47fe22f0bd60f0c275696f726b6f01a68..933e9f8ec5fc7b1d9b510c71f57fda309a5477dc 100644 --- a/lite/tests/kernels/transpose_compute_test.cc +++ b/lite/tests/kernels/transpose_compute_test.cc @@ -164,7 +164,7 @@ TEST(Transpose, precision) { LOG(INFO) << "test Transpose op"; float abs_error = 2e-5; Place place; -#ifdef LITE_WITH_XPU +#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #elif defined(LITE_WITH_NPU) place = TARGET(kNPU); diff --git a/lite/tests/kernels/yolo_box_compute_test.cc b/lite/tests/kernels/yolo_box_compute_test.cc index c41c89608fd7496c5b01b1a813581f7f461ff0ee..b88f25e1e0ddb85683297c19a841a5d47b2bbccf 100644 --- a/lite/tests/kernels/yolo_box_compute_test.cc +++ b/lite/tests/kernels/yolo_box_compute_test.cc @@ -247,7 +247,7 @@ TEST(YoloBox, precision) { Place place; #if defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/math/conv_compute_test.cc b/lite/tests/math/conv_compute_test.cc index 8265f9db2f85e54dd91314ac5dc7932e7f7e842a..9ad98ce6f4566898b3821e6bf540b331a84b97bb 100644 --- a/lite/tests/math/conv_compute_test.cc +++ b/lite/tests/math/conv_compute_test.cc @@ -236,19 +236,19 @@ void test_conv_fp32(const std::vector& input_dims, double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] * weight_dim[3] / param.groups; - LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape" - << dim_out << ",running time, avg: " << t0.LapTimes().Avg() - << ", min time: " << t0.LapTimes().Min() - << ", total GOPS: " << 1e-9 * gops - << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() - << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); + VLOG(4) << "conv fp32: input shape: " << dim_in << ", output shape" + << dim_out << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() + << ", total GOPS: " << 1e-9 * gops + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); if (FLAGS_check_result) { double max_ratio = 0; double max_diff = 0; tensor_cmp_host(tout_basic, *param.output, max_ratio, max_diff); - LOG(INFO) << "compare result, max diff: " << max_diff - << ", max ratio: " << max_ratio; + VLOG(4) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; if (std::abs(max_ratio) > 1e-3f) { if (max_diff > 5e-4f) { LOG(WARNING) << "basic result"; @@ -274,15 +274,15 @@ void test_conv_fp32(const std::vector& input_dims, } } } - LOG(INFO) << "test fp32 conv: input: " << dim_in - << ", output: " << dim_out << ", weight dim: " << weight_dim - << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2] - << ", " << pads[3] << ", stride: " << strides[0] << ", " - << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] - << ", group: " << group - << ", bias: " << (flag_bias ? "true" : "false") - << ", act: " << flag_act << ", threads: " << th - << ", power_mode: " << cls << " successed!!\n"; + VLOG(4) << "test fp32 conv: input: " << dim_in + << ", output: " << dim_out << ", weight dim: " << weight_dim + << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2] + << ", " << pads[3] << ", stride: " << strides[0] << ", " + << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] + << ", group: " << group + << ", bias: " << (flag_bias ? "true" : "false") + << ", act: " << flag_act << ", threads: " << th + << ", power_mode: " << cls << " successed!!\n"; } } } diff --git a/lite/tools/build_android.sh b/lite/tools/build_android.sh index aba5fb706cb62e5bc9b50127f16d07e0db55d595..5713c4e21bb97d12bb840c99d1adbc7f2d781157 100755 --- a/lite/tools/build_android.sh +++ b/lite/tools/build_android.sh @@ -1,5 +1,5 @@ #!/bin/bash -set -x +set +x ##################################################################################################### # 1. global variables, you can change them according to your requirements ##################################################################################################### @@ -269,6 +269,7 @@ function main { if [ -z "$1" ]; then # compiling result contains light_api lib only, recommanded. make_tiny_publish_so $ARCH $TOOLCHAIN $ANDROID_STL + exit 0 fi # Parse command line. @@ -358,6 +359,7 @@ function main { done # compiling result contains light_api lib only, recommanded. make_tiny_publish_so + exit 0 } main $@ diff --git a/lite/tools/build_bm.sh b/lite/tools/build_bm.sh index 964da15b0b6fcf888812271b0a2c944d9efa63b8..055f6a35c3ab145e9dfe4bc5d46172a2119ffb25 100755 --- a/lite/tools/build_bm.sh +++ b/lite/tools/build_bm.sh @@ -43,7 +43,7 @@ function prepare_thirdparty { # clone bmlibs if [ ! -d ${workspace}/third-party/bmlibs ]; then git clone https://github.com/AnBaolei1984/bmlibs.git ${workspace}/third-party/bmlibs - fi + fi } # for code gen, a source file is generated after a test, but is dependended by some targets in cmake. @@ -70,6 +70,13 @@ function build_bm { mkdir -p $build_dir cd $build_dir + if [ $TARGET_NAME == "BM1684" ]; then + BM_SDK_ROOT="$workspace/third-party/bmlibs/bm_sc5_libs" + else + BM_SDK_ROOT="$workspace/third-party/bmlibs/bm_sc3_libs" + fi + echo $BM_SDK_ROOT + prepare_workspace cmake .. \ ${CMAKE_COMMON_OPTIONS} \ @@ -95,17 +102,7 @@ function main { case $i in --target_name=*) TARGET_NAME="${i#*=}" - shift - ;; - #--bm_sdk_root=*) - # BM_SDK_ROOT="${i#*=}" - # shift - # ;; - bm) build_bm - shift - ;; - *) # unknown option print_usage exit 1 diff --git a/lite/tools/build_ios.sh b/lite/tools/build_ios.sh index 2c7eeb466f3d82cf491b6a631d79918fa4fd4cd2..3d4337aa8ecc20fd078b8906a950408927ea56c8 100755 --- a/lite/tools/build_ios.sh +++ b/lite/tools/build_ios.sh @@ -152,6 +152,7 @@ function main { esac done make_ios $ARCH + exit 0 } main $@ diff --git a/lite/tools/build_mlu.sh b/lite/tools/build_mlu.sh index 01d71aaf213abb99633112664af580b897ce7454..e0fb2ab11b110cf5a29151ea7c8e544a4074c8c5 100755 --- a/lite/tools/build_mlu.sh +++ b/lite/tools/build_mlu.sh @@ -4,7 +4,7 @@ set -ex # global variables with default value NEUWARE_HOME="${NEUWARE_HOME}" TARGET_NAME="all" # default target -BUILD_EXTRA=OFF # ON(with sequence ops)/OFF +BUILD_EXTRA=ON # ON(with sequence ops)/OFF WITH_TESTING=ON # ON/OFF function print_usage { @@ -28,16 +28,13 @@ readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/t readonly workspace=$(pwd) function prepare_thirdparty { - if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then + if [ ! -d $workspace/third-party ]; then rm -rf $workspace/third-party - - if [ ! -f $workspace/third-party-05b862.tar.gz ]; then - wget $THIRDPARTY_TAR - fi - tar xzf third-party-05b862.tar.gz - else - git submodule update --init --recursive fi + if [ ! -f $workspace/third-party-05b862.tar.gz ]; then + wget $THIRDPARTY_TAR + fi + tar xvf third-party-05b862.tar.gz } # for code gen, a source file is generated after a test, but is dependended by some targets in cmake. diff --git a/lite/tools/check_api_approvals.sh b/lite/tools/check_api_approvals.sh old mode 100644 new mode 100755 index 6100558d68abb2b4c82c1f367078e519972546ce..b2a4659c964121b0a95961195340c296710db2de --- a/lite/tools/check_api_approvals.sh +++ b/lite/tools/check_api_approvals.sh @@ -5,13 +5,14 @@ if [ -z ${BRANCH} ]; then fi LITE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../.." && pwd )" - approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle-Lite/pulls/${GIT_PR_ID}/reviews?per_page=10000` -git_files=`git diff --numstat upstream/$BRANCH| wc -l` -git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'` failed_num=0 echo_list=() +# approval list +Superjomn=328693 +DannyIsFunny=45189361 + function add_failed(){ failed_num=`expr $failed_num + 1` echo_list="${echo_list[@]}$1" @@ -24,20 +25,105 @@ function check_approval(){ add_failed "${failed_num}. ${echo_line}" fi } +#################################################################################################### +# Check 1: You must have Superjomn's (Yunchunwei) approval for changing +# 20+ files or adding more than 1000+ lines of content +#################################################################################################### +function CheckModifiedFileNums() { + git_files=`git diff --numstat upstream/$BRANCH| wc -l` + git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'` + + if [[ $git_files -gt 19 || $git_count -gt 999 ]];then + echo_line="You must have Superjomn's (Yunchunwei) approval for changing 20+ files or adding more than 1000+ lines of content.\n" + check_approval 1 $Superjomn + fi + if [ -n "${echo_list}" ];then + echo "****************" + echo -e "${echo_list[@]}" + echo "There are ${failed_num} approved errors." + echo "****************" + fi + + if [ -n "${echo_list}" ]; then + exit 1 + fi +} +#################################################################################################### +# Check 2: You must have Superjomn's (Yunchunwei) approval for increasing +# size of dynamic lib for 10+ kb +#################################################################################################### +function CheckLibSizeDiff() { + # step1: record lib size of current branch + lite/tools/build_android.sh --arch=armv8 --toolchain=gcc --android_stl=c++_static --with_log=OFF + current_size=`stat -c%s build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/cxx/lib/libpaddle_light_api_shared.so` -if [[ $git_files -gt 19 || $git_count -gt 999 ]];then - echo_line="You must have Superjomn (Yunchunwei) approval for change 20+ files or add than 1000+ lines of content.\n" - check_approval 1 328693 -fi + # step2: record lib size of current develop branch + git checkout develop + git clean -f . && git checkout . + git fetch upstream && git merge upstream/develop -if [ -n "${echo_list}" ];then - echo "****************" - echo -e "${echo_list[@]}" - echo "There are ${failed_num} approved errors." - echo "****************" -fi + lite/tools/build_android.sh --arch=armv8 --toolchain=gcc --android_stl=c++_static --with_log=OFF + develop_size=`stat -c%s build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/cxx/lib/libpaddle_light_api_shared.so` + + # step3: if diff_size > 10485, special approval is needed + diff_size=$[$current_size - $develop_size] + if [ $diff_size -gt 10485 ]; then + echo_line="Your PR has increased basic inference lib for $diff_size Byte, exceeding maximum requirement of 10485 Byte (0.01M). You need Superjomn's (Yunchunwei) approval or you can contact DannyIsFunny(HuZhiqiang).\n" + echo "****************" + echo -e "${echo_line[@]}" + echo "There is an approved errors." + echo "****************" + exit 1 + fi +# Todo: Code below should be applied later. +# if [ $diff_size -gt 10485 ]; then +# echo_line="Your PR has increased basic inference lib for $diff_size Byte, exceeding maximum requirement of 10485 Byte (0.01M). You need Superjomn's (Yunchunwei) approval or you can contact DannyIsFunny(HuZhiqiang).\n" +# check_approval 1 $Superjomn +# fi +# +# if [ -n "${echo_list}" ];then +# echo "****************" +# echo -e "${echo_list[@]}" +# echo "There are ${failed_num} approved errors." +# echo "****************" +# fi +# +# if [ -n "${echo_list}" ]; then +# exit 1 +# fi +} + +#################################################################################################### +# Main functions +#################################################################################################### +function main { + if [ -z "$1" ]; then + # at least on argument is needed + echo "Error: at least on argument is needed!" + exit 1 + fi + + # Parse command line. + for i in "$@"; do + case $i in + check_modified_file_nums) + # modified files num can not exceed 20 + + CheckModifiedFileNums + exit 0 + ;; + check_lib_size_diff) + # size diff can not exceed 10K + + CheckLibSizeDiff + exit 0 + ;; + *) + # unknown option + echo "Error: unsupported input argument!" + exit 1 + ;; + esac + done +} -if [ -n "${echo_list}" ]; then - exit 1 -fi +main $@ diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh index 29ed9100f932b3215e45fc2352b5f0d73b7349b1..680c865c2c8999a29ff2b351dadfc797506c87f6 100755 --- a/lite/tools/ci_build.sh +++ b/lite/tools/ci_build.sh @@ -279,7 +279,7 @@ function test_server { } function assert_api_spec_approvals() { - /bin/bash ${LITE_ROOT}/lite/tools/check_api_approvals.sh + /bin/bash ${LITE_ROOT}/lite/tools/check_api_approvals.sh check_modified_file_nums if [ "$?" != 0 ];then exit 1 fi @@ -353,7 +353,7 @@ function cmake_xpu { -DWITH_MKL=ON \ -DLITE_BUILD_EXTRA=ON \ -DLITE_WITH_XPU=ON \ - -DXPU_SDK_ROOT="$(pwd)/../../XPU_SDK" + -DXPU_SDK_ROOT="/opt/output" } function build_xpu { @@ -564,8 +564,18 @@ function test_arm_model { function test_model_optimize_tool_compile { cd $workspace cd build + # Compile opt tool cmake .. -DWITH_LITE=ON -DLITE_ON_MODEL_OPTIMIZE_TOOL=ON -DWITH_TESTING=OFF -DLITE_BUILD_EXTRA=ON make opt -j$NUM_CORES_FOR_COMPILE + # Check whether opt can transform quantized mobilenetv1 successfully. + cd lite/api && chmod +x ./opt + wget --no-check-certificate https://paddlelite-data.bj.bcebos.com/doc_models/MobileNetV1_quant.tar.gz + tar zxf MobileNetV1_quant.tar.gz + ./opt --model_dir=./MobileNetV1_quant --valid_targets=arm --optimize_out=quant_mobilenetv1 + if [ ! -f quant_mobilenetv1.nb ]; then + echo -e "Error! Resulted opt can not tramsform MobileNetV1_quant successfully!" + exit 1 + fi } function _test_paddle_code_generator { diff --git a/lite/utils/all.h b/lite/utils/all.h index a0d323aa24b36dac7858f484eb1cf1d5a7bcba50..8586188b99971d04271d14ac2d3b8043b0ea4414 100644 --- a/lite/utils/all.h +++ b/lite/utils/all.h @@ -14,10 +14,16 @@ #pragma once +#include +#include +#include +#include +#include +#include + #include "lite/utils/any.h" #include "lite/utils/check.h" #include "lite/utils/cp_logging.h" -#include "lite/utils/factory.h" #include "lite/utils/hash.h" #include "lite/utils/io.h" #include "lite/utils/macros.h" diff --git a/lite/utils/env.h b/lite/utils/env.h index 3048c84b42f6f658eaf0c8ee0d08456f53162c37..f3bb8b58e1b63ed2c0ed05792020d11ea307690c 100644 --- a/lite/utils/env.h +++ b/lite/utils/env.h @@ -22,6 +22,8 @@ #define SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE \ "SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE" +#define SUBGRAPH_DISABLE_ONLINE_MODE "SUBGRAPH_DISABLE_ONLINE_MODE" + namespace paddle { namespace lite { diff --git a/lite/utils/factory.h b/lite/utils/factory.h deleted file mode 100644 index d286ceb42ce32dba68bc68cabab2a600ad3d7789..0000000000000000000000000000000000000000 --- a/lite/utils/factory.h +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include "lite/utils/all.h" -#include "lite/utils/cp_logging.h" -#include "lite/utils/replace_stl/stream.h" - -namespace paddle { -namespace lite { - -/* - * Factor for any Type creator. - * - * Usage: - * - * struct SomeType; - * // Register a creator. - * Factory::Global().Register("some_key", [] -> - * std::unique_ptr { ... }); - * // Retrive a creator. - * auto some_type_instance = Factory::Global().Create("some_key"); - */ -template -class Factory { - public: - using item_t = ItemType; - using self_t = Factory; - using item_ptr_t = ItemTypePtr; - using creator_t = std::function; - - static Factory& Global() { - static Factory* x = new self_t; - return *x; - } - - void Register(const std::string& op_type, creator_t&& creator) { - creators_[op_type].emplace_back(std::move(creator)); - } - - item_ptr_t Create(const std::string& op_type) const { - auto res = Creates(op_type); - if (res.empty()) return nullptr; - CHECK_EQ(res.size(), 1UL) << "Get multiple Op for type " << op_type; - return std::move(res.front()); - } - - std::list Creates(const std::string& op_type) const { - std::list res; - auto it = creators_.find(op_type); - if (it == creators_.end()) return res; - for (auto& c : it->second) { - res.emplace_back(c()); - } - return res; - } - - std::string DebugString() const { - STL::stringstream ss; - for (const auto& item : creators_) { - ss << " - " << item.first << "\n"; - } - return ss.str(); - } - - protected: - std::map> creators_; -}; - -/* A helper function to help run a lambda at the start. - */ -template -class Registor { - public: - explicit Registor(std::function&& functor) { functor(); } - - // Touch will do nothing. - int Touch() { return 0; } -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/utils/io.h b/lite/utils/io.h index 2141364df79bb189772592a556dd9a115ae1a67e..5de95e72f06856df01189e8ae3f1c22115801094 100644 --- a/lite/utils/io.h +++ b/lite/utils/io.h @@ -120,5 +120,40 @@ static std::vector ListDir(const std::string& path, return paths; } +static bool ReadFile(const std::string& filename, std::vector* contents) { + FILE* fp = fopen(filename.c_str(), "rb"); + if (!fp) return false; + fseek(fp, 0, SEEK_END); + size_t size = ftell(fp); + fseek(fp, 0, SEEK_SET); + contents->clear(); + contents->resize(size); + size_t offset = 0; + char* ptr = reinterpret_cast(&(contents->at(0))); + while (offset < size) { + size_t already_read = fread(ptr, 1, size - offset, fp); + offset += already_read; + ptr += already_read; + } + fclose(fp); + return true; +} + +static bool WriteFile(const std::string& filename, + const std::vector& contents) { + FILE* fp = fopen(filename.c_str(), "wb"); + if (!fp) return false; + size_t size = contents.size(); + size_t offset = 0; + const char* ptr = reinterpret_cast(&(contents.at(0))); + while (offset < size) { + size_t already_written = fwrite(ptr, 1, size - offset, fp); + offset += already_written; + ptr += already_written; + } + fclose(fp); + return true; +} + } // namespace lite } // namespace paddle diff --git a/lite/utils/md5.h b/lite/utils/md5.h new file mode 100644 index 0000000000000000000000000000000000000000..c2e972dd8001a9a85e29688f460be061d64a16b5 --- /dev/null +++ b/lite/utils/md5.h @@ -0,0 +1,104 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +namespace paddle { +namespace lite { + +std::string MD5(std::string message) { + const uint32_t shiftAmounts[] = { + 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, + 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, + 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, + 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21}; + const uint32_t partsOfSines[] = { + 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, + 0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, + 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340, + 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, + 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, + 0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, + 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa, + 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, + 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, + 0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, + 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391}; + + uint32_t state[4]; + state[0] = 0x67452301; + state[1] = 0xefcdab89; + state[2] = 0x98badcfe; + state[3] = 0x10325476; + + // Pad with zeros + int size = ((((message.length() + 8) / 64) + 1) * 64) - 8; + uint8_t *buf = reinterpret_cast(calloc(size + 64, 1)); + memcpy(buf, message.c_str(), message.length()); + buf[message.length()] = 128; + uint32_t bits = 8 * message.length(); + memcpy(buf + size, &bits, 4); + +// Process at each 512-bit(64 bytes) chunk +#define LEFTROTATE(x, c) (((x) << (c)) | ((x) >> (32 - (c)))) + for (int offset = 0; offset < size; offset += 64) { + uint32_t A = state[0]; + uint32_t B = state[1]; + uint32_t C = state[2]; + uint32_t D = state[3]; + uint32_t *W = reinterpret_cast(buf + offset); + for (uint32_t i = 0; i < 64; i++) { + uint32_t F, g; + if (i < 16) { + F = (B & C) | ((~B) & D); + g = i; + } else if (i < 32) { + F = (D & B) | ((~D) & C); + g = (5 * i + 1) % 16; + } else if (i < 48) { + F = B ^ C ^ D; + g = (3 * i + 5) % 16; + } else { + F = C ^ (B | (~D)); + g = (7 * i) % 16; + } + uint32_t T = D; + D = C; + C = B; + B = B + LEFTROTATE((A + F + partsOfSines[i] + W[g]), shiftAmounts[i]); + A = T; + } + state[0] += A; + state[1] += B; + state[2] += C; + state[3] += D; + } +#undef LEFTROTATE + free(buf); + + // Convert digest to string + std::string res; + res.reserve(16 << 1); + const uint8_t *digest = reinterpret_cast(state); + char hex[3]; + for (size_t i = 0; i < 16; i++) { + snprintf(hex, sizeof(hex), "%02x", digest[i]); + res.append(hex); + } + return res; +} + +} // namespace lite +} // namespace paddle diff --git a/lite/utils/paddle_enforce.h b/lite/utils/paddle_enforce.h deleted file mode 100644 index 82534af996919ac69a8624e442f1af6a9abb2c07..0000000000000000000000000000000000000000 --- a/lite/utils/paddle_enforce.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* - * This file defines PADDLE_ENFORCE_xx, which helps to adapt the legacy fluid - * codes. - */ -#pragma once -#include "lite/utils/cp_logging.h" -#include "lite/utils/string.h" - -#define PADDLE_ENFORCE(cond, ...) \ - CHECK((cond)) << paddle::lite::string_format("" __VA_ARGS__); -#define PADDLE_ENFORCE_EQ(a, b, ...) \ - CHECK_EQ((a), (b)) << paddle::lite::string_format("" __VA_ARGS__); -#define PADDLE_ENFORCE_LE(a, b, ...) \ - CHECK_LE((a), (b)) << paddle::lite::string_format("" __VA_ARGS__); -#define PADDLE_ENFORCE_LT(a, b, ...) \ - CHECK_LT((a), (b)) << paddle::lite::string_format("" __VA_ARGS__); - -#define PADDLE_ENFORCE_GE(a, b, ...) \ - CHECK_GE((a), (b)) << paddle::lite::string_format("" __VA_ARGS__); -#define PADDLE_ENFORCE_GT(a, b, ...) \ - CHECK_GT((a), (b)) << paddle::lite::string_format("" __VA_ARGS__); - -#ifndef PADDLE_THROW -#define PADDLE_THROW(...) printf("" __VA_ARGS__); -#endif diff --git a/lite/utils/string.h b/lite/utils/string.h index ada51d0b85d7536bfc937a7b1b8368a0f0e053be..b1aaf5d6c56d8931c4ad416f9d38c947abc68dd8 100644 --- a/lite/utils/string.h +++ b/lite/utils/string.h @@ -60,6 +60,38 @@ static std::string to_string(const T& v) { return ss.str(); } +static std::string to_string(int index) { + const int BUFFER_LENGTH = 15; + char buffer[BUFFER_LENGTH]; + snprintf(buffer, sizeof(buffer), "%d", index); + return std::string(buffer); +} + +template +static T parse_string(const std::string& v) { + return v; +} + +template <> +int32_t parse_string(const std::string& v) { + return std::stoi(v); +} + +template <> +int64_t parse_string(const std::string& v) { + return std::stoll(v); +} + +template <> +float parse_string(const std::string& v) { + return std::stof(v); +} + +template <> +double parse_string(const std::string& v) { + return std::stod(v); +} + template std::string Join(const std::vector& vec, const std::string& delim) { if (vec.empty()) return ""; @@ -84,19 +116,20 @@ static std::string Repr(const std::vector& v) { return "{" + Join(tmp, ",") + "}"; } -static std::vector Split(const std::string& original, - const std::string& separator) { - std::vector results; +template +static std::vector Split(const std::string& original, + const std::string& separator) { + std::vector results; std::string::size_type pos1, pos2; pos2 = original.find(separator); pos1 = 0; while (std::string::npos != pos2) { - results.push_back(original.substr(pos1, pos2 - pos1)); + results.push_back(parse_string(original.substr(pos1, pos2 - pos1))); pos1 = pos2 + separator.size(); pos2 = original.find(separator, pos1); } if (pos1 != original.length()) { - results.push_back(original.substr(pos1)); + results.push_back(parse_string(original.substr(pos1))); } return results; } diff --git a/third-party/flatbuffers b/third-party/flatbuffers new file mode 160000 index 0000000000000000000000000000000000000000..6df40a2471737b27271bdd9b900ab5f3aec746c7 --- /dev/null +++ b/third-party/flatbuffers @@ -0,0 +1 @@ +Subproject commit 6df40a2471737b27271bdd9b900ab5f3aec746c7