提交 37f606d2 编写于 作者: J jiweibo

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite into stream_manage

......@@ -116,4 +116,10 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
metal/MobileNetDemo/MobileNetDemo/Resources
#flatbuffers
lite/model_parser/flatbuffers/framework_generated.h
build*
# hiai libs
ai_ddk_lib*
......@@ -10,3 +10,6 @@
[submodule "third-party/protobuf-host"]
path = third-party/protobuf-host
url = https://github.com/protocolbuffers/protobuf.git
[submodule "third-party/flatbuffers"]
path = third-party/flatbuffers
url = https://github.com/google/flatbuffers.git
......@@ -106,7 +106,8 @@ lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kerne
lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
# cv build options
lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF)
lite_option(LITE_WITH_STATIC_CUDA "Statically link cuda libraries." ON)
lite_option(LITE_WITH_STATIC_CUDA "Statically link cuda libraries." OFF)
lite_option(CUDA_WITH_FP16 "Compile with cuda half support" OFF)
lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF)
# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
......@@ -168,6 +169,7 @@ if(LITE_WITH_RKNPU)
include(device/rknpu)
endif()
include(external/flatbuffers)
# for mobile
if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
......
......@@ -35,8 +35,12 @@ endif()
if(NOT DEFINED ANDROID_API_LEVEL)
set(ANDROID_API_LEVEL "23")
if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
if(LITE_WITH_NPU AND NOT LITE_ON_TINY_PUBLISH)
set(ANDROID_API_LEVEL "24") # HIAI DDK depends on android-24
else()
set(ANDROID_API_LEVEL "22")
endif()
endif()
endif()
# then check input arm abi
......
......@@ -2,6 +2,10 @@ if(NOT LITE_WITH_CUDA)
return()
endif()
if(WITH_CUDA_FP16)
add_definitions("-DCUDA_WITH_FP16")
endif()
set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs7 "30 35 50 52")
set(paddle_known_gpu_archs8 "30 35 50 52 53 60 61 62")
......@@ -167,6 +171,10 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
endif()
if (CUDA_WITH_FP16)
STRING(REGEX REPLACE "30|35|50|52" "" paddle_known_gpu_archs ${paddle_known_gpu_archs})
endif()
include_directories(${CUDA_INCLUDE_DIRS})
if(NOT WITH_DSO)
if(WIN32)
......
......@@ -39,7 +39,7 @@ else()
endif()
find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt
PATHS ${XPU_SDK_ROOT}/XTDK/shlib
PATHS ${XPU_SDK_ROOT}/XTDK/runtime/shlib ${XPU_SDK_ROOT}/XTDK/shlib # libxpurt.so may have been moved to XTDK/runtime/shlib
NO_DEFAULT_PATH)
if(NOT XPU_SDK_XPU_RT_FILE)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
INCLUDE(ExternalProject)
# Introduce variables:
# * CMAKE_INSTALL_LIBDIR
INCLUDE(GNUInstallDirs)
SET(LIBDIR "lib")
if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$")
SET(LIBDIR "lib64")
endif()
SET(FLATBUFFERS_PREFIX_DIR ${THIRD_PARTY_PATH}/flatbuffers)
SET(FLATBUFFERS_SOURCES_DIR ${CMAKE_SOURCE_DIR}/third-party/flatbuffers)
SET(FLATBUFFERS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flatbuffers)
SET(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_SOURCES_DIR}/include" CACHE PATH "flatbuffers include directory." FORCE)
IF(WIN32)
set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
ELSE(WIN32)
set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.a" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
ENDIF(WIN32)
INCLUDE_DIRECTORIES(${FLATBUFFERS_INCLUDE_DIR})
if(NOT HOST_CXX_COMPILER)
set(HOST_CXX_COMPILER ${CMAKE_CXX_COMPILER})
set(HOST_C_COMPILER ${CMAKE_C_COMPILER})
endif()
SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${HOST_CXX_COMPILER}"
"-DCMAKE_C_COMPILER=${HOST_C_COMPILER}")
ExternalProject_Add(
extern_flatbuffers
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/google/flatbuffers.git"
GIT_TAG "v1.12.0"
SOURCE_DIR ${FLATBUFFERS_SOURCES_DIR}
PREFIX ${FLATBUFFERS_PREFIX_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DBUILD_STATIC_LIBS=ON
-DCMAKE_INSTALL_PREFIX=${FLATBUFFERS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_TESTING=OFF
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
-DFLATBUFFERS_BUILD_TESTS=OFF
${CROSS_COMPILE_CMAKE_ARGS}
${OPTIONAL_ARGS}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${FLATBUFFERS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
IF(WIN32)
IF(NOT EXISTS "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib")
add_custom_command(TARGET extern_flatbuffers POST_BUILD
COMMAND cmake -E copy ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/flatbuffers_static.lib ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib
)
ENDIF()
ENDIF(WIN32)
ADD_LIBRARY(flatbuffers STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET flatbuffers PROPERTY IMPORTED_LOCATION ${FLATBUFFERS_LIBRARIES})
ADD_DEPENDENCIES(flatbuffers extern_flatbuffers)
SET(FLATBUFFERS_FLATC_EXECUTABLE ${FLATBUFFERS_INSTALL_DIR}/bin/flatc)
function(register_generated_output file_name)
get_property(tmp GLOBAL PROPERTY FBS_GENERATED_OUTPUTS)
list(APPEND tmp ${file_name})
set_property(GLOBAL PROPERTY FBS_GENERATED_OUTPUTS ${tmp})
endfunction(register_generated_output)
function(compile_flatbuffers_schema_to_cpp_opt TARGET SRC_FBS OPT)
if(FLATBUFFERS_BUILD_LEGACY)
set(OPT ${OPT};--cpp-std c++0x)
else()
# --cpp-std is defined by flatc default settings.
endif()
message(STATUS "`${SRC_FBS}`: add generation of C++ code with '${OPT}'")
get_filename_component(SRC_FBS_DIR ${SRC_FBS} PATH)
message(STATUS "SRC_FBS_DIR: ${SRC_FBS_DIR}")
string(REGEX REPLACE "\\.fbs$" "_generated.h" GEN_HEADER ${SRC_FBS})
add_custom_command(
OUTPUT ${GEN_HEADER}
COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}"
--cpp --gen-mutable --gen-object-api --reflect-names
--cpp-ptr-type flatbuffers::unique_ptr # Used to test with C++98 STLs
${OPT}
-I "${CMAKE_CURRENT_SOURCE_DIR}/tests/include_test"
-o "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR}"
"${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}"
DEPENDS flatbuffers
COMMENT "Run generation: '${GEN_HEADER}'")
register_generated_output(${GEN_HEADER})
add_custom_target(${TARGET} ALL DEPENDS ${GEN_HEADER})
endfunction()
set(FRAMEWORK_FBS_DIR "lite/model_parser/flatbuffers")
set(FRAMEWORK_SCHEMA_PATH "${FRAMEWORK_FBS_DIR}/framework.fbs")
compile_flatbuffers_schema_to_cpp_opt(framework_fbs_header ${FRAMEWORK_SCHEMA_PATH} "--no-includes;--gen-compare;--force-empty")
include_directories(${FLATBUFFERS_INCLUDE_DIR})
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR})
# C++ Train Demo
# Introduction
我们都知道,PaddleLite可以做移动端预测,事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子,这一例子对应的任务是“波士顿房价预测”,又称作“fit-a-line”。
## Introduction
你可以通过book库中的
我们都知道,PaddleLite可以做移动端预测,事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子,这一例子对应的任务是“波士顿房价预测”,又称作“fit-a-line”。
你可以通过book库中的
[文档](https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html)
[源码](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line)
......@@ -10,18 +12,16 @@
其使用线性回归(Linear Regression)
模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。
注:这是一篇使用C++ API做模型训练的教程,其他API暂时不支持训练功能。
# Requirements
## Requirements
- 一部安卓手机,用于运行训练程序
- 装了Paddle (version: 1.7.0) 的python
- 装了Paddle (version >= 1.7.0) 的python
# Quick start
## Quick start
## Step1 build paddle-lite
### Step1 build paddle-lite
请按照[paddle-lite官方文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#paddlelite) 的教程编译full_publish的paddle-lite lib。以Linux上编译为例,其具体的命令为:
请按照paddle-lite官方文档的教程编译full_publish的paddle-lite lib。以Linux上编译为例,其具体的命令为:
```shell
## 配置环境
......@@ -51,7 +51,7 @@ cd Paddle-Lite
Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so
```
## Step2 编译lr_trainer
### Step2 编译lr_trainer
```shell
cd Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/
......@@ -64,7 +64,7 @@ bin/
`-- demo_trainer
```
## Step3 download model and run it!
### Step3 download model and run it!
在你的笔记本电脑上,用usb连接到手机,开启开发者模式,在任意目录下执行:
......@@ -102,7 +102,7 @@ sample 8: Loss: 248.445
sample 9: Loss: 325.135
```
# 更多细节
## 更多细节
上面提到的模型是直接下载得到的,如果你想自己生成,可以执行以下命令:
```shell
......@@ -125,9 +125,9 @@ md5sum fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d
如果你想生成自己的模型用于训练,可以参考`train.py`中保存模型的方式。
# 与Paddle训练结果做校对
## 与Paddle训练结果做校对
## 前10个Loss值
### 前10个Loss值
为了验证paddle与lite的一致性,我们控制模型参数一致、数据一致、batch size = 1的情况下,训练10个batch, 记录了二者的loss值。
......@@ -171,11 +171,11 @@ sample 8: Loss: 248.445
sample 9: Loss: 325.135
```
## Loss 曲线
### Loss 曲线
控制训练时的batch size为20,每个epoch对训练数据做全局shuffle,训练100个epoch后,paddle和lite的loss曲线对比如下。
![lr_loss](image/lr_loss.png)
![lr_loss](../images/lr_loss.png)
如果想复现上述效果,paddle+python的运行命令为:
......
......@@ -86,19 +86,28 @@ config.set_model_from_file(/YOU_MODEL_PATH/mobilenet_v1_opt.nb)
predictor = create_paddle_predictor(config)
```
(3) 设置输入数据
(3) 从图片读入数据
```python
image = Image.open('./example.jpg')
resized_image = image.resize((224, 224), Image.BILINEAR)
image_data = np.array(resized_image).flatten().tolist()
```
(4) 设置输入数据
```python
input_tensor = predictor.get_input(0)
input_tensor.resize([1, 3, 224, 224])
input_tensor.set_float_data([1.] * 3 * 224 * 224)
input_tensor.set_float_data(image_data)
```
(4) 执行预测
(5) 执行预测
```python
predictor.run()
```
(5) 得到输出数据
(6) 得到输出数据
```python
output_tensor = predictor.get_output(0)
print(output_tensor.shape())
......
......@@ -60,6 +60,13 @@ Welcome to Paddle-Lite's documentation!
demo_guides/rockchip_npu
demo_guides/mediatek_apu
.. toctree::
:maxdepth: 1
:caption: 训练示例(预览)
:name: sec-train_demo_guides
demo_guides/cpp_train_demo
.. toctree::
:maxdepth: 1
:caption: API文档
......
......@@ -61,7 +61,7 @@ inference_lite_lib.ios64.armv8 iOS预测库和头文件
- 裁剪预测库方法(只编译模型中的kernel&OP,降低预测库体积):
```shell
./lite/tools/build_android.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir
./lite/tools/build_ios.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir
```
```shell
--with_strip: (OFF|ON); 是否根据输入模型裁剪预测库,默认为OFF
......
......@@ -21,11 +21,11 @@ pip install paddlelite
- 方法二: 下载opt可执行文件
[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases),选择当前预测库对应版本的`opt`转化工具
本文提供`release/v2.6``release/v2.2.0`版本的优化工具下载
本文提供`release/v2.6.1``release/v2.2.0`版本的优化工具下载
|版本 | Linux | MacOS|
|---|---|---|
| `release/v2.3`| [opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) |
| `release/v2.6.1` | [opt](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt_mac) |
|`release/v2.2.0` | [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool) | [model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) |
- 方法三: 源码编译opt
......
......@@ -49,4 +49,4 @@ $ ./opt \
## 五. 测试工具
为了使您更好的了解并使用Lite框架,我们向有进一步使用需求的用户开放了 [Debug工具](debug#debug)[Profile工具](debug#profiler)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异,进一步快速定位问题Op,方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗,其会自动统计Op执行的次数,最长、最短、平均执行时间等等信息,为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。
为了使您更好的了解并使用Lite框架,我们向有进一步使用需求的用户开放了 [Debug工具](debug)[Profile工具](debug)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异,进一步快速定位问题Op,方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗,其会自动统计Op执行的次数,最长、最短、平均执行时间等等信息,为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。
......@@ -53,6 +53,8 @@ if (WITH_TESTING)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "step_rnn.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "bert.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "ernie.tar.gz")
endif()
endif()
......@@ -242,7 +244,6 @@ if (LITE_WITH_X86)
add_dependencies(publish_inference_x86_cxx_lib test_model_bin)
add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
COMMAND rm -rf "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_light_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_light"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_full_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_full"
......
......@@ -2,7 +2,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR (NOT LITE_WITH_LOG))
lite_cc_library(place SRCS paddle_place.cc DEPS logging)
else()
lite_cc_library(place SRCS paddle_place.cc DEPS glog)
endif(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
endif()
if (LITE_ON_TINY_PUBLISH)
set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG")
......@@ -15,8 +15,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
#full api dynamic library
lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
DEPS paddle_api paddle_api_light paddle_api_full)
add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
target_link_libraries(paddle_full_api_shared framework_proto)
target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files})
add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto op_registry framework_fbs_header)
target_link_libraries(paddle_full_api_shared framework_proto op_registry)
if(LITE_WITH_X86)
add_dependencies(paddle_full_api_shared xxhash)
target_link_libraries(paddle_full_api_shared xxhash)
......@@ -70,7 +71,7 @@ else()
set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto")
endif()
set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}")
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h framework_fbs_header)
if (LITE_WITH_NPU)
# Need to add HIAI runtime libs (libhiai.so) dependency
target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
......@@ -368,6 +369,9 @@ endif()
if (LITE_WITH_PYTHON)
add_subdirectory(python)
# add library for opt_base
lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
endif()
if (LITE_ON_TINY_PUBLISH)
......@@ -375,9 +379,6 @@ if (LITE_ON_TINY_PUBLISH)
endif()
# add library for opt_base
lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
if (LITE_ON_MODEL_OPTIMIZE_TOOL)
message(STATUS "Compiling opt")
......
......@@ -17,6 +17,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
# Unlike static library, module library has to link target to be able to work
# as a single .so lib.
target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels})
add_dependencies(paddle_lite_jni framework_fbs_header)
if (LITE_WITH_NPU)
# Strips the symbols of our protobuf functions to fix the conflicts during
# loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
......@@ -31,7 +32,7 @@ else()
endif()
set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS ${TARGET_COMIPILE_FLAGS})
target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
add_dependencies(paddle_lite_jni op_list_h kernel_list_h framework_fbs_header)
if (LITE_WITH_NPU)
# Need to add HIAI runtime libs (libhiai.so) dependency
target_link_libraries(paddle_lite_jni ${npu_builder_libs} ${npu_runtime_libs})
......
......@@ -13,18 +13,24 @@
// limitations under the License.
#include "lite/api/cxx_api.h"
#include <algorithm>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "lite/api/paddle_use_passes.h"
#include "lite/utils/io.h"
namespace paddle {
namespace lite {
std::vector<std::string> GetAllOps() {
return OpLiteFactory::Global().GetAllOps();
}
void Predictor::SaveModel(const std::string &dir,
lite_api::LiteModelType model_type,
bool record_info) {
......@@ -326,10 +332,8 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
}
}
if (is_quantized_model) {
#ifdef LITE_WITH_ARM
inner_places.insert(inner_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)});
#endif
}
Program program(*desc.get(), scope_, inner_places);
......
......@@ -41,6 +41,8 @@ static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
".tailored_kernels_source_list";
static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
std::vector<std::string> GetAllOps();
/*
* Predictor for inference, input a model, it will optimize and execute it.
*/
......
......@@ -52,12 +52,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
if (!status_is_cloned_) {
#ifdef LITE_WITH_MLU
Env<TARGET(kMLU)>::Init();
lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
lite::TargetWrapperMlu::SetMLURunMode(config.mlu_core_version(),
config.mlu_core_number(),
config.mlu_use_first_conv(),
config.mlu_first_conv_mean(),
config.mlu_first_conv_std(),
config.mlu_input_layout());
config.mlu_input_layout(),
config.mlu_firstconv_param());
#endif // LITE_WITH_MLU
auto use_layout_preprocess_pass =
config.model_dir().find("OPENCL_PRE_PRECESS");
......@@ -75,6 +73,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
mode_ = config.power_mode();
threads_ = config.threads();
#ifdef LITE_WITH_NPU
Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
config.subgraph_model_cache_dir());
#endif
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
int num_threads = config.x86_math_library_num_threads();
......
......@@ -15,8 +15,6 @@
#include "lite/api/light_api.h"
#include <algorithm>
#include <map>
#include "paddle_use_kernels.h" // NOLINT
#include "paddle_use_ops.h" // NOLINT
namespace paddle {
namespace lite {
......
......@@ -13,6 +13,9 @@
// limitations under the License.
#include "lite/api/paddle_api.h"
#include <utility>
#include "lite/core/context.h"
#include "lite/core/device_info.h"
#include "lite/core/target_wrapper.h"
......@@ -21,6 +24,13 @@
#ifdef LITE_WITH_CUDA
#include "lite/backends/cuda/target_wrapper.h"
#endif
#ifdef LITE_WITH_XPU
#include "lite/backends/xpu/target_wrapper.h"
#endif
#ifdef LITE_WITH_MLU
#include "lite/backends/mlu/target_wrapper.h"
#endif
namespace paddle {
namespace lite_api {
......@@ -106,6 +116,13 @@ void Tensor::CopyFromCpu(const T *src_data) {
data, src_data, num * sizeof(T), lite::IoDirection::HtoD, *io_stream_);
#else
LOG(FATAL) << "Please compile the lib with CUDA.";
#endif
} else if (type == TargetType::kMLU) {
#ifdef LITE_WITH_MLU
lite::TargetWrapperMlu::MemcpySync(
data, src_data, num * sizeof(T), lite::IoDirection::HtoD);
#else
LOG(FATAL) << "Please compile the lib with MLU.";
#endif
} else {
LOG(FATAL) << "The CopyFromCpu interface just support kHost, kARM, kCUDA";
......@@ -127,6 +144,13 @@ void Tensor::CopyToCpu(T *data) const {
lite::TargetWrapperCuda::StreamSync(*io_stream_);
#else
LOG(FATAL) << "Please compile the lib with CUDA.";
#endif
} else if (type == TargetType::kMLU) {
#ifdef LITE_WITH_MLU
lite::TargetWrapperMlu::MemcpySync(
data, src_data, num * sizeof(T), lite::IoDirection::DtoH);
#else
LOG(FATAL) << "Please compile the lib with MLU.";
#endif
} else {
LOG(FATAL) << "The CopyToCpu interface just support kHost, kARM, kCUDA";
......@@ -148,6 +172,11 @@ template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
template void Tensor::CopyFromCpu<int, TargetType::kMLU>(const int *);
template void Tensor::CopyFromCpu<int64_t, TargetType::kMLU>(const int64_t *);
template void Tensor::CopyFromCpu<float, TargetType::kMLU>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kMLU>(const int8_t *);
template void Tensor::CopyToCpu(float *) const;
template void Tensor::CopyToCpu(int *) const;
template void Tensor::CopyToCpu(int8_t *) const;
......@@ -238,13 +267,9 @@ void CxxConfig::set_mlu_core_number(int core_number) {
void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
mlu_input_layout_ = layout;
}
void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
mlu_use_first_conv_ = use_first_conv;
}
void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
void CxxConfig::set_mlu_firstconv_param(const std::vector<float> &mean,
const std::vector<float> &std) {
mlu_first_conv_mean_ = mean;
}
void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
mlu_first_conv_std_ = std;
}
lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
......@@ -252,18 +277,15 @@ lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
}
int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
return mlu_first_conv_mean_;
}
const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
return mlu_first_conv_std_;
std::pair<std::vector<float>, std::vector<float>>
CxxConfig::mlu_firstconv_param() const {
return std::make_pair(mlu_first_conv_mean_, mlu_first_conv_std_);
}
#endif
void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
#ifdef LITE_WITH_XPU
lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size);
lite::TargetWrapperXPU::workspace_l3_size_per_thread = l3_size;
#else
LOG(WARNING) << "The invoking of the function "
"'set_xpu_workspace_l3_size_per_thread' is ignored, please "
......@@ -273,7 +295,7 @@ void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
#ifdef LITE_WITH_XPU
lite::Context<TargetType::kXPU>::SetDev(dev_no);
lite::TargetWrapperXPU::SetDev(dev_no);
#else
LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
"ignored, please rebuild it with LITE_WITH_XPU=ON.";
......@@ -282,7 +304,7 @@ void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
void CxxConfig::set_xpu_multi_encoder_precision(const std::string &precision) {
#ifdef LITE_WITH_XPU
lite::Context<TargetType::kXPU>::_multi_encoder_precision = precision;
lite::TargetWrapperXPU::multi_encoder_precision = precision;
#else
LOG(WARNING) << "The invoking of the function "
"'set_xpu_multi_encoder_precision' is "
......
......@@ -21,6 +21,7 @@
#define PADDLE_LITE_API_H_
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "paddle_place.h" // NOLINT
......@@ -174,9 +175,8 @@ class LITE_API CxxConfig : public ConfigBase {
lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
int mlu_core_number_{1};
DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
bool mlu_use_first_conv_{false};
std::vector<float> mlu_first_conv_mean_;
std::vector<float> mlu_first_conv_std_;
std::vector<float> mlu_first_conv_mean_{};
std::vector<float> mlu_first_conv_std_{};
#endif
public:
......@@ -232,24 +232,22 @@ class LITE_API CxxConfig : public ConfigBase {
void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
// set MLU core number, which is used when compiling MLU kernels
void set_mlu_core_number(int core_number);
// set MLU input layout. User can specify layout of input data to be NHWC,
// default is NCHW
void set_mlu_input_layout(DataLayoutType layout);
// whether use MLU's first conv kernel. First conv is a special kernel
// provided by MLU, its input is uint8, and also needs two 3-dimentional
// vectors which save all inputs' mean and std values
void set_mlu_use_first_conv(bool use_first_conv);
// set the 3-dimentional mean vector used by MLU's first conv
void set_mlu_first_conv_mean(const std::vector<float>& mean);
// set the 3-dimentional std vector used by MLU's first conv
void set_mlu_first_conv_std(const std::vector<float>& std);
// set the 3-dimentional mean vector and 3-dimentional std vector used by
// MLU's first conv
void set_mlu_firstconv_param(const std::vector<float>& mean,
const std::vector<float>& std);
// set MLU input layout. User can specify layout of input data to be NHWC,
// default is NCHW
void set_mlu_input_layout(DataLayoutType layout);
lite_api::MLUCoreVersion mlu_core_version() const;
int mlu_core_number() const;
DataLayoutType mlu_input_layout() const;
bool mlu_use_first_conv() const;
const std::vector<float>& mlu_first_conv_mean() const;
const std::vector<float>& mlu_first_conv_std() const;
// std::pair<mean, std>
std::pair<std::vector<float>, std::vector<float>> mlu_firstconv_param() const;
#endif
// XPU only, set the size of the workspace memory from L3 cache for the
......
......@@ -15,8 +15,11 @@
#include "lite/api/paddle_api.h"
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/io.h"
DEFINE_string(model_dir, "", "");
namespace paddle {
......
......@@ -55,6 +55,8 @@ USE_MIR_PASS(apu_subgraph_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass);
USE_MIR_PASS(lite_scale_activation_fuse_pass);
USE_MIR_PASS(__xpu__resnet_fuse_pass);
USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);
USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
USE_MIR_PASS(__xpu__fc_fuse_pass);
USE_MIR_PASS(__xpu__mmdnn_fuse_pass);
......@@ -59,9 +59,9 @@ void TestModel(const std::vector<Place>& valid_places) {
}
auto* image_tensor = predictor.GetInput(1);
image_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 2})));
data = image_tensor->mutable_data<float>();
data[0] = FLAGS_im_height;
data[1] = FLAGS_im_width;
auto* data_1 = image_tensor->mutable_data<int>();
data_1[0] = FLAGS_im_height;
data_1[1] = FLAGS_im_width;
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor.Run();
......
......@@ -127,5 +127,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
split_merge_lod_tenosr.cc
reduce_prod.cc
lstm.cc
clip.cc
DEPS ${lite_kernel_deps} context tensor)
endif()
......@@ -763,24 +763,6 @@ void act_thresholded_relu<float>(
}
}
#ifdef LITE_WITH_TRAIN
template <>
void act_square_grad(const float* din,
const float* dout_grad,
float* din_grad,
int size,
int threads) {
const float* ptr_out_grad = dout_grad;
float* ptr_in_grad = din_grad;
for (int i = 0; i < size; ++i) {
ptr_in_grad[0] = ptr_out_grad[0] * 2.0 * din[0];
ptr_out_grad++;
ptr_in_grad++;
din++;
}
}
#endif
} // namespace math
} // namespace arm
} // namespace lite
......
......@@ -90,12 +90,6 @@ template <typename T>
void act_thresholded_relu(
const T* din, T* dout, int size, float threshold, int threads);
#ifdef LITE_WITH_TRAIN
template <typename T>
void act_square_grad(
const T* din, const T* dout_grad, T* din_grad, int size, int threads);
#endif
} // namespace math
} // namespace arm
} // namespace lite
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/arm/math/clip.h"
#include <algorithm>
#include <limits>
#include <memory>
#include "lite/backends/arm/math/funcs.h"
#include "lite/backends/arm/math/saturate.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
void clip_kernel_fp32(
const float* input, int64_t num, float min, float max, float* output) {
float tmp;
for (int64_t i = 0; i < num; i++) {
tmp = *input;
tmp = tmp > min ? tmp : min;
*output = tmp < max ? tmp : max;
input++;
output++;
}
}
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <string>
#include <vector>
#include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
void clip_kernel_fp32(
const float* input, int64_t num, float min, float max, float* output);
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
......@@ -11,8 +11,8 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/arm/math/elementwise.h"
#include <math.h>
#include <algorithm>
#include "lite/backends/arm/math/funcs.h"
......@@ -1254,6 +1254,19 @@ void elementwise_max_relu_broadcast<float>(const float* dinx,
}
}
template <>
void elementwise_div<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int num) {
for (int i = 0; i < num; i++) {
*dout = *dinx / *diny;
dout++;
dinx++;
diny++;
}
}
template <>
void elementwise_div<float>(const float* dinx,
const float* diny,
......@@ -1306,6 +1319,28 @@ void elementwise_div<float>(const float* dinx,
}
}
template <>
void elementwise_div_broadcast<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int batch,
int channels,
int num) {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const int64_t* din_ptr = dinx + offset;
const int64_t diny_data = diny[j];
int64_t* dout_ptr = dout + offset;
for (int p = 0; p < num; p++) {
*dout_ptr = *din_ptr / diny_data;
dout_ptr++;
din_ptr++;
}
}
}
}
template <>
void elementwise_div_broadcast<float>(const float* dinx,
const float* diny,
......@@ -1541,6 +1576,87 @@ void elementwise_div_relu_broadcast<float>(const float* dinx,
}
}
template <typename T>
void elementwise_mod_broadcast(
const T* dinx, const T* diny, T* dout, int batch, int channels, int num) {
#pragma omp parallel for collapse(2)
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const T* din_ptr = dinx + offset;
const T diny_data = diny[j];
T* dout_ptr = dout + offset;
int cnt = num >> 2;
int remain = num % 4;
for (int k = 0; k < cnt; ++k) {
register T dinx0 = din_ptr[0];
register T dinx1 = din_ptr[1];
register T dinx2 = din_ptr[2];
register T dinx3 = din_ptr[3];
dout_ptr[0] = dinx0 % diny_data;
dout_ptr[1] = dinx1 % diny_data;
dout_ptr[2] = dinx2 % diny_data;
dout_ptr[3] = dinx3 % diny_data;
din_ptr += 4;
dout_ptr += 4;
}
if (remain > 0) {
for (int p = 0; p < remain; p++) {
*dout_ptr++ = *din_ptr++ % diny_data;
}
}
}
}
}
template <typename T>
void elementwise_mod(const T* dinx, const T* diny, T* dout, int num) {
int cnt = num >> 2;
int remain = num % 4;
#pragma omp parallel for
for (int i = 0; i < cnt; i++) {
const T* dinx_ptr = dinx + (i << 2);
const T* diny_ptr = diny + (i << 2);
T* dout_ptr = dout + (i << 2);
register T dinx0 = dinx_ptr[0];
register T dinx1 = dinx_ptr[1];
register T dinx2 = dinx_ptr[2];
register T dinx3 = dinx_ptr[3];
register T diny0 = diny_ptr[0];
register T diny1 = diny_ptr[1];
register T diny2 = diny_ptr[2];
register T diny3 = diny_ptr[3];
dout_ptr[0] = dinx0 % diny0;
dout_ptr[1] = dinx1 % diny1;
dout_ptr[2] = dinx2 % diny2;
dout_ptr[3] = dinx3 % diny3;
}
if (remain > 0) {
const T* dinx_ptr = dinx + (cnt << 2);
const T* diny_ptr = diny + (cnt << 2);
T* dout_ptr = dout + (cnt << 2);
for (int i = 0; i < remain; i++) {
*dout_ptr++ = *dinx_ptr++ % *diny_ptr++;
}
}
}
template void elementwise_mod<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int num);
template void elementwise_mod_broadcast<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int batch,
int channels,
int num);
} // namespace math
} // namespace arm
} // namespace lite
......
......@@ -253,6 +253,13 @@ template <typename T>
void elementwise_div_relu_broadcast(
const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
template <typename T>
void elementwise_mod(const T* dinx, const T* diny, T* dout, int num);
template <typename T>
void elementwise_mod_broadcast(
const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
} // namespace math
} // namespace arm
} // namespace lite
......
......@@ -25,6 +25,7 @@
#include "lite/backends/arm/math/axpy.h"
#include "lite/backends/arm/math/beam_search.h"
#include "lite/backends/arm/math/box_coder.h"
#include "lite/backends/arm/math/clip.h"
#include "lite/backends/arm/math/col_im_transform.h"
#include "lite/backends/arm/math/concat.h"
#include "lite/backends/arm/math/conv_block_utils.h"
......
......@@ -531,7 +531,7 @@ void softmax_inner1_large_axis<float>(const float* din,
}
float32x2_t vhmax = vmax_f32(vget_high_f32(vmax), vget_low_f32(vmax));
float max_data = std::max(vget_lane_f32(vhmax, 0), vget_lane_f32(vhmax, 1));
for (j = 4 * j; j < axis_size; ++j) {
for (j = 4 * nn; j < axis_size; ++j) {
max_data = std::max(max_data, din_max_ptr[0]);
din_max_ptr++;
}
......@@ -557,7 +557,7 @@ void softmax_inner1_large_axis<float>(const float* din,
float32x2_t vhsum = vadd_f32(vget_high_f32(vsum), vget_low_f32(vsum));
float sum_data = vget_lane_f32(vhsum, 0) + vget_lane_f32(vhsum, 1);
for (j = 4 * j; j < axis_size; ++j) {
for (j = 4 * nn; j < axis_size; ++j) {
dout_sum_ptr[0] = expf(din_sum_ptr[0] - max_data);
sum_data += dout_sum_ptr[0];
din_sum_ptr++;
......
......@@ -41,6 +41,8 @@
<< "CUDA: " << cudaGetErrorString(e); \
}
#define CUDA_POST_KERNEL_CHECK CUDA_CALL(cudaPeekAtLastError())
#define CUBLAS_CALL(func) \
{ \
auto e = (func); \
......@@ -127,6 +129,10 @@ static const char* CudnnGetErrorInfo(cudnnStatus_t status) {
return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
#endif
#if CUDNN_VERSION_MIN(8, 0, 0)
case CUDNN_STATUS_VERSION_MISMATCH:
return "CUDNN_STATUS_VERSION_MISMATCH";
#endif
}
return "Unknown cudnn status";
......
......@@ -13,6 +13,8 @@ nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
nv_library(cuda_gemm SRCS gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_strided_gemm SRCS strided_gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_sequence_padding SRCS sequence_padding.cu DEPS ${cuda_static_deps})
set (
math_cuda
......@@ -25,6 +27,8 @@ set (
cudnn_pool
cuda_gemm
cuda_batched_gemm
cuda_strided_gemm
cuda_sequence_padding
)
set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
......@@ -161,15 +161,17 @@ bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param,
search_func);
} else {
CUDNN_CHECK(
cudnnGetConvolutionForwardAlgorithm(this->handle_,
int requestedAlgoCount = 1;
int returnedAlgoCount;
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(this->handle_,
this->input_desc_,
this->filter_desc_,
this->conv_desc_,
this->output_desc_,
this->preference_,
this->workspace_limit_bytes_,
&this->fwd_algo_));
requestedAlgoCount,
&returnedAlgoCount,
&this->algo_perf_));
this->fwd_algo_ = this->algo_perf_.algo;
}
CUDNN_CHECK(
cudnnGetConvolutionForwardWorkspaceSize(this->handle_,
......
......@@ -81,6 +81,7 @@ class CudnnConv2DBase {
cudaStream_t stream_;
cudnnHandle_t handle_;
cudnnConvolutionFwdAlgo_t fwd_algo_;
cudnnConvolutionFwdAlgoPerf_t algo_perf_;
cudnnTensorDescriptor_t input_desc_;
cudnnTensorDescriptor_t output_desc_;
cudnnTensorDescriptor_t bias_desc_;
......@@ -98,8 +99,6 @@ class CudnnConv2DBase {
const bool use_tensor_core_ = true;
const size_t workspace_limit_bytes_ = 4 * 1024 * 1024;
const cudnnConvolutionFwdPreference_t preference_ =
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
// For int8
Tensor temp_tensor_;
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/backends/cuda/math/sequence_padding.h"
#include "lite/backends/cuda/math/utils.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
enum CopyType { kSeqToPad, kPadToSeq };
template <typename T, CopyType Type>
__global__ void SequencePadKernel(T* dst,
const T* src,
const T* pad_value,
bool is_constant_pad,
const size_t* seq_offsets,
const int seq_num,
const int pad_seq_len,
const int step_width) {
size_t seq_idx = blockIdx.y;
size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
size_t step_idx = blockIdx.x * blockDim.y + threadIdx.y;
size_t seq_data_offset = (seq_offsets[seq_idx] + step_idx) * step_width;
size_t pad_data_offset = (seq_idx * pad_seq_len + step_idx) * step_width;
T* dst_data = dst + (Type == kSeqToPad ? pad_data_offset : seq_data_offset);
const T* src_data =
src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset);
if (step_idx < seq_len) {
for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
dst_data[i] = src_data[i];
}
} else if (step_idx < pad_seq_len && Type == kSeqToPad) {
for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
dst_data[i] = is_constant_pad ? pad_value[0] : pad_value[i];
}
}
}
template <typename T>
void SequencePadding(T* pad_data,
const T* seq_data,
const T* pad_value_data,
bool is_constant_pad,
const size_t* seq_offsets_data,
int seq_num,
int pad_seq_len,
int step_width,
cudaStream_t* stream) {
const int kBlockSize = 512;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t block_dim_x =
std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = seq_num;
dim3 grid(grid_dim_x, grid_dim_y);
SequencePadKernel<T, kSeqToPad><<<grid, threads, 0, *stream>>>(
pad_data,
seq_data,
pad_value_data,
is_constant_pad,
seq_offsets_data,
seq_num,
pad_seq_len,
step_width);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
}
template <typename T>
void SequenceUnpadding(T* seq_data,
const T* pad_data,
const size_t* seq_offsets_data,
int seq_num,
int pad_seq_len,
int step_width,
cudaStream_t* stream) {
const int kBlockSize = 512;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t block_dim_x =
std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = seq_num;
dim3 grid(grid_dim_x, grid_dim_y);
SequencePadKernel<T, kPadToSeq><<<grid, threads, 0, *stream>>>(
seq_data,
pad_data,
nullptr,
false,
seq_offsets_data,
seq_num,
pad_seq_len,
step_width);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
}
template void SequencePadding(float* pad_data,
const float* seq_data,
const float* pad_value_data,
bool is_constant_pad,
const size_t* seq_offsets_data,
int seq_num,
int pad_seq_len,
int step_width,
cudaStream_t* stream);
template void SequencePadding(half* pad_data,
const half* seq_data,
const half* pad_value_data,
bool is_constant_pad,
const size_t* seq_offsets_data,
int seq_num,
int pad_seq_len,
int step_width,
cudaStream_t* stream);
template void SequenceUnpadding(float* seq_data,
const float* pad_data,
const size_t* seq_offsets_data,
int seq_num,
int pad_seq_len,
int step_width,
cudaStream_t* stream);
template void SequenceUnpadding(half* seq_data,
const half* pad_data,
const size_t* seq_offsets_data,
int seq_num,
int pad_seq_len,
int step_width,
cudaStream_t* stream);
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda.h>
#include <cuda_runtime.h>
#include <string>
#include <vector>
#include "lite/core/context.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename T>
void SequenceUnpadding(T* seq_data,
const T* pad_data,
const size_t* seq_offsets_data,
int seq_num,
int pad_seq_len,
int step_width,
cudaStream_t* stream);
template <typename T>
void SequencePadding(T* pad_data,
const T* seq_data,
const T* pad_value_data,
bool is_constant_pad,
const size_t* seq_offsets_data,
int seq_num,
int pad_seq_len,
int step_width,
cudaStream_t* stream);
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/cuda/math/strided_gemm.h"
#include <iostream>
#include "lite/core/device_info.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename PtypeIn, typename PtypeOut>
bool StridedGemm<PtypeIn, PtypeOut>::init(const bool trans_a,
const bool trans_b,
Context<TARGET(kCUDA)>* ctx) {
if (cu_handle_ == nullptr) {
this->exe_stream_ = ctx->exec_stream();
CUBLAS_CALL(cublasCreate(&cu_handle_));
CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
}
cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
return true;
}
template <>
bool StridedGemm<float, float>::run(const float alpha,
const float beta,
const int m,
const int n,
const int k,
const float* a_data,
const float* b_data,
float* c_data,
const int batch_size,
const int64_t stride_a,
const int64_t stride_b) {
lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
ldc_ = n;
m_ = m;
n_ = n;
k_ = k;
const int64_t stride_c = m_ * n_;
CUBLAS_CALL(cublasGemmStridedBatchedEx(cu_handle_,
cu_trans_b_,
cu_trans_a_,
n_,
m_,
k_,
&alpha,
b_data,
CUDA_R_32F,
ldb_,
stride_b,
a_data,
CUDA_R_32F,
lda_,
stride_a,
&beta,
c_data,
CUDA_R_32F,
ldc_,
stride_c,
batch_size,
CUDA_R_32F,
algo_));
return true;
}
template <>
bool StridedGemm<half, half>::run(const half alpha,
const half beta,
const int m,
const int n,
const int k,
const half* a_data,
const half* b_data,
half* c_data,
const int batch_size,
const int64_t stride_a,
const int64_t stride_b) {
lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
ldc_ = n;
m_ = m;
n_ = n;
k_ = k;
const int64_t stride_c = m_ * n_;
CUBLAS_CALL(cublasGemmStridedBatchedEx(cu_handle_,
cu_trans_b_,
cu_trans_a_,
n_,
m_,
k_,
&alpha,
b_data,
CUDA_R_16F,
ldb_,
stride_b,
a_data,
CUDA_R_16F,
lda_,
stride_a,
&beta,
c_data,
CUDA_R_16F,
ldc_,
stride_c,
batch_size,
CUDA_R_16F,
algo_));
return true;
}
template class StridedGemm<float, float>;
template class StridedGemm<half, half>;
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cudnn.h>
#include <string>
#include <vector>
#include "lite/api/paddle_place.h"
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/context.h"
#include "lite/core/target_wrapper.h"
#include "lite/operators/op_params.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename PtypeIn, typename PtypeOut>
class StridedGemm {
public:
StridedGemm() : cu_handle_(nullptr) {}
~StridedGemm() {}
bool init(const bool trans_a,
const bool trans_b,
Context<TARGET(kCUDA)>* ctx);
bool run(const PtypeIn alpha,
const PtypeIn beta,
const int m,
const int n,
const int k,
const PtypeIn* a_data,
const PtypeIn* b_data,
PtypeOut* c_data,
const int batch_size,
const int64_t stride_a,
const int64_t stride_b);
private:
cudaStream_t exe_stream_;
cublasHandle_t cu_handle_;
cublasOperation_t cu_trans_a_;
cublasOperation_t cu_trans_b_;
int m_{-1};
int n_{-1};
int k_{-1};
int lda_{-1};
int ldb_{-1};
int ldc_{-1};
cublasGemmAlgo_t algo_{CUBLAS_GEMM_DEFAULT_TENSOR_OP};
};
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
......@@ -174,24 +174,9 @@ void Transpose<T>::transpose(T* dst,
TransposeCUDAImpl<T>(src_dims, axes, src, dst, &Y_dims_, &strides_, stream);
}
// template <typename T>
// void Transpose<T>::transpose(T* dst,
// const T* src,
// const std::vector<int>& src_dims,
// const std::vector<int>& axes,
// cudaStream_t* stream) {
// std::vector<int64_t> _src_dims(src_dims.size(), 0);
// std::transform(
// src_dims.begin(),
// src_dims.end(),
// _src_dims.begin(),
// [](int data) -> int64_t { return static_cast<int64_t>(data); });
// TransposeCUDAImpl<T>(_src_dims, axes, src, dst, &Y_dims_, &strides_,
// stream);
//}
template class Transpose<int8_t>;
template class Transpose<float>;
template class Transpose<half>;
} // namespace math
} // namespace cuda
......
......@@ -15,6 +15,7 @@
#include "lite/backends/mlu/target_wrapper.h"
#include <memory>
#include <utility>
#include "lite/backends/mlu/mlu_utils.h"
......@@ -36,6 +37,13 @@ void cnrtMemcpyDtoH(void* dst, const void* src, size_t size) {
} // namespace mlu
thread_local cnmlCoreVersion_t TargetWrapperMlu::mlu_core_version_{CNML_MLU270};
thread_local int TargetWrapperMlu::mlu_core_number_{1};
thread_local bool TargetWrapperMlu::use_first_conv_{false};
thread_local std::vector<float> TargetWrapperMlu::mean_vec_;
thread_local std::vector<float> TargetWrapperMlu::std_vec_;
thread_local DataLayoutType TargetWrapperMlu::input_layout_{DATALAYOUT(kNCHW)};
size_t TargetWrapperMlu::num_devices() {
uint32_t dev_count = 0;
CNRT_CALL(cnrtGetDeviceCount(&dev_count)) << " cnrt get device count failed";
......@@ -77,15 +85,42 @@ void TargetWrapperMlu::MemcpySync(void* dst,
LOG(FATAL) << "Unsupported IoDirection" << static_cast<int>(dir);
}
}
void TargetWrapperMlu::SetMLURunMode(
lite_api::MLUCoreVersion core_version,
int core_number,
DataLayoutType input_layout,
std::pair<std::vector<float>, std::vector<float>> firstconv_param) {
switch (core_version) {
case (lite_api::MLUCoreVersion::MLU_220):
mlu_core_version_ = CNML_MLU220;
break;
case (lite_api::MLUCoreVersion::MLU_270):
mlu_core_version_ = CNML_MLU270;
break;
default:
mlu_core_version_ = CNML_MLU270;
break;
}
mlu_core_number_ = core_number;
mean_vec_ = firstconv_param.first;
std_vec_ = firstconv_param.second;
use_first_conv_ = !(mean_vec_.empty() || std_vec_.empty());
input_layout_ = input_layout;
}
cnmlCoreVersion_t TargetWrapperMlu::MLUCoreVersion() {
return mlu_core_version_;
}
int TargetWrapperMlu::MLUCoreNumber() { return mlu_core_number_; }
bool TargetWrapperMlu::UseFirstConv() { return use_first_conv_; }
const std::vector<float>& TargetWrapperMlu::MeanVec() { return mean_vec_; }
const std::vector<float>& TargetWrapperMlu::StdVec() { return std_vec_; }
// void TargetWrapperMlu::MemcpyAsync(void* dst,
// const void* src,
// size_t size,
// IoDirection dir,
// const stream_t& stream) {
// LOG(WARNING) << "Mlu unsupported MemcpyAsync now, use MemcpySync.";
// MemcpySync(dst, src, size, dir);
// }
DataLayoutType TargetWrapperMlu::InputLayout() { return input_layout_; }
} // namespace lite
} // namespace paddle
......@@ -13,6 +13,8 @@
// limitations under the License.
#pragma once
#include <utility>
#include <vector>
#include "lite/backends/mlu/mlu_utils.h"
#include "lite/core/target_wrapper.h"
......@@ -43,11 +45,25 @@ class TargetWrapper<TARGET(kMLU)> {
const void* src,
size_t size,
IoDirection dir);
// static void MemcpyAsync(void* dst,
// const void* src,
// size_t size,
// IoDirection dir,
// const queue_t& queue);
static void SetMLURunMode(
lite_api::MLUCoreVersion core_version,
int core_number,
DataLayoutType input_layout,
std::pair<std::vector<float>, std::vector<float>> firstconv_param);
static cnmlCoreVersion_t MLUCoreVersion();
static int MLUCoreNumber();
static bool UseFirstConv();
static const std::vector<float>& MeanVec();
static const std::vector<float>& StdVec();
static DataLayoutType InputLayout();
private:
static thread_local cnmlCoreVersion_t mlu_core_version_;
static thread_local int mlu_core_number_;
static thread_local bool use_first_conv_;
static thread_local std::vector<float> mean_vec_;
static thread_local std::vector<float> std_vec_;
static thread_local DataLayoutType input_layout_;
};
} // namespace lite
......
......@@ -20,94 +20,120 @@ namespace paddle {
namespace lite {
namespace npu {
bool WriteToOMFile(const domi::ModelBufferData& om_model_buff,
std::string om_file_path) {
FILE* fp;
fp = fopen(om_file_path.c_str(), "wb");
CHECK(fp != nullptr) << om_file_path << " open failed!";
uint32_t write_size =
(uint32_t)fwrite(om_model_buff.data, 1, om_model_buff.length, fp);
CHECK_EQ(write_size, om_model_buff.length) << "write om file failed !";
fclose(fp);
return true;
}
bool ReadFromOMFile(domi::ModelBufferData* om_model_buff,
std::string om_file_path) {
FILE* fp;
fp = fopen(om_file_path.c_str(), "rb");
CHECK(fp != nullptr) << om_file_path << " open failed!";
fseek(fp, 0, SEEK_END);
uint32_t model_length = (uint32_t)ftell(fp);
fseek(fp, 0, SEEK_SET);
om_model_buff->data = malloc(model_length);
om_model_buff->length = model_length;
uint32_t read_size =
(uint32_t)fread(om_model_buff->data, 1, model_length, fp);
CHECK_EQ(read_size, model_length) << "read om file failed !";
fclose(fp);
return true;
std::shared_ptr<hiai::AiModelMngerClient> Device::Load(
const std::string& model_name,
std::vector<char>* model_buffer,
bool* model_comp) {
// Create a HiAI model manager client to load the HiAI om model
auto model_client = std::make_shared<hiai::AiModelMngerClient>();
if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
LOG(WARNING) << "[NPU] Init hiai model client failed!";
return nullptr;
}
// Check HiAI DDK version
const char* ddk_version = model_client->GetVersion();
if (ddk_version) {
LOG(INFO) << "[NPU] HiAI DDK version: " << ddk_version;
} else {
LOG(WARNING) << "[NPU] Unable to get HiAI DDK version!";
}
// Check model compatibility
auto model_desc = std::make_shared<hiai::AiModelDescription>(
model_name, freq_level(), framework_type(), model_type(), device_type());
model_desc->SetModelBuffer(
reinterpret_cast<const void*>(model_buffer->data()),
model_buffer->size());
if (!*model_comp &&
model_client->CheckModelCompatibility(*model_desc, *model_comp) !=
hiai::AI_SUCCESS) {
*model_comp = false;
VLOG(3) << "[NPU] model is NOT compatiblitiable, setting model_comp to "
<< *model_comp;
} else {
*model_comp = true;
VLOG(3) << "[NPU] model is compatiblitiable, setting model_comp to "
<< *model_comp;
}
// Rebuild and write the data of the compatible model to the model buffer
if (!*model_comp) {
std::shared_ptr<hiai::AiModelBuilder> model_builder =
std::make_shared<hiai::AiModelBuilder>(model_client);
hiai::MemBuffer* org_model_buffer = model_builder->InputMemBufferCreate(
reinterpret_cast<void*>(model_buffer->data()), model_buffer->size());
if (org_model_buffer) {
std::vector<hiai::MemBuffer*> org_model_buffers;
org_model_buffers.push_back(org_model_buffer);
hiai::MemBuffer* new_model_buffer = model_builder->OutputMemBufferCreate(
framework_type(), org_model_buffers);
// VLOG(3) << "[NPU] new model buffer memeory size is " <<
// new_model_buffer->GetMemBufferSize();
if (new_model_buffer) {
uint32_t new_model_size = 0;
if (model_builder->BuildModel(org_model_buffers,
new_model_buffer,
new_model_size) == hiai::AI_SUCCESS) {
// need to change to new_model_size as GetMemBufferSize is not
// correct.
model_buffer->resize(new_model_size);
memcpy(reinterpret_cast<void*>(model_buffer->data()),
new_model_buffer->GetMemBufferData(),
new_model_size);
// Reset the model buffer
model_desc->SetModelBuffer(
reinterpret_cast<const void*>(model_buffer->data()),
model_buffer->size());
VLOG(3) << "[NPU] Rebuild the compatible model done.";
} else {
LOG(WARNING) << "[NPU] Rebuild the compatible model failed!";
}
model_builder->MemBufferDestroy(new_model_buffer);
} else {
LOG(WARNING) << "[NPU] OutputMemBufferCreate failed!";
}
model_builder->MemBufferDestroy(org_model_buffer);
} else {
LOG(WARNING) << "[NPU] InputMemBufferCreate failed!";
}
}
// Load the compatible model
std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs{
model_desc};
if (model_client->Load(model_descs) != hiai::AI_SUCCESS) {
LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!";
return nullptr;
}
VLOG(3) << "[NPU] Load model done.";
return model_client;
}
std::shared_ptr<hiai::AiModelMngerClient> Device::Build(
const std::string model_name, // NOLINT
std::vector<ge::Operator>& input_nodes, // NOLINT
bool Device::Build(std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes, // NOLINT
const std::string model_cache_full_dir = "" // NOLINT
) {
VLOG(3) << "[NPU] Build model";
// Build the HiAI IR graph to the HiAI om model
std::vector<char>* model_buffer) {
// Convert the HiAI IR graph to the HiAI om model
ge::Graph ir_graph("graph");
ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
ge::Model om_model("model", "model");
om_model.SetGraph(ir_graph);
domi::HiaiIrBuild ir_build;
domi::ModelBufferData om_model_buf;
if (!model_cache_full_dir.empty() && IsFileExists(model_cache_full_dir)) {
VLOG(3) << "Will read om model from " << model_cache_full_dir;
ReadFromOMFile(&om_model_buf, model_cache_full_dir);
} else {
if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
// Build the HiAI om model, serialize and output it to the om buffer
domi::HiaiIrBuild ir_build;
domi::ModelBufferData om_buffer;
if (!ir_build.CreateModelBuff(om_model, om_buffer)) {
LOG(WARNING) << "[NPU] CreateModelBuff failed!";
return nullptr;
return false;
}
if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
if (!ir_build.BuildIRModel(om_model, om_buffer)) {
LOG(WARNING) << "[NPU] BuildIRModel failed!";
ir_build.ReleaseModelBuff(om_model_buf);
return nullptr;
ir_build.ReleaseModelBuff(om_buffer);
return false;
}
if (!model_cache_full_dir.empty()) {
VLOG(3) << "Will write om model to " << model_cache_full_dir;
WriteToOMFile(om_model_buf, model_cache_full_dir);
}
}
// Create a HiAI model manager client to load the HiAI om model
std::shared_ptr<hiai::AiModelMngerClient> model_client(
new hiai::AiModelMngerClient());
if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!";
ir_build.ReleaseModelBuff(om_model_buf);
return nullptr;
}
auto model_desc = std::make_shared<hiai::AiModelDescription>(
model_name, freq_level(), framework_type(), model_type(), device_type());
model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length);
std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs;
model_descs.push_back(model_desc);
if (model_client->Load(model_descs) != hiai::AI_SUCCESS) {
LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!";
ir_build.ReleaseModelBuff(om_model_buf);
return nullptr;
}
ir_build.ReleaseModelBuff(om_model_buf);
VLOG(3) << "[NPU] Build done";
return model_client;
model_buffer->resize(om_buffer.length);
memcpy(reinterpret_cast<void*>(model_buffer->data()),
reinterpret_cast<void*>(om_buffer.data),
om_buffer.length);
ir_build.ReleaseModelBuff(om_buffer);
VLOG(3) << "[NPU] Build model done.";
return true;
}
} // namespace npu
......
......@@ -38,14 +38,18 @@ class Device {
int model_type() { return model_type_; }
int device_type() { return device_type_; }
// Load the HiAI om model from buffer, rebuild the model if it's incompatible
// with the current device, then create a HiAI model manager client(from HiAI
// Server) to run inference
std::shared_ptr<hiai::AiModelMngerClient> Load(
const std::string& model_name,
std::vector<char>* model_buffer,
bool* model_comp);
// Build the HiAI IR graph to om model, return HiAI model manager client to
// load om model and run inference.
std::shared_ptr<hiai::AiModelMngerClient> Build(
const std::string model_name, // NOLINT
std::vector<ge::Operator>& input_nodes, // NOLINT
bool Build(std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes, // NOLINT
const std::string model_cache_name // NOLINT
); // NOLINT
std::vector<char>* model_buffer);
private:
int freq_level_{3};
......
......@@ -119,7 +119,7 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
}
}
cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
cl::NDRange CLContext::LocalWorkSizeTune(cl::NDRange global_work_size,
size_t max_work_size,
int divisor) {
int preferred_lws = 0;
......@@ -157,7 +157,7 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
static_cast<size_t>(gws0)};
#endif
}
cl::NDRange CLContext::LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
cl::NDRange CLContext::LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
size_t max_work_size,
int divisor) {
int preferred_lws = 0;
......
......@@ -62,10 +62,10 @@ class CLContext {
cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);
cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
cl::NDRange LocalWorkSizeTune(cl::NDRange global_work_size,
size_t max_work_size,
int divitor = 2);
cl::NDRange LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
cl::NDRange LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
size_t max_work_size,
int divitor = 2);
bool IsArmMali();
......
......@@ -405,7 +405,9 @@ void fc_gemm_4x4(__global const CL_DTYPE* a,
} else {
for (int cidx = col; cidx < N; ++cidx) {
for (int ridx = row; ridx < M; ++ridx) {
CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0;
CL_COMPUTE_DTYPE a0 = 0;
CL_COMPUTE_DTYPE b0 = 0;
CL_COMPUTE_DTYPE c0 = bias ? bias[cidx] : 0;
for (int p = 0; p < K; ++p) {
a0 = *(a + ridx * K + p);
b0 = *(b + p * N + cidx),
......
......@@ -6,9 +6,7 @@ __kernel void conv2d_1x1_opt(
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
......@@ -284,9 +282,7 @@ __kernel void conv2d_1x1_simple(
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
......
......@@ -19,9 +19,7 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
......
......@@ -19,9 +19,7 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
......@@ -264,9 +262,7 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
......
......@@ -5,9 +5,7 @@ __kernel void conv2d_5x5(__private const int global_size_dim0,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
......
......@@ -20,9 +20,7 @@ __kernel void conv2d_5x5_opt(__private const int item_ch,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
......@@ -268,9 +266,7 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
......
......@@ -5,9 +5,7 @@ __kernel void conv2d_7x7(__private const int global_size_dim0,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
......
......@@ -20,9 +20,7 @@ __kernel void conv2d_7x7_opt(__private const int item_ch,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
......@@ -268,9 +266,7 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
......
......@@ -19,9 +19,7 @@ __kernel void depth_conv2d(__private const int global_size_dim0,
__private const int global_size_dim2,
__read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
......
......@@ -20,9 +20,7 @@ __kernel void depth_conv2d_3x3(
__private const int global_size_dim2,
__read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
......@@ -249,9 +247,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
__private const int ou_nh,
__read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void transpose_4d(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_H,
__private const int out_W,
__private const int in_W) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = 1;
const int out_h = out_nh % out_H;
const int out_c0 = out_c * 4;
const int out_c1 = out_c * 4 + 1;
const int out_c2 = out_c * 4 + 2;
const int out_c3 = out_c * 4 + 3;
const int in_n = out_n;
const int in_c = out_w * 0.25;
const int in_h0 = out_c0;
const int in_h1 = out_c1;
const int in_h2 = out_c2;
const int in_h3 = out_c3;
const int in_w = out_h;
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
int2 input_pos0;
int2 input_pos1;
int2 input_pos2;
int2 input_pos3;
input_pos0.x = in_W * in_c + in_w;
input_pos0.y = in_n * in_h0;
input_pos1.x = in_W * in_c + in_w;
input_pos1.y = in_n * in_h1;
input_pos2.x = in_W * in_c + in_w;
input_pos2.y = in_n * in_h2;
input_pos3.x = in_W * in_c + in_w;
input_pos3.y = in_n * in_h3;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 input0;
CL_DTYPE4 input1;
CL_DTYPE4 input2;
CL_DTYPE4 input3;
CL_DTYPE4 output;
input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos0);
if (out_w % 4 == 0) {
output.x = input0.x;
} else if (out_w % 4 == 1) {
output.x = input0.y;
} else if (out_w % 4 == 2) {
output.x = input0.z;
} else {
output.x = input0.w;
}
if (out_C - out_c * 4 >= 2) {
input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos1);
if(out_w % 4 == 0) {
output.y = input1.x;
} else if(out_w % 4 == 1) {
output.y = input1.y;
} else if(out_w % 4 == 2) {
output.y = input1.z;
} else {
output.y = input1.w;
}
} else {
output.y = 0.0f;
}
if (out_C - out_c * 4 >= 3) {
input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos2);
if (out_w % 4 == 0){
output.z = input2.x;
} else if (out_w % 4 == 1) {
output.z = input2.y;
} else if (out_w % 4 == 2) {
output.z = input2.z;
} else {
output.z = input2.w;
}
} else {
output.z = 0.0f;
}
if (out_C - out_c * 4 >= 4) {
input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos3);
if (out_w % 4 == 0) {
output.w = input3.x;
} else if (out_w % 4 == 1) {
output.w = input3.y;
} else if (out_w % 4 == 2) {
output.w = input3.z;
} else {
output.w = input3.w;
}
} else {
output.w = 0.0f;
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}
__kernel void transpose(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_H,
__private const int out_W,
__private const int in_W) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = 1;
const int out_h = out_nh % out_H;
const int in_n = 1;
const int in_c = out_c;
const int in_w = out_h;
const int in_h = out_w;
int2 input_pos;
int2 output_pos;
input_pos.x = in_c * in_W + in_w;
input_pos.y = in_n * in_h;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_n * out_h;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 input;
CL_DTYPE4 output;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos);
output = input;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
}
\ No newline at end of file
......@@ -20,8 +20,8 @@ limitations under the License. */
#include "lite/backends/x86/cupti_lib_path.h"
#include "lite/backends/x86/port.h"
#include "lite/backends/x86/warpctc_lib_path.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/env.h"
#include "lite/utils/paddle_enforce.h"
// DEFINE_string(cudnn_dir,
// "",
......@@ -178,7 +178,7 @@ auto error_msg =
#endif // !_WIN32
if (throw_on_error) {
CHECK(dso_handle != nullptr);
// PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, errorno);
// CHECK(nullptr != dso_handle, error_msg, dlPath, errorno);
} else if (nullptr == dso_handle) {
// LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno);
}
......
......@@ -319,8 +319,8 @@ void BenchKernelSgd() {
const T lr = 0.1;
auto UnDuplicatedRandomVec = [](
int n, const int64_t lower, const int64_t upper) -> std::vector<int64_t> {
PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
PADDLE_ENFORCE_GT(n, 0);
CHECK_LE(static_cast<size_t>(upper - lower), n - 1);
CHECK_GT(n, 0);
std::vector<int64_t> all, out;
for (int i = 0; i < n; ++i) {
all.push_back(i);
......
......@@ -129,11 +129,11 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
}
std::unique_ptr<GenBase> CreateJitCode(
const emb_seq_pool_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.table_height, 0);
PADDLE_ENFORCE_GT(attr.table_width, 0);
PADDLE_ENFORCE_GT(attr.index_height, 0);
PADDLE_ENFORCE_GT(attr.index_width, 0);
PADDLE_ENFORCE_GT(attr.out_width, 0);
CHECK_GT(attr.table_height, 0);
CHECK_GT(attr.table_width, 0);
CHECK_GT(attr.index_height, 0);
CHECK_GT(attr.index_width, 0);
CHECK_GT(attr.out_width, 0);
return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
}
};
......
......@@ -17,7 +17,7 @@
#include <string>
#include "lite/backends/x86/jit/gen/jitcode.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/string.h"
namespace paddle {
namespace lite {
......
......@@ -27,7 +27,7 @@ void MatMulJitCode::genCode() {
preCode();
int block, rest;
const auto groups = packed_groups(n_, k_, &block, &rest);
PADDLE_ENFORCE_GT(groups.front(), 0);
CHECK_GT(groups.front(), 0);
const int block_len = sizeof(float) * block;
const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
......@@ -116,9 +116,9 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
}
std::unique_ptr<GenBase> CreateJitCode(
const matmul_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.m, 0);
PADDLE_ENFORCE_GT(attr.n, 0);
PADDLE_ENFORCE_GT(attr.k, 0);
CHECK_GT(attr.m, 0);
CHECK_GT(attr.n, 0);
CHECK_GT(attr.k, 0);
return make_unique<MatMulJitCode>(attr, CodeSize(attr));
}
};
......
......@@ -19,7 +19,7 @@
#include <vector>
#include "lite/backends/x86/jit/gen/jitcode.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/string.h"
namespace paddle {
namespace lite {
......@@ -32,7 +32,7 @@ class MatMulJitCode : public JitCode {
size_t code_size = 256 * 1024,
void* code_ptr = nullptr)
: JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
CHECK_EQ(m_, 1) << "Only support m==1 yet";
this->genCode();
}
......
......@@ -69,8 +69,8 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
}
std::unique_ptr<GenBase> CreateJitCode(
const seq_pool_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.w, 0);
PADDLE_ENFORCE_GT(attr.h, 0);
CHECK_GT(attr.w, 0);
CHECK_GT(attr.h, 0);
return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
}
};
......
......@@ -17,7 +17,7 @@
#include <string>
#include "lite/backends/x86/jit/gen/jitcode.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/string.h"
namespace paddle {
namespace lite {
......@@ -125,8 +125,8 @@ class SeqPoolJitCode : public JitCode {
vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
reg_idx++;
}
PADDLE_ENFORCE_EQ(
reg_idx, rest_used_num_regs, "All heights should use same regs");
CHECK_EQ(reg_idx, rest_used_num_regs)
<< "All heights should use same regs";
for (int i = 0; i < reg_idx; ++i) {
vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
}
......
......@@ -17,7 +17,7 @@
#include <memory>
#include <vector>
#include "lite/backends/x86/jit/registry.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -113,9 +113,9 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
}
std::unique_ptr<GenBase> CreateJitCode(
const sgd_attr_t& attr) const override {
PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width);
PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height);
PADDLE_ENFORCE_GE(attr.selected_rows_size, 0);
CHECK_EQ(attr.param_width, attr.grad_width);
CHECK_LE(attr.selected_rows_size, attr.grad_height);
CHECK_GE(attr.selected_rows_size, 0);
return make_unique<SgdJitCode>(attr, CodeSize(attr));
}
};
......
......@@ -16,7 +16,7 @@
#include <memory>
#include <vector>
#include "lite/backends/x86/jit/registry.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -76,7 +76,7 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
}
std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
PADDLE_ENFORCE_GT(w, 0);
CHECK_GT(w, 0);
return make_unique<VBroadcastJitCode>(w, CodeSize(w));
}
};
......
......@@ -21,8 +21,8 @@
// posix_memalign
#include "lite/backends/x86/cpu_info.h"
#include "lite/backends/x86/jit/macro.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/env.h"
#include "lite/utils/paddle_enforce.h"
#ifndef _WIN32
#define posix_memalign_free free
......@@ -62,12 +62,10 @@ void* GenBase::operator new(size_t size) {
#ifdef _WIN32
ptr = _aligned_malloc(size, alignment);
#else
PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size),
0,
"GenBase Alloc %ld error!",
size);
CHECK_EQ(posix_memalign(&ptr, alignment, size), 0) << "GenBase Alloc " << size
<< " error!";
#endif
PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
CHECK(ptr) << "Fail to allocate GenBase CPU memory: size = " << size;
return ptr;
}
......
......@@ -14,9 +14,10 @@
#include "lite/backends/x86/jit/helper.h"
#include <algorithm> // tolower
#include <cstring>
#include <numeric>
#include <string>
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -104,12 +105,12 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
int block, rest;
const auto groups = packed_groups(n, k, &block, &rest);
std::for_each(groups.begin(), groups.end(), [&](int i) {
PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
CHECK_GT(i, 0) << "each element of groups should be larger than 0.";
});
int sum = std::accumulate(groups.begin(), groups.end(), 0);
std::memset(dst, 0, k * sum * block * sizeof(float));
PADDLE_ENFORCE_GE(
sum * block, n, "The packed n should be equal to or larger than n");
CHECK_GE(sum * block, n)
<< "The packed n should be equal to or larger than n";
const int block_len = sizeof(float) * block;
int n_offset = 0;
......
......@@ -23,7 +23,7 @@
#include "lite/backends/x86/jit/kernel_base.h"
#include "lite/backends/x86/jit/kernel_key.h"
#include "lite/backends/x86/jit/kernel_pool.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -78,8 +78,8 @@ inline const Kernel* GetReferKernel() {
auto& ref_pool = ReferKernelPool::Instance().AllKernels();
KernelKey kkey(KernelTuple::kernel_type, lite::fluid::CPUPlace());
auto ref_iter = ref_pool.find(kkey);
PADDLE_ENFORCE(ref_iter != ref_pool.end(),
"Every Kernel should have reference function.");
CHECK(ref_iter != ref_pool.end())
<< "Every Kernel should have reference function.";
auto& ref_impls = ref_iter->second;
for (auto& impl : ref_impls) {
auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
......@@ -94,7 +94,7 @@ template <typename KernelTuple>
inline typename KernelTuple::func_type GetReferFunc() {
auto ker = GetReferKernel<KernelTuple>();
auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
PADDLE_ENFORCE(p, "The Refer kernel should exsit");
CHECK(p) << "The Refer kernel should exsit";
return p->GetFunc();
}
......@@ -125,7 +125,7 @@ std::vector<const Kernel*> GetAllCandidateKernels(
// The last implementation should be reference function on CPUPlace.
auto ref = GetReferKernel<KernelTuple>();
PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
CHECK(ref != nullptr) << "Refer Kernel can not be empty.";
res.emplace_back(ref);
return res;
}
......@@ -140,11 +140,11 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
std::string name = k->ImplType();
if (name == "JitCode") {
auto i = dynamic_cast<const GenBase*>(k);
PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
CHECK(i) << "jitcode kernel cast can not fail.";
res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
} else {
auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
PADDLE_ENFORCE(i, "kernel cast can not fail.");
CHECK(i) << "kernel cast can not fail.";
res.emplace_back(std::make_pair(name, i->GetFunc()));
}
}
......@@ -166,7 +166,7 @@ template <typename KernelTuple, typename PlaceType = lite::fluid::CPUPlace>
typename KernelTuple::func_type GetDefaultBestFunc(
const typename KernelTuple::attr_type& attr) {
auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
PADDLE_ENFORCE_GE(funcs.size(), 1UL);
CHECK_GE(funcs.size(), 1UL);
// Here could do some runtime benchmark of this attr and return the best one.
// But yet just get the first one as the default best one,
// which is searched in order and tuned by offline.
......
......@@ -14,7 +14,7 @@
#include "lite/backends/x86/jit/kernel_key.h"
#include <xxhash.h> // XXH64: 13.8 GB/s
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......
......@@ -18,7 +18,7 @@
#include <type_traits>
#include <vector>
#include "lite/backends/x86/jit/kernel_base.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -104,11 +104,11 @@ void EmbSeqPool(const T* table,
const int64_t* idx,
T* out,
const emb_seq_pool_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
CHECK_EQ(attr->table_width * attr->index_width, attr->out_width);
auto check_idx_value_valid = [&](int64_t i) {
PADDLE_ENFORCE_LT(
idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i);
PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
CHECK_LT(idx[i], attr->table_height) << "idx value: " << idx[i]
<< " i: " << i;
CHECK_GE(idx[i], 0) << "idx value: " << idx[i] << " i: " << i;
};
for (int64_t w = 0; w != attr->index_width; ++w) {
......@@ -175,22 +175,22 @@ void Sgd(const T* lr,
const int64_t* rows,
T* out,
const sgd_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
CHECK_EQ(attr->param_width, attr->grad_width);
CHECK_LE(attr->selected_rows_size, attr->grad_height);
T scalar = -lr[0];
int width = attr->grad_width;
if (out == param) {
for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
auto h_idx = rows[i];
PADDLE_ENFORCE_LT(h_idx, attr->param_height);
PADDLE_ENFORCE_GE(h_idx, 0);
CHECK_LT(h_idx, attr->param_height);
CHECK_GE(h_idx, 0);
VAXPY(scalar, grad + i * width, out + h_idx * width, width);
}
} else {
for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
auto h_idx = rows[i];
PADDLE_ENFORCE_LT(h_idx, attr->param_height);
PADDLE_ENFORCE_GE(h_idx, 0);
CHECK_LT(h_idx, attr->param_height);
CHECK_GE(h_idx, 0);
VScal(&scalar, grad + i * width, out + h_idx * width, width);
VAdd(param + h_idx * width,
out + h_idx * width,
......
......@@ -22,7 +22,6 @@
#include "lite/backends/x86/jit/kernel_base.h"
#include "lite/backends/x86/jit/macro.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/paddle_enforce.h"
namespace paddle {
namespace lite {
......@@ -480,12 +479,12 @@ void EmbSeqPool(const T* table,
const int64_t* idx,
T* out,
const emb_seq_pool_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
CHECK_EQ(attr->table_width * attr->index_width, attr->out_width);
auto check_idx_value_valid = [&](int64_t i) {
PADDLE_ENFORCE_LT(
idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i);
PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
CHECK_LT(idx[i], attr->table_height) << "idx value: " << idx[i]
<< " i: " << i;
CHECK_GE(idx[i], 0) << "idx value: " << idx[i] << " i: " << i;
};
for (int64_t w = 0; w != attr->index_width; ++w) {
......@@ -527,12 +526,12 @@ void Sgd(const T* lr,
const int64_t* rows,
T* out,
const lite::jit::sgd_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
CHECK_EQ(attr->param_width, attr->grad_width);
CHECK_LE(attr->selected_rows_size, attr->grad_height);
for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
auto h_idx = rows[i];
PADDLE_ENFORCE_LT(h_idx, attr->param_height);
PADDLE_ENFORCE_GE(h_idx, 0);
CHECK_LT(h_idx, attr->param_height);
CHECK_GE(h_idx, 0);
for (int64_t j = 0; j < attr->grad_width; ++j) {
out[h_idx * attr->grad_width + j] =
param[h_idx * attr->grad_width + j] -
......
......@@ -910,8 +910,8 @@ void TestKernelSgd() {
const T lr = 0.1;
auto UnDuplicatedRandomVec = [](
int n, const int64_t lower, const int64_t upper) -> std::vector<int64_t> {
PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
PADDLE_ENFORCE_GT(n, 0);
CHECK_LE(static_cast<size_t>(upper - lower), n - 1);
CHECK_GT(n, 0);
std::vector<int64_t> all, out;
for (int i = 0; i < n; ++i) {
all.push_back(i);
......
......@@ -116,7 +116,7 @@ class BeamSearchFunctor<TARGET(kX86), T> {
lod[0].assign(high_level.begin(), high_level.end());
lod[1].assign(low_level.begin(), low_level.end());
// if (!lite::fluid::CheckLoD(lod)) {
// //PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
// //LOG(FATAL)<<"lod %s is not right", framework::LoDToString(lod));
//}
selected_ids->set_lod(lod);
selected_scores->set_lod(lod);
......
......@@ -23,7 +23,7 @@ namespace math {
MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
int num_flatten_cols,
bool trans) {
PADDLE_ENFORCE_GT(tensor_dim.size(), 1u);
CHECK_GT(tensor_dim.size(), 1u);
MatDescriptor retv;
if (num_flatten_cols > 1) {
auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
......
......@@ -287,22 +287,22 @@ struct CBlas<double> {
template <>
struct CBlas<lite::fluid::float16> {
static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
static void GEMM(...) { LOG(FATAL) << "float16 GEMM not supported on CPU"; }
static void SMM_GEMM(...) {
PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
LOG(FATAL) << "float16 SMM_GEMM not supported on CPU";
}
static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
static void VMUL(...) { LOG(FATAL) << "float16 VMUL not supported on CPU"; }
static void VEXP(...) { LOG(FATAL) << "float16 VEXP not supported on CPU"; }
static void VSQUARE(...) {
PADDLE_THROW("float16 VSQUARE not supported on CPU");
LOG(FATAL) << "float16 VSQUARE not supported on CPU";
}
static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); };
static void VPOW(...) { LOG(FATAL) << "float16 VPOW not supported on CPU"; }
static void DOT(...) { LOG(FATAL) << "float16 DOT not supported on CPU"; };
static void SCAL(...) { LOG(FATAL) << "float16 SCAL not supported on CPU"; };
static void ASUM(...) { LOG(FATAL) << "float16 ASUM not supported on CPU"; };
#ifdef PADDLE_WITH_MKLML
static void GEMM_BATCH(...) {
PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
LOG(FATAL) << "float16 GEMM_BATCH not supported on CPU";
}
#endif
};
......@@ -461,11 +461,11 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
auto dim_a = mat_a.dims();
auto dim_b = mat_b.dims();
auto dim_out = mat_out->dims();
PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix");
// PADDLE_ENFORCE(
// mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target(),
// "The targets of matrices must be same");
CHECK(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2)
<< "The input and output of matmul be matrix";
// CHECK(
// mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target())
// << "The targets of matrices must be same";
int M = dim_out[0];
int N = dim_out[1];
......@@ -746,7 +746,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
T alpha,
lite::Tensor *mat_out,
T beta) const {
PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
CHECK_EQ(dim_a.width_, dim_b.height_);
CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
......@@ -761,8 +761,8 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
beta,
mat_out->template mutable_data<T>());
} else {
PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
CHECK(dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
dim_b.batch_size_ == 0);
this->template BatchedGEMM<T>(
transA,
transB,
......
......@@ -146,7 +146,7 @@ class ContextProjectFunctor {
}
}
if (padding_trainable) {
PADDLE_ENFORCE(padding_data != nullptr);
CHECK(padding_data != nullptr);
for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
if (lod_level_0[i] == lod_level_0[i + 1]) continue;
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include <functional>
#include <string>
#include "lite/backends/x86/cpu_info.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
#ifdef PADDLE_WITH_MKLML
#include "lite/backends/x86/mklml.h"
......@@ -652,7 +652,7 @@ class VecActivations {
} else if (type == "identity" || type == "") {
return vec_identity<T, isa>;
}
PADDLE_THROW("Not support type: %s", type);
LOG(FATAL) << "Not support type: " << type;
}
};
......
......@@ -57,7 +57,7 @@ class CrossEntropyFunctor<lite::TargetType::kX86, T> {
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < num_remain; j++) {
int lbl = label_data[i * num_remain + j];
PADDLE_ENFORCE((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index);
CHECK((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index);
int index = i * num_classes + lbl * num_remain + j;
int loss_idx = i * num_remain + j;
loss_data[loss_idx] =
......
......@@ -27,7 +27,7 @@ namespace math {
template <typename T>
struct TolerableValue {
HOSTDEVICE T operator()(const T& x) const {
PADDLE_ENFORCE(static_cast<bool>(std::is_floating_point<T>::value));
CHECK(static_cast<bool>(std::is_floating_point<T>::value));
const T kApproInf = 1e20;
if (x == INFINITY) return kApproInf;
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include <math.h>
#include <string>
#include "lite/backends/x86/cpu_info.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -46,8 +46,6 @@ inline ActivationType GetActivationType(const std::string &type) {
return ActivationType::kIdentity;
}
LOG(ERROR) << "Not support type " << type;
// PADDLE_ENFORCE(false, "Not support type %s", type);
// PADDLE_THROW("Not support type %s.", type);
return ActivationType();
}
......
......@@ -13,7 +13,7 @@ limitations under the License. */
#include "lite/backends/x86/math/detail/activation_functions.h"
#include "lite/core/context.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#include "lite/backends/x86/math/im2col.h"
#include <vector>
#include "lite/backends/x86/math/im2col_cfo_cpu.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -38,8 +38,8 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kCFO,
const std::vector<int>& stride,
const std::vector<int>& padding,
lite::Tensor* col) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col->dims().size() == 5);
CHECK_EQ(im.dims().size(), 3);
CHECK_EQ(col->dims().size(), 5);
if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
dilation[1] == 1) {
......@@ -72,8 +72,8 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
const std::vector<int>& stride,
const std::vector<int>& padding,
lite::Tensor* im) {
PADDLE_ENFORCE(im->dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
CHECK_EQ(im->dims().size(), 3);
CHECK_EQ(col.dims().size(), 5);
int im_channels = im->dims()[0];
int im_height = im->dims()[1];
int im_width = im->dims()[2];
......@@ -82,20 +82,20 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
int col_height = col.dims()[3];
int col_width = col.dims()[4];
PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
CHECK_EQ((im_height + padding[0] + padding[2] -
((dilation[0] * (filter_height - 1) + 1))) /
stride[0] +
1,
col_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
col_height)
<< "Output_height and padding(padding_up, padding_down) are "
"inconsistent.";
CHECK_EQ((im_width + padding[1] + padding[3] -
((dilation[1] * (filter_width - 1) + 1))) /
stride[1] +
1,
col_width,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
col_width)
<< "Output_height and padding(padding_up, padding_down) are "
"inconsistent.";
int channels_col = im_channels * filter_height * filter_width;
......@@ -150,8 +150,8 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kOCF,
const std::vector<int>& stride,
const std::vector<int>& padding,
lite::Tensor* col) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col->dims().size() == 5);
CHECK_EQ(im.dims().size(), 3);
CHECK_EQ(col->dims().size(), 5);
int im_channels = im.dims()[0];
int im_height = im.dims()[1];
int im_width = im.dims()[2];
......@@ -214,8 +214,8 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
const std::vector<int>& stride,
const std::vector<int>& padding,
lite::Tensor* im) {
PADDLE_ENFORCE(im->dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
CHECK_EQ(im->dims().size(), 3);
CHECK_EQ(col.dims().size(), 5);
int im_channels = im->dims()[0];
int im_height = im->dims()[1];
int im_width = im->dims()[2];
......@@ -224,16 +224,16 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
int col_height = col.dims()[0];
int col_width = col.dims()[1];
PADDLE_ENFORCE_EQ(
CHECK_EQ(
(im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
col_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ(
col_height)
<< "Output_height and padding(padding_up, padding_down) are "
"inconsistent.";
CHECK_EQ(
(im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
col_width,
"col_width and padding(padding_left, padding_right) are "
"inconsistent.");
col_width)
<< "col_width and padding(padding_left, padding_right) are "
"inconsistent.";
T* im_data = im->template mutable_data<T>();
const T* col_data = col.data<T>();
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include "lite/backends/x86/math/detail/activation_functions.h"
#include "lite/core/context.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......
......@@ -121,8 +121,8 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
lite::Tensor* output) {
const auto& in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
CHECK_EQ(vector.numel(), size);
CHECK_EQ(output->dims(), in_dims);
const T* input_data = input.data<T>();
const T* vector_data = vector.data<T>();
......
......@@ -20,8 +20,8 @@ limitations under the License. */
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
#include "lite/fluid/float16.h"
#include "lite/utils/paddle_enforce.h"
//#include "lite/tensor_util.h"
#include "lite/utils/cp_logging.h"
// #include "lite/tensor_util.h"
namespace paddle {
namespace lite {
......
......@@ -59,7 +59,7 @@ void ColwiseSum<Target, T>::operator()(const lite::Context<Target>& context,
lite::TensorLite* out) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(out->numel(), size);
CHECK_EQ(out->numel(), size);
auto in = lite::fluid::EigenMatrix<T>::From(input);
auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
......@@ -81,7 +81,7 @@ class ColwiseSum<lite::TargetType::kX86, T> {
auto& in_dims = input.dims();
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), size);
CHECK_EQ(out->numel(), size);
T* out_buf = out->template mutable_data<T>(out->target());
const T* in_buf = input.data<T>();
......@@ -103,8 +103,8 @@ void RowwiseMean<Target, T>::operator()(const lite::Context<Target>& context,
const lite::TensorLite& input,
lite::TensorLite* out) {
auto in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
CHECK_EQ(in_dims.size(), 2U);
CHECK_EQ(out->numel(), in_dims[0]);
auto in = lite::fluid::EigenMatrix<T>::From(input);
auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
......@@ -124,10 +124,10 @@ class RowwiseMean<lite::TargetType::kX86, T> {
const lite::TensorLite& input,
lite::TensorLite* out) {
auto& in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
CHECK_EQ(in_dims.size(), 2U);
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height);
CHECK_EQ(out->numel(), height);
auto inv_size = 1.0 / size;
T* out_buf = out->template mutable_data<T>(out->target());
const T* in_buf = input.data<T>();
......@@ -147,8 +147,8 @@ void RowwiseSum<Target, T>::operator()(const lite::Context<Target>& context,
const lite::TensorLite& input,
lite::TensorLite* out) {
auto in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
CHECK_EQ(in_dims.size(), 2U);
CHECK_EQ(out->numel(), in_dims[0]);
auto in = lite::fluid::EigenMatrix<T>::From(input);
auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
......@@ -168,10 +168,10 @@ class RowwiseSum<lite::TargetType::kX86, T> {
const lite::TensorLite& input,
lite::TensorLite* out) {
auto& in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
CHECK_EQ(in_dims.size(), 2U);
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height);
CHECK_EQ(out->numel(), height);
T* out_buf = out->template mutable_data<T>(out->target());
const T* in_buf = input.data<T>();
......
......@@ -273,7 +273,7 @@ TEST(math_funciton, set_constant) {
auto* ctx = new paddle::platform::CPUDeviceContext();
paddle::operators::math::set_constant(*ctx, &t, 10);
for (int64_t i = 0; i < t.numel(); ++i) {
PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
CHECK_EQ(10, t.data<int>()[i]);
}
delete ctx;
}
......
......@@ -32,7 +32,7 @@ namespace math {
class Sampler {
public:
explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
// PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0.");
// CHECK_GT(range, 0, "Range should be greater than 0.");
if (seed == 0) {
std::random_device r;
seed_ = r();
......
......@@ -31,7 +31,7 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
const fluid::SelectedRows& input2,
fluid::SelectedRows* output) {
auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2.height());
CHECK_EQ(in1_height, input2.height());
output->set_height(in1_height);
auto& in1_rows = input1.rows();
......@@ -49,8 +49,8 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
auto& in2_value = input2.value();
auto in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
CHECK_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
CHECK_EQ(in1_row_numel, out_value->numel() / out_rows.size());
auto* out_data = out_value->template mutable_data<T>();
auto* in1_data = in1_value.data<T>();
......@@ -73,15 +73,15 @@ struct SelectedRowsAddTensor<lite::TargetType::kX86, T> {
auto in1_height = input1.height();
auto in2_dims = input2.dims();
auto out_dims = output->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
CHECK_EQ(in1_height, in2_dims[0]);
CHECK_EQ(in1_height, out_dims[0]);
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
CHECK_EQ(in1_row_numel, input2.numel() / in1_height);
CHECK_EQ(in1_row_numel, output->numel() / in1_height);
SetConstant<lite::TargetType::kX86, T> functor;
functor(context, output, 0.0);
......@@ -113,7 +113,7 @@ struct SelectedRowsAddTo<lite::TargetType::kX86, T> {
const int64_t input2_offset,
fluid::SelectedRows* input2) {
auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height());
CHECK_EQ(in1_height, input2->height());
auto& in1_rows = input1.rows();
auto& in2_rows = *(input2->mutable_rows());
......@@ -149,7 +149,7 @@ struct SelectedRowsSumTo<lite::TargetType::kX86, T> {
auto& in_rows = (*iter)->rows();
size += in_rows.end() - in_rows.begin();
auto in1_height = (*iter)->height();
PADDLE_ENFORCE_EQ(in1_height, input2->height());
CHECK_EQ(in1_height, input2->height());
}
// concat rows
std::vector<int64_t> in2_rows;
......@@ -185,13 +185,13 @@ struct SelectedRowsAddToTensor<lite::TargetType::kX86, T> {
auto in1_height = input1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
CHECK_EQ(in1_height, in2_dims[0]);
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
CHECK_EQ(in1_row_numel, input2->numel() / in1_height);
auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->template mutable_data<T>();
......@@ -291,12 +291,11 @@ struct MergeAdd<lite::TargetType::kX86, T> {
if (input->rows().size() == 0) {
continue;
}
PADDLE_ENFORCE_EQ(input_width,
input->value().dims()[1],
"all input should have same "
"dimension except for the first one");
PADDLE_ENFORCE_EQ(
input_height, input->height(), "all input should have same height");
CHECK_EQ(input_width, input->value().dims()[1])
<< "all input should have same "
"dimension except for the first one";
CHECK_EQ(input_height, input->height())
<< "all input should have same height";
row_num += input->rows().size();
merged_row_set.insert(input->rows().begin(), input->rows().end());
}
......@@ -376,13 +375,13 @@ struct UpdateToTensor<lite::TargetType::kX86, T> {
lite::Tensor* input2) {
auto in1_height = input1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
CHECK_EQ(in1_height, in2_dims[0]);
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
CHECK_EQ(in1_row_numel, input2->numel() / in1_height);
auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->template data<T>();
......
......@@ -30,12 +30,10 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
const uint64_t* index = index_lod.data();
const auto& src_dims = src.dims();
const auto& dst_dims = dst->dims();
PADDLE_ENFORCE_EQ(
src_dims.size(), 2UL, "The src must be matrix with rank 2.");
PADDLE_ENFORCE_EQ(
dst_dims.size(), 2UL, "The dst must be matrix with rank 2.");
PADDLE_ENFORCE_EQ(
src_dims[1], dst_dims[1], "The width of src and dst must be same.");
CHECK_EQ(src_dims.size(), 2UL) << "The src must be matrix with rank 2.";
CHECK_EQ(dst_dims.size(), 2UL) << "The dst must be matrix with rank 2.";
CHECK_EQ(src_dims[1], dst_dims[1])
<< "The width of src and dst must be same.";
auto height = dst_dims[0];
auto width = dst_dims[1];
auto* src_data = src.data<T>();
......
......@@ -19,7 +19,7 @@ limitations under the License. */
#include "lite/core/context.h"
#include "lite/core/tensor.h"
#include "lite/fluid/eigen.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -66,21 +66,18 @@ class LoDTensor2BatchFunctor {
bool is_reverse = false) const {
if (!is_cal_batch_lod) {
auto lods = batch->lod();
PADDLE_ENFORCE_GT(lods.size(),
2UL,
"The LoD of LoDTensor should inlcude at least 2-level "
"sequence information.");
PADDLE_ENFORCE_EQ(
lods[1].size(),
static_cast<size_t>(lod_tensor.dims()[0]),
"The LoD information should be consistent with the dims.");
CHECK_GT(lods.size(), 2UL)
<< "The LoD of LoDTensor should inlcude at least 2-level "
"sequence information.";
CHECK_EQ(lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]))
<< "The LoD information should be consistent with the dims.";
CopyMatrixRowsFunctor<Target, T> to_batch;
to_batch(context, lod_tensor, lods[1], batch, true);
return;
}
auto lods = lod_tensor.lod();
PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now.";
const auto& lod = lods[0];
......@@ -165,14 +162,11 @@ class Batch2LoDTensorFunctor {
const lite::Tensor& batch,
lite::Tensor* lod_tensor) const {
auto in_lod = batch.lod();
PADDLE_ENFORCE_GT(in_lod.size(),
2UL,
"The LoD of LoDTensor should inlcude at least 2-level "
"sequence information.");
PADDLE_ENFORCE_EQ(
in_lod[1].size(),
static_cast<size_t>(lod_tensor->dims()[0]),
"The LoD information should be consistent with the dims.");
CHECK_GT(in_lod.size(), 2UL)
<< "The LoD of LoDTensor should inlcude at least 2-level "
"sequence information.";
CHECK_EQ(in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]))
<< "The LoD information should be consistent with the dims.";
CopyMatrixRowsFunctor<Target, T> to_seq;
to_seq(context, batch, in_lod[1], lod_tensor, false);
}
......
......@@ -37,10 +37,9 @@ void CopyValidData(lite::Tensor* dst_tensor,
layout == kBatchLengthWidth ? step_width : seq_num * step_width;
for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) {
int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
PADDLE_ENFORCE_GE(
pad_seq_len,
valid_seq_len,
"The padded sequence length can not be less than its original length.");
CHECK_GE(pad_seq_len, valid_seq_len) << "The padded sequence length can "
"not be less than its original "
"length.";
int seq_data_offset = seq_offsets[seq_idx] * step_width;
int pad_data_offset = layout == kBatchLengthWidth
? seq_idx * pad_seq_len * step_width
......@@ -108,9 +107,9 @@ class PaddingLoDTensorFunctor<lite::TargetType::kX86, T> {
pad_seq_len,
step_width,
layout);
PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
"The numel of 'pad_value' can only be 1 or be equal to the "
"'step_width'.");
CHECK(pad_value.numel() == 1 || pad_value.numel() == step_width)
<< "The numel of 'pad_value' can only be 1 or be equal to the "
"'step_width'.";
// fill padding value
T* pad_data = pad_tensor->template mutable_data<T>();
......
......@@ -19,7 +19,7 @@ limitations under the License. */
#include "lite/core/context.h"
#include "lite/core/tensor.h"
#include "lite/fluid/lod.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -46,15 +46,14 @@ inline static void CheckDims(const lite::DDim& seq_tensor_dims,
int64_t padded_seq_len,
int64_t step_width,
const PadLayout& layout) {
PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]),
seq_offset.back(),
"Value of 1st dimension of the sequence tensor should be "
"equal to sum of lengths of all sequences.");
CHECK_EQ(static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back())
<< "Value of 1st dimension of the sequence tensor should be "
"equal to sum of lengths of all sequences.";
PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
seq_tensor_dims.size() == pad_tensor_dims.size(),
"pad_tensor's rank should be 1 greater than seq_tensor's "
"rank, or be equal with it.");
CHECK(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
seq_tensor_dims.size() == pad_tensor_dims.size())
<< "pad_tensor's rank should be 1 greater than seq_tensor's "
"rank, or be equal with it.";
}
/*
......
......@@ -46,12 +46,12 @@ class MaxSeqPoolFunctor {
auto in_dims = input.dims();
auto out_dims = output->dims();
auto idx_dims = index->dims();
PADDLE_ENFORCE_GT(in_dims.size(), 1u);
PADDLE_ENFORCE_GT(out_dims.size(), 1u);
CHECK_GT(in_dims.size(), 1u);
CHECK_GT(out_dims.size(), 1u);
for (size_t i = 1; i < in_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
CHECK_EQ(in_dims[i], out_dims[i]);
}
PADDLE_ENFORCE_EQ(idx_dims, out_dims);
CHECK_EQ(idx_dims, out_dims);
auto starts = input.lod()[0];
const T* in_data = input.data<T>();
......@@ -95,10 +95,10 @@ class MaxSeqPoolFunctor<T, true> {
lite::Tensor* index) {
auto in_dims = input.dims();
auto out_dims = output->dims();
PADDLE_ENFORCE_GT(in_dims.size(), 1u);
PADDLE_ENFORCE_GT(out_dims.size(), 1u);
CHECK_GT(in_dims.size(), 1u);
CHECK_GT(out_dims.size(), 1u);
for (size_t i = 1; i < in_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
CHECK_EQ(in_dims[i], out_dims[i]);
}
auto starts = input.lod()[0];
......@@ -136,12 +136,12 @@ class MaxSeqPoolGradFunctor {
auto og_dims = out_grad.dims();
auto ig_dims = in_grad->dims();
auto idx_dims = index.dims();
PADDLE_ENFORCE_GT(og_dims.size(), 1);
PADDLE_ENFORCE_GT(ig_dims.size(), 1);
CHECK_GT(og_dims.size(), 1);
CHECK_GT(ig_dims.size(), 1);
for (size_t i = 1; i < og_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
CHECK_EQ(og_dims[i], ig_dims[i]);
}
PADDLE_ENFORCE_EQ(idx_dims, og_dims);
CHECK_EQ(idx_dims, og_dims);
const T* og_data = out_grad.data<T>();
const int* max_index = index.data<int>();
......@@ -236,7 +236,7 @@ class SumSeqPoolGradFunctor {
auto lod = in_grad->lod()[0];
int64_t out_w = out_grad.numel() / out_grad.dims()[0];
int64_t in_w = in_grad->numel() / in_grad->dims()[0];
PADDLE_ENFORCE(in_w == out_w);
CHECK(in_w == out_w);
const T* out_g_data = out_grad.data<T>();
T* in_g_data = in_grad->template mutable_data<T>(TARGET(kX86));
auto blas = math::GetBlas<TARGET(kX86), T>(context);
......@@ -330,7 +330,7 @@ class SequencePoolFunctor<TARGET(kX86), T> {
out_e.device(eigen_device) = in_e.sum(Eigen::array<int, 1>({{0}})) /
std::sqrt(static_cast<T>(h));
} else {
PADDLE_THROW("unsupported pooling pooltype");
LOG(FATAL) << "unsupported pooling pooltype";
}
}
}
......@@ -389,7 +389,7 @@ class SequencePoolGradFunctor<TARGET(kX86), T> {
} else if (pooltype == "FIRST") {
in_g_e.chip(0, 0).device(eigen_device) = out_g_e_v;
} else {
PADDLE_THROW("unsupported pooling pooltype");
LOG(FATAL) << "unsupported pooling pooltype";
}
}
}
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册