未验证 提交 99deb7d9 编写于 作者: H hong19860320 提交者: GitHub

[Core][XPU] Add XPU op kernels (#3274)

* [LITE][XPU] bind xpu resnet50 kernels

* [LITE][XPU] fuse resnet50 and encoder

* [LITE][XPU] bind xpu bert kernels

* [LITE][XPU] refine xpu_resnet_fuse_pass.cc

* [LITE][XPU] add xpu stack kernel

* [LITE][XPU] add xpu slice/tanh kernel

* [LITE][XPU] refine resnet50 and encoder fusor

* [LITE][XPU] split resnet50 and multi_encoder op from subgraph_op.h

* [LITE][XPU] clean workspace

* [LITE][XPU] add build script

* [LITE][XPU] fix compilation errors

* [LITE][XPU] fix kernel matmul

* [LITE][XPU] fix kernel ewadd ewsub

* [LITE][XPU] add xpu cast kernel

* [LITE][XPU] fix kernel slice

* [LITE][XPU] switch dev by LITE_XPU_DEV env

* [LITE][XPU] eliminate useless cast op

* [LITE][XPU] add PerThread Ops

* [LITE][X86] add SequenceUnpad op and kernel

* [LITE][XPU] add LITE_WITH_XTCL option

* [LITE][X86] add SequenceConv kernel

* [LITE][XPU] fix cmake dependency

* [LITE][XPU] add xpu sigmoid kernel

* [XPU] Remove the dependencies of framework.pb.h
test=develop

Change-Id: Icfb44efb0482a6369b365b5c09017765328fc10d

* [XPU] Fix the precision of cast kernel
test=develop

Change-Id: Icb18be47d7ab490de9fb9c92eae1165f49dbf492

* [Core] Fix the compiling error when build for the target that disable XPU
test=develop

Change-Id: I38ec53f222391d3bf06b70512e6c3ad1282e4683

* [XPU] Add io_copy kernel for xpu<->arm
test=develop

Change-Id: Iec7ea066f040534285557f9948b73e6a1970aed7

* fix
test=develop

Change-Id: I4db1c93df48e22afbba904ce6c3b0babd9fda4c3

* fix target matching of type_target_cast_pass and remove the unnecessary registration of io_copy kernel
test=develop

Change-Id: I432c10c9d1064e778d43fd0d12d8cf0599252f7a

* [X86] Add the keyword 'template' to avoid the compiling errors
test=develop

Change-Id: I015d5d323adafb3884029c8287ced66c90ad931e

* Fix the build.sh for XPU and x86
test=develop

Change-Id: I7d9575243669ce02af69a8ddbd6421db31902bd6

* [XPU] Add the keyword 'template' to avoid the compiling errors
test=develop

Change-Id: I46d0b3b6861286a73ee2999934b8e185e453e749

* [XPU] Add XTCL compiling option in build.sh
test=develop

Change-Id: I8b3fd998ca5f898d5bd2e665646e3874b3b73c80

* fix namespace conflicts, test=develop

* [API][XPU] Move the XPU related APIs into CxxConfig
test=develop

Change-Id: I75ac35e8bae96bcb835683f413f01b9db45afbf9

* [API][XPU] Remove the LITE_WITH_XPU in paddle_api.h
test=develop

Change-Id: Idbd64013bdf331ad876919511c1c349332d46f93

* [API][XPU] Remove XPUSetWorkspaceL3SizePerThread and XPUSetDevPerThread
test=develop

Change-Id: I515958f56f8e129280bae61c923513cc91fb9728

* [API][Core][XPU] Refine the test case and remove the necessary modifications
test=develop

Change-Id: I1e0e2957a2f9d5f4207b06c0bc98a5ab611fee56

* [Core] Remove useless code
test=develop

Change-Id: I6293faa10424aea2836d09d85ddb6a30f7811678

* [XPU] Refine the test cases
test=develop

Change-Id: I6818fc3addf1bca5b96a7d66ee99263242e3374f

* [XPU] Remove useless scripts and code
test=develop

Change-Id: I965ba6712d3cf881d0038f0473fec27d4c1bc684

* [XPU] Use InferShapeImpl in sequence_unpad, resnet50 and multi_encoder op
test=develop

Change-Id: I5375f524d36836a394d426b4b2bc9fb44be0b59c

* test=develop

Change-Id: I42ee68c8a5e891dd0f3e95d6cfbc498be7cf1519

* test=develop

Change-Id: If679e5aa73e1368e0ee5bd5f286d2e1b4c2f354e

* [XPU] Add __xpu__ prefix to the op and graph pass name of resnet50 and multi_encoder
test=develop

Change-Id: Idb61c99b4b8429cb87665bfd6835ab4d7d263be2

* [XPU] Fix and refine the xpu fuse pass
test=develop

Change-Id: If1c5b6788d994e2809c1a00d9384685a89440907

* test=develop

Change-Id: Icfa333e322fc4351700103692c46cfcb3d4f9a89

* [XPU] Remove the dependency on xpu api for xpu fuse passes
test=develop

Change-Id: I6094b5536f58ae18bab068284b32f9bd10a2ab92

* [XPU] Move unit tests from lite/api to lite/tests/api
test=develop

Change-Id: I7ba27abb23abeffb0c95fdbbefec7ac16cdbd250

* test=develop

Change-Id: I33230c84d6c4e61bf19f46668bae2baa3ef68794

* [XPU] Refine code
test=develop

Change-Id: I37bc5b948b4927e44cd3ea2594ebe3fd7671be06

* [XPU] Add env XPU_ENABLE_XTCL to enable xpu_subgraph_pass
test=develop

Change-Id: Ifb8e07e86f307f562adaca3ce792015a6f2a2204

* [XPU] refine code
test=develop

Change-Id: I1380654b930d51ae704dbc0cd855464d9c3b5b79

* [XPU] Refine code
test=develop

Change-Id: I73285c2718ccd3612490eb2635bef4fd608c9bde

* [XPU] Add comments for the XPU APIs
test=develop

Change-Id: Ieb5015f37984f8869b90c4c625c5894bb26164fd
Co-authored-by: Nmiaotianxiang <miaotianxiang@baidu.com>
Co-authored-by: NShixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
上级 293d2d38
...@@ -60,6 +60,7 @@ lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON) ...@@ -60,6 +60,7 @@ lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON)
lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU)
lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF) lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF)
lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF) lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
......
...@@ -136,6 +136,9 @@ endif() ...@@ -136,6 +136,9 @@ endif()
if (LITE_WITH_XPU) if (LITE_WITH_XPU)
add_definitions("-DLITE_WITH_XPU") add_definitions("-DLITE_WITH_XPU")
if (LITE_WITH_XTCL)
add_definitions("-DLITE_WITH_XTCL")
endif()
endif() endif()
if (LITE_WITH_OPENCL) if (LITE_WITH_OPENCL)
......
...@@ -22,42 +22,10 @@ if(NOT DEFINED XPU_SDK_ROOT) ...@@ -22,42 +22,10 @@ if(NOT DEFINED XPU_SDK_ROOT)
message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON") message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
endif() endif()
endif() endif()
message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}") message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}")
find_path(XPU_SDK_INC NAMES xtcl.h
PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl
NO_DEFAULT_PATH)
if(NOT XPU_SDK_INC)
message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
endif()
include_directories("${XPU_SDK_ROOT}/XTCL/include")
include_directories("${XPU_SDK_ROOT}/XTDK/include") include_directories("${XPU_SDK_ROOT}/XTDK/include")
find_library(XPU_SDK_XTCL_FILE NAMES xtcl
PATHS ${XPU_SDK_ROOT}/XTCL/so
NO_DEFAULT_PATH)
if(NOT XPU_SDK_XTCL_FILE)
message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
endif()
find_library(XPU_SDK_TVM_FILE NAMES tvm
PATHS ${XPU_SDK_ROOT}/XTCL/so
NO_DEFAULT_PATH)
if(NOT XPU_SDK_TVM_FILE)
message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
endif()
find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi
PATHS ${XPU_SDK_ROOT}/XTDK/shlib PATHS ${XPU_SDK_ROOT}/XTDK/shlib
NO_DEFAULT_PATH) NO_DEFAULT_PATH)
...@@ -82,23 +50,55 @@ else() ...@@ -82,23 +50,55 @@ else()
set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE}) set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE})
endif() endif()
find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc set(xpu_runtime_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu runtime libs")
PATHS ${XPU_SDK_ROOT}/XTDK/shlib set(xpu_builder_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu builder libs")
NO_DEFAULT_PATH)
if(LITE_WITH_XTCL)
find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8 find_path(XPU_SDK_INC NAMES xtcl.h
PATHS ${XPU_SDK_ROOT}/XTDK/shlib PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH)
NO_DEFAULT_PATH) if(NOT XPU_SDK_INC)
message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
if(NOT XPU_SDK_LLVM_FILE) endif()
message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}") include_directories("${XPU_SDK_ROOT}/XTCL/include")
else()
message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}") find_library(XPU_SDK_XTCL_FILE NAMES xtcl
add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL) PATHS ${XPU_SDK_ROOT}/XTCL/so
set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE}) NO_DEFAULT_PATH)
if(NOT XPU_SDK_XTCL_FILE)
message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
endif()
find_library(XPU_SDK_TVM_FILE NAMES tvm
PATHS ${XPU_SDK_ROOT}/XTCL/so
NO_DEFAULT_PATH)
if(NOT XPU_SDK_TVM_FILE)
message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
endif()
find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
PATHS ${XPU_SDK_ROOT}/XTDK/shlib
NO_DEFAULT_PATH)
if(NOT XPU_SDK_LLVM_FILE)
message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1")
set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
endif() endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=0")
set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
...@@ -8,6 +8,7 @@ message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") ...@@ -8,6 +8,7 @@ message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}") message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}") message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
......
...@@ -20,7 +20,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH ...@@ -20,7 +20,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
if(LITE_WITH_X86) if(LITE_WITH_X86)
add_dependencies(paddle_full_api_shared xxhash) add_dependencies(paddle_full_api_shared xxhash)
target_link_libraries(paddle_full_api_shared xxhash) target_link_libraries(paddle_full_api_shared xxhash)
if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
add_dependencies(paddle_full_api_shared dynload_mklml) add_dependencies(paddle_full_api_shared dynload_mklml)
endif() endif()
endif() endif()
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "lite/api/paddle_api.h" #include "lite/api/paddle_api.h"
#include "lite/core/context.h"
#include "lite/core/device_info.h" #include "lite/core/device_info.h"
#include "lite/core/target_wrapper.h" #include "lite/core/target_wrapper.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
...@@ -203,6 +204,25 @@ void ConfigBase::set_threads(int threads) { ...@@ -203,6 +204,25 @@ void ConfigBase::set_threads(int threads) {
#endif #endif
} }
void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
#ifdef LITE_WITH_XPU
lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size);
#else
LOG(WARNING) << "The invoking of the function "
"'set_xpu_workspace_l3_size_per_thread' is ignored, please "
"rebuild it with LITE_WITH_XPU=ON.";
#endif
}
void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
#ifdef LITE_WITH_XPU
lite::Context<TargetType::kXPU>::SetDev(dev_no);
#else
LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
"ignored, please rebuild it with LITE_WITH_XPU=ON.";
#endif
}
// set model data in combined format, `set_model_from_file` refers to loading // set model data in combined format, `set_model_from_file` refers to loading
// model from file, set_model_from_buffer refers to loading model from memory // model from file, set_model_from_buffer refers to loading model from memory
// buffer // buffer
......
...@@ -163,6 +163,11 @@ class LITE_API CxxConfig : public ConfigBase { ...@@ -163,6 +163,11 @@ class LITE_API CxxConfig : public ConfigBase {
return x86_math_library_math_threads_; return x86_math_library_math_threads_;
} }
#endif #endif
// XPU only, set the size of the workspace memory from L3 cache for the
// current thread.
void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
// XPU only, specify the target device ID for the current thread.
void set_xpu_dev_per_thread(int dev_no = 0);
}; };
/// MobileConfig is the config for the light weight predictor, it will skip /// MobileConfig is the config for the light weight predictor, it will skip
......
...@@ -47,3 +47,5 @@ USE_MIR_PASS(npu_subgraph_pass); ...@@ -47,3 +47,5 @@ USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass); USE_MIR_PASS(xpu_subgraph_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass); USE_MIR_PASS(weight_quantization_preprocess_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass); USE_MIR_PASS(quantized_op_attributes_inference_pass);
USE_MIR_PASS(__xpu__resnet_fuse_pass);
USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
...@@ -2,4 +2,7 @@ if(NOT LITE_WITH_XPU) ...@@ -2,4 +2,7 @@ if(NOT LITE_WITH_XPU)
return() return()
endif() endif()
lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs}) if(LITE_WITH_XTCL)
lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
endif()
lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
...@@ -14,12 +14,12 @@ ...@@ -14,12 +14,12 @@
#pragma once #pragma once
#include <xtcl/xtcl.h>
#include <cstdlib> #include <cstdlib>
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include <cmath>
#include <cstdlib>
#include <utility>
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace xpu {
namespace math {
static inline long round_half_to_even(const float src) { // NOLINT
long ret = llround(src); // NOLINT
if (fabs(fabs(round(src) - src) - 0.5) > 0) {
return ret;
} else {
if (abs(ret) % 2 == 0) {
return ret;
} else {
return ret + (ret > 0 ? -1 : 1);
}
}
}
static float ieee_compliance_0(float f) {
uint32_t *ptr = reinterpret_cast<uint32_t *>(&f);
uint32_t sign = (*ptr) & 0x80000000;
uint32_t uf = 0;
// nan -> inf
if (std::isnan(f)) {
uf = (sign | 0x7F800000);
float *ptr = reinterpret_cast<float *>(&uf);
return *ptr;
} else if (std::isnormal(f) || (std::isinf(f)) || (f == 0)) {
return f;
} else {
// denormal -> +-0
uf = 0x0;
float *ptr = reinterpret_cast<float *>(&uf);
return *ptr;
}
}
template <typename T, int RMAX>
static inline T fp32_to_intx(const float f, float max) {
max = ieee_compliance_0(max);
float input = ieee_compliance_0(f);
// +0 and -0 -> +0
if (input == 0) {
input = 0.0f;
}
float tmp = RMAX / max;
if (std::isinf(tmp)) {
uint32_t *ptr = reinterpret_cast<uint32_t *>(&input);
if ((*ptr) >> 31 & 1) {
return T(-RMAX);
} else {
return T(RMAX);
}
}
tmp = input * tmp;
if (std::isnan(tmp)) {
return T(RMAX);
}
tmp = ieee_compliance_0(tmp);
// early check to avoid INF or big value get into convertor func.
if (tmp > RMAX) {
return T(RMAX);
}
if (tmp < -RMAX) {
return T(-RMAX);
}
T ret = (T)round_half_to_even(tmp);
if (ret > RMAX) {
ret = T(RMAX);
}
if (ret < -RMAX) {
ret = T(-RMAX);
}
return ret;
}
static inline int16_t fp32_to_int16(const float f, float max) {
int16_t v1 = fp32_to_intx<int16_t, 32767>(f, max);
return v1;
}
static inline int ConvertFP32ToInt16(const void *input,
void *output,
float max_val,
int len) {
for (int i = 0; i < len; i++) {
static_cast<int16_t *>(output)[i] =
fp32_to_int16(static_cast<const float *>(input)[i], max_val);
}
return 0;
}
static inline float FindMaxAbs(const float *data, int len) {
float max_f = 0.0f;
for (int i = 0; i < len; ++i) {
float max = std::abs(data[i]);
if (max > max_f) {
max_f = max;
}
}
return max_f;
}
template <typename T>
static inline void Transpose(const T *in, T *out, int h, int w) {
for (int h1 = 0; h1 < w; ++h1) {
for (int w1 = 0; w1 < h; ++w1) {
out[h1 * h + w1] = in[w1 * w + h1];
}
}
}
/**
* Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
* original x_dim is returned.
*/
static lite::DDim RowMatrixFromVector(const lite::DDim &x_dim) {
if (x_dim.size() > 1) {
return x_dim;
}
return lite::DDim({1, x_dim[0]});
}
/**
* Get column matrix shape from a vector shape. If the rank of y_dim > 1, the
* original y_dim is returned.
*/
static lite::DDim ColumnMatrixFromVector(const lite::DDim &y_dim) {
if (y_dim.size() > 1) {
return y_dim;
}
return lite::DDim({y_dim[0], 1});
}
/**
* Matrix Descriptor of a memory buffer.
*
* It is used for Blas::MatMul. MatMul operator can be batched.
* if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
* `batch_size` times of GEMM. The batched GEMM could be faster base on the
* implementation of the blas library. The batch size could be zero. If any
* matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g.,
* Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
* [BatchSize, H1, W2]
*
* The boolean flag, `trans`, describe the memory is the transpose of matrix or
* not. If the trans is true, the last two dims of matrix are transposed. The
* memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
*
* The MatDescriptor is not only the dimension or shape of a matrix, it also
* contains the layout, stride of matrix. It is clearer to have a structure than
* reuse `DDim`.
*/
struct MatDescriptor {
int64_t height_;
int64_t width_;
int64_t stride_{0};
int64_t batch_size_{0};
bool trans_;
};
static MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
int num_flatten_cols,
bool trans) {
MatDescriptor retv;
if (num_flatten_cols > 1) {
auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
retv.height_ = flatten_dim[0];
retv.width_ = flatten_dim[1];
} else {
if (tensor_dim.size() == 2) {
retv.height_ = tensor_dim[0];
retv.width_ = tensor_dim[1];
} else {
auto dim_vec = tensor_dim.Vectorize();
retv.batch_size_ = 1;
for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
retv.batch_size_ *= dim_vec[i];
}
retv.height_ = dim_vec[dim_vec.size() - 2];
retv.width_ = dim_vec[dim_vec.size() - 1];
retv.stride_ = retv.height_ * retv.width_;
}
}
if (trans) {
std::swap(retv.width_, retv.height_);
}
retv.trans_ = trans;
return retv;
}
} // namespace math
} // namespace xpu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/xpu/target_wrapper.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
namespace paddle {
namespace lite {
void* TargetWrapperXPU::Malloc(size_t size) {
void* ptr{nullptr};
xpu_malloc(&ptr, size);
return ptr;
}
void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); }
void TargetWrapperXPU::MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir) {
switch (dir) {
case IoDirection::HtoD:
xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE);
break;
case IoDirection::DtoH:
xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST);
break;
default:
LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
}
}
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/target_wrapper.h"
namespace paddle {
namespace lite {
using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
template <>
class TargetWrapper<TARGET(kXPU)> {
public:
static size_t num_devices() { return 1; }
static size_t maximum_stream() { return 0; }
static void* Malloc(size_t size);
static void Free(void* ptr);
static void MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir);
};
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#pragma GCC system_header
#include <xpu/api.h>
#include <xpu/golden.h>
#include <xpu/runtime.h>
#if defined(LITE_WITH_XTCL)
#include <xtcl/xtcl.h>
#endif
namespace paddle {
namespace lite {
namespace xdnn = baidu::xpu::api;
} // namespace lite
} // namespace paddle
...@@ -5,6 +5,7 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc ...@@ -5,6 +5,7 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
DEPS target_wrapper_host place DEPS target_wrapper_host place
X86_DEPS target_wrapper_x86 X86_DEPS target_wrapper_x86
CUDA_DEPS target_wrapper_cuda CUDA_DEPS target_wrapper_cuda
XPU_DEPS target_wrapper_xpu
CL_DEPS cl_target_wrapper CL_DEPS cl_target_wrapper
FPGA_DEPS fpga_target_wrapper FPGA_DEPS fpga_target_wrapper
BM_DEPS target_wrapper_bm) BM_DEPS target_wrapper_bm)
......
...@@ -15,5 +15,11 @@ ...@@ -15,5 +15,11 @@
#include "lite/core/context.h" #include "lite/core/context.h"
namespace paddle { namespace paddle {
namespace lite {} // namespace lite namespace lite {
#ifdef LITE_WITH_XPU
thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
#endif
} // namespace lite
} // namespace paddle } // namespace paddle
...@@ -24,6 +24,9 @@ ...@@ -24,6 +24,9 @@
#include "lite/backends/opencl/cl_context.h" #include "lite/backends/opencl/cl_context.h"
#include "lite/backends/opencl/cl_runtime.h" #include "lite/backends/opencl/cl_runtime.h"
#endif #endif
#ifdef LITE_WITH_XPU
#include "lite/backends/xpu/xpu_header_sitter.h"
#endif
#include <map> #include <map>
#include <memory> #include <memory>
...@@ -103,11 +106,38 @@ class Context<TargetType::kXPU> { ...@@ -103,11 +106,38 @@ class Context<TargetType::kXPU> {
public: public:
Context() {} Context() {}
explicit Context(const XPUContext& ctx); explicit Context(const XPUContext& ctx);
// NOTE: InitOnce should only be used by ContextScheduler // NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {} void InitOnce() {}
void CopySharedTo(XPUContext* ctx) {} void CopySharedTo(XPUContext* ctx) {}
static xdnn::Context* GetRawContext() {
if (_tls_raw_ctx == nullptr) {
_tls_raw_ctx = xdnn::create_context();
CHECK(_tls_raw_ctx);
}
return _tls_raw_ctx;
}
static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
xdnn::set_workspace_l3_size(GetRawContext(), l3_size);
}
static void SetDev(int dev_no = 0) {
const char* dev_env = getenv("LITE_XPU_DEV");
if (dev_env) {
xpu_set_device(atoi(dev_env));
return;
}
xpu_set_device(dev_no);
}
std::string name() const { return "XPUContext"; } std::string name() const { return "XPUContext"; }
private:
static thread_local xdnn::Context* _tls_raw_ctx;
}; };
#endif #endif
......
...@@ -45,6 +45,11 @@ void* TargetMalloc(TargetType target, size_t size) { ...@@ -45,6 +45,11 @@ void* TargetMalloc(TargetType target, size_t size) {
data = TargetWrapper<TARGET(kBM)>::Malloc(size); data = TargetWrapper<TARGET(kBM)>::Malloc(size);
break; break;
#endif #endif
#ifdef LITE_WITH_XPU
case TargetType::kXPU:
data = TargetWrapperXPU::Malloc(size);
break;
#endif // LITE_WITH_XPU
default: default:
LOG(FATAL) << "Unknown supported target " << TargetToStr(target); LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
} }
...@@ -83,6 +88,11 @@ void TargetFree(TargetType target, void* data, std::string free_flag) { ...@@ -83,6 +88,11 @@ void TargetFree(TargetType target, void* data, std::string free_flag) {
TargetWrapper<TARGET(kBM)>::Free(data); TargetWrapper<TARGET(kBM)>::Free(data);
break; break;
#endif #endif
#ifdef LITE_WITH_XPU
case TargetType::kXPU:
TargetWrapperXPU::Free(data);
break;
#endif // LITE_WITH_XPU
default: default:
LOG(FATAL) << "Unknown type"; LOG(FATAL) << "Unknown type";
} }
......
...@@ -31,6 +31,10 @@ ...@@ -31,6 +31,10 @@
#include "lite/backends/bm/target_wrapper.h" #include "lite/backends/bm/target_wrapper.h"
#endif // LITE_WITH_BM #endif // LITE_WITH_BM
#ifdef LITE_WITH_XPU
#include "lite/backends/xpu/target_wrapper.h"
#endif // LITE_WITH_XPU
namespace paddle { namespace paddle {
namespace lite { namespace lite {
......
...@@ -21,6 +21,8 @@ lite_cc_library(mir_passes ...@@ -21,6 +21,8 @@ lite_cc_library(mir_passes
fusion/elementwise_add_activation_fuse_pass.cc fusion/elementwise_add_activation_fuse_pass.cc
fusion/quant_dequant_fuse_pass.cc fusion/quant_dequant_fuse_pass.cc
fusion/sequence_pool_concat_fuse_pass.cc fusion/sequence_pool_concat_fuse_pass.cc
fusion/__xpu__resnet_fuse_pass.cc
fusion/__xpu__multi_encoder_fuse_pass.cc
elimination/identity_scale_eliminate_pass.cc elimination/identity_scale_eliminate_pass.cc
elimination/elementwise_mul_constant_eliminate_pass.cc elimination/elementwise_mul_constant_eliminate_pass.cc
static_kernel_pick_pass.cc static_kernel_pick_pass.cc
...@@ -69,10 +71,10 @@ set(pattern_deps mir_node mir_ssa_graph op) ...@@ -69,10 +71,10 @@ set(pattern_deps mir_node mir_ssa_graph op)
if (WITH_TESTING) if (WITH_TESTING)
list(APPEND pattern_deps gtest) list(APPEND pattern_deps gtest)
endif() endif()
lite_cc_library(pattern_matcher SRCS pattern_matcher.cc DEPS ${pattern_deps}) lite_cc_library(pattern_matcher SRCS pattern_matcher.cc xpu_pattern_matcher.cc DEPS ${pattern_deps})
lite_cc_test(test_pattern_matcher SRCS pattern_matcher_test.cc DEPS pattern_matcher) lite_cc_test(test_pattern_matcher SRCS pattern_matcher_test.cc DEPS pattern_matcher)
lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher) lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc xpu_pattern_matcher_high_api.cc DEPS pattern_matcher)
# for mobile, unnecessary to compile the following testings. # for mobile, unnecessary to compile the following testings.
......
...@@ -27,8 +27,8 @@ ...@@ -27,8 +27,8 @@
#include "lite/utils/string.h" #include "lite/utils/string.h"
namespace paddle { namespace paddle {
namespace inference { namespace lite {
namespace analysis { namespace mir {
static size_t dot_node_counter{0}; static size_t dot_node_counter{0};
...@@ -162,6 +162,6 @@ class Dot { ...@@ -162,6 +162,6 @@ class Dot {
std::vector<Attr> attrs_; std::vector<Attr> attrs_;
}; };
} // namespace analysis } // namespace mir
} // namespace inference } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -27,10 +27,10 @@ lite_cc_library(fuse_transpose_softmax_transpose ...@@ -27,10 +27,10 @@ lite_cc_library(fuse_transpose_softmax_transpose
DEPS pattern_matcher_high_api) DEPS pattern_matcher_high_api)
lite_cc_library(fuse_interpolate lite_cc_library(fuse_interpolate
SRCS interpolate_fuser.cc SRCS interpolate_fuser.cc
DEPS pattern_matcher_high_api) DEPS pattern_matcher_high_api)
lite_cc_library(fuse_sequence_pool_concat lite_cc_library(fuse_sequence_pool_concat
SRCS sequence_pool_concat_fuser.cc SRCS sequence_pool_concat_fuser.cc
DEPS pattern_matcher_high_api) DEPS pattern_matcher_high_api)
set(mir_fusers set(mir_fusers
fuse_fc fuse_fc
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <vector>
#include "lite/backends/xpu/math.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
#include "lite/operators/subgraph_op.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class XPUSingleEncoderFuser : public FuseBase {
public:
explicit XPUSingleEncoderFuser(const std::string& act_type = "gelu")
: act_type_(act_type) {}
void BuildPattern() override {
auto* input = VarNode("input")
->assert_is_op_input("mul", "X")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* q_mul_y =
VarNode("q_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
auto* q_mul = OpNode("q_mul", "mul");
auto* q_mul_out = VarNode("q_mul_out")
->assert_is_op_output("mul", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* q_add_y = VarNode("q_add_y")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* q_add = OpNode("q_add", "elementwise_add")->AsIntermediate();
auto* q_add_out = VarNode("q_add_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("reshape2", "X")
->AsIntermediate();
auto* q_reshape2 = OpNode("q_reshape2", "reshape2")->AsIntermediate();
auto* q_reshape2_out = VarNode("q_reshape2_out")
->assert_is_op_output("reshape2", "Out")
->assert_is_op_input("transpose2", "X")
->AsIntermediate();
auto* q_reshape2_xshape = VarNode("q_reshape2_xshape")
->assert_is_op_output("reshape2", "XShape")
->AsIntermediate();
auto* q_transpose2 = OpNode("q_transpose2", "transpose2")->AsIntermediate();
auto* q_transpose2_out = VarNode("q_transpose2_out")
->assert_is_op_output("transpose2", "Out")
->assert_is_op_input("scale", "X")
->AsIntermediate();
auto* q_transpose2_xshape =
VarNode("q_transpose2_xshape")
->assert_is_op_output("transpose2", "XShape")
->AsIntermediate();
auto* q_scale = OpNode("q_scale", "scale")->AsIntermediate();
auto* q_scale_out = VarNode("q_scale_out")
->assert_is_op_output("scale", "Out")
->assert_is_op_input("matmul", "X")
->AsIntermediate();
auto* k_mul_y =
VarNode("k_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
auto* k_mul = OpNode("k_mul", "mul")->AsIntermediate();
auto* k_mul_out = VarNode("k_mul_out")
->assert_is_op_output("mul", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* k_add_y = VarNode("k_add_y")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* k_add = OpNode("k_add", "elementwise_add")->AsIntermediate();
auto* k_add_out = VarNode("k_add_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("reshape2", "X")
->AsIntermediate();
auto* k_reshape2 = OpNode("k_reshape2", "reshape2")->AsIntermediate();
auto* k_reshape2_out = VarNode("k_reshape2_out")
->assert_is_op_output("reshape2", "Out")
->assert_is_op_input("transpose2", "X")
->AsIntermediate();
auto* k_reshape2_xshape = VarNode("k_reshape2_xshape")
->assert_is_op_output("reshape2", "XShape")
->AsIntermediate();
auto* k_transpose2 = OpNode("k_transpose2", "transpose2")->AsIntermediate();
auto* k_transpose2_out = VarNode("k_transpose2_out")
->assert_is_op_output("transpose2", "Out")
->assert_is_op_input("matmul", "Y")
->AsIntermediate();
auto* k_transpose2_xshape =
VarNode("k_transpose2_xshape")
->assert_is_op_output("transpose2", "XShape")
->AsIntermediate();
auto* qk_matmul = OpNode("qk_matmul", "matmul")->AsIntermediate();
auto* qk_matmul_out = VarNode("qk_matmul_out")
->assert_is_op_output("matmul", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* qk_mask = VarNode("qk_mask")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* qk_add = OpNode("qk_add", "elementwise_add")->AsIntermediate();
auto* qk_add_out = VarNode("qk_add_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("softmax", "X")
->AsIntermediate();
auto* qk_softmax = OpNode("qk_softmax", "softmax")->AsIntermediate();
auto* qk_softmax_out = VarNode("qk_softmax_out")
->assert_is_op_output("softmax", "Out")
->AsIntermediate();
auto* qk_dropout = OpNode("qk_dropout", "dropout")->AsIntermediate();
auto* qk_dropout_out = VarNode("qk_dropout_out")
->assert_is_op_output("dropout", "Out")
->assert_is_op_input("matmul", "X")
->AsIntermediate();
auto* qk_dropout_mask = VarNode("qk_dropout_mask")
->assert_is_op_output("dropout", "Mask")
->AsIntermediate();
auto* v_mul_y =
VarNode("v_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
auto* v_mul = OpNode("v_mul", "mul")->AsIntermediate();
auto* v_mul_out = VarNode("v_mul_out")
->assert_is_op_output("mul", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* v_add_y = VarNode("v_add_y")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* v_add = OpNode("v_add", "elementwise_add")->AsIntermediate();
auto* v_add_out = VarNode("v_add_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("reshape2", "X")
->AsIntermediate();
auto* v_reshape2 = OpNode("v_reshape2", "reshape2")->AsIntermediate();
auto* v_reshape2_out = VarNode("v_reshape2_out")
->assert_is_op_output("reshape2", "Out")
->assert_is_op_input("transpose2", "X")
->AsIntermediate();
auto* v_reshape2_xshape = VarNode("v_reshape2_xshape")
->assert_is_op_output("reshape2", "XShape")
->AsIntermediate();
auto* v_transpose2 = OpNode("v_transpose2", "transpose2")->AsIntermediate();
auto* v_transpose2_out = VarNode("v_transpose2_out")
->assert_is_op_output("transpose2", "Out")
->assert_is_op_input("matmul", "Y")
->AsIntermediate();
auto* v_transpose2_xshape =
VarNode("v_transpose2_xshape")
->assert_is_op_output("transpose2", "XShape")
->AsIntermediate();
auto* qkv_matmul = OpNode("qkv_matmul", "matmul")->AsIntermediate();
auto* qkv_matmul_out = VarNode("qkv_matmul_out")
->assert_is_op_output("matmul", "Out")
->assert_is_op_input("transpose2", "X")
->AsIntermediate();
auto* qkv_transpose2 =
OpNode("qkv_transpose2", "transpose2")->AsIntermediate();
auto* qkv_transpose2_out = VarNode("qkv_transpose2_out")
->assert_is_op_output("transpose2", "Out")
->assert_is_op_input("reshape2", "X")
->AsIntermediate();
auto* qkv_transpose2_xshape =
VarNode("qkv_transpose2_xshape")
->assert_is_op_output("transpose2", "XShape")
->AsIntermediate();
auto* qkv_reshape2 = OpNode("qkv_reshape2", "reshape2")->AsIntermediate();
auto* qkv_reshape2_out = VarNode("qkv_reshape2_out")
->assert_is_op_output("reshape2", "Out")
->assert_is_op_input("mul", "X")
->AsIntermediate();
auto* qkv_reshape2_xshape = VarNode("qkv_reshape2_xshape")
->assert_is_op_output("reshape2", "XShape")
->AsIntermediate();
auto* qkv_mul_y =
VarNode("qkv_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
auto* qkv_mul = OpNode("qkv_mul", "mul")->AsIntermediate();
auto* qkv_mul_out = VarNode("qkv_mul_out")
->assert_is_op_output("mul", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* qkv_add_y = VarNode("qkv_add_y")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* qkv_add = OpNode("qkv_add", "elementwise_add")->AsIntermediate();
auto* qkv_add_out = VarNode("qkv_add_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("dropout", "X")
->AsIntermediate();
auto* qkv_dropout = OpNode("qkv_dropout", "dropout")->AsIntermediate();
auto* qkv_dropout_out = VarNode("qkv_dropout_out")
->assert_is_op_output("dropout", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* qkv_dropout_mask = VarNode("qkv_dropout_mask")
->assert_is_op_output("dropout", "Mask")
->AsIntermediate();
auto* qkv_add_2 = OpNode("qkv_add_2", "elementwise_add")->AsIntermediate();
auto* qkv_add_2_out = VarNode("qkv_add_2_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("layer_norm", "X")
->AsIntermediate();
auto* qkv_ln_2_scale = VarNode("qkv_ln_2_scale")
->assert_is_op_input("layer_norm", "Scale")
->AsInput();
auto* qkv_ln_2_bias = VarNode("qkv_ln_2_bias")
->assert_is_op_input("layer_norm", "Bias")
->AsInput();
auto* qkv_ln_2 = OpNode("qkv_ln_2", "layer_norm")->AsIntermediate();
auto* qkv_ln_2_out = VarNode("qkv_ln_2_out")
->assert_is_op_output("layer_norm", "Y")
->assert_is_op_input("mul", "X")
->assert_is_op_input("elementwise_add", "Y")
->AsIntermediate();
auto* qkv_ln_2_mean = VarNode("qkv_ln_2_mean")
->assert_is_op_output("layer_norm", "Mean")
->AsIntermediate();
auto* qkv_ln_2_var = VarNode("qkv_ln_2_var")
->assert_is_op_output("layer_norm", "Variance")
->AsIntermediate();
auto* qkv_mul_3_y =
VarNode("qkv_mul_3_y")->assert_is_op_input("mul", "Y")->AsInput();
auto* qkv_mul_3 = OpNode("qkv_mul_3", "mul")->AsIntermediate();
auto* qkv_mul_3_out = VarNode("qkv_mul_3_out")
->assert_is_op_output("mul", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* qkv_add_3_y = VarNode("qkv_add_3_y")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* qkv_add_3 = OpNode("qkv_add_3", "elementwise_add")->AsIntermediate();
auto* qkv_add_3_out = VarNode("qkv_add_3_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input(act_type_, "X")
->AsIntermediate();
auto* qkv_act = OpNode("qkv_act", act_type_)->AsIntermediate();
auto* qkv_act_out = VarNode("qkv_act_out")
->assert_is_op_output(act_type_, "Out")
->assert_is_op_input("mul", "X")
->AsIntermediate();
auto* qkv_mul_4_y =
VarNode("qkv_mul_4_y")->assert_is_op_input("mul", "Y")->AsInput();
auto* qkv_mul_4 = OpNode("qkv_mul_4", "mul")->AsIntermediate();
auto* qkv_mul_4_out = VarNode("qkv_mul_4_out")
->assert_is_op_output("mul", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* qkv_add_4_y = VarNode("qkv_add_4_y")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* qkv_add_4 = OpNode("qkv_add_4", "elementwise_add")->AsIntermediate();
auto* qkv_add_4_out = VarNode("qkv_add_4_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("dropout", "X")
->AsIntermediate();
auto* qkv_dropout_4 = OpNode("qkv_dropout_4", "dropout")->AsIntermediate();
auto* qkv_dropout_4_out = VarNode("qkv_dropout_4_out")
->assert_is_op_output("dropout", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* qkv_dropout_4_mask = VarNode("qkv_dropout_4_mask")
->assert_is_op_output("dropout", "Mask")
->AsIntermediate();
auto* qkv_add_5 = OpNode("qkv_add_5", "elementwise_add")->AsIntermediate();
auto* qkv_add_5_out = VarNode("qkv_add_5_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("layer_norm", "X")
->AsIntermediate();
auto* qkv_ln_5_scale = VarNode("qkv_ln_5_scale")
->assert_is_op_input("layer_norm", "Scale")
->AsInput();
auto* qkv_ln_5_bias = VarNode("qkv_ln_5_bias")
->assert_is_op_input("layer_norm", "Bias")
->AsInput();
auto* qkv_ln_5 = OpNode("qkv_ln_5", "layer_norm")->AsIntermediate();
auto* qkv_ln_5_out = VarNode("qkv_ln_5_out")
->assert_is_op_output("layer_norm", "Y")
->AsOutput();
auto* qkv_ln_5_mean = VarNode("qkv_ln_5_mean")
->assert_is_op_output("layer_norm", "Mean")
->AsIntermediate();
auto* qkv_ln_5_var = VarNode("qkv_ln_5_var")
->assert_is_op_output("layer_norm", "Variance")
->AsIntermediate();
// TODO(miaotianxiang): use LinksFrom/LinksTo() instead
*input >> *q_mul >> *q_mul_out >> *q_add >> *q_add_out >> *q_reshape2 >>
*q_reshape2_out >> *q_transpose2 >> *q_transpose2_out >> *q_scale >>
*q_scale_out >> *qk_matmul;
*q_mul_y >> *q_mul;
*q_add_y >> *q_add;
*q_reshape2 >> *q_reshape2_xshape;
*q_transpose2 >> *q_transpose2_xshape;
*input >> *k_mul >> *k_mul_out >> *k_add >> *k_add_out >> *k_reshape2 >>
*k_reshape2_out >> *k_transpose2 >> *k_transpose2_out >> *qk_matmul;
*k_mul_y >> *k_mul;
*k_add_y >> *k_add;
*k_reshape2 >> *k_reshape2_xshape;
*k_transpose2 >> *k_transpose2_xshape;
*qk_matmul >> *qk_matmul_out >> *qk_add >> *qk_add_out >> *qk_softmax >>
*qk_softmax_out >> *qk_dropout >> *qk_dropout_out >> *qkv_matmul;
*qk_mask >> *qk_add;
*qk_dropout >> *qk_dropout_mask;
*input >> *v_mul >> *v_mul_out >> *v_add >> *v_add_out >> *v_reshape2 >>
*v_reshape2_out >> *v_transpose2 >> *v_transpose2_out >> *qkv_matmul;
*v_mul_y >> *v_mul;
*v_add_y >> *v_add;
*v_reshape2 >> *v_reshape2_xshape;
*v_transpose2 >> *v_transpose2_xshape;
*qkv_matmul >> *qkv_matmul_out >> *qkv_transpose2 >> *qkv_transpose2_out >>
*qkv_reshape2 >> *qkv_reshape2_out >> *qkv_mul >> *qkv_mul_out >>
*qkv_add >> *qkv_add_out >> *qkv_dropout >> *qkv_dropout_out >>
*qkv_add_2;
*qkv_transpose2 >> *qkv_transpose2_xshape;
*qkv_reshape2 >> *qkv_reshape2_xshape;
*qkv_mul_y >> *qkv_mul;
*qkv_add_y >> *qkv_add;
*qkv_dropout >> *qkv_dropout_mask;
*input >> *qkv_add_2 >> *qkv_add_2_out >> *qkv_ln_2 >> *qkv_ln_2_out;
*qkv_ln_2_scale >> *qkv_ln_2;
*qkv_ln_2_bias >> *qkv_ln_2;
*qkv_ln_2 >> *qkv_ln_2_mean;
*qkv_ln_2 >> *qkv_ln_2_var;
*qkv_ln_2_out >> *qkv_mul_3 >> *qkv_mul_3_out >> *qkv_add_3 >>
*qkv_add_3_out >> *qkv_act >> *qkv_act_out >> *qkv_mul_4 >>
*qkv_mul_4_out >> *qkv_add_4 >> *qkv_add_4_out >> *qkv_dropout_4 >>
*qkv_dropout_4_out >> *qkv_add_5;
*qkv_mul_3_y >> *qkv_mul_3;
*qkv_add_3_y >> *qkv_add_3;
*qkv_mul_4_y >> *qkv_mul_4;
*qkv_add_4_y >> *qkv_add_4;
*qkv_dropout_4 >> *qkv_dropout_4_mask;
*qkv_ln_2_out >> *qkv_add_5 >> *qkv_add_5_out >> *qkv_ln_5 >> *qkv_ln_5_out;
*qkv_ln_5_scale >> *qkv_ln_5;
*qkv_ln_5_bias >> *qkv_ln_5;
*qkv_ln_5 >> *qkv_ln_5_mean;
*qkv_ln_5 >> *qkv_ln_5_var;
}
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
cpp::OpDesc op_desc;
op_desc.SetType("single_encoder");
op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
op_desc.SetInput("Mask", {matched.at("qk_mask")->arg()->name});
op_desc.SetInput("FCWeight",
{
matched.at("q_mul_y")->arg()->name,
matched.at("k_mul_y")->arg()->name,
matched.at("v_mul_y")->arg()->name,
matched.at("qkv_mul_y")->arg()->name,
matched.at("qkv_mul_3_y")->arg()->name,
matched.at("qkv_mul_4_y")->arg()->name,
});
op_desc.SetInput("FCBias",
{
matched.at("q_add_y")->arg()->name,
matched.at("k_add_y")->arg()->name,
matched.at("v_add_y")->arg()->name,
matched.at("qkv_add_y")->arg()->name,
matched.at("qkv_add_3_y")->arg()->name,
matched.at("qkv_add_4_y")->arg()->name,
});
op_desc.SetInput("LNScale",
{
matched.at("qkv_ln_2_scale")->arg()->name,
matched.at("qkv_ln_5_scale")->arg()->name,
});
op_desc.SetInput("LNBias",
{
matched.at("qkv_ln_2_bias")->arg()->name,
matched.at("qkv_ln_5_bias")->arg()->name,
});
op_desc.SetOutput("Outputs", {matched.at("qkv_ln_5_out")->arg()->name});
// XXX: keep these to fool SubgraphOp::AttachImpl()
op_desc.SetAttr<int>("sub_block", 0);
op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
// extra traits to distill
auto* reshape_op_info = matched.at("q_reshape2")->stmt()->op_info();
auto reshape_dim = reshape_op_info->GetAttr<std::vector<int>>("shape");
op_desc.SetAttr<int>("head_num", reshape_dim[2]);
op_desc.SetAttr<int>("size_per_head", reshape_dim[3]);
op_desc.SetAttr<std::string>("act_type", act_type_);
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
auto* single_encoder_stmt = matched.at("q_mul")->stmt();
fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places());
single_encoder_stmt->SetOp(fake_subgraph_op);
std::vector<std::string> froms = {
"qk_mask",
"k_mul_y",
"v_mul_y",
"qkv_mul_y",
"qkv_mul_3_y",
"qkv_mul_4_y",
"q_add_y",
"k_add_y",
"v_add_y",
"qkv_add_y",
"qkv_add_3_y",
"qkv_add_4_y",
"qkv_ln_2_scale",
"qkv_ln_2_bias",
"qkv_ln_5_scale",
"qkv_ln_5_bias",
};
for (auto& from : froms) {
IR_NODE_LINK_TO(matched.at(from), matched.at("q_mul"));
}
IR_OP_VAR_LINK(matched.at("q_mul"), matched.at("qkv_ln_5_out"));
}
private:
std::string act_type_;
};
class XPUMultiEncoderFuser {
public:
bool IsDirectPredecessorOf(Node* op1, Node* op2) {
for (auto* out : op1->outlinks) {
for (auto* in : op2->inlinks) {
if (out == in) return true;
}
}
return false;
}
void operator()(SSAGraph* graph) {
std::vector<Node*> all_encoders;
for (auto* node : graph->StmtTopologicalOrder()) {
CHECK(node->IsStmt());
if (node->stmt()->op_info()->Type() == "single_encoder") {
all_encoders.push_back(node);
}
}
VLOG(3) << "Found " << all_encoders.size() << " single_encoder";
if (all_encoders.size() == 0) {
return;
}
// TODO(miaotianxiang): more verification
for (size_t i = 0; i < all_encoders.size() - 1; ++i) {
CHECK(IsDirectPredecessorOf(all_encoders[i], all_encoders[i + 1]));
}
std::string mask_name;
for (auto* encoder : all_encoders) {
auto* op_info = encoder->stmt()->op_info();
if (mask_name.empty()) {
mask_name = op_info->Input("Mask").front();
} else {
// CHECK(mask_name == op_info->Input("Mask").front());
}
}
std::unordered_set<const Node*> to_remove;
Node* first_encoder = all_encoders[0];
std::string in_name, out_name;
std::vector<std::string> arg_names{
"FCWeight", "FCBias", "LNScale", "LNBias"};
std::unordered_map<std::string, std::vector<std::string>> arg_map;
for (size_t i = 0; i < all_encoders.size(); ++i) {
Node* cur_encoder = all_encoders[i];
auto* op_info = cur_encoder->stmt()->op_info();
for (auto arg_name : arg_names) {
auto real_names = op_info->Input(arg_name);
for (auto name : real_names) {
auto* arg_node = graph->RetrieveArgument(name);
DirectedLink(arg_node, first_encoder);
arg_map[arg_name].push_back(name);
}
}
auto* cur_out =
graph->RetrieveArgument(op_info->Output("Outputs").front());
if (i == 0) {
// first encoder
to_remove.insert(cur_out);
in_name = op_info->Input("Inputs").front();
mask_name = op_info->Input("Mask").front();
} else if (i == all_encoders.size() - 1) {
// last encoder
to_remove.insert(cur_encoder);
DirectedLink(first_encoder, cur_out);
out_name = op_info->Output("Outputs").front();
} else {
to_remove.insert(cur_encoder);
to_remove.insert(cur_out);
}
}
GraphSafeRemoveNodes(graph, to_remove);
auto* multi_encoder_stmt = first_encoder->stmt();
cpp::OpDesc op_desc;
op_desc.SetType("__xpu__multi_encoder");
op_desc.SetInput("Input", {in_name});
for (auto kv : arg_map) {
op_desc.SetInput(kv.first, kv.second);
}
op_desc.SetInput("Mask", {mask_name});
op_desc.SetOutput("Output", {out_name});
op_desc.SetAttr<int>("xpu", 1);
auto* first_encoder_op_info = multi_encoder_stmt->op_info();
op_desc.SetAttr<int>("head_num",
first_encoder_op_info->GetAttr<int>("head_num"));
op_desc.SetAttr<int>("size_per_head",
first_encoder_op_info->GetAttr<int>("size_per_head"));
op_desc.SetAttr<int>("n_layers", all_encoders.size());
op_desc.SetAttr<std::string>(
"act_type", first_encoder_op_info->GetAttr<std::string>("act_type"));
auto* scope = multi_encoder_stmt->op()->scope();
std::vector<float> fc_weight_max(arg_map["FCWeight"].size());
auto& fc_weight_names = arg_map["FCWeight"];
for (size_t i = 0; i < fc_weight_names.size(); ++i) {
auto* weight_t = scope->FindMutableTensor(fc_weight_names[i]);
auto weight_dims = weight_t->dims();
int weight_len = weight_t->numel();
float* weight_on_host = weight_t->mutable_data<float>();
float max_f =
paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
std::unique_ptr<int16_t[]> weight_trans_int16(new int16_t[weight_len]);
paddle::lite::xpu::math::ConvertFP32ToInt16(
weight_on_host, weight_int16.get(), max_f, weight_len);
paddle::lite::xpu::math::Transpose(weight_int16.get(),
weight_trans_int16.get(),
weight_dims[0],
weight_dims[1]);
memcpy(weight_on_host,
weight_trans_int16.get(),
weight_len * sizeof(int16_t));
fc_weight_max[i] = max_f;
}
std::string max_name = "encoder_max";
auto* max_filter_node = graph->NewArgumentNode(max_name);
max_filter_node->arg()->is_weight = true;
max_filter_node->arg()->type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
DirectedLink(max_filter_node, first_encoder);
auto* max_filter_tensor = scope->NewTensor(max_name);
max_filter_tensor->Resize({static_cast<int>(fc_weight_max.size())});
memcpy(max_filter_tensor->mutable_data<float>(),
&fc_weight_max[0],
sizeof(float) * fc_weight_max.size());
op_desc.SetInput("FCWeightMax", {max_name});
auto multi_encoder_op = LiteOpRegistry::Global().Create(op_desc.Type());
multi_encoder_op->Attach(op_desc, scope);
multi_encoder_op->SetValidPlaces(multi_encoder_stmt->op()->valid_places());
auto kernels =
multi_encoder_op->CreateKernels(multi_encoder_op->valid_places());
multi_encoder_stmt->SetOp(multi_encoder_op);
multi_encoder_stmt->SetKernels(std::move(kernels));
// temp remove useless cast
std::unordered_set<const Node*> to_remove2;
Node* stack = nullptr;
for (auto* node : graph->StmtTopologicalOrder()) {
CHECK(node->IsStmt());
if (node->stmt()->op_info()->Type() == "stack") {
stack = node;
}
}
Node* stack_out = stack->outlinks.front();
for (Node* cast : stack_out->outlinks) {
Node* cast_out = cast->outlinks.front();
if (cast_out->outlinks.size() == 0) {
// remove
to_remove2.insert(cast_out);
to_remove2.insert(cast);
}
}
GraphSafeRemoveNodes(graph, to_remove2);
}
};
} // namespace fusion
class XPUMultiEncoderFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
// TODO(miaotianxiang): backup graph, recover from failed match
std::vector<std::string> act_types{"gelu", "relu"};
for (auto& act_type : act_types) {
fusion::XPUSingleEncoderFuser single_encoder_fuser(act_type);
single_encoder_fuser(graph.get());
fusion::XPUMultiEncoderFuser multi_encoder_fuser;
multi_encoder_fuser(graph.get());
}
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(__xpu__multi_encoder_fuse_pass,
paddle::lite::mir::XPUMultiEncoderFusePass)
.BindTargets({TARGET(kXPU)})
.BindKernel("matmul");
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <vector>
#include "lite/backends/xpu/math.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
#include "lite/operators/subgraph_op.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class XPUResNetBlock0Fuser : public FuseBase {
public:
XPUResNetBlock0Fuser() {}
void BuildPattern() override {
auto* input =
VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
auto* left_conv1_weight = VarNode("left_conv1_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* left_conv1 = OpNode("left_conv1", "conv2d");
auto* left_conv1_out = VarNode("left_conv1_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* left_bn1_scale = VarNode("left_bn1_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* left_bn1_bias = VarNode("left_bn1_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* left_bn1_mean = VarNode("left_bn1_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* left_bn1_var = VarNode("left_bn1_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* left_bn1 = OpNode("left_bn1", "batch_norm")->AsIntermediate();
auto* left_bn1_out = VarNode("left_bn1_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("relu", "X")
->AsIntermediate();
auto* left_bn1_mean_out = VarNode("left_bn1_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* left_bn1_var_out =
VarNode("left_bn1_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* left_bn1_saved_mean =
VarNode("left_bn1_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* left_bn1_saved_var =
VarNode("left_bn1_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* left_relu1 = OpNode("left_relu1", "relu")->AsIntermediate();
auto* left_relu1_out = VarNode("left_relu1_out")
->assert_is_op_output("relu", "Out")
->assert_is_op_input("conv2d", "Input")
->AsIntermediate();
auto* left_conv2_weight = VarNode("left_conv2_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* left_conv2 = OpNode("left_conv2", "conv2d")->AsIntermediate();
auto* left_conv2_out = VarNode("left_conv2_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* left_bn2_scale = VarNode("left_bn2_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* left_bn2_bias = VarNode("left_bn2_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* left_bn2_mean = VarNode("left_bn2_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* left_bn2_var = VarNode("left_bn2_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* left_bn2 = OpNode("left_bn2", "batch_norm")->AsIntermediate();
auto* left_bn2_out = VarNode("left_bn2_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("relu", "X")
->AsIntermediate();
auto* left_bn2_mean_out = VarNode("left_bn2_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* left_bn2_var_out =
VarNode("left_bn2_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* left_bn2_saved_mean =
VarNode("left_bn2_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* left_bn2_saved_var =
VarNode("left_bn2_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* left_relu2 = OpNode("left_relu2", "relu")->AsIntermediate();
auto* left_relu2_out = VarNode("left_relu2_out")
->assert_is_op_output("relu", "Out")
->assert_is_op_input("conv2d", "Input")
->AsIntermediate();
auto* left_conv3_weight = VarNode("left_conv3_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* left_conv3 = OpNode("left_conv3", "conv2d")->AsIntermediate();
auto* left_conv3_out = VarNode("left_conv3_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* left_bn3_scale = VarNode("left_bn3_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* left_bn3_bias = VarNode("left_bn3_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* left_bn3_mean = VarNode("left_bn3_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* left_bn3_var = VarNode("left_bn3_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* left_bn3 = OpNode("left_bn3", "batch_norm")->AsIntermediate();
auto* left_bn3_out = VarNode("left_bn3_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("elementwise_add", "Y")
->AsIntermediate();
auto* left_bn3_mean_out = VarNode("left_bn3_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* left_bn3_var_out =
VarNode("left_bn3_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* left_bn3_saved_mean =
VarNode("left_bn3_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* left_bn3_saved_var =
VarNode("left_bn3_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* right_conv1_weight = VarNode("right_conv1_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* right_conv1 = OpNode("right_conv1", "conv2d")->AsIntermediate();
auto* right_conv1_out = VarNode("right_conv1_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* right_bn1_scale = VarNode("right_bn1_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* right_bn1_bias = VarNode("right_bn1_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* right_bn1_mean = VarNode("right_bn1_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* right_bn1_var = VarNode("right_bn1_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate();
auto* right_bn1_out = VarNode("right_bn1_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* right_bn1_mean_out =
VarNode("right_bn1_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* right_bn1_var_out =
VarNode("right_bn1_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* right_bn1_saved_mean =
VarNode("right_bn1_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* right_bn1_saved_var =
VarNode("right_bn1_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* add = OpNode("add", "elementwise_add")->AsIntermediate();
auto* add_out = VarNode("add_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("relu", "X")
->AsIntermediate();
auto* relu = OpNode("relu", "relu")->AsIntermediate();
auto* relu_out =
VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
*input >> *left_conv1 >> *left_conv1_out >> *left_bn1 >> *left_bn1_out >>
*left_relu1 >> *left_relu1_out >> *left_conv2 >> *left_conv2_out >>
*left_bn2 >> *left_bn2_out >> *left_relu2 >> *left_relu2_out >>
*left_conv3 >> *left_conv3_out >> *left_bn3 >> *left_bn3_out >> *add;
*left_conv1_weight >> *left_conv1;
*left_bn1_scale >> *left_bn1;
*left_bn1_bias >> *left_bn1;
*left_bn1_mean >> *left_bn1;
*left_bn1_var >> *left_bn1;
*left_bn1 >> *left_bn1_mean_out;
*left_bn1 >> *left_bn1_var_out;
*left_bn1 >> *left_bn1_saved_mean;
*left_bn1 >> *left_bn1_saved_var;
*left_conv2_weight >> *left_conv2;
*left_bn2_scale >> *left_bn2;
*left_bn2_bias >> *left_bn2;
*left_bn2_mean >> *left_bn2;
*left_bn2_var >> *left_bn2;
*left_bn2 >> *left_bn2_mean_out;
*left_bn2 >> *left_bn2_var_out;
*left_bn2 >> *left_bn2_saved_mean;
*left_bn2 >> *left_bn2_saved_var;
*left_conv3_weight >> *left_conv3;
*left_bn3_scale >> *left_bn3;
*left_bn3_bias >> *left_bn3;
*left_bn3_mean >> *left_bn3;
*left_bn3_var >> *left_bn3;
*left_bn3 >> *left_bn3_mean_out;
*left_bn3 >> *left_bn3_var_out;
*left_bn3 >> *left_bn3_saved_mean;
*left_bn3 >> *left_bn3_saved_var;
*input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >>
*right_bn1_out >> *add;
*right_conv1_weight >> *right_conv1;
*right_bn1_scale >> *right_bn1;
*right_bn1_bias >> *right_bn1;
*right_bn1_mean >> *right_bn1;
*right_bn1_var >> *right_bn1;
*right_bn1 >> *right_bn1_mean_out;
*right_bn1 >> *right_bn1_var_out;
*right_bn1 >> *right_bn1_saved_mean;
*right_bn1 >> *right_bn1_saved_var;
*add >> *add_out >> *relu >> *relu_out;
}
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
cpp::OpDesc op_desc;
op_desc.SetType("resnet_block0");
op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
op_desc.SetInput("Filter",
{
matched.at("left_conv1_weight")->arg()->name,
matched.at("left_conv2_weight")->arg()->name,
matched.at("left_conv3_weight")->arg()->name,
matched.at("right_conv1_weight")->arg()->name,
});
op_desc.SetInput("Scale",
{
matched.at("left_bn1_scale")->arg()->name,
matched.at("left_bn2_scale")->arg()->name,
matched.at("left_bn3_scale")->arg()->name,
matched.at("right_bn1_scale")->arg()->name,
});
op_desc.SetInput("Bias",
{
matched.at("left_bn1_bias")->arg()->name,
matched.at("left_bn2_bias")->arg()->name,
matched.at("left_bn3_bias")->arg()->name,
matched.at("right_bn1_bias")->arg()->name,
});
op_desc.SetInput("Mean",
{
matched.at("left_bn1_mean")->arg()->name,
matched.at("left_bn2_mean")->arg()->name,
matched.at("left_bn3_mean")->arg()->name,
matched.at("right_bn1_mean")->arg()->name,
});
op_desc.SetInput("Var",
{
matched.at("left_bn1_variance")->arg()->name,
matched.at("left_bn2_variance")->arg()->name,
matched.at("left_bn3_variance")->arg()->name,
matched.at("right_bn1_variance")->arg()->name,
});
op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
// XXX: keep these to fool SubgraphOp::AttachImpl()
op_desc.SetAttr<int>("sub_block", 0);
op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
auto block0_stmt = matched.at("left_conv1")->stmt();
// block0_stmt->ResetOp(op_desc, graph->valid_places());
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
block0_stmt->SetOp(fake_subgraph_op);
std::vector<std::string> froms = {
"left_conv2_weight",
"left_conv3_weight",
"right_conv1_weight",
"left_bn1_bias",
"left_bn2_bias",
"left_bn3_bias",
"right_bn1_bias",
};
for (auto& from : froms) {
IR_NODE_LINK_TO(matched.at(from), matched.at("left_conv1"));
}
IR_OP_VAR_LINK(matched.at("left_conv1"), matched.at("relu_out"));
}
};
class XPUResNetBlock1Fuser : public FuseBase {
public:
XPUResNetBlock1Fuser() {}
void BuildPattern() override {
auto* input = VarNode("input")
->assert_is_op_input("conv2d", "Input")
->assert_is_op_input("elementwise_add", "X")
->AsInput();
auto* right_conv1_weight = VarNode("right_conv1_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* right_conv1 = OpNode("right_conv1", "conv2d");
auto* right_conv1_out = VarNode("right_conv1_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* right_bn1_scale = VarNode("right_bn1_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* right_bn1_bias = VarNode("right_bn1_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* right_bn1_mean = VarNode("right_bn1_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* right_bn1_var = VarNode("right_bn1_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate();
auto* right_bn1_out = VarNode("right_bn1_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("relu", "X")
->AsIntermediate();
auto* right_bn1_mean_out =
VarNode("right_bn1_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* right_bn1_var_out =
VarNode("right_bn1_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* right_bn1_saved_mean =
VarNode("right_bn1_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* right_bn1_saved_var =
VarNode("right_bn1_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* right_relu1 = OpNode("right_relu1", "relu")->AsIntermediate();
auto* right_relu1_out = VarNode("right_relu1_out")
->assert_is_op_output("relu", "Out")
->assert_is_op_input("conv2d", "Input")
->AsIntermediate();
auto* right_conv2_weight = VarNode("right_conv2_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* right_conv2 = OpNode("right_conv2", "conv2d")->AsIntermediate();
auto* right_conv2_out = VarNode("right_conv2_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* right_bn2_scale = VarNode("right_bn2_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* right_bn2_bias = VarNode("right_bn2_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* right_bn2_mean = VarNode("right_bn2_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* right_bn2_var = VarNode("right_bn2_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* right_bn2 = OpNode("right_bn2", "batch_norm")->AsIntermediate();
auto* right_bn2_out = VarNode("right_bn2_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("relu", "X")
->AsIntermediate();
auto* right_bn2_mean_out =
VarNode("right_bn2_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* right_bn2_var_out =
VarNode("right_bn2_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* right_bn2_saved_mean =
VarNode("right_bn2_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* right_bn2_saved_var =
VarNode("right_bn2_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* right_relu2 = OpNode("right_relu2", "relu")->AsIntermediate();
auto* right_relu2_out = VarNode("right_relu2_out")
->assert_is_op_output("relu", "Out")
->assert_is_op_input("conv2d", "Input")
->AsIntermediate();
auto* right_conv3_weight = VarNode("right_conv3_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* right_conv3 = OpNode("right_conv3", "conv2d")->AsIntermediate();
auto* right_conv3_out = VarNode("right_conv3_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* right_bn3_scale = VarNode("right_bn3_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* right_bn3_bias = VarNode("right_bn3_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* right_bn3_mean = VarNode("right_bn3_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* right_bn3_var = VarNode("right_bn3_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* right_bn3 = OpNode("right_bn3", "batch_norm")->AsIntermediate();
auto* right_bn3_out = VarNode("right_bn3_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("elementwise_add", "Y")
->AsIntermediate();
auto* right_bn3_mean_out =
VarNode("right_bn3_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* right_bn3_var_out =
VarNode("right_bn3_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* right_bn3_saved_mean =
VarNode("right_bn3_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* right_bn3_saved_var =
VarNode("right_bn3_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* add = OpNode("add", "elementwise_add")->AsIntermediate();
auto* add_out = VarNode("add_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("relu", "X")
->AsIntermediate();
auto* relu = OpNode("relu", "relu")->AsIntermediate();
auto* relu_out =
VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
*input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >>
*right_bn1_out >> *right_relu1 >> *right_relu1_out >> *right_conv2 >>
*right_conv2_out >> *right_bn2 >> *right_bn2_out >> *right_relu2 >>
*right_relu2_out >> *right_conv3 >> *right_conv3_out >> *right_bn3 >>
*right_bn3_out >> *add;
*right_conv1_weight >> *right_conv1;
*right_bn1_scale >> *right_bn1;
*right_bn1_bias >> *right_bn1;
*right_bn1_mean >> *right_bn1;
*right_bn1_var >> *right_bn1;
*right_bn1 >> *right_bn1_mean_out;
*right_bn1 >> *right_bn1_var_out;
*right_bn1 >> *right_bn1_saved_mean;
*right_bn1 >> *right_bn1_saved_var;
*right_conv2_weight >> *right_conv2;
*right_bn2_scale >> *right_bn2;
*right_bn2_bias >> *right_bn2;
*right_bn2_mean >> *right_bn2;
*right_bn2_var >> *right_bn2;
*right_bn2 >> *right_bn2_mean_out;
*right_bn2 >> *right_bn2_var_out;
*right_bn2 >> *right_bn2_saved_mean;
*right_bn2 >> *right_bn2_saved_var;
*right_conv3_weight >> *right_conv3;
*right_bn3_scale >> *right_bn3;
*right_bn3_bias >> *right_bn3;
*right_bn3_mean >> *right_bn3;
*right_bn3_var >> *right_bn3;
*right_bn3 >> *right_bn3_mean_out;
*right_bn3 >> *right_bn3_var_out;
*right_bn3 >> *right_bn3_saved_mean;
*right_bn3 >> *right_bn3_saved_var;
*input >> *add;
*add >> *add_out >> *relu >> *relu_out;
}
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
cpp::OpDesc op_desc;
op_desc.SetType("resnet_block1");
op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
op_desc.SetInput("Filter",
{
matched.at("right_conv1_weight")->arg()->name,
matched.at("right_conv2_weight")->arg()->name,
matched.at("right_conv3_weight")->arg()->name,
});
op_desc.SetInput("Scale",
{
matched.at("right_bn1_scale")->arg()->name,
matched.at("right_bn2_scale")->arg()->name,
matched.at("right_bn3_scale")->arg()->name,
});
op_desc.SetInput("Bias",
{
matched.at("right_bn1_bias")->arg()->name,
matched.at("right_bn2_bias")->arg()->name,
matched.at("right_bn3_bias")->arg()->name,
});
op_desc.SetInput("Mean",
{
matched.at("right_bn1_mean")->arg()->name,
matched.at("right_bn2_mean")->arg()->name,
matched.at("right_bn3_mean")->arg()->name,
});
op_desc.SetInput("Var",
{
matched.at("right_bn1_variance")->arg()->name,
matched.at("right_bn2_variance")->arg()->name,
matched.at("right_bn3_variance")->arg()->name,
});
op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
// XXX: keep these to fool SubgraphOp::AttachImpl()
op_desc.SetAttr<int>("sub_block", 0);
op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
auto block1_stmt = matched.at("right_conv1")->stmt();
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
block1_stmt->SetOp(fake_subgraph_op);
std::vector<std::string> froms = {
"right_conv2_weight",
"right_conv3_weight",
"right_bn1_bias",
"right_bn2_bias",
"right_bn3_bias",
};
for (auto& from : froms) {
IR_NODE_LINK_TO(matched.at(from), matched.at("right_conv1"));
}
IR_OP_VAR_LINK(matched.at("right_conv1"), matched.at("relu_out"));
}
};
class XPUResNet50Fuser : public xpu::XPUFuseBase {
public:
XPUResNet50Fuser() {}
void BuildPattern() override {
auto* input =
VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
auto* top_conv_weight = VarNode("top_conv_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* top_conv = OpNode("top_conv", "conv2d");
auto* top_conv_out = VarNode("top_conv_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* top_bn_scale = VarNode("top_bn_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* top_bn_bias = VarNode("top_bn_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* top_bn_mean = VarNode("top_bn_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* top_bn_var = VarNode("top_bn_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* top_bn = OpNode("top_bn", "batch_norm")->AsIntermediate();
auto* top_bn_out = VarNode("top_bn_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("relu", "X")
->AsIntermediate();
auto* top_bn_mean_out = VarNode("top_bn_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* top_bn_var_out =
VarNode("top_bn_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* top_bn_saved_mean =
VarNode("top_bn_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* top_bn_saved_var =
VarNode("top_bn_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* top_relu = OpNode("top_relu", "relu")->AsIntermediate();
auto* top_relu_out = VarNode("top_relu_out")
->assert_is_op_output("relu", "Out")
->assert_is_op_input("pool2d", "X")
->AsIntermediate();
auto* top_pool = OpNode("top_pool", "pool2d")->AsIntermediate();
auto* top_pool_out = VarNode("top_pool_out")
->assert_is_op_output("pool2d", "Out")
->assert_is_op_input("resnet_block0", "Inputs")
->AsIntermediate();
// args are left out
auto* resnet_block0_1 =
OpNode("resnet_block0_1", "resnet_block0")->AsIntermediate();
auto* resnet_block0_1_out =
VarNode("resnet_block0_1_out")
->assert_is_op_output("resnet_block0", "Outputs")
->AsIntermediate();
auto* resnet_block1_1_1 =
OpNode("resnet_block1_1_1", "resnet_block1")->AsIntermediate();
auto* resnet_block1_1_1_out =
VarNode("resnet_block1_1_1_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_1_2 =
OpNode("resnet_block1_1_2", "resnet_block1")->AsIntermediate();
auto* resnet_block1_1_2_out =
VarNode("resnet_block1_1_2_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block0_2 =
OpNode("resnet_block0_2", "resnet_block0")->AsIntermediate();
auto* resnet_block0_2_out =
VarNode("resnet_block0_2_out")
->assert_is_op_output("resnet_block0", "Outputs")
->AsIntermediate();
auto* resnet_block1_2_1 =
OpNode("resnet_block1_2_1", "resnet_block1")->AsIntermediate();
auto* resnet_block1_2_1_out =
VarNode("resnet_block1_2_1_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_2_2 =
OpNode("resnet_block1_2_2", "resnet_block1")->AsIntermediate();
auto* resnet_block1_2_2_out =
VarNode("resnet_block1_2_2_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_2_3 =
OpNode("resnet_block1_2_3", "resnet_block1")->AsIntermediate();
auto* resnet_block1_2_3_out =
VarNode("resnet_block1_2_3_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block0_3 =
OpNode("resnet_block0_3", "resnet_block0")->AsIntermediate();
auto* resnet_block0_3_out =
VarNode("resnet_block0_3_out")
->assert_is_op_output("resnet_block0", "Outputs")
->AsIntermediate();
auto* resnet_block1_3_1 =
OpNode("resnet_block1_3_1", "resnet_block1")->AsIntermediate();
auto* resnet_block1_3_1_out =
VarNode("resnet_block1_3_1_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_3_2 =
OpNode("resnet_block1_3_2", "resnet_block1")->AsIntermediate();
auto* resnet_block1_3_2_out =
VarNode("resnet_block1_3_2_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_3_3 =
OpNode("resnet_block1_3_3", "resnet_block1")->AsIntermediate();
auto* resnet_block1_3_3_out =
VarNode("resnet_block1_3_3_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_3_4 =
OpNode("resnet_block1_3_4", "resnet_block1")->AsIntermediate();
auto* resnet_block1_3_4_out =
VarNode("resnet_block1_3_4_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_3_5 =
OpNode("resnet_block1_3_5", "resnet_block1")->AsIntermediate();
auto* resnet_block1_3_5_out =
VarNode("resnet_block1_3_5_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block0_4 =
OpNode("resnet_block0_4", "resnet_block0")->AsIntermediate();
auto* resnet_block0_4_out =
VarNode("resnet_block0_4_out")
->assert_is_op_output("resnet_block0", "Outputs")
->AsIntermediate();
auto* resnet_block1_4_1 =
OpNode("resnet_block1_4_1", "resnet_block1")->AsIntermediate();
auto* resnet_block1_4_1_out =
VarNode("resnet_block1_4_1_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_4_2 =
OpNode("resnet_block1_4_2", "resnet_block1")->AsIntermediate();
auto* resnet_block1_4_2_out =
VarNode("resnet_block1_4_2_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* bottom_pool = OpNode("bottom_pool", "pool2d")->AsIntermediate();
auto* bottom_pool_out = VarNode("bottom_pool_out")
->assert_is_op_output("pool2d", "Out")
->AsOutput();
*input >> *top_conv >> *top_conv_out >> *top_bn >> *top_bn_out >>
*top_relu >> *top_relu_out >> *top_pool >> *top_pool_out >>
*resnet_block0_1 >> *resnet_block0_1_out >> *resnet_block1_1_1 >>
*resnet_block1_1_1_out >> *resnet_block1_1_2 >>
*resnet_block1_1_2_out >> *resnet_block0_2 >> *resnet_block0_2_out >>
*resnet_block1_2_1 >> *resnet_block1_2_1_out >> *resnet_block1_2_2 >>
*resnet_block1_2_2_out >> *resnet_block1_2_3 >>
*resnet_block1_2_3_out >> *resnet_block0_3 >> *resnet_block0_3_out >>
*resnet_block1_3_1 >> *resnet_block1_3_1_out >> *resnet_block1_3_2 >>
*resnet_block1_3_2_out >> *resnet_block1_3_3 >>
*resnet_block1_3_3_out >> *resnet_block1_3_4 >>
*resnet_block1_3_4_out >> *resnet_block1_3_5 >>
*resnet_block1_3_5_out >> *resnet_block0_4 >> *resnet_block0_4_out >>
*resnet_block1_4_1 >> *resnet_block1_4_1_out >> *resnet_block1_4_2 >>
*resnet_block1_4_2_out >> *bottom_pool >> *bottom_pool_out;
*top_conv_weight >> *top_conv;
*top_bn_scale >> *top_bn;
*top_bn_bias >> *top_bn;
*top_bn_mean >> *top_bn;
*top_bn_var >> *top_bn;
*top_bn >> *top_bn_mean_out;
*top_bn >> *top_bn_var_out;
*top_bn >> *top_bn_saved_mean;
*top_bn >> *top_bn_saved_var;
}
void InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched,
const std::vector<Node*>& extra_input_vars) override {
cpp::OpDesc op_desc;
op_desc.SetType("__xpu__resnet50");
op_desc.SetInput("Input", {matched.at("input")->arg()->name});
std::vector<std::string> filter_name = {
matched.at("top_conv_weight")->arg()->name};
std::vector<std::string> scale_name = {
matched.at("top_bn_scale")->arg()->name};
std::vector<std::string> bias_name = {
matched.at("top_bn_bias")->arg()->name};
std::vector<std::string> mean_name = {
matched.at("top_bn_mean")->arg()->name};
std::vector<std::string> var_name = {
matched.at("top_bn_variance")->arg()->name};
std::vector<std::string> max_filter_name;
std::vector<std::string> resnet_block_vec = {
"resnet_block0_1",
"resnet_block1_1_1",
"resnet_block1_1_2",
"resnet_block0_2",
"resnet_block1_2_1",
"resnet_block1_2_2",
"resnet_block1_2_3",
"resnet_block0_3",
"resnet_block1_3_1",
"resnet_block1_3_2",
"resnet_block1_3_3",
"resnet_block1_3_4",
"resnet_block1_3_5",
"resnet_block0_4",
"resnet_block1_4_1",
"resnet_block1_4_2",
};
for (auto& block : resnet_block_vec) {
auto* block_op_info = matched.at(block)->stmt()->op_info();
auto block_filter_name = block_op_info->Input("Filter");
std::copy(block_filter_name.begin(),
block_filter_name.end(),
std::back_inserter(filter_name));
auto block_scale_name = block_op_info->Input("Scale");
std::copy(block_scale_name.begin(),
block_scale_name.end(),
std::back_inserter(scale_name));
auto block_bias_name = block_op_info->Input("Bias");
std::copy(block_bias_name.begin(),
block_bias_name.end(),
std::back_inserter(bias_name));
auto block_mean_name = block_op_info->Input("Mean");
std::copy(block_mean_name.begin(),
block_mean_name.end(),
std::back_inserter(mean_name));
auto block_var_name = block_op_info->Input("Var");
std::copy(block_var_name.begin(),
block_var_name.end(),
std::back_inserter(var_name));
}
op_desc.SetInput("Filter", filter_name);
op_desc.SetInput("Bias", bias_name);
op_desc.SetOutput("Output", {matched.at("bottom_pool_out")->arg()->name});
op_desc.SetAttr<int>("xpu", 1);
auto* resnet50_stmt = matched.at("top_conv")->stmt();
auto* scope = resnet50_stmt->op()->scope();
for (size_t i = 0; i < filter_name.size(); ++i) {
auto* filter_t = scope->FindMutableTensor(filter_name[i]);
auto* scale_t = scope->FindMutableTensor(scale_name[i]);
auto* bias_t = scope->FindMutableTensor(bias_name[i]);
auto* mean_t = scope->FindMutableTensor(mean_name[i]);
auto* var_t = scope->FindMutableTensor(var_name[i]);
int mean_len = mean_t->numel();
int filter_len = filter_t->numel();
int filter_stride = filter_len / mean_len;
float* filter_on_host = filter_t->mutable_data<float>();
float* scale_on_host = scale_t->mutable_data<float>();
float* bias_on_host = bias_t->mutable_data<float>();
float* mean_on_host = mean_t->mutable_data<float>();
float* var_on_host = var_t->mutable_data<float>();
// Perform preprocess
for (int i = 0; i < mean_len; ++i) {
scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f);
}
for (int i = 0; i < mean_len; ++i) {
for (int j = 0; j < filter_stride; ++j) {
filter_on_host[i * filter_stride + j] *= scale_on_host[i];
}
}
for (int i = 0; i < mean_len; ++i) {
bias_on_host[i] += -mean_on_host[i] * scale_on_host[i];
}
float max_f =
paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len);
std::unique_ptr<int16_t[]> filter_int16(new int16_t[filter_len]);
paddle::lite::xpu::math::ConvertFP32ToInt16(
filter_on_host, filter_int16.get(), max_f, filter_len);
memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t));
// create new arg in graph and scope
std::string max_name = filter_name[i] + "_max";
max_filter_name.push_back(max_name);
auto* max_filter_node = graph->NewArgumentNode(max_name);
max_filter_node->arg()->is_weight = true;
max_filter_node->arg()->type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
DirectedLink(max_filter_node, matched.at("top_conv"));
auto* max_filter_t = scope->NewTensor(max_name);
max_filter_t->Resize({4});
float* max_ptr = max_filter_t->mutable_data<float>();
max_ptr[0] = max_f;
max_ptr[1] = max_f;
max_ptr[2] = max_f;
max_ptr[3] = max_f;
}
op_desc.SetInput("MaxFilter", max_filter_name);
auto resnet50_op = LiteOpRegistry::Global().Create(op_desc.Type());
resnet50_op->Attach(op_desc, scope);
resnet50_op->SetValidPlaces(resnet50_stmt->op()->valid_places());
auto kernels = resnet50_op->CreateKernels(resnet50_op->valid_places());
resnet50_stmt->SetOp(resnet50_op);
resnet50_stmt->SetKernels(std::move(kernels));
IR_NODE_LINK_TO(matched.at("top_bn_bias"), matched.at("top_conv"));
for (auto* node : extra_input_vars) {
IR_NODE_LINK_TO(node, matched.at("top_conv"));
}
IR_OP_VAR_LINK(matched.at("top_conv"), matched.at("bottom_pool_out"));
}
};
} // namespace fusion
class XPUResNet50FusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
fusion::XPUResNetBlock0Fuser block0_fuser;
block0_fuser(graph.get());
fusion::XPUResNetBlock1Fuser block1_fuser;
block1_fuser(graph.get());
fusion::XPUResNet50Fuser resnet50_fuser;
resnet50_fuser(graph.get());
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(__xpu__resnet_fuse_pass,
paddle::lite::mir::XPUResNet50FusePass)
.BindTargets({TARGET(kXPU)})
.BindKernel("conv2d");
...@@ -26,15 +26,13 @@ namespace paddle { ...@@ -26,15 +26,13 @@ namespace paddle {
namespace lite { namespace lite {
namespace mir { namespace mir {
using inference::analysis::Dot;
void GraphVisualizePass::Apply(const std::unique_ptr<SSAGraph>& graph) { void GraphVisualizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
VLOG(5) << "\n" << Visualize(graph.get()); VLOG(5) << "\n" << Visualize(graph.get());
} }
std::string Visualize(mir::SSAGraph* graph) { std::string Visualize(mir::SSAGraph* graph) {
std::ostringstream os; std::ostringstream os;
inference::analysis::Dot dot; Dot dot;
auto string_trunc = [](const std::string& str) -> std::string { auto string_trunc = [](const std::string& str) -> std::string {
const int max_disp_size = 100; const int max_disp_size = 100;
if (str.length() > max_disp_size) if (str.length() > max_disp_size)
......
...@@ -322,7 +322,6 @@ void PatternMatcher::RemoveOverlappedMatch(std::vector<subgraph_t> *subgraphs) { ...@@ -322,7 +322,6 @@ void PatternMatcher::RemoveOverlappedMatch(std::vector<subgraph_t> *subgraphs) {
} }
std::string PMPattern::DotString() const { std::string PMPattern::DotString() const {
using inference::analysis::Dot;
Dot dot; Dot dot;
int id = 0; int id = 0;
// Create Nodes // Create Nodes
......
...@@ -64,7 +64,6 @@ class FuseBase { ...@@ -64,7 +64,6 @@ class FuseBase {
protected: protected:
virtual void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) = 0; virtual void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) = 0;
private:
void PerformPatternMatcher(SSAGraph* graph); void PerformPatternMatcher(SSAGraph* graph);
// Delete nodes that are marked as Intermediate // Delete nodes that are marked as Intermediate
......
...@@ -213,9 +213,10 @@ std::vector<mir::Node *> SSAGraph::outputs() { ...@@ -213,9 +213,10 @@ std::vector<mir::Node *> SSAGraph::outputs() {
} }
mir::Node *SSAGraph::RetrieveArgument(const std::string &arg) { mir::Node *SSAGraph::RetrieveArgument(const std::string &arg) {
auto it = arguments_.find(arg); for (auto &node : node_storage_) {
if (it != arguments_.end()) { if (node.IsArg() && node.arg()->name == arg) {
return it->second; return &node;
}
} }
return nullptr; return nullptr;
} }
......
...@@ -30,10 +30,8 @@ namespace paddle { ...@@ -30,10 +30,8 @@ namespace paddle {
namespace lite { namespace lite {
namespace mir { namespace mir {
using inference::analysis::Dot;
std::string SubgraphVisualizer::operator()() { std::string SubgraphVisualizer::operator()() {
inference::analysis::Dot dot; Dot dot;
const std::vector<std::string> subgraph_colors{ const std::vector<std::string> subgraph_colors{
"red", "green", "cyan", "bisque3", "red", "green", "cyan", "bisque3",
"coral", "darkseagreen1", "goldenrod1", "darkorchid", "coral", "darkseagreen1", "goldenrod1", "darkorchid",
......
...@@ -200,7 +200,7 @@ TEST(Subgraph, detect_custom_model) { ...@@ -200,7 +200,7 @@ TEST(Subgraph, detect_custom_model) {
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
Place{TARGET(kNPU), PRECISION(kFloat)}, Place{TARGET(kNPU), PRECISION(kFloat)},
#endif #endif
#ifdef LITE_WITH_XPU #ifdef LITE_WITH_XTCL
Place{TARGET(kXPU), PRECISION(kFloat)}, Place{TARGET(kXPU), PRECISION(kFloat)},
#endif #endif
}); });
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <vector> #include <vector>
#include "lite/core/mir/pass_registry.h" #include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/subgraph/subgraph_detector.h" #include "lite/core/mir/subgraph/subgraph_detector.h"
#include "lite/utils/env.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -40,6 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -40,6 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
} }
void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
std::unordered_set<std::string> supported_lists; std::unordered_set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
#include "lite/kernels/xpu/bridges/paddle_use_bridges.h" #include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
......
...@@ -180,7 +180,7 @@ TEST(Subgraph, generate_model_and_check_precision) { ...@@ -180,7 +180,7 @@ TEST(Subgraph, generate_model_and_check_precision) {
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}); valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
#endif #endif
#ifdef LITE_WITH_XPU #ifdef LITE_WITH_XTCL
valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}); valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
#endif #endif
auto tar_predictor = TestModel(FLAGS_model_dir, auto tar_predictor = TestModel(FLAGS_model_dir,
......
...@@ -180,7 +180,7 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -180,7 +180,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
VLOG(4) << "picked, opencl found"; VLOG(4) << "picked, opencl found";
is_found = true; is_found = true;
} else if (TypeCompatible(*in_arg_ty, from) && } else if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->target() == to.target()) { TargetCompatibleTo(*out_arg_ty, to)) {
VLOG(4) << "picked"; VLOG(4) << "picked";
is_found = true; is_found = true;
} }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <array>
#include <string>
#include <vector>
#include "lite/core/mir/dot.h"
#include "lite/core/mir/xpu_pattern_matcher.h"
#include "lite/core/op_lite.h"
#include "lite/utils/string.h"
namespace paddle {
namespace lite {
namespace mir {
namespace xpu {
void XPUPatternMatcher::operator()(SSAGraph *graph,
XPUPatternMatcher::handle_t handler) {
if (!MarkPMNodesInGraph(graph)) {
return;
}
auto subgraphs = DetectPatterns();
UniquePatterns(&subgraphs);
RemoveOverlappedMatch(&subgraphs);
ValidateByNodeRole(&subgraphs);
if (subgraphs.empty()) return;
LOG(INFO) << "detected " << subgraphs.size() << " subgraph";
int id = 0;
for (auto &g : subgraphs) {
VLOG(3) << "optimizing #" << id++ << " subgraph";
handler(g, graph);
}
}
bool XPUPatternMatcher::MarkPMNodesInGraph(SSAGraph *graph) {
VLOG(3) << "mark pmnodes in graph";
if (graph->nodes().empty()) return false;
for (auto &node : graph->mutable_nodes()) {
for (const auto &pmnode : pattern_.nodes()) {
if (pmnode->Tell(&node)) {
pmnodes2nodes_[pmnode.get()].insert(&node);
}
}
}
// Check to early stop if some PMNode can't find matched Node.
for (auto &pmnode : pattern_.nodes()) {
if (!pmnodes2nodes_.count(pmnode.get())) {
VLOG(4) << pmnode->name() << " can't find matched Node, early stop";
// return false;
}
}
VLOG(3) << pmnodes2nodes_.size() << " nodes marked";
return !pmnodes2nodes_.empty();
}
// The intermediate Nodes can only link to the nodes inside the pattern, or this
// subgraph will be droped.
void XPUPatternMatcher::ValidateByNodeRole(
std::vector<PatternMatcher::subgraph_t> *subgraphs) {
subgraphs->erase(
std::remove_if(subgraphs->begin(),
subgraphs->end(),
[](const XPUPatternMatcher::subgraph_t &subgraph) -> bool {
// Collect the inlinks and outlinks.
std::unordered_set<Node *> ios;
for (auto &item : subgraph) {
ios.insert(item.second);
}
for (auto &item : subgraph) {
if (item.first->IsIntermediate()) {
for (auto *x : item.second->outlinks) {
if (!ios.count(x)) {
return true;
}
}
}
}
return false;
}),
subgraphs->end());
for (auto &subgraph : *subgraphs) {
std::unordered_set<Node *> ios;
for (auto &item : subgraph) {
ios.insert(item.second);
}
extra_input_vars_.emplace_back();
for (auto &item : subgraph) {
for (auto *x : item.second->inlinks) {
if (x->IsArg() && ios.count(x) == 0) {
// extra weight var
extra_input_vars_.back().push_back(x);
}
}
}
}
}
struct HitGroup {
std::unordered_map<PMNode *, Node *> roles;
bool Match(Node *node, PMNode *pat) {
if (nodes_.count(node)) {
if (roles.count(pat) && roles[pat] == node) return true;
return false;
} else {
if (roles.count(pat) && roles[pat] != node) return false;
return true;
}
}
void Register(Node *node, PMNode *pat) {
roles[pat] = node;
nodes_.insert(node);
}
private:
std::unordered_set<Node *> nodes_;
};
// Tell whether Node a links to b.
bool IsNodesLink(Node *a, Node *b) {
for (auto *node : a->outlinks) {
if (b == node) {
return true;
}
}
return false;
}
std::vector<PatternMatcher::subgraph_t> XPUPatternMatcher::DetectPatterns() {
// Init empty subgraphs.
std::vector<PatternMatcher::subgraph_t> result;
std::vector<HitGroup> init_groups;
std::array<std::vector<HitGroup>, 2> bi_records;
auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
: pattern_.edges().front().first;
if (!pmnodes2nodes_.count(first_pnode)) return result;
for (auto *node : pmnodes2nodes_[first_pnode]) {
HitGroup group;
group.roles[first_pnode] = node;
init_groups.emplace_back(group);
}
int step = 0;
bi_records[0] = std::move(init_groups);
// Extend a PMNode to subgraphs by deducing the connection relations defined
// in edges of PMNodes.
for (const auto &edge : pattern_.edges()) {
VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
// TODO(Superjomn) Fix bug here, the groups might be duplicate here.
// Each role has two PMNodes, which indicates two roles.
// Detect two Nodes that can match these two roles and they are connected.
auto &pre_groups = bi_records[step % 2];
auto &cur_groups = bi_records[1 - (step++ % 2)];
cur_groups.clear();
if (pre_groups.empty()) break;
// source -> target
for (Node *source : pmnodes2nodes_[edge.first]) {
for (Node *target : pmnodes2nodes_[edge.second]) {
// TODO(Superjomn) add some prune strategies.
for (const auto &group : pre_groups) {
if (IsNodesLink(source, target)) {
HitGroup new_group = group;
bool flag = new_group.Match(source, edge.first) &&
new_group.Match(target, edge.second);
if (flag) {
new_group.Register(source, edge.first);
new_group.Register(target, edge.second);
cur_groups.push_back(new_group);
// TODO(Superjomn) need to unique
}
}
}
}
}
VLOG(3) << "step " << step << " get records: " << cur_groups.size();
}
for (auto &group : bi_records[step % 2]) {
XPUPatternMatcher::subgraph_t subgraph;
for (auto &role : group.roles) {
subgraph.emplace(role.first, role.second);
}
result.emplace_back(subgraph);
}
return result;
}
struct GraphItemLessThan {
bool operator()(const std::pair<PMNode *, Node *> &a,
const std::pair<PMNode *, Node *> &b) {
if (a.first != b.first) {
return a.first < b.first;
} else {
return a.second < b.second;
}
}
};
// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
// see https://github.com/PaddlePaddle/Paddle/issues/13550
void XPUPatternMatcher::UniquePatterns(
std::vector<PatternMatcher::subgraph_t> *subgraphs) {
if (subgraphs->empty()) return;
std::vector<PatternMatcher::subgraph_t> result;
std::unordered_set<size_t> set;
std::hash<std::string> hasher;
for (auto &g : *subgraphs) {
// Sort the items in the sub-graph, and transform to a string key.
std::vector<std::pair<PMNode *, Node *>> sorted_keys(g.begin(), g.end());
std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan());
STL::stringstream ss;
for (auto &item : sorted_keys) {
ss << reinterpret_cast<size_t>(item.first) << ":"
<< reinterpret_cast<size_t>(item.second);
}
auto key = hasher(ss.str());
if (!set.count(key)) {
result.emplace_back(g);
set.insert(key);
}
}
*subgraphs = result;
}
void XPUPatternMatcher::RemoveOverlappedMatch(
std::vector<subgraph_t> *subgraphs) {
std::vector<subgraph_t> result;
std::unordered_set<Node *> node_set;
for (const auto &subgraph : *subgraphs) {
bool valid = true;
for (auto &item : subgraph) {
if (item.first->IsIntermediate() && node_set.count(item.second)) {
valid = false;
break;
}
}
if (valid) {
for (auto &item : subgraph) {
node_set.insert(item.second);
}
result.push_back(subgraph);
}
}
*subgraphs = result;
}
} // namespace xpu
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "lite/core/mir/pattern_matcher.h"
namespace paddle {
namespace lite {
namespace mir {
namespace xpu {
/*
* PatternMatcher helps to detect the specific patterns in the graph.
* Input a pattern, output a list of the matched subgraphs/nodes.
* This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
*
* The algorithm has three phases:
* 1. Mark the nodes that match the defined PMNodes in a PMPattern,
* 2. Extend a PMNode to subgraphs by deducing the connection relation defined
* in PAPattern(the edges),
* 3. Get the filtered subgraphs and treat them with a pre-defined handler.
*
* Usage:
* // Create a matcher
* PatternMatcher matcher;
* // Define the matcher's pattern, by adding PMNode and define the edges.
* auto* node0 = matcher.mutable_pattern().AddNode(...)
* auto* node1 = matcher.mutable_pattern().AddNode(...)
* node0->teller = some lambda.
* node1->teller = some lambda.
* matcher.mutable_pattern().AddEdge(node0, node1);
* // Create an handler, to define the behavior of treating the filtered
* // subgraphs that comply with the patterns.
* PatternMatcher::handle_t handler = some labmda
* // Execute the matcher.
* matcher(&graph, handler);
*/
struct XPUPatternMatcher {
using subgraph_t = std::unordered_map<PMNode*, Node*>;
// Operate on the detected pattern.
using handle_t =
std::function<void(const subgraph_t& /*hitted pattern*/, SSAGraph*)>;
void operator()(SSAGraph* graph, handle_t handler);
const PMPattern& pattern() const { return pattern_; }
PMPattern* mutable_pattern() { return &pattern_; }
// Mark the nodes that fits the pattern.
bool MarkPMNodesInGraph(SSAGraph* graph);
// Detect all the pattern and output the hit records.
std::vector<subgraph_t> DetectPatterns();
// Remove duplicate patterns.
void UniquePatterns(std::vector<subgraph_t>* subgraphs);
// Remove overlapped match subgraphs, when overlapped, keep the previous one.
// The intermediate PMNodes will be removed, so can't shared by multiple
// patterns.
void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
// Validate whether the intermediate nodes are linked by external nodes.
void ValidateByNodeRole(std::vector<subgraph_t>* subgraphs);
using hit_rcd_t =
std::pair<Node* /*node in graph*/, PMNode* /*node in pattern*/>;
PMPattern pattern_;
std::unordered_map<const PMNode*, std::unordered_set<Node*>> pmnodes2nodes_;
std::vector<std::vector<Node*>> extra_input_vars_;
};
} // namespace xpu
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
#include <set>
#include <unordered_set>
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace mir {
namespace xpu {
void XPUFuseBase::PerformPatternMatcher(SSAGraph *graph) {
VLOG(4) << "\n" << matcher_.pattern().DotString();
// Get subgraphs and record the mir::Node pointers for each PMNode.
auto handler = [&](const PatternMatcher::subgraph_t &subgraph, SSAGraph *g) {
// get all the reigistered nodes.
key2nodes_.emplace_back();
for (auto &item : nodes_) {
key2nodes_.back()[item.first] = subgraph.at(item.second);
}
};
matcher_(graph, handler);
}
void XPUFuseBase::DeleteInterNodes(SSAGraph *graph) {
std::set<std::string> keys;
for (auto &node : nodes_) {
if (node.second->IsIntermediate()) {
keys.insert(node.first);
}
}
VLOG(4) << "keys: " << key2nodes_.size();
std::unordered_set<const Node *> nodes2rm;
for (auto &matched : key2nodes_) {
for (const auto &key : keys) {
nodes2rm.insert(matched.at(key));
}
}
VLOG(3) << "clean nodes " << nodes2rm.size();
GraphSafeRemoveNodes(graph, nodes2rm);
}
PMNode *XPUFuseBase::GetOrCreateNode(const std::string &key) {
auto it = nodes_.find(key);
if (it != nodes_.end()) {
return it->second;
}
nodes_.emplace(key,
matcher_.mutable_pattern()->NewNode(patterns::UniqueKey(key)));
it = nodes_.find(key);
return it->second;
}
PMNode *XPUFuseBase::OpNode(const std::string &key,
const std::string &op_type) {
GetOrCreateNode(key)->set_op_type(op_type);
GetOrCreateNode(key)->AsOp(op_type);
return GetOrCreateNode(key);
}
PMNode *XPUFuseBase::VarNode(const std::string &key) {
GetOrCreateNode(key)->AsVar();
return GetOrCreateNode(key);
}
} // namespace xpu
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <vector>
#include "lite/core/mir/pattern_matcher_high_api.h"
#include "lite/core/mir/xpu_pattern_matcher.h"
namespace paddle {
namespace lite {
namespace mir {
namespace xpu {
class XPUFuseBase {
public:
using key2nodes_t = std::map<std::string, Node*>;
virtual ~XPUFuseBase() = default;
void operator()(SSAGraph* graph) {
BuildPattern();
PerformPatternMatcher(graph);
for (size_t i = 0; i < key2nodes_.size(); ++i) {
InsertNewNode(graph, key2nodes_[i], matcher_.extra_input_vars_[i]);
}
DeleteInterNodes(graph);
}
// Build a PMPattern using PMNode.
virtual void BuildPattern() = 0;
// Generate an operator desc with a matched subgraph.
virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) {
return cpp::OpDesc();
}
PMNode* OpNode(const std::string& key) {
return GetOrCreateNode(key)->assert_is_op();
}
PMNode* OpNode(const std::string& key, const std::string& op_type);
PMNode* VarNode(const std::string& key);
protected:
virtual void InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched,
const std::vector<Node*>& extra_input_vars) = 0;
void PerformPatternMatcher(SSAGraph* graph);
// Delete nodes that are marked as Intermediate
void DeleteInterNodes(SSAGraph* graph);
PMNode* GetOrCreateNode(const std::string& key);
protected:
XPUPatternMatcher matcher_;
std::map<std::string, PMNode*> nodes_;
std::vector<key2nodes_t> key2nodes_;
};
} // namespace xpu
} // namespace mir
} // namespace lite
} // namespace paddle
...@@ -75,6 +75,8 @@ class Optimizer { ...@@ -75,6 +75,8 @@ class Optimizer {
(defined LITE_WITH_ARM) (defined LITE_WITH_ARM)
"lite_elementwise_add_activation_fuse_pass", // "lite_elementwise_add_activation_fuse_pass", //
#endif #endif
"__xpu__resnet_fuse_pass",
"__xpu__multi_encoder_fuse_pass",
"quantized_op_attributes_inference_pass", // Only for fully "quantized_op_attributes_inference_pass", // Only for fully
// quantized model, infer // quantized model, infer
// the output scale and // the output scale and
......
if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM) if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM)
return() return()
endif() endif()
......
...@@ -30,6 +30,8 @@ add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps} ...@@ -30,6 +30,8 @@ add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps}
add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps} blas math_function sequence2batch gru_compute) add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps} blas math_function sequence2batch gru_compute)
#add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps}) #add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps})
add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_compute.cc DEPS ${lite_kernel_deps}) add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_compute.cc DEPS ${lite_kernel_deps})
add_kernel(sequence_unpad_compute_x86 X86 basic SRCS sequence_unpad_compute.cc DEPS ${lite_kernel_deps} sequence_padding)
add_kernel(sequence_conv_compute_x86 X86 basic SRCS sequence_conv_compute.cc DEPS ${lite_kernel_deps} math_function blas context_project)
# lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86) # lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
add_kernel(gather_compute_x86 X86 basic SRCS gather_compute.cc DEPS ${lite_kernel_deps} fluid_data_type) add_kernel(gather_compute_x86 X86 basic SRCS gather_compute.cc DEPS ${lite_kernel_deps} fluid_data_type)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/x86/sequence_conv_compute.h"
REGISTER_LITE_KERNEL(sequence_conv,
kX86,
kFloat,
kNCHW,
paddle::lite::kernels::x86::SequenceConvCompute<float>,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <vector>
#include "lite/backends/x86/math/blas.h"
#include "lite/backends/x86/math/context_project.h"
#include "lite/backends/x86/math/math_function.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
namespace math = paddle::lite::x86::math;
template <typename T>
class SequenceConvCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::SequenceConvParam;
void Run() override {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<X86Context>();
auto* in = param.X;
auto* filter = param.Filter;
auto* out = param.Out;
out->template mutable_data<T>();
CHECK(in->lod().size() == 1) << "Only support one level sequence now";
int context_start = param.contextStart;
int context_stride = param.contextStride;
int context_length = param.contextLength;
bool padding_trainable = false;
const Tensor* padding_data = nullptr;
int up_pad = std::max(0, -context_start);
int down_pad = std::max(0, context_start + context_length - 1);
auto sequence_width = static_cast<int64_t>(in->dims()[1]);
std::vector<int64_t> col_shape{in->dims()[0],
context_length * sequence_width};
Tensor col;
col.Resize(col_shape);
col.mutable_data<T>();
// Because if padding_trainable is false, padding data should be zeros.
math::SetConstant<TARGET(kX86), T> set_zero;
auto blas = math::GetBlas<TARGET(kX86), T>(ctx);
set_zero(ctx, &col, static_cast<T>(0));
math::ContextProjectFunctor<TARGET(kX86), T> seq_project_functor;
seq_project_functor(ctx,
*in,
padding_data,
padding_trainable,
context_start,
context_length,
context_stride,
up_pad,
down_pad,
&col);
blas.MatMul(col, *filter, out);
}
virtual ~SequenceConvCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/x86/sequence_unpad_compute.h"
REGISTER_LITE_KERNEL(sequence_unpad,
kX86,
kFloat,
kNCHW,
paddle::lite::kernels::x86::SequenceUnpadCompute<float>,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindInput("Length",
{LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/backends/x86/math/sequence_padding.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
namespace math = paddle::lite::x86::math;
template <typename T>
class SequenceUnpadCompute
: public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::SequenceUnpadParam;
void Run() override {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<X86Context>();
param.Out->template mutable_data<T>();
int64_t padded_length = param.X->dims()[1];
math::UnpaddingLoDTensorFunctor<lite::TargetType::kX86, T>()(
ctx,
*param.X,
param.Out,
padded_length,
0,
false,
math::kBatchLengthWidth);
}
virtual ~SequenceUnpadCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
if(NOT LITE_WITH_XPU)
return()
endif()
add_subdirectory(bridges) if(LITE_WITH_XTCL)
add_subdirectory(bridges)
add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges}) add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges})
else()
add_kernel(conv_compute_xpu XPU basic SRCS conv_compute.cc DEPS ${lite_kernel_deps})
add_kernel(io_copy_compute_xpu XPU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} target_wrapper_xpu)
add_kernel(batch_norm_compute_xpu XPU basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
add_kernel(activation_compute_xpu XPU basic SRCS activation_compute.cc DEPS ${lite_kernel_deps})
add_kernel(pool_compute_xpu XPU basic SRCS pool_compute.cc DEPS ${lite_kernel_deps})
add_kernel(elementwise_compute_xpu XPU basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps})
add_kernel(mul_compute_xpu XPU basic SRCS mul_compute.cc DEPS ${lite_kernel_deps})
add_kernel(softmax_compute_xpu XPU basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps})
add_kernel(scale_compute_xpu XPU basic SRCS scale_compute.cc DEPS ${lite_kernel_deps})
add_kernel(lookup_table_compute_xpu XPU basic SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
add_kernel(layer_norm_compute_xpu XPU basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps})
add_kernel(dropout_compute_xpu XPU basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps})
add_kernel(matmul_compute_xpu XPU basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps})
add_kernel(stack_compute_xpu XPU basic SRCS stack_compute.cc DEPS ${lite_kernel_deps})
add_kernel(slice_compute_xpu XPU basic SRCS slice_compute.cc DEPS ${lite_kernel_deps})
add_kernel(cast_compute_xpu XPU basic SRCS cast_compute.cc DEPS ${lite_kernel_deps})
add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc DEPS ${lite_kernel_deps})
add_kernel(__xpu__multi_encoder_compute_xpu XPU extra SRCS __xpu__multi_encoder_compute.cc DEPS ${lite_kernel_deps})
endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/__xpu__multi_encoder_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void XPUMultiEncoderCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
for (auto* fc_weight : param.fc_weight) {
arg_fc_weight_.push_back(
reinterpret_cast<const int16_t*>(fc_weight->data<float>()));
}
for (auto* fc_bias : param.fc_bias) {
arg_fc_bias_.push_back(fc_bias->data<float>());
}
for (auto* ln_scale : param.ln_scale) {
arg_ln_scale_.push_back(ln_scale->data<float>());
}
for (auto* ln_bias : param.ln_bias) {
arg_ln_bias_.push_back(ln_bias->data<float>());
}
if (param.act_type == "relu") {
act_type_ = xdnn::Activation_t::RELU;
}
}
void XPUMultiEncoderCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int batch_size = param.input->dims()[0];
int seq_len = param.input->dims()[1];
int r = xdnn::bert_encoder_transformer_int16<int16_t>(
ctx.GetRawContext(), /* context */
batch_size, /* batch_size */
seq_len, /* from_seq_len */
seq_len, /* to_seq_len */
param.head_num, /* head_num */
param.size_per_head, /* size_per_head */
param.n_layers, /* n_layers */
param.input->data<float>(), /* from_tensor */
param.input->data<float>(), /* to_tensor */
param.mask->data<float>(), /* att_mask */
&arg_fc_weight_[0], /* fc_weights */
&arg_fc_bias_[0], /* fc_biass */
&arg_ln_scale_[0], /* ln_scales */
&arg_ln_bias_[0], /* ln_biass */
param.output->mutable_data<float>(TARGET(kXPU)), /* output */
param.fc_weight_max->data<float>(), /* fc_weights_max */
true, /* pretrans_b */
true, /* use_l3 */
act_type_ /* act_type */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(__xpu__multi_encoder,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::XPUMultiEncoderCompute,
def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("FCWeight", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("FCBias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("LNScale", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("LNBias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("FCWeightMax", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class XPUMultiEncoderCompute
: public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::XPUMultiEncoderParam;
virtual void PrepareForRun();
virtual void Run();
private:
std::vector<const int16_t *> arg_fc_weight_;
std::vector<const float *> arg_fc_bias_;
std::vector<const float *> arg_ln_scale_;
std::vector<const float *> arg_ln_bias_;
xdnn::Activation_t act_type_{xdnn::Activation_t::GELU};
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/__xpu__resnet50_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void XPUResNet50Compute::PrepareForRun() {
auto& param = this->Param<param_t>();
for (auto* filter : param.filter) {
arg_filter_.push_back(
reinterpret_cast<const int16_t*>(filter->data<float>()));
}
for (auto* bias : param.bias) {
arg_bias_.push_back(bias->data<float>());
}
for (auto* max_filter : param.max_filter) {
arg_max_filter_.push_back(max_filter->data<float>());
}
}
void XPUResNet50Compute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int batch_size = param.input->dims()[0];
int r = xdnn::conv2d_int16_resnet<float, int16_t>(
ctx.GetRawContext(), /* context */
batch_size, /* num */
param.input->data<float>(), /* bottom */
&arg_filter_[0], /* weight_list */
param.output->mutable_data<float>(TARGET(kXPU)), /* top */
&arg_bias_[0], /* bias_list */
&arg_max_filter_[0] /* max_filter_list */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(__xpu__resnet50,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::XPUResNet50Compute,
def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("MaxFilter", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class XPUResNet50Compute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::XPUResNet50Param;
virtual void PrepareForRun();
virtual void Run();
private:
std::vector<const int16_t *> arg_filter_;
std::vector<const float *> arg_max_filter_;
std::vector<const float *> arg_bias_;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/activation_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void ReluCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int r = xdnn::activation_forward(
ctx.GetRawContext(), /* context */
xdnn::Activation_t::RELU, /* type */
param.X->numel(), /* len */
param.X->data<float>(), /* x */
param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
CHECK_EQ(r, 0);
}
void TanhCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int r = xdnn::activation_forward(
ctx.GetRawContext(), /* context */
xdnn::Activation_t::TANH, /* type */
param.X->numel(), /* len */
param.X->data<float>(), /* x */
param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
CHECK_EQ(r, 0);
}
void SigmoidCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int r = xdnn::activation_forward(
ctx.GetRawContext(), /* context */
xdnn::Activation_t::SIGMOID, /* type */
param.X->numel(), /* len */
param.X->data<float>(), /* x */
param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
relu, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ReluCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(
tanh, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::TanhCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(sigmoid,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::SigmoidCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class ReluCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
virtual void Run();
virtual ~ReluCompute() = default;
};
class TanhCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
virtual void Run();
virtual ~TanhCompute() = default;
};
class SigmoidCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
virtual void Run();
virtual ~SigmoidCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/batch_norm_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void BatchNormCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
float epsilon = param.epsilon;
auto& x_dims = param.x->dims();
int r = xdnn::batch_norm_infer_forward(
ctx.GetRawContext(), /* context */
epsilon, /* epsilon */
x_dims[0], /* img_n */
x_dims[1], /* img_c */
x_dims[2], /* img_h */
x_dims[3], /* img_w */
param.x->data<float>(), /* img_gm */
param.y->mutable_data<float>(TARGET(kXPU)), /* out_gm */
param.scale->data<float>(), /* scale_gm */
param.bias->data<float>(), /* bias_gm */
param.mean->data<float>(), /* mean_gm */
param.variance->data<float>() /* var__gm */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(batch_norm,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::BatchNormCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Mean", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Variance", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class BatchNormCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::BatchNormParam;
virtual void Run();
virtual ~BatchNormCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
if(NOT LITE_WITH_XPU) if(NOT LITE_WITH_XTCL)
return() return()
endif() endif()
......
...@@ -14,12 +14,12 @@ ...@@ -14,12 +14,12 @@
#pragma once #pragma once
#include <xtcl/xtcl.h>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_lite.h" #include "lite/core/op_lite.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
......
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
#pragma once #pragma once
#include <xtcl/xtcl.h>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_lite.h" #include "lite/core/op_lite.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/cast_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
template <typename InType>
void CastCompute<InType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();
auto* x = param.X;
auto* out = param.Out;
int out_dtype = param.out_dtype;
auto* in_data = x->template data<InType>();
int numel = x->numel();
int r = 0;
// BOOL = 0;INT16 = 1;INT32 = 2;INT64 = 3;FP16 = 4;FP32 = 5;FP64 = 6;
// SIZE_T = 19;UINT8 = 20;INT8 = 21;
if (out_dtype == 5) {
auto* out_data = out->template mutable_data<float>(TARGET(kXPU));
r = xdnn::cast<InType, float>(
ctx.GetRawContext(), in_data, out_data, numel);
} else if (out_dtype == 2) {
auto* out_data = out->template mutable_data<int>(TARGET(kXPU));
r = xdnn::cast<InType, int>(ctx.GetRawContext(), in_data, out_data, numel);
} else if (out_dtype == 3) {
auto* out_data = out->template mutable_data<int64_t>(TARGET(kXPU));
r = xdnn::cast<InType, int64_t>(
ctx.GetRawContext(), in_data, out_data, numel);
} else {
CHECK(false);
}
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(cast,
kXPU,
kAny,
kNCHW,
paddle::lite::kernels::xpu::CastCompute<float>,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
template <typename InType>
class CastCompute : public KernelLite<TARGET(kXPU), PRECISION(kAny)> {
public:
using param_t = operators::CastParam;
void Run() override;
virtual ~CastCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/conv_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
template <>
void Conv2dCompute<PRECISION(kFloat)>::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.x->dims();
auto& w_dims = param.filter->dims();
int groups = param.groups;
auto& strides = param.strides;
auto paddings = *param.paddings;
auto dilations = *param.dilations;
int r = xdnn::conv2d_forward_int16<float, float, float, float>(
ctx.GetRawContext(), /* context */
x_dims[0], /* num */
x_dims[1], /* input_c */
x_dims[2], /* input_h */
x_dims[3], /* input_w */
w_dims[0], /* num_filter */
w_dims[2], /* kernel_h */
w_dims[3], /* kernel_w */
strides[0], /* stride_h */
strides[1], /* stride_w */
paddings[0], /* pad_h */
paddings[1], /* pad_w */
dilations[0], /* dilation_h */
dilations[1], /* dilation_w */
groups, /* group */
param.x->data<float>(), /* bottom */
param.filter->data<float>(), /* weight */
param.output->mutable_data<float>(TARGET(kXPU)), /* top */
nullptr, /* bias */
nullptr, /* branch */
xdnn::Activation_t::LINEAR, /* type */
nullptr, /* max_image_ptr */
nullptr, /* max_filter_ptr */
nullptr /* max_result_ptr */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
namespace xpu = paddle::lite::kernels::xpu;
using Conv2dFp32 = xpu::Conv2dCompute<PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(conv2d, kXPU, kFloat, kNCHW, Conv2dFp32, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
template <PrecisionType FilterPtype>
class Conv2dCompute : public KernelLite<TARGET(kXPU), FilterPtype> {
public:
using param_t = operators::ConvParam;
virtual void Run();
virtual ~Conv2dCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/dropout_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void DropoutCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int size = param.x->numel() * sizeof(float);
int r = xdnn::memcpy_device(
ctx.GetRawContext(), /* context */
param.output->mutable_data<float>(TARGET(kXPU)), /* dst */
param.x->data<float>(), /* src */
size /* size */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(dropout,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::DropoutCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class DropoutCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::DropoutParam;
virtual void Run();
virtual ~DropoutCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/elementwise_compute.h"
#include <functional>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void ElementwiseAddCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.X->dims().data();
auto& y_dims = param.Y->dims();
int axis = param.axis;
if (param.axis == -1) {
axis = x_dims.size() - y_dims.size();
}
int iter = std::accumulate(
x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
int stride = param.Y->numel();
for (int i = 0; i < iter; ++i) {
const float* x_ptr = param.X->data<float>() + i * stride;
const float* y_ptr = param.Y->data<float>();
float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
int r = xdnn::elementwise_add(ctx.GetRawContext(), /* context */
x_ptr, /* x */
y_ptr, /* y */
o_ptr, /* z */
stride /* len */);
CHECK_EQ(r, 0);
}
}
void ElementwiseSubCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.X->dims().data();
auto& y_dims = param.Y->dims();
int axis = param.axis;
if (param.axis == -1) {
axis = x_dims.size() - y_dims.size();
}
int iter = std::accumulate(
x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
int stride = param.Y->numel();
for (int i = 0; i < iter; ++i) {
const float* x_ptr = param.X->data<float>() + i * stride;
const float* y_ptr = param.Y->data<float>();
float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
int r = xdnn::elementwise_sub(ctx.GetRawContext(), /* context */
x_ptr, /* x */
y_ptr, /* y */
o_ptr, /* z */
stride /* len */);
CHECK_EQ(r, 0);
}
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(elementwise_add,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::ElementwiseAddCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_sub,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::ElementwiseSubCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class ElementwiseAddCompute
: public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ElementwiseParam;
virtual void Run();
virtual ~ElementwiseAddCompute() = default;
};
class ElementwiseSubCompute
: public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ElementwiseParam;
virtual void Run();
virtual ~ElementwiseSubCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/xpu/target_wrapper.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
/*
* This kernel copies a tensor from host to XPU.
*/
class IoCopyHostToXPUCompute
: public KernelLite<TARGET(kXPU), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
void Run() override {
auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kX86) ||
param.x->target() == TARGET(kARM));
auto mem_size = param.x->memory_size();
VLOG(4) << "host to xpu, copy size " << mem_size;
auto* data = param.y->mutable_data(TARGET(kXPU), mem_size);
TargetWrapperXPU::MemcpySync(
data, param.x->raw_data(), mem_size, IoDirection::HtoD);
}
std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
*res = [](const std::map<std::string, const Type*>& inputs,
const std::string& out) -> const Type* {
CHECK(!inputs.empty());
auto* type = inputs.at("Input");
CHECK(type->target() == TARGET(kHost));
auto out_place = type->place();
out_place.target = TARGET(kXPU);
auto* out_type = Type::Get(type->id(),
out_place.target,
out_place.precision,
out_place.layout,
out_place.device);
return out_type;
};
return res;
}
std::string doc() const override { return "Copy IO from HOST to XPU"; }
};
/*
* This kernel copies a tensor from XPU to host.
*/
class IoCopyXPUToHostCompute
: public KernelLite<TARGET(kXPU), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
void Run() override {
auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kXPU));
auto mem_size = param.x->memory_size();
VLOG(4) << "xpu to host, copy size " << mem_size;
auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
TargetWrapperXPU::MemcpySync(
data, param.x->raw_data(), mem_size, IoDirection::DtoH);
}
std::string doc() const override { return "Copy IO from XPU to HOST"; }
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(io_copy,
kXPU,
kAny,
kAny,
paddle::lite::kernels::xpu::IoCopyHostToXPUCompute,
host_to_device)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(io_copy,
kXPU,
kAny,
kAny,
paddle::lite::kernels::xpu::IoCopyXPUToHostCompute,
device_to_host)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(io_copy_once,
kXPU,
kAny,
kAny,
paddle::lite::kernels::xpu::IoCopyHostToXPUCompute,
host_to_device)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(io_copy_once,
kXPU,
kAny,
kAny,
paddle::lite::kernels::xpu::IoCopyXPUToHostCompute,
device_to_host)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/layer_norm_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void LayerNormCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto x_dims = param.X->dims();
auto axis = param.begin_norm_axis;
auto matrix_dim = x_dims.Flatten2D(axis);
float epsilon = param.epsilon;
int r = xdnn::layer_norm(ctx.GetRawContext(), /* context */
matrix_dim[0], /* m */
matrix_dim[1], /* n */
param.X->data<float>(), /* in */
param.Y->mutable_data<float>(TARGET(kXPU)), /* out */
param.Scale->data<float>(), /* scale */
param.Bias->data<float>(), /* bias */
epsilon /* epsilon */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(layer_norm,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::LayerNormCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Mean", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Variance", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class LayerNormCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::LayerNormParam;
virtual void Run();
virtual ~LayerNormCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/lookup_table_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void LookupTableCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int num = param.Ids->numel();
int embed_dim = param.W->dims()[1];
int r = xdnn::embedding<float, int64_t>(
ctx.GetRawContext(), /* context */
num, /* num */
param.Ids->data<int64_t>(), /* indices */
embed_dim, /* embed_dim */
param.W->data<float>(), /* table */
param.Out->mutable_data<float>(TARGET(kXPU)) /* top */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(lookup_table,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::LookupTableCompute,
def)
.BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Ids", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class LookupTableCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::LookupTableParam;
virtual void Run();
virtual ~LookupTableCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/matmul_compute.h"
#include "lite/backends/xpu/math.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace math = paddle::lite::xpu::math;
void MatMulCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto* x = param.X;
auto* y = param.Y;
auto* out = param.Out;
auto mat_dim_a = math::CreateMatrixDescriptor(
math::RowMatrixFromVector(x->dims()), 0, param.transpose_X);
auto mat_dim_b = math::CreateMatrixDescriptor(
math::ColumnMatrixFromVector(y->dims()), 0, param.transpose_Y);
int lda = (mat_dim_a.trans_ ? mat_dim_a.height_ : mat_dim_a.width_);
int ldb = (mat_dim_b.trans_ ? mat_dim_b.height_ : mat_dim_b.width_);
int ldc = mat_dim_b.width_;
int r = 0;
if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
r = xdnn::fc_int16(ctx.GetRawContext(), /* context */
mat_dim_a.trans_, /* TransA */
mat_dim_b.trans_, /* TransB */
mat_dim_a.height_, /* m */
mat_dim_b.width_, /* n */
mat_dim_a.width_, /* k */
param.alpha, /* alpha */
x->data<float>(), /* A */
y->data<float>(), /* B */
0.0f, /* beta */
out->mutable_data<float>(TARGET(kXPU)) /* C */);
} else {
// batch matmul
r = xdnn::gemm_strided_batched_int16<float, float, float>(
ctx.GetRawContext(), /* context */
mat_dim_a.trans_, /* TransA */
mat_dim_b.trans_, /* TransB */
mat_dim_a.batch_size_, /* batch_size */
mat_dim_a.height_, /* M */
mat_dim_b.width_, /* N */
mat_dim_a.width_, /* K */
param.alpha, /* alpha */
x->data<float>(), /* A */
lda, /* lda */
mat_dim_a.stride_, /* stride_a */
y->data<float>(), /* B */
ldb, /* ldb */
mat_dim_b.stride_, /* stride_b */
0.0f, /* beta */
out->mutable_data<float>(TARGET(kXPU)), /* C */
ldc, /* ldc */
mat_dim_a.height_ * mat_dim_b.width_ /* stride_c */);
}
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
matmul, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::MatMulCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class MatMulCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::MatMulParam;
virtual void Run();
virtual ~MatMulCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/mul_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void MulCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& origin_x = *param.x;
auto& origin_y = *param.y;
auto& x_dims = origin_x.dims();
auto& y_dims = origin_y.dims();
Tensor x_matrix, y_matrix;
if (x_dims.size() > 2) {
x_matrix = ReshapeToMatrix(origin_x, param.x_num_col_dims);
} else {
x_matrix = origin_x;
}
if (y_dims.size() > 2) {
y_matrix = ReshapeToMatrix(origin_y, param.y_num_col_dims);
} else {
y_matrix = origin_y;
}
int m = x_matrix.dims()[0];
int k = x_matrix.dims()[1];
int n = y_matrix.dims()[1];
int r =
xdnn::fc_int16(ctx.GetRawContext(), /* context */
false, /* TransA */
false, /* TransB */
m,
n,
k,
1.0f, /* alpha */
x_matrix.data<float>(), /* A */
y_matrix.data<float>(), /* B */
0.0f, /* beta */
param.output->mutable_data<float>(TARGET(kXPU)) /* C */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
mul, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::MulCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
static inline lite::Tensor ReshapeToMatrix(const lite::Tensor& src,
int num_col_dims) {
int rank = src.dims().size();
if (rank == 2) {
return src;
}
lite::Tensor res;
res.ShareDataWith(src);
res.Resize(src.dims().Flatten2D(num_col_dims));
return res;
}
class MulCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::MulParam;
virtual void Run();
virtual ~MulCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/pool_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void Pool2DCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.x->dims();
CHECK_EQ(x_dims.size(), 4);
auto& o_dims = param.output->dims();
CHECK_EQ(param.ksize.size(), 2);
if (param.global_pooling) {
param.ksize[0] = x_dims[2];
param.ksize[1] = x_dims[3];
}
CHECK_EQ(param.strides.size(), 2);
CHECK_EQ(param.paddings->size(), 4);
auto& paddings = *param.paddings;
auto type = xdnn::MAX_WITHOUT_INDEX;
if (param.pooling_type == "avg") {
if (paddings[0] == 0 && paddings[1] == 0 && paddings[2] == 0 &&
paddings[3] == 0) {
type = xdnn::AVG_WITHOUT_PAD;
} else {
type = xdnn::AVG_WITH_PAD;
}
}
int r = xdnn::pooling_forward<float, float>(
ctx.GetRawContext(), /* context */
param.x->data<float>(), /* x */
param.output->mutable_data<float>(TARGET(kXPU)), /* y */
nullptr, /* y_index */
type, /* type */
x_dims[0] * x_dims[1], /* c */
x_dims[2], /* in_h */
x_dims[3], /* in_w */
paddings[0], /* pad_left */
paddings[1], /* pad_right */
paddings[2], /* pad_up */
paddings[3], /* pad_down */
param.ksize[0], /* win_h */
param.ksize[1], /* win_w */
param.strides[0], /* stride_h */
param.strides[1], /* stride_w */
o_dims[2], /* out_h */
o_dims[3] /* out_w */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
pool2d, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Pool2DCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class Pool2DCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::PoolParam;
virtual void Run();
virtual ~Pool2DCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/scale_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void ScaleCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.x->dims();
int r = xdnn::scale(ctx.GetRawContext(), /* context */
x_dims.production(), /* len */
param.scale, /* alpha */
param.bias, /* beta */
param.bias_after_scale, /* bias_after_scale */
param.x->data<float>(), /* x */
param.output->mutable_data<float>(TARGET(kXPU)) /* y */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
scale, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ScaleCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class ScaleCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ScaleParam;
virtual void Run();
virtual ~ScaleCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/slice_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void SliceCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
auto x_dims = param.X->dims();
x_shape_.reserve(x_dims.size());
x_dim_begin_.reserve(x_dims.size());
x_dim_end_.reserve(x_dims.size());
}
void SliceCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto x_dims = param.X->dims();
for (size_t i = 0; i < x_dims.size(); ++i) {
x_shape_[i] = x_dims[i];
x_dim_begin_[i] = 0;
x_dim_end_[i] = x_dims[i];
}
for (size_t i = 0; i < param.axes.size(); ++i) {
int axis = param.axes[i];
x_dim_begin_[axis] = param.starts[i];
x_dim_end_[axis] = param.ends[i];
}
int ndim = param.X->dims().size();
int r = xdnn::slice_forward(
ctx.GetRawContext(), /* context */
&x_shape_[0], /* shape */
&x_dim_begin_[0], /* starts */
&x_dim_end_[0], /* ends */
ndim, /* n */
param.X->data<float>(), /* in */
param.Out->mutable_data<float>(TARGET(kXPU)) /* out */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
slice, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SliceCompute, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class SliceCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::SliceParam;
virtual void PrepareForRun();
virtual void Run();
virtual ~SliceCompute() = default;
private:
std::vector<int> x_shape_;
std::vector<int> x_dim_begin_;
std::vector<int> x_dim_end_;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/softmax_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void SoftmaxCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.x->dims();
int axis = CanonicalAxis(param.axis, x_dims.size());
int rows = SizeToAxis(axis, x_dims);
int cols = SizeFromAxis(axis, x_dims);
int r = xdnn::softmax2d_forward(
ctx.GetRawContext(), /* context */
param.x->data<float>(), /* x */
param.output->mutable_data<float>(TARGET(kXPU)), /* y */
rows, /* rows */
cols /* cols */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(softmax,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::SoftmaxCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
static inline int CanonicalAxis(const int axis, const int rank) {
if (axis < 0) {
return axis + rank;
}
return axis;
}
static inline int SizeToAxis(const int axis, lite::DDim dims) {
int size = 1;
for (int i = 0; i < axis; i++) {
size *= dims[i];
}
return size;
}
static inline int SizeFromAxis(const int axis, lite::DDim dims) {
int size = 1;
for (size_t i = axis; i < dims.size(); i++) {
size *= dims[i];
}
return size;
}
class SoftmaxCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::SoftmaxParam;
virtual void Run();
virtual ~SoftmaxCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/stack_compute.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void StackCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
int n = param.X.size();
void* x_ptr = nullptr;
xpu_malloc(&x_ptr, n * 8 /* sizeof(__global__ float*) */);
x_ptr_guard_.reset(x_ptr);
x_ptr_cpu_.reserve(n);
}
void StackCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int n = param.X.size();
auto x_dims = param.X[0]->dims();
int axis = param.axis;
// XXX(miaotianxiang): +1?
if (axis < 0) axis += (x_dims.size() + 1);
auto matrix = x_dims.Flatten2D(axis);
int height = matrix[0];
int width = matrix[1];
for (int i = 0; i < n; ++i) {
x_ptr_cpu_[i] = param.X[i]->data<float>();
}
xpu_memcpy(x_ptr_guard_.get(), &x_ptr_cpu_[0], n * 8, XPU_HOST_TO_DEVICE);
int r = xdnn::stack_forward(
ctx.GetRawContext(), /* context */
height, /* height */
width, /* width */
n, /* n */
x_ptr_guard_.get(), /* x_ptr */
param.Out->mutable_data<float>(TARGET(kXPU)) /* out */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
stack, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::StackCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
struct XPUFreeDeleter {
void operator()(void* p) const { xpu_free(p); }
};
class StackCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::StackParam;
virtual void PrepareForRun();
virtual void Run();
virtual ~StackCompute() = default;
private:
std::unique_ptr<void, XPUFreeDeleter> x_ptr_guard_;
std::vector<const float*> x_ptr_cpu_;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
#pragma once #pragma once
#include <xtcl/xtcl.h>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/kernels/npu/bridges/engine.h" #include "lite/kernels/npu/bridges/engine.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
......
...@@ -73,6 +73,7 @@ add_operator(calib_once_op extra SRCS calib_once_op.cc DEPS ${op_DEPS}) ...@@ -73,6 +73,7 @@ add_operator(calib_once_op extra SRCS calib_once_op.cc DEPS ${op_DEPS})
add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS}) add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS})
add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS}) add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS})
add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS}) add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS})
add_operator(sequence_unpad_op_lite extra SRCS sequence_unpad_op.cc DEPS ${op_DEPS})
add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS}) add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS})
add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS}) add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS})
add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS}) add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS})
...@@ -148,6 +149,10 @@ add_operator(elementwise_grad_op train SRCS elementwise_grad_ops.cc DEPS ${op_DE ...@@ -148,6 +149,10 @@ add_operator(elementwise_grad_op train SRCS elementwise_grad_ops.cc DEPS ${op_DE
add_operator(mul_grad_op train SRCS mul_grad_op.cc DEPS ${op_DEPS}) add_operator(mul_grad_op train SRCS mul_grad_op.cc DEPS ${op_DEPS})
add_operator(sgd_op train SRCS sgd_op.cc DEPS ${op_DEPS}) add_operator(sgd_op train SRCS sgd_op.cc DEPS ${op_DEPS})
# Only for XPU
add_operator(__xpu__resnet50_op extra SRCS __xpu__resnet50_op.cc DEPS ${op_DEPS})
add_operator(__xpu__multi_encoder_op extra SRCS __xpu__multi_encoder_op.cc DEPS ${op_DEPS})
if (NOT LITE_WITH_X86) if (NOT LITE_WITH_X86)
lite_cc_test(test_fc_op SRCS fc_op_test.cc lite_cc_test(test_fc_op SRCS fc_op_test.cc
DEPS fc_op memory DEPS fc_op memory
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/__xpu__multi_encoder_op.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
bool XPUMultiEncoderOp::CheckShape() const { return true; }
bool XPUMultiEncoderOp::InferShapeImpl() const {
auto input_shape = param_.input->dims();
param_.output->Resize(input_shape);
return true;
}
bool XPUMultiEncoderOp::AttachImpl(const cpp::OpDesc& op_desc,
lite::Scope* scope) {
param_.input = const_cast<lite::Tensor*>(
&scope->FindVar(op_desc.Input("Input").front())->Get<lite::Tensor>());
param_.mask = const_cast<lite::Tensor*>(
&scope->FindVar(op_desc.Input("Mask").front())->Get<lite::Tensor>());
param_.fc_weight_max = const_cast<lite::Tensor*>(
&scope->FindVar(op_desc.Input("FCWeightMax").front())
->Get<lite::Tensor>());
param_.output = scope->FindVar(op_desc.Output("Output").front())
->GetMutable<lite::Tensor>();
param_.fc_weight.clear();
for (auto& name : op_desc.Input("FCWeight")) {
auto t =
const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
param_.fc_weight.push_back(t);
}
param_.fc_bias.clear();
for (auto& name : op_desc.Input("FCBias")) {
auto t =
const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
param_.fc_bias.push_back(t);
}
param_.ln_scale.clear();
for (auto& name : op_desc.Input("LNScale")) {
auto t =
const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
param_.ln_scale.push_back(t);
}
param_.ln_bias.clear();
for (auto& name : op_desc.Input("LNBias")) {
auto t =
const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
param_.ln_bias.push_back(t);
}
param_.n_layers = op_desc.GetAttr<int>("n_layers");
param_.head_num = op_desc.GetAttr<int>("head_num");
param_.size_per_head = op_desc.GetAttr<int>("size_per_head");
param_.act_type = op_desc.GetAttr<std::string>("act_type");
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(__xpu__multi_encoder,
paddle::lite::operators::XPUMultiEncoderOp);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "lite/core/op_lite.h"
namespace paddle {
namespace lite {
namespace operators {
class XPUMultiEncoderOp : public OpLite {
public:
XPUMultiEncoderOp() {}
explicit XPUMultiEncoderOp(const std::string &op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShapeImpl() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "MultiEncoder"; }
private:
mutable XPUMultiEncoderParam param_;
};
} // namespace operators
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/__xpu__resnet50_op.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
bool XPUResNet50Op::CheckShape() const { return true; }
bool XPUResNet50Op::InferShapeImpl() const {
auto input_shape = param_.input->dims();
input_shape[1] = 2048;
input_shape[2] = 1;
input_shape[3] = 1;
param_.output->Resize(input_shape);
return true;
}
bool XPUResNet50Op::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
param_.input = const_cast<lite::Tensor*>(
&scope->FindVar(op_desc.Input("Input").front())->Get<lite::Tensor>());
param_.output = scope->FindVar(op_desc.Output("Output").front())
->GetMutable<lite::Tensor>();
param_.filter.clear();
for (auto& name : op_desc.Input("Filter")) {
auto t =
const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
param_.filter.push_back(t);
}
param_.bias.clear();
for (auto& name : op_desc.Input("Bias")) {
auto t =
const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
param_.bias.push_back(t);
}
param_.max_filter.clear();
for (auto& name : op_desc.Input("MaxFilter")) {
auto t =
const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
param_.max_filter.push_back(t);
}
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(__xpu__resnet50, paddle::lite::operators::XPUResNet50Op);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "lite/core/op_lite.h"
namespace paddle {
namespace lite {
namespace operators {
class XPUResNet50Op : public OpLite {
public:
XPUResNet50Op() {}
explicit XPUResNet50Op(const std::string &op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShapeImpl() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "ResNet50"; }
private:
mutable XPUResNet50Param param_;
};
} // namespace operators
} // namespace lite
} // namespace paddle
...@@ -1019,6 +1019,12 @@ struct SequenceExpandParam : ParamBase { ...@@ -1019,6 +1019,12 @@ struct SequenceExpandParam : ParamBase {
int ref_level{-1}; int ref_level{-1};
}; };
struct SequenceUnpadParam : ParamBase {
const lite::Tensor* X{};
const lite::Tensor* Length{};
lite::Tensor* Out{};
};
struct SequenceExpandAsParam : ParamBase { struct SequenceExpandAsParam : ParamBase {
const lite::Tensor* x{nullptr}; const lite::Tensor* x{nullptr};
const lite::Tensor* y{nullptr}; const lite::Tensor* y{nullptr};
...@@ -1438,6 +1444,30 @@ struct CrfDecodingParam : ParamBase { ...@@ -1438,6 +1444,30 @@ struct CrfDecodingParam : ParamBase {
lite::Tensor* viterbi_path{}; lite::Tensor* viterbi_path{};
}; };
struct XPUResNet50Param : ParamBase {
lite::Tensor* input{};
std::vector<lite::Tensor*> filter;
std::vector<lite::Tensor*> bias;
std::vector<lite::Tensor*> max_filter;
lite::Tensor* output{};
};
struct XPUMultiEncoderParam : ParamBase {
lite::Tensor* input{};
std::vector<lite::Tensor*> fc_weight;
std::vector<lite::Tensor*> fc_bias;
std::vector<lite::Tensor*> ln_scale;
std::vector<lite::Tensor*> ln_bias;
lite::Tensor* fc_weight_max{};
lite::Tensor* mask{};
lite::Tensor* output{};
int n_layers{};
int head_num{};
int size_per_head{};
std::string act_type{};
};
} // namespace operators } // namespace operators
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/sequence_unpad_op.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
bool SequenceUnpadOp::CheckShape() const {
CHECK_OR_FALSE(param_.X);
CHECK_OR_FALSE(param_.Length);
CHECK_OR_FALSE(param_.Out);
auto x_dims = param_.X->dims();
auto len_dims = param_.Length->dims();
CHECK(x_dims.size() >= 2) << "Rank of X can't be less than 2";
CHECK(len_dims.size() == 1) << "Rank of Length should be 1";
CHECK(x_dims[0] == len_dims[0])
<< "X and Length should have the same 1st dim";
return true;
}
bool SequenceUnpadOp::InferShapeImpl() const {
auto x_dims = param_.X->dims();
auto len_dims = param_.Length->dims();
auto *seq_len_ptr = param_.Length->data<int64_t>();
int64_t batch_size = len_dims[0];
std::vector<uint64_t> out_lod0(batch_size + 1, 0);
for (int64_t i = 0; i < batch_size; ++i) {
out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i];
}
paddle::lite::LoD out_lod;
out_lod.push_back(out_lod0);
int64_t out_dim0 = out_lod0.back();
std::vector<int64_t> out_dims{out_dim0};
if (x_dims.size() == 2) {
out_dims.push_back(1);
} else {
for (size_t i = 2; i < x_dims.size(); ++i) {
out_dims.push_back(x_dims[i]);
}
}
param_.Out->Resize(out_dims);
param_.Out->set_lod(out_lod);
return true;
}
bool SequenceUnpadOp::AttachImpl(const cpp::OpDesc &opdesc,
lite::Scope *scope) {
param_.X = const_cast<lite::Tensor *>(
&scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
param_.Length = const_cast<lite::Tensor *>(
&scope->FindVar(opdesc.Input("Length").front())->Get<lite::Tensor>());
param_.Out =
scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(sequence_unpad, paddle::lite::operators::SequenceUnpadOp);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/scope.h"
#include "lite/utils/all.h"
namespace paddle {
namespace lite {
namespace operators {
class SequenceUnpadOp : public OpLite {
public:
SequenceUnpadOp() {}
explicit SequenceUnpadOp(const std::string &op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShapeImpl() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "sequence_unpad"; }
private:
mutable SequenceUnpadParam param_;
};
} // namespace operators
} // namespace lite
} // namespace paddle
...@@ -47,6 +47,7 @@ bool StackOp::InferShapeImpl() const { ...@@ -47,6 +47,7 @@ bool StackOp::InferShapeImpl() const {
bool StackOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { bool StackOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
auto X = op_desc.Input("X"); auto X = op_desc.Input("X");
auto Out = op_desc.Output("Y").front(); auto Out = op_desc.Output("Y").front();
param_.X.clear();
for (auto var : X) { for (auto var : X) {
param_.X.emplace_back(scope->FindVar(var)->GetMutable<lite::Tensor>()); param_.X.emplace_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
} }
......
add_subdirectory(kernels) add_subdirectory(kernels)
add_subdirectory(math) add_subdirectory(math)
add_subdirectory(cv) add_subdirectory(cv)
add_subdirectory(api)
if(LITE_WITH_XPU)
lite_cc_test(test_resnet50_lite_xpu SRCS test_resnet50_lite_xpu.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
lite_cc_test(test_ernie_lite_xpu SRCS test_ernie_lite_xpu.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
lite_cc_test(test_bert_lite_xpu SRCS test_bert_lite_xpu.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/lite_api_test_helper.h"
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
template <typename T>
lite::Tensor GetTensorWithShape(std::vector<int64_t> shape) {
lite::Tensor ret;
ret.Resize(shape);
T* ptr = ret.mutable_data<T>();
for (int i = 0; i < ret.numel(); ++i) {
ptr[i] = (T)1;
}
return ret;
}
TEST(Ernie, test_ernie_lite_xpu) {
lite_api::CxxConfig config;
config.set_model_dir(FLAGS_model_dir);
config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
config.set_xpu_workspace_l3_size_per_thread();
auto predictor = lite_api::CreatePaddlePredictor(config);
int64_t batch_size = 1;
int64_t seq_len = 64;
Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
std::vector<int64_t> input_shape{batch_size, seq_len, 1};
predictor->GetInput(0)->Resize(input_shape);
predictor->GetInput(1)->Resize(input_shape);
predictor->GetInput(2)->Resize(input_shape);
predictor->GetInput(3)->Resize(input_shape);
memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor->Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < FLAGS_repeats; ++i) {
predictor->Run();
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
std::vector<std::vector<float>> results;
results.emplace_back(std::vector<float>({0.278893, 0.330888, 0.39022}));
auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2);
ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 3);
for (size_t i = 0; i < results.size(); ++i) {
for (size_t j = 0; j < results[i].size(); ++j) {
EXPECT_NEAR(
out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 1e-5);
}
}
}
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/lite_api_test_helper.h"
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
template <typename T>
lite::Tensor GetTensorWithShape(std::vector<int64_t> shape) {
lite::Tensor ret;
ret.Resize(shape);
T* ptr = ret.mutable_data<T>();
for (int i = 0; i < ret.numel(); ++i) {
ptr[i] = (T)1;
}
return ret;
}
TEST(Ernie, test_ernie_lite_xpu) {
lite_api::CxxConfig config;
config.set_model_dir(FLAGS_model_dir);
config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
config.set_xpu_workspace_l3_size_per_thread();
auto predictor = lite_api::CreatePaddlePredictor(config);
int64_t batch_size = 1;
int64_t seq_len = 64;
Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
std::vector<int64_t> input_shape{batch_size, seq_len, 1};
predictor->GetInput(0)->Resize(input_shape);
predictor->GetInput(1)->Resize(input_shape);
predictor->GetInput(2)->Resize(input_shape);
predictor->GetInput(3)->Resize(input_shape);
memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor->Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < FLAGS_repeats; ++i) {
predictor->Run();
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
std::vector<std::vector<float>> results;
results.emplace_back(std::vector<float>({0.108398}));
auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2);
ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 1);
for (size_t i = 0; i < results.size(); ++i) {
for (size_t j = 0; j < results[i].size(); ++j) {
EXPECT_NEAR(
out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 1e-5);
}
}
}
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/lite_api_test_helper.h"
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
TEST(Resnet50, test_resnet50_lite_xpu) {
lite_api::CxxConfig config;
config.set_model_dir(FLAGS_model_dir);
config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
config.set_xpu_workspace_l3_size_per_thread();
auto predictor = lite_api::CreatePaddlePredictor(config);
auto input_tensor = predictor->GetInput(0);
std::vector<int64_t> input_shape{1, 3, 224, 224};
input_tensor->Resize(input_shape);
auto* data = input_tensor->mutable_data<float>();
int input_num = 1;
for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i];
}
for (int i = 0; i < input_num; i++) {
data[i] = 1;
}
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor->Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < FLAGS_repeats; ++i) {
predictor->Run();
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
std::vector<std::vector<float>> results;
results.emplace_back(std::vector<float>(
{0.000268651, 0.000174053, 0.000213181, 0.000396771, 0.000591516,
0.00018169, 0.000289721, 0.000855934, 0.000732185, 9.2055e-05,
0.000220664, 0.00235289, 0.00571265, 0.00357688, 0.00129667,
0.000465392, 0.000143775, 0.000211628, 0.000617144, 0.000265033}));
auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2);
ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 1000);
int step = 50;
for (size_t i = 0; i < results.size(); ++i) {
for (size_t j = 0; j < results[i].size(); ++j) {
EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
results[i][j],
1e-5);
}
}
}
} // namespace lite
} // namespace paddle
...@@ -25,6 +25,7 @@ SHUTDOWN_LOG=ON ...@@ -25,6 +25,7 @@ SHUTDOWN_LOG=ON
BUILD_NPU=OFF BUILD_NPU=OFF
NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.huawei.com/consumer/cn/hiai/ NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.huawei.com/consumer/cn/hiai/
BUILD_XPU=OFF BUILD_XPU=OFF
BUILD_XTCL=OFF
XPU_SDK_ROOT="$(pwd)/xpu_sdk_lib/" XPU_SDK_ROOT="$(pwd)/xpu_sdk_lib/"
LITE_WITH_ARM_LANG=OFF LITE_WITH_ARM_LANG=OFF
...@@ -138,6 +139,7 @@ function make_tiny_publish_so { ...@@ -138,6 +139,7 @@ function make_tiny_publish_so {
-DLITE_WITH_NPU=$BUILD_NPU \ -DLITE_WITH_NPU=$BUILD_NPU \
-DNPU_DDK_ROOT=$NPU_DDK_ROOT \ -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
-DLITE_WITH_XPU=$BUILD_XPU \ -DLITE_WITH_XPU=$BUILD_XPU \
-DLITE_WITH_XTCL=$BUILD_XTCL \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
-DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
...@@ -226,6 +228,7 @@ function make_full_publish_so { ...@@ -226,6 +228,7 @@ function make_full_publish_so {
-DLITE_WITH_NPU=$BUILD_NPU \ -DLITE_WITH_NPU=$BUILD_NPU \
-DNPU_DDK_ROOT=$NPU_DDK_ROOT \ -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
-DLITE_WITH_XPU=$BUILD_XPU \ -DLITE_WITH_XPU=$BUILD_XPU \
-DLITE_WITH_XTCL=$BUILD_XTCL \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
-DLITE_WITH_TRAIN=$BUILD_TRAIN \ -DLITE_WITH_TRAIN=$BUILD_TRAIN \
-DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
...@@ -260,6 +263,7 @@ function make_all_tests { ...@@ -260,6 +263,7 @@ function make_all_tests {
-DLITE_WITH_NPU=$BUILD_NPU \ -DLITE_WITH_NPU=$BUILD_NPU \
-DNPU_DDK_ROOT=$NPU_DDK_ROOT \ -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
-DLITE_WITH_XPU=$BUILD_XPU \ -DLITE_WITH_XPU=$BUILD_XPU \
-DLITE_WITH_XTCL=$BUILD_XTCL \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
-DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
...@@ -330,7 +334,10 @@ function make_cuda { ...@@ -330,7 +334,10 @@ function make_cuda {
-DWITH_TESTING=OFF \ -DWITH_TESTING=OFF \
-DLITE_WITH_ARM=OFF \ -DLITE_WITH_ARM=OFF \
-DLITE_WITH_PYTHON=${BUILD_PYTHON} \ -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
-DLITE_BUILD_EXTRA=ON -DLITE_BUILD_EXTRA=ON \
-DLITE_WITH_XPU=$BUILD_XPU \
-DLITE_WITH_XTCL=$BUILD_XTCL \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT
make publish_inference -j$NUM_PROC make publish_inference -j$NUM_PROC
cd - cd -
...@@ -362,9 +369,10 @@ function make_x86 { ...@@ -362,9 +369,10 @@ function make_x86 {
-DWITH_GPU=OFF \ -DWITH_GPU=OFF \
-DLITE_WITH_PYTHON=${BUILD_PYTHON} \ -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
-DLITE_BUILD_EXTRA=ON \ -DLITE_BUILD_EXTRA=ON \
-DCMAKE_BUILD_TYPE=Release \ -DLITE_WITH_XPU=$BUILD_XPU \
-DLITE_WITH_XPU=$BUID_XPU \ -DLITE_WITH_XTCL=$BUILD_XTCL \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
-DCMAKE_BUILD_TYPE=Release
make publish_inference -j$NUM_PROC make publish_inference -j$NUM_PROC
cd - cd -
...@@ -483,6 +491,10 @@ function main { ...@@ -483,6 +491,10 @@ function main {
BUILD_XPU="${i#*=}" BUILD_XPU="${i#*=}"
shift shift
;; ;;
--build_xtcl=*)
BUILD_XTCL="${i#*=}"
shift
;;
--xpu_sdk_root=*) --xpu_sdk_root=*)
XPU_SDK_ROOT="${i#*=}" XPU_SDK_ROOT="${i#*=}"
shift shift
......
#!/bin/bash
set -ex
# global variables with default value
XPU_SDK_ROOT="$(pwd)/../XPU_SDK" # XPU SDK
TARGET_NAME="test_subgraph_pass" # default target
BUILD_EXTRA=ON # ON(with sequence ops)/OFF
WITH_TESTING=ON # ON/OFF
function print_usage {
echo -e "\nUSAGE:"
echo
echo "----------------------------------------"
echo -e "--xpu_sdk_root=<xpu sdk directory>"
echo -e "--target_name=<target name>"
echo "----------------------------------------"
echo
}
# readonly variables with default value
readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
-DWITH_PYTHON=OFF \
-DLITE_WITH_ARM=OFF"
readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1}
readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
readonly workspace=$(pwd)
function prepare_thirdparty {
if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
rm -rf $workspace/third-party
if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
wget $THIRDPARTY_TAR
fi
tar xzf third-party-05b862.tar.gz
else
git submodule update --init --recursive
fi
}
# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
# here we fake an empty file to make cmake works.
function prepare_workspace {
# in build directory
# 1. Prepare gen_code file
GEN_CODE_PATH_PREFIX=lite/gen_code
mkdir -p ./${GEN_CODE_PATH_PREFIX}
touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
# 2.Prepare debug tool
DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
# clone submodule
# git submodule update --init --recursive
prepare_thirdparty
}
function build_xpu {
build_dir=${workspace}/build.lite.xpu
mkdir -p $build_dir
cd $build_dir
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
prepare_workspace
cmake .. \
${CMAKE_COMMON_OPTIONS} \
-DWITH_GPU=OFF \
-DWITH_MKLDNN=OFF \
-DLITE_WITH_X86=ON \
-DWITH_MKL=ON \
-DLITE_WITH_XPU=ON \
-DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
-DWITH_TESTING=${WITH_TESTING} \
-DXPU_SDK_ROOT=${XPU_SDK_ROOT}
make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE
cd -
echo "Done"
}
function main {
# Parse command line.
for i in "$@"; do
case $i in
--target_name=*)
TARGET_NAME="${i#*=}"
shift
;;
--build_extra=*)
BUILD_EXTRA="${i#*=}"
shift
;;
--xpu_sdk_root=*)
XPU_SDK_ROOT="${i#*=}"
shift
;;
build)
build_xpu
shift
;;
full_publish)
TARGET_NAME=publish_inference
build_xpu
shift
;;
*)
# unknown option
print_usage
exit 1
;;
esac
done
}
main $@
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册