提交 bc8a8042 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into all_new_design_exec

...@@ -69,15 +69,21 @@ if(NOT DEFINED CBLAS_PROVIDER) ...@@ -69,15 +69,21 @@ if(NOT DEFINED CBLAS_PROVIDER)
PATHS ${OPENBLAS_LIB_SEARCH_PATHS}) PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB) if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB)
set(CBLAS_PROVIDER OPENBLAS) file(READ "${OPENBLAS_INC_DIR}/openblas_config.h" config_file)
set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR}) string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file})
set(CBLAS_LIBRARIES ${OPENBLAS_LIB}) string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp})
add_definitions(-DPADDLE_USE_OPENBLAS) if (${ver} VERSION_EQUAL "0.3.7")
add_definitions(-DLAPACK_FOUND) set(CBLAS_PROVIDER OPENBLAS)
set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
add_definitions(-DPADDLE_USE_OPENBLAS)
add_definitions(-DLAPACK_FOUND)
message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
endif()
endif() endif()
endif() endif()
......
...@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn) ...@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
SET(MKLDNN_REPOSITORY ${GIT_URL}/oneapi-src/oneDNN.git) SET(MKLDNN_REPOSITORY ${GIT_URL}/oneapi-src/oneDNN.git)
SET(MKLDNN_TAG f3999b71d8e4415c1985a0dfb812a3ed77ee21fa) SET(MKLDNN_TAG 748528a2d3204b5f401c14a9aacdec16accd5ead)
# Introduce variables: # Introduce variables:
......
...@@ -7,52 +7,70 @@ SET(XPU_PROJECT "extern_xpu") ...@@ -7,52 +7,70 @@ SET(XPU_PROJECT "extern_xpu")
SET(XPU_API_LIB_NAME "libxpuapi.so") SET(XPU_API_LIB_NAME "libxpuapi.so")
SET(XPU_RT_LIB_NAME "libxpurt.so") SET(XPU_RT_LIB_NAME "libxpurt.so")
if(NOT XPU_SDK_ROOT) IF(WITH_AARCH64)
if (WITH_AARCH64) SET(XPU_XRE_DIR_NAME "xre-kylin_aarch64")
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/aarch64/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) SET(XPU_XDNN_DIR_NAME "xdnn-kylin_aarch64")
elseif(WITH_SUNWAY) SET(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64")
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) ELSEIF(WITH_SUNWAY)
else() SET(XPU_XRE_DIR_NAME "xre-deepin_sw6_64")
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_05_19.tar.gz" CACHE STRING "" FORCE) SET(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64")
endif() SET(XPU_XCCL_DIR_NAME "xccl-deepin_sw6_64")
ELSEIF(WITH_BDCENTOS)
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") SET(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}") SET(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
SET(XPU_API_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") ELSEIF(WITH_UBUNTU)
SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib") SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}") SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}") ELSEIF(WITH_CENTOS)
SET(XPU_XRE_DIR_NAME "xre-centos7_x86_64")
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib") SET(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64")
SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt ELSE ()
"PROJECT(XPU)\n" SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
"cmake_minimum_required(VERSION 3.0)\n" SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
"install(DIRECTORY xpu/include xpu/lib \n" SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
" DESTINATION ${XPU_INSTALL_DIR})\n") ENDIF()
ExternalProject_Add( SET(XPU_BASE_URL "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev/20210527")
${XPU_PROJECT} SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
${EXTERNAL_PROJECT_LOG_ARGS} SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
PREFIX ${XPU_SOURCE_DIR} SET(XPU_XCCL_URL "${XPU_BASE_URL}/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR} SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE)
DOWNLOAD_COMMAND wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
&& tar xvf xpu.tar.gz SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
DOWNLOAD_NO_PROGRESS 1 SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
UPDATE_COMMAND "" SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT} SET(XPU_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include")
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT} SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib")
)
else() SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
SET(XPU_API_INC_DIR "${XPU_SDK_ROOT}/XTDK/include/") SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
SET(XPU_API_LIB "${XPU_SDK_ROOT}/XTDK/shlib/libxpuapi.so")
SET(XPU_RT_LIB "${XPU_SDK_ROOT}/XTDK/runtime/shlib/libxpurt.so") SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
SET(XPU_LIB_DIR "${XPU_SDK_ROOT}/XTDK/shlib/")
endif() FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(XPU)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY xpu/include xpu/lib \n"
" DESTINATION ${XPU_INSTALL_DIR})\n")
ExternalProject_Add(
${XPU_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${XPU_SOURCE_DIR}
DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget ${XPU_PACK_DEPENCE_URL}
&& bash pack_paddle_depence.sh ${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL} ${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME}
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
)
INCLUDE_DIRECTORIES(${XPU_API_INC_DIR}) INCLUDE_DIRECTORIES(${XPU_INC_DIR})
ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL) ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}") set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
...@@ -62,7 +80,7 @@ generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake") ...@@ -62,7 +80,7 @@ generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB}) TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
if (WITH_XPU_BKCL) IF(WITH_XPU_BKCL)
MESSAGE(STATUS "Compile with XPU BKCL!") MESSAGE(STATUS "Compile with XPU BKCL!")
ADD_DEFINITIONS(-DPADDLE_WITH_XPU_BKCL) ADD_DEFINITIONS(-DPADDLE_WITH_XPU_BKCL)
...@@ -71,9 +89,9 @@ if (WITH_XPU_BKCL) ...@@ -71,9 +89,9 @@ if (WITH_XPU_BKCL)
SET(XPU_BKCL_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") SET(XPU_BKCL_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include")
INCLUDE_DIRECTORIES(${XPU_BKCL_INC_DIR}) INCLUDE_DIRECTORIES(${XPU_BKCL_INC_DIR})
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB}) TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB})
else(WITH_XPU_BKCL) ELSE(WITH_XPU_BKCL)
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ) TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
endif(WITH_XPU_BKCL) ENDIF(WITH_XPU_BKCL)
if(NOT XPU_SDK_ROOT) if(NOT XPU_SDK_ROOT)
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT}) ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
......
...@@ -215,6 +215,8 @@ list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boos ...@@ -215,6 +215,8 @@ list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boos
list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool) list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)
include(cblas) # find first, then download, build, install openblas include(cblas) # find first, then download, build, install openblas
message(STATUS "CBLAS_PROVIDER: ${CBLAS_PROVIDER}")
if(${CBLAS_PROVIDER} STREQUAL MKLML) if(${CBLAS_PROVIDER} STREQUAL MKLML)
list(APPEND third_party_deps extern_mklml) list(APPEND third_party_deps extern_mklml)
elseif(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS) elseif(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/imperative/type_defs.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
namespace paddle { namespace paddle {
...@@ -30,9 +31,28 @@ void CheckVarHasNanOrInf(const std::string& op_type, ...@@ -30,9 +31,28 @@ void CheckVarHasNanOrInf(const std::string& op_type,
const std::string& var_name, const std::string& var_name,
const platform::Place& place); const platform::Place& place);
void CheckVarHasNanOrInf(const std::string& op_type,
const std::string& var_name,
const framework::Variable* var,
const platform::Place& place);
void CheckOpHasNanOrInf(const framework::OperatorBase& op, void CheckOpHasNanOrInf(const framework::OperatorBase& op,
const framework::Scope& scope, const framework::Scope& scope,
const platform::Place& place); const platform::Place& place);
template <typename VarType>
void CheckOpHasNanOrInfInDygraph(const std::string& op_type,
const imperative::NameVarMap<VarType>& op_outs,
platform::Place place) {
for (const auto& pair : op_outs) {
for (const auto& ivar : pair.second) {
auto* var = ivar->MutableVar();
if (var == nullptr) continue;
CheckVarHasNanOrInf(op_type, ivar->Name(), var, place);
}
}
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -297,13 +297,12 @@ void tensor_check<platform::CPUDeviceContext>(const std::string& op_type, ...@@ -297,13 +297,12 @@ void tensor_check<platform::CPUDeviceContext>(const std::string& op_type,
} }
void CheckVarHasNanOrInf(const std::string& op_type, void CheckVarHasNanOrInf(const std::string& op_type,
const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
const framework::Variable* var,
const platform::Place& place) { const platform::Place& place) {
auto* var = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("In op=%s, can't find var:%s", op_type, var, platform::errors::NotFound("Cannot find var: `%s` in op `%s`.",
var_name)); var_name, op_type));
const Tensor* tensor{nullptr}; const Tensor* tensor{nullptr};
if (var->IsType<framework::LoDTensor>()) { if (var->IsType<framework::LoDTensor>()) {
...@@ -393,6 +392,14 @@ void CheckVarHasNanOrInf(const std::string& op_type, ...@@ -393,6 +392,14 @@ void CheckVarHasNanOrInf(const std::string& op_type,
tensor_check<platform::CPUDeviceContext>(op_type, var_name, *tensor, place); tensor_check<platform::CPUDeviceContext>(op_type, var_name, *tensor, place);
} }
void CheckVarHasNanOrInf(const std::string& op_type,
const framework::Scope& scope,
const std::string& var_name,
const platform::Place& place) {
auto* var = scope.FindVar(var_name);
CheckVarHasNanOrInf(op_type, var_name, var, place);
}
bool IsSkipOp(const framework::OperatorBase& op) { bool IsSkipOp(const framework::OperatorBase& op) {
if (op_type_nan_inf_white_list().count(op.Type()) != 0) return true; if (op_type_nan_inf_white_list().count(op.Type()) != 0) return true;
......
...@@ -176,6 +176,7 @@ message DistributedStrategy { ...@@ -176,6 +176,7 @@ message DistributedStrategy {
optional bool find_unused_parameters = 28 [ default = false ]; optional bool find_unused_parameters = 28 [ default = false ];
optional bool tensor_parallel = 29 [ default = false ]; optional bool tensor_parallel = 29 [ default = false ];
optional bool without_graph_optimization = 30 [ default = false ]; optional bool without_graph_optimization = 30 [ default = false ];
optional int32 fuse_grad_size_in_num = 31 [ default = 1 ];
optional RecomputeConfig recompute_configs = 101; optional RecomputeConfig recompute_configs = 101;
optional AMPConfig amp_configs = 102; optional AMPConfig amp_configs = 102;
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "paddle/fluid/framework/ir/fc_fuse_pass.h" #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
#include <string> #include <string>
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -38,6 +37,7 @@ FCFusePass::FCFusePass() { ...@@ -38,6 +37,7 @@ FCFusePass::FCFusePass() {
.IsNumGE(1) .IsNumGE(1)
.End() .End()
.AddAttr("y_num_col_dims") .AddAttr("y_num_col_dims")
.IsNumEQ(1)
.End(); .End();
AddOpCompat(OpCompat("elementwise_add")) AddOpCompat(OpCompat("elementwise_add"))
...@@ -51,6 +51,7 @@ FCFusePass::FCFusePass() { ...@@ -51,6 +51,7 @@ FCFusePass::FCFusePass() {
.IsTensor() .IsTensor()
.End() .End()
.AddAttr("axis") .AddAttr("axis")
.IsNumGE(1)
.End(); .End();
AddOpCompat(OpCompat("relu")) AddOpCompat(OpCompat("relu"))
......
...@@ -58,12 +58,12 @@ TEST(FCFusePass, basic) { ...@@ -58,12 +58,12 @@ TEST(FCFusePass, basic) {
auto* weights_0 = layers.data("weights_0", {}, true); auto* weights_0 = layers.data("weights_0", {}, true);
auto* mul_out_0 = layers.mul(relu_out_0, weights_0); auto* mul_out_0 = layers.mul(relu_out_0, weights_0);
auto* bias_1 = layers.data("bias_1", {}, true); auto* bias_1 = layers.data("bias_1", {}, true);
auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1); auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1, nullptr, 1);
auto* relu_out_1 = layers.relu(add_out_0); auto* relu_out_1 = layers.relu(add_out_0);
auto* weights_1 = layers.data("weights_1", {}, true); auto* weights_1 = layers.data("weights_1", {}, true);
auto* mul_out_1 = layers.mul(relu_out_1, weights_1); auto* mul_out_1 = layers.mul(relu_out_1, weights_1);
auto* bias_2 = layers.data("bias_2", {}, true); auto* bias_2 = layers.data("bias_2", {}, true);
auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2); auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2, nullptr, 1);
VLOG(4) << add_out_1; VLOG(4) << add_out_1;
std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program())); std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
......
...@@ -250,6 +250,32 @@ OpCompat& OpCompatSensiblePass::AddOpCompat(OpCompat&& op_compat) { ...@@ -250,6 +250,32 @@ OpCompat& OpCompatSensiblePass::AddOpCompat(OpCompat&& op_compat) {
return *(op_compat_judgers_[name]); return *(op_compat_judgers_[name]);
} }
//! Tell the Op compability of a subgraph.
bool OpCompatSensiblePass::IsCompat(
const GraphPatternDetector::subgraph_t& subgraph, Graph*) const {
PADDLE_ENFORCE_EQ(op_compat_judgers_.empty(), false,
platform::errors::InvalidArgument(
"At least one OpCompat instance should be added"));
// Check the all the ops in the subgraph are contained in the
// op_compat.
for (auto& node_pair : subgraph) {
if (!node_pair.second->IsOp()) continue;
auto op_type = node_pair.second->Op()->Type();
if (!op_compat_judgers_.count(op_type)) {
if (HasOpDef(op_type)) {
LOG(WARNING) << op_type << "compat not registered!";
return false;
}
continue;
}
auto& judger = *op_compat_judgers_.at(op_type);
if (!judger.Judge(*(node_pair.second->Op()))) {
return false;
}
}
return true;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -195,26 +195,7 @@ class OpCompatSensiblePass : public Pass { ...@@ -195,26 +195,7 @@ class OpCompatSensiblePass : public Pass {
//! Tell the Op compability of a subgraph. //! Tell the Op compability of a subgraph.
bool IsCompat(const GraphPatternDetector::subgraph_t& subgraph, bool IsCompat(const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) const { Graph* g) const;
CHECK(!op_compat_judgers_.empty())
<< "At least one OpCompat instance should be added in the "
"OpCompatSensiblePass.";
// Check the all the ops in the subgraph are contained in the
// op_compat.
for (auto& node_pair : subgraph) {
if (!node_pair.second->IsOp()) continue;
auto op_type = node_pair.second->Op()->Type();
if (!op_compat_judgers_.count(op_type)) {
LOG(WARNING) << op_type << "compat not registered!";
return false;
}
auto& judger = *op_compat_judgers_.at(op_type);
if (!judger.Judge(*(node_pair.second->Op()))) {
return false;
}
}
return true;
}
//! Tell the op compatibility of a single Op. //! Tell the op compatibility of a single Op.
bool IsCompat(const OpDesc& op_desc) const { bool IsCompat(const OpDesc& op_desc) const {
......
...@@ -151,6 +151,10 @@ class OpCompatSensiblePassTest : public OpCompatSensiblePass { ...@@ -151,6 +151,10 @@ class OpCompatSensiblePassTest : public OpCompatSensiblePass {
public: public:
OpCompatSensiblePassTest(); OpCompatSensiblePassTest();
bool TestIsCompat(const OpDesc& op_desc) { return IsCompat(op_desc); } bool TestIsCompat(const OpDesc& op_desc) { return IsCompat(op_desc); }
bool TestIsCompat(const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
return IsCompat(subgraph, g);
}
}; };
OpCompatSensiblePassTest::OpCompatSensiblePassTest() { OpCompatSensiblePassTest::OpCompatSensiblePassTest() {
...@@ -192,6 +196,23 @@ TEST(OpCompatSensiblePass, IsCompat) { ...@@ -192,6 +196,23 @@ TEST(OpCompatSensiblePass, IsCompat) {
EXPECT_TRUE(test.TestIsCompat(fc_op)); EXPECT_TRUE(test.TestIsCompat(fc_op));
} }
TEST(OpCompatSensiblePass, IsCompatFail) {
OpCompatSensiblePassTest test;
GraphPatternDetector::subgraph_t subgraph;
PDPattern pattern;
PDNode* pd_node = pattern.NewNode();
ProgramDesc prog;
Graph g(prog);
OpDesc fc_op;
fc_op.SetType("op1");
subgraph[pd_node] = g.CreateOpNode(&fc_op);
EXPECT_TRUE(test.TestIsCompat(subgraph, &g));
fc_op.SetType("mul");
subgraph[pd_node] = g.CreateOpNode(&fc_op);
EXPECT_FALSE(test.TestIsCompat(subgraph, &g));
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -194,14 +194,18 @@ struct Layers { ...@@ -194,14 +194,18 @@ struct Layers {
} }
VarDesc* mul(VarDesc* x, VarDesc* y, VarDesc* out = nullptr, VarDesc* mul(VarDesc* x, VarDesc* y, VarDesc* out = nullptr,
int x_num_col_dims = 1) { int x_num_col_dims = 1, int y_num_col_dims = 1) {
AttributeMap attrs; AttributeMap attrs;
attrs["x_num_col_dims"] = 1; attrs["x_num_col_dims"] = x_num_col_dims;
attrs["y_num_col_dims"] = y_num_col_dims;
return binary_op("mul", x, y, out, &attrs); return binary_op("mul", x, y, out, &attrs);
} }
VarDesc* elementwise_add(VarDesc* x, VarDesc* y, VarDesc* out = nullptr) { VarDesc* elementwise_add(VarDesc* x, VarDesc* y, VarDesc* out = nullptr,
return binary_op("elementwise_add", x, y, out); int axis = -1) {
AttributeMap attrs;
attrs["axis"] = axis;
return binary_op("elementwise_add", x, y, out, &attrs);
} }
VarDesc* elementwise_mul(VarDesc* x, VarDesc* y, VarDesc* out = nullptr, VarDesc* elementwise_mul(VarDesc* x, VarDesc* y, VarDesc* out = nullptr,
......
...@@ -66,9 +66,13 @@ static bool IsFCWithPaddingWeights(Node* n) { ...@@ -66,9 +66,13 @@ static bool IsFCWithPaddingWeights(Node* n) {
} }
static bool IsParamOfFC(Node* n, const std::string& param_name) { static bool IsParamOfFC(Node* n, const std::string& param_name) {
if (IsInputOfFC(n) && n->inputs.empty() && if (IsInputOfFC(n) && n->inputs.empty()) {
(n->Name() == n->outputs[0]->Op()->Input(param_name)[0])) { for (auto* out : n->outputs) {
return true; if (out->Op()->Type() == "fc" &&
n->Name() == out->Op()->Input(param_name)[0]) {
return true;
}
}
} }
return false; return false;
} }
......
...@@ -276,7 +276,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor) { ...@@ -276,7 +276,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor) {
SerializeToStream(os, tensor, *dev_ctx); SerializeToStream(os, tensor, *dev_ctx);
} }
void DeserializeFromStream(std::ifstream &os, LoDTensor *tensor) { void DeserializeFromStream(std::istream &os, LoDTensor *tensor) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext *dev_ctx; const platform::DeviceContext *dev_ctx;
dev_ctx = pool.Get(platform::CPUPlace()); dev_ctx = pool.Get(platform::CPUPlace());
......
...@@ -257,7 +257,7 @@ LoD ConvertToOffsetBasedLoD(const LoD& length_lod); ...@@ -257,7 +257,7 @@ LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
void SerializeToStream(std::ostream& os, const LoDTensor& tensor); void SerializeToStream(std::ostream& os, const LoDTensor& tensor);
void DeserializeFromStream(std::ifstream& os, LoDTensor* tensor); void DeserializeFromStream(std::istream& os, LoDTensor* tensor);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -68,5 +68,9 @@ const proto::OpDef& GetOpDef(const std::string& op_name) { ...@@ -68,5 +68,9 @@ const proto::OpDef& GetOpDef(const std::string& op_name) {
} }
return ops_definition.at(op_name); return ops_definition.at(op_name);
} }
bool HasOpDef(const std::string& op_name) {
return op_def_map.find(op_name) != op_def_map.end();
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -19,5 +19,7 @@ ...@@ -19,5 +19,7 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
const proto::OpDef& GetOpDef(const std::string& op_name); const proto::OpDef& GetOpDef(const std::string& op_name);
bool HasOpDef(const std::string& op_name);
} }
} }
...@@ -121,7 +121,7 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) { ...@@ -121,7 +121,7 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) {
SerializeToStream(os, selected_rows, *dev_ctx); SerializeToStream(os, selected_rows, *dev_ctx);
} }
void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows) { void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext* dev_ctx; const platform::DeviceContext* dev_ctx;
dev_ctx = pool.Get(platform::CPUPlace()); dev_ctx = pool.Get(platform::CPUPlace());
......
...@@ -175,7 +175,7 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, ...@@ -175,7 +175,7 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows); void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows);
void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows); void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
cc_library(imperative_flag SRCS flags.cc DEPS gflags) cc_library(imperative_flag SRCS flags.cc DEPS gflags)
cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform) cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils)
cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry) cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
add_subdirectory(jit) add_subdirectory(jit)
cc_library(amp SRCS amp_auto_cast.cc DEPS layer ) cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
......
...@@ -15,8 +15,11 @@ ...@@ -15,8 +15,11 @@
#include "paddle/fluid/imperative/prepared_operator.h" #include "paddle/fluid/imperative/prepared_operator.h"
#include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/details/nan_inf_utils.h"
#include "paddle/fluid/imperative/infer_shape_context.h" #include "paddle/fluid/imperative/infer_shape_context.h"
DECLARE_bool(check_nan_inf);
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
...@@ -175,6 +178,11 @@ static void PreparedOpRunImpl( ...@@ -175,6 +178,11 @@ static void PreparedOpRunImpl(
func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs, func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
attrs)); attrs));
if (FLAGS_check_nan_inf) {
framework::details::CheckOpHasNanOrInfInDygraph<VarType>(
op.Type(), outs, dev_ctx->GetPlace());
}
/** /**
* [ Why need handle complex gradient to real gradient? ] * [ Why need handle complex gradient to real gradient? ]
* *
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/prepared_operator.h"
#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -32,7 +33,17 @@ bool RequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap& outs) { ...@@ -32,7 +33,17 @@ bool RequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
for (const auto& name_pair : ins) { for (const auto& name_pair : ins) {
for (const auto& var_base : name_pair.second) { for (const auto& var_base : name_pair.second) {
if (!var_base->OverridedStopGradient()) { if (!var_base->OverridedStopGradient()) {
PassStopGradient(outs, var_base->OverridedStopGradient()); for (const auto& pair : outs) {
for (const auto& var : pair.second) {
if (var) {
var->SetOverridedStopGradient(false);
SetForwardDataTypeOfGradVar(var);
VLOG(3) << "Set output: " << var->Name()
<< "'s OverridedStopGradient as "
<< var->OverridedStopGradient();
}
}
}
return true; return true;
} }
} }
...@@ -78,28 +89,36 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls, ...@@ -78,28 +89,36 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
// process args,`input_vars` only collect `imperative::VarBase` // process args,`input_vars` only collect `imperative::VarBase`
if (!args.empty()) { if (!args.empty()) {
for (auto ptr = args.begin(); ptr != args.end(); ptr++) { for (auto ptr = args.begin(); ptr != args.end(); ptr++) {
try { // Only collect Tensor type in 'args' and pass them to backward. Ignore
if (Py_None != ptr->ptr()) { // other types of input temporarily.
if (py::isinstance<imperative::VarBase>(*ptr)) {
try {
auto a = ptr->cast<std::shared_ptr<VarBase>>(); auto a = ptr->cast<std::shared_ptr<VarBase>>();
input_vars.push_back(a); input_vars.push_back(a);
} catch (py::cast_error& err) {
PADDLE_THROW(platform::errors::InvalidArgument(
"The `PyLayer.forward` function contains invalid argument, the "
"`%s` type argument can not be cast into `Tensor`.",
ptr->ptr()->ob_type->tp_name));
} }
} catch (py::cast_error& err) {
// Only collect Tensor type in 'args' and pass them to backward. Ignore
// other types of input temporarily.
} }
} }
} }
// process kwargs, only collect `imperative::VarBase` // process kwargs, only collect `imperative::VarBase`
if (!kwargs.empty()) { if (!kwargs.empty()) {
for (auto ptr = kwargs.begin(); ptr != kwargs.end(); ptr++) { for (auto ptr = kwargs.begin(); ptr != kwargs.end(); ptr++) {
try { // Only collect Tensor type in 'kwargs' and pass them to backward.
if (Py_None != ptr->second.ptr()) { // Ignore other types of input temporarily.
if (py::isinstance<imperative::VarBase>(*ptr->second)) {
try {
auto a = ptr->second.cast<std::shared_ptr<VarBase>>(); auto a = ptr->second.cast<std::shared_ptr<VarBase>>();
input_vars.push_back(a); input_vars.push_back(a);
} catch (py::cast_error&) {
PADDLE_THROW(platform::errors::InvalidArgument(
"The `PyLayer.forward` function contains invalid argument, the "
"`%s` type argument can not be cast into `Tensor`.",
ptr->second.ptr()->ob_type->tp_name));
} }
} catch (py::cast_error&) {
// Only collect Tensor type in 'kwargs' and pass them to backward.
// Ignore other types of input temporarily.
} }
} }
} }
...@@ -110,33 +129,35 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls, ...@@ -110,33 +129,35 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
PyList_Check(result_forward.ptr())) { PyList_Check(result_forward.ptr())) {
auto tuple_result = result_forward.cast<py::tuple>(); auto tuple_result = result_forward.cast<py::tuple>();
for (size_t i = 0; i < tuple_result.size(); i++) { for (size_t i = 0; i < tuple_result.size(); i++) {
if (Py_None != tuple_result[i].ptr()) { // Only collect Tensor type of output and pass them to backward.
// Ignore other types of input temporarily.
if (py::isinstance<imperative::VarBase>(tuple_result[i])) {
try { try {
auto temp_out = auto temp_out =
tuple_result[i].cast<std::shared_ptr<imperative::VarBase>>(); tuple_result[i].cast<std::shared_ptr<imperative::VarBase>>();
output_vars.push_back(temp_out); output_vars.push_back(temp_out);
} catch (py::cast_error&) { } catch (py::cast_error&) {
// Only collect Tensor type in 'kwargs' and pass them to backward. PADDLE_THROW(platform::errors::InvalidArgument(
// Ignore other types of input temporarily. "The `PyLayer.forward` function returns invalid argument, the "
"`%s` type argument can not be cast into `Tensor`.",
tuple_result[i].ptr()->ob_type->tp_name));
} }
} else {
// Only collect Tensor type in 'kwargs' and pass them to backward.
// Ignore other types of input temporarily.
} }
} }
} else { } else {
if (Py_None != result_forward.ptr()) { // Only collect Tensor type of output and pass them to backward.
// Ignore other types of input temporarily.
if (py::isinstance<imperative::VarBase>(result_forward)) {
try { try {
auto temp_out = auto temp_out =
result_forward.cast<std::shared_ptr<imperative::VarBase>>(); result_forward.cast<std::shared_ptr<imperative::VarBase>>();
output_vars.push_back(temp_out); output_vars.push_back(temp_out);
} catch (py::cast_error&) { } catch (py::cast_error&) {
// Only collect Tensor type in 'kwargs' and pass them to backward. PADDLE_THROW(platform::errors::InvalidArgument(
// Ignore other types of input temporarily. "The `PyLayer.forward` function returns invalid argument, the `%s` "
"type argument can not be cast into `Tensor`.",
result_forward.ptr()->ob_type->tp_name));
} }
} else {
// Only collect Tensor type in 'kwargs' and pass them to backward.
// Ignore other types of input temporarily.
} }
} }
if (output_vars.size() == 0) { if (output_vars.size() == 0) {
......
...@@ -303,7 +303,9 @@ static void DisablePrepareDataOpt( ...@@ -303,7 +303,9 @@ static void DisablePrepareDataOpt(
disable_opt || pre_disable_opt); disable_opt || pre_disable_opt);
} }
// disable prepare data if unfriendly op is found // disable prepare data if unfriendly op is found
disable_opt = IsPrepareDataOptTargetOp(op); if (!disable_opt) {
disable_opt = IsPrepareDataOptTargetOp(op);
}
} }
} }
......
...@@ -46,13 +46,6 @@ class LayerNormOpConverter : public OpConverter { ...@@ -46,13 +46,6 @@ class LayerNormOpConverter : public OpConverter {
auto* Bias_t = Bias_v->GetMutable<framework::LoDTensor>(); auto* Bias_t = Bias_v->GetMutable<framework::LoDTensor>();
auto* Scale_t = Scale_v->GetMutable<framework::LoDTensor>(); auto* Scale_t = Scale_v->GetMutable<framework::LoDTensor>();
int input_num = 1;
for (int i = 0; i < X->getDimensions().nbDims; i++) {
input_num *= X->getDimensions().d[i];
}
std::vector<int64_t> mean_shape{input_num};
std::vector<int64_t> variance_shape{input_num};
std::unique_ptr<framework::LoDTensor> bias_tensor( std::unique_ptr<framework::LoDTensor> bias_tensor(
new framework::LoDTensor()); new framework::LoDTensor());
std::unique_ptr<framework::LoDTensor> scale_tensor( std::unique_ptr<framework::LoDTensor> scale_tensor(
...@@ -68,10 +61,33 @@ class LayerNormOpConverter : public OpConverter { ...@@ -68,10 +61,33 @@ class LayerNormOpConverter : public OpConverter {
auto* bias_data = bias_tensor->mutable_data<float>(platform::CPUPlace()); auto* bias_data = bias_tensor->mutable_data<float>(platform::CPUPlace());
auto* scale_data = scale_tensor->mutable_data<float>(platform::CPUPlace()); auto* scale_data = scale_tensor->mutable_data<float>(platform::CPUPlace());
plugin::LayerNormPlugin* plugin = new plugin::LayerNormPlugin( nvinfer1::ILayer* layernorm_layer = nullptr;
bias_data, bias_tensor->numel(), scale_data, scale_tensor->numel(), if (engine_->with_dynamic_shape()) {
begin_norm_axis, eps, mean_shape, variance_shape); int input_num = 1;
nvinfer1::IPluginLayer* layernorm_layer = engine_->AddPlugin(&X, 1, plugin); for (int i = begin_norm_axis; i < X->getDimensions().nbDims; i++) {
input_num *= X->getDimensions().d[i];
}
std::vector<int64_t> mean_shape{input_num};
std::vector<int64_t> variance_shape{input_num};
plugin::LayerNormPluginDynamic* plugin =
new plugin::LayerNormPluginDynamic(bias_data, bias_tensor->numel(),
scale_data, scale_tensor->numel(),
begin_norm_axis, eps, mean_shape,
variance_shape);
layernorm_layer = engine_->AddDynamicPlugin(&X, 1, plugin);
} else {
int input_num = 1;
for (int i = begin_norm_axis - 1; i < X->getDimensions().nbDims; i++) {
input_num *= X->getDimensions().d[i];
}
std::vector<int64_t> mean_shape{input_num};
std::vector<int64_t> variance_shape{input_num};
plugin::LayerNormPlugin* plugin = new plugin::LayerNormPlugin(
bias_data, bias_tensor->numel(), scale_data, scale_tensor->numel(),
begin_norm_axis, eps, mean_shape, variance_shape);
layernorm_layer = engine_->AddPlugin(
&X, 1, reinterpret_cast<plugin::PluginTensorRT*>(plugin));
}
auto output_name = op_desc.Output("Y").front(); auto output_name = op_desc.Output("Y").front();
engine_->SetWeights(op_desc.Input("Bias").front(), std::move(bias_tensor)); engine_->SetWeights(op_desc.Input("Bias").front(), std::move(bias_tensor));
......
...@@ -703,7 +703,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, ...@@ -703,7 +703,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
return false; return false;
// Paddle-TRT does not support the input tensors: Shape and ShapeTensor // Paddle-TRT does not support the input tensors: Shape and ShapeTensor
} else if (desc.Input("Shape").size() >= 1 || } else if (desc.Input("Shape").size() >= 1 ||
desc.Input("ShapeTensor").size() >= 1) { desc.Input("ShapeTensor").size() >= 1 || with_dynamic_shape) {
return false; return false;
} else { } else {
std::vector<int> shape = std::vector<int> shape =
......
...@@ -57,8 +57,18 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs, ...@@ -57,8 +57,18 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs,
input_shape.push_back(input_dims.d[i]); input_shape.push_back(input_dims.d[i]);
} }
const auto input_ddim = framework::make_ddim(input_shape); const auto input_ddim = framework::make_ddim(input_shape);
auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis - 1); auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis);
int feature_size = static_cast<int>(matrix_dim[1]); int feature_size = static_cast<int>(matrix_dim[1]);
PADDLE_ENFORCE_EQ(feature_size, scale_.size(),
platform::errors::InvalidArgument(
"scale's size should be equal to the feature_size,"
"but got feature_size:%d, scale's size:%d.",
feature_size, scale_.size()));
PADDLE_ENFORCE_EQ(feature_size, bias_.size(),
platform::errors::InvalidArgument(
"bias's size should be equal to the feature_size,"
"but got feature_size:%d, bias's size:%d.",
feature_size, bias_.size()));
scale_t.Resize(framework::make_ddim({feature_size})); scale_t.Resize(framework::make_ddim({feature_size}));
bias_t.Resize(framework::make_ddim({feature_size})); bias_t.Resize(framework::make_ddim({feature_size}));
...@@ -82,6 +92,163 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs, ...@@ -82,6 +92,163 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs,
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
nvinfer1::DimsExprs LayerNormPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputDims, int nb_inputs,
nvinfer1::IExprBuilder &expr_builder) {
return inputDims[0];
}
bool LayerNormPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
int nb_outputs) {
PADDLE_ENFORCE_NOT_NULL(
in_out, platform::errors::InvalidArgument(
"The input of layernorm plugin shoule not be nullptr."));
PADDLE_ENFORCE_LT(
pos, nb_inputs + nb_outputs,
platform::errors::InvalidArgument("The pos(%d) should be less than the "
"num(%d) of the input and the output.",
pos, nb_inputs + nb_outputs));
const nvinfer1::PluginTensorDesc &in = in_out[pos];
if (pos == 0) {
// TODO(Shangzhizhou) FP16 support
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
}
const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
// output
return in.type == prev.type && in.format == prev.format;
}
nvinfer1::DataType LayerNormPluginDynamic::getOutputDataType(
int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
PADDLE_ENFORCE_EQ(index, 0,
platform::errors::InvalidArgument(
"The LayerNormPlugin only has one input, so the "
"index value should be 0, but get %d.",
index));
return input_types[0];
}
int LayerNormPluginDynamic::enqueue(
const nvinfer1::PluginTensorDesc *input_desc,
const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream) {
const auto &input_dims = input_desc[0].dims;
int begin_norm_axis = begin_norm_axis_;
float eps = eps_;
std::vector<int> input_shape;
for (int i = 0; i < input_dims.nbDims; i++) {
input_shape.push_back(input_dims.d[i]);
}
const auto input_ddim = framework::make_ddim(input_shape);
auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis);
int feature_size = static_cast<int>(matrix_dim[1]);
PADDLE_ENFORCE_EQ(feature_size, scale_.size(),
platform::errors::InvalidArgument(
"scale's size should be equal to the feature_size,"
"but got feature_size:%d, scale's size:%d.",
feature_size, scale_.size()));
PADDLE_ENFORCE_EQ(feature_size, bias_.size(),
platform::errors::InvalidArgument(
"bias's size should be equal to the feature_size,"
"but got feature_size:%d, bias's size:%d.",
feature_size, bias_.size()));
int device_id;
cudaGetDevice(&device_id);
auto input_type = input_desc[0].type;
if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp32";
const float *input = reinterpret_cast<const float *>(inputs[0]);
float *output = static_cast<float *>(outputs[0]);
scale_t.Resize(framework::make_ddim({feature_size}));
bias_t.Resize(framework::make_ddim({feature_size}));
mean_t.Resize(framework::make_ddim(mean_shape_));
variance_t.Resize(framework::make_ddim(variance_shape_));
float *scale_d =
scale_t.mutable_data<float>(platform::CUDAPlace(device_id));
float *bias_d = bias_t.mutable_data<float>(platform::CUDAPlace(device_id));
float *mean_d = mean_t.mutable_data<float>(platform::CUDAPlace(device_id));
float *variance_d =
variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
cudaMemcpyAsync(scale_d, scale_.data(), sizeof(float) * feature_size,
cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size,
cudaMemcpyHostToDevice, stream);
paddle::operators::LayerNormDirectCUDAFunctor<float> layer_norm;
layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d,
variance_d, begin_norm_axis, eps);
} else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef TRT_PLUGIN_FP16_AVALIABLE
VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp16";
const half *input = reinterpret_cast<const half *>(inputs[0]);
half *output = static_cast<half *>(outputs[0]);
size_t mean_shape_product = 1;
for (auto s : mean_shape_) {
mean_shape_product *= s;
}
size_t variance_shape_product = 1;
for (auto s : variance_shape_) {
variance_shape_product *= s;
}
if (!scale_gpu_half_d_) {
cudaMalloc(&scale_gpu_half_d_, feature_size * sizeof(half));
}
if (!bias_gpu_half_d_) {
cudaMalloc(&bias_gpu_half_d_, feature_size * sizeof(half));
}
if (!mean_gpu_half_d_) {
cudaMalloc(&mean_gpu_half_d_, mean_shape_product * sizeof(half));
}
if (!variance_gpu_half_d_) {
cudaMalloc(&variance_gpu_half_d_, variance_shape_product * sizeof(half));
}
half *scale_cpu_half =
static_cast<half *>(malloc(feature_size * sizeof(half)));
half *bias_cpu_half =
static_cast<half *>(malloc(feature_size * sizeof(half)));
PADDLE_ENFORCE_EQ(
scale_cpu_half && bias_cpu_half, true,
platform::errors::Unavailable("Out of memory, malloc size %d.",
feature_size * sizeof(half)));
for (int i = 0; i < feature_size; i++) {
scale_cpu_half[i] = static_cast<half>(scale_[i]);
bias_cpu_half[i] = static_cast<half>(bias_[i]);
}
cudaMemcpyAsync(scale_gpu_half_d_, scale_cpu_half,
sizeof(half) * feature_size, cudaMemcpyHostToDevice,
stream);
cudaMemcpyAsync(bias_gpu_half_d_, bias_cpu_half,
sizeof(half) * feature_size, cudaMemcpyHostToDevice,
stream);
free(scale_cpu_half);
free(bias_cpu_half);
paddle::operators::LayerNormDirectCUDAFunctor<half> layer_norm;
layer_norm(stream, input, input_shape, bias_gpu_half_d_, scale_gpu_half_d_,
output, mean_gpu_half_d_, variance_gpu_half_d_, begin_norm_axis,
eps);
#else
PADDLE_THROW(platform::errors::Fatal(
"The layer_norm tensorRT plugin should be "
"complied with CUDA version >= 10.0 when running with fp16. "
"Please recomplie it or try to use fp32 by set "
"config.SetTRTDynamicShapeInfo(min_input_shape, "
"max_input_shape, opt_input_shape, true"));
#endif
} else {
PADDLE_THROW(platform::errors::Fatal(
"The LayerNorm TRT Plugin's input type should be float or half."));
}
return cudaGetLastError() != cudaSuccess;
}
} // namespace plugin } // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
......
...@@ -50,7 +50,7 @@ class LayerNormPlugin : public PluginTensorRT { ...@@ -50,7 +50,7 @@ class LayerNormPlugin : public PluginTensorRT {
// TRT will call this func when we need to serialize the configuration of // TRT will call this func when we need to serialize the configuration of
// tensorrt. // tensorrt.
// It should not be called by users. // It should not be called by users.
void serialize(void *buffer) override { void serialize(void* buffer) override {
SerializeValue(&buffer, getPluginType()); SerializeValue(&buffer, getPluginType());
serializeBase(buffer); serializeBase(buffer);
SerializeValue(&buffer, bias_); SerializeValue(&buffer, bias_);
...@@ -62,7 +62,7 @@ class LayerNormPlugin : public PluginTensorRT { ...@@ -62,7 +62,7 @@ class LayerNormPlugin : public PluginTensorRT {
} }
public: public:
LayerNormPlugin(const float *bias, const int bias_num, const float *scale, LayerNormPlugin(const float* bias, const int bias_num, const float* scale,
const int scale_num, int begin_norm_axis, float eps, const int scale_num, int begin_norm_axis, float eps,
std::vector<int64_t> mean_shape, std::vector<int64_t> mean_shape,
std::vector<int64_t> variance_shape) std::vector<int64_t> variance_shape)
...@@ -78,7 +78,7 @@ class LayerNormPlugin : public PluginTensorRT { ...@@ -78,7 +78,7 @@ class LayerNormPlugin : public PluginTensorRT {
// It was used for tensorrt deserialization. // It was used for tensorrt deserialization.
// It should not be called by users. // It should not be called by users.
LayerNormPlugin(void const *serialData, size_t serialLength) { LayerNormPlugin(void const* serialData, size_t serialLength) {
deserializeBase(serialData, serialLength); deserializeBase(serialData, serialLength);
DeserializeValue(&serialData, &serialLength, &bias_); DeserializeValue(&serialData, &serialLength, &bias_);
DeserializeValue(&serialData, &serialLength, &scale_); DeserializeValue(&serialData, &serialLength, &scale_);
...@@ -90,20 +90,180 @@ class LayerNormPlugin : public PluginTensorRT { ...@@ -90,20 +90,180 @@ class LayerNormPlugin : public PluginTensorRT {
~LayerNormPlugin() {} ~LayerNormPlugin() {}
int initialize() override; int initialize() override;
LayerNormPlugin *clone() const override { LayerNormPlugin* clone() const override {
return new LayerNormPlugin(bias_.data(), bias_.size(), scale_.data(), return new LayerNormPlugin(bias_.data(), bias_.size(), scale_.data(),
scale_.size(), begin_norm_axis_, eps_, scale_.size(), begin_norm_axis_, eps_,
mean_shape_, variance_shape_); mean_shape_, variance_shape_);
} }
const char *getPluginType() const override { return "layer_norm_plugin"; } const char* getPluginType() const override { return "layer_norm_plugin"; }
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
int nbInputDims) override; int nbInputDims) override;
int enqueue(int batchSize, const void *const *inputs, void **outputs, int enqueue(int batchSize, const void* const* inputs, void** outputs,
void *workspace, cudaStream_t stream) override; void* workspace, cudaStream_t stream) override;
}; };
class LayerNormPluginDynamic : public DynamicPluginTensorRT {
public:
LayerNormPluginDynamic(const float* bias, const int bias_num,
const float* scale, const int scale_num,
int begin_norm_axis, float eps,
std::vector<int64_t> mean_shape,
std::vector<int64_t> variance_shape)
: begin_norm_axis_(begin_norm_axis),
eps_(eps),
mean_shape_(mean_shape),
variance_shape_(variance_shape),
scale_gpu_half_d_(nullptr),
bias_gpu_half_d_(nullptr),
mean_gpu_half_d_(nullptr),
variance_gpu_half_d_(nullptr) {
bias_.resize(bias_num);
scale_.resize(scale_num);
std::copy(bias, bias + bias_num, bias_.data());
std::copy(scale, scale + scale_num, scale_.data());
}
LayerNormPluginDynamic(void const* serialData, size_t serialLength)
: scale_gpu_half_d_(nullptr),
bias_gpu_half_d_(nullptr),
mean_gpu_half_d_(nullptr),
variance_gpu_half_d_(nullptr) {
DeserializeValue(&serialData, &serialLength, &bias_);
DeserializeValue(&serialData, &serialLength, &scale_);
DeserializeValue(&serialData, &serialLength, &begin_norm_axis_);
DeserializeValue(&serialData, &serialLength, &eps_);
DeserializeValue(&serialData, &serialLength, &mean_shape_);
DeserializeValue(&serialData, &serialLength, &variance_shape_);
}
nvinfer1::IPluginV2DynamicExt* clone() const override {
return new LayerNormPluginDynamic(bias_.data(), bias_.size(), scale_.data(),
scale_.size(), begin_norm_axis_, eps_,
mean_shape_, variance_shape_);
}
const char* getPluginType() const override { return "layernorm_plugin"; }
int getNbOutputs() const override { return 1; }
int initialize() override { return 0; }
size_t getSerializationSize() const override {
return SerializedSize(bias_) + SerializedSize(scale_) +
SerializedSize(begin_norm_axis_) + SerializedSize(eps_) +
SerializedSize(mean_shape_) + SerializedSize(variance_shape_);
}
void serialize(void* buffer) const override {
SerializeValue(&buffer, bias_);
SerializeValue(&buffer, scale_);
SerializeValue(&buffer, begin_norm_axis_);
SerializeValue(&buffer, eps_);
SerializeValue(&buffer, mean_shape_);
SerializeValue(&buffer, variance_shape_);
}
nvinfer1::DimsExprs getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
nvinfer1::IExprBuilder& expr_builder) override;
bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut,
int nbInputs, int nbOutputs) override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) override {}
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const override {
return 0;
}
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) override;
nvinfer1::DataType getOutputDataType(int index,
const nvinfer1::DataType* inputTypes,
int nbInputs) const override;
~LayerNormPluginDynamic() {
if (scale_gpu_half_d_) {
cudaFree(scale_gpu_half_d_);
}
if (bias_gpu_half_d_) {
cudaFree(bias_gpu_half_d_);
}
if (mean_gpu_half_d_) {
cudaFree(mean_gpu_half_d_);
}
if (variance_gpu_half_d_) {
cudaFree(variance_gpu_half_d_);
}
}
void destroy() override { delete this; }
private:
std::vector<float> bias_;
std::vector<float> scale_;
framework::Tensor scale_t;
framework::Tensor bias_t;
framework::Tensor mean_t;
framework::Tensor variance_t;
int begin_norm_axis_;
float eps_;
std::vector<int64_t> mean_shape_;
std::vector<int64_t> variance_shape_;
half* scale_gpu_half_d_;
half* bias_gpu_half_d_;
half* mean_gpu_half_d_;
half* variance_gpu_half_d_;
};
class LayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator {
public:
LayerNormPluginDynamicCreator() {}
const char* getPluginName() const override { return "layernorm_plugin"; }
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
auto plugin = new LayerNormPluginDynamic(serial_data, serial_length);
return plugin;
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(LayerNormPluginDynamicCreator);
} // namespace plugin } // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
......
...@@ -325,11 +325,10 @@ inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_te ...@@ -325,11 +325,10 @@ inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_te
# densebox # densebox
set(DENSEBOX_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/densebox") set(DENSEBOX_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/densebox")
download_data_without_verify(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz") download_data_without_verify(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz")
#inference_analysis_test(test_analyzer_detect SRCS analyzer_detect_tester.cc inference_analysis_test(test_analyzer_detect_functional_mkldnn SRCS analyzer_detect_functional_mkldnn_tester.cc
# EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
# ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt
# --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt) --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt)
#set_property(TEST test_analyzer_detect PROPERTY ENVIRONMENT GLOG_vmodule=analysis_predictor=2)
# mobilenet with transpose op # mobilenet with transpose op
set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet") set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
DEFINE_string(infer_shape, "", "data shape file");
DEFINE_int32(sample, 20, "number of sample");
namespace paddle {
namespace inference {
namespace analysis {
struct Record {
std::vector<float> data;
std::vector<int32_t> shape;
};
Record ProcessALine(const std::string &line, const std::string &shape_line) {
VLOG(3) << "process a line";
Record record;
std::vector<std::string> data_strs;
split(line, ' ', &data_strs);
for (auto &d : data_strs) {
record.data.push_back(std::stof(d));
}
std::vector<std::string> shape_strs;
split(shape_line, ' ', &shape_strs);
for (auto &s : shape_strs) {
record.shape.push_back(std::stoi(s));
}
return record;
}
void SetConfig(AnalysisConfig *cfg) {
cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
cfg->DisableGpu();
// cfg->SwitchIrDebug(); // Enable to have graphs dumped
cfg->SwitchSpecifyInputNames(false);
cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads);
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
const std::string &line, const std::string &shape_line) {
auto record = ProcessALine(line, shape_line);
PaddleTensor input;
input.shape = record.shape;
input.dtype = PaddleDType::FLOAT32;
size_t input_size = record.data.size() * sizeof(float);
input.data.Resize(input_size);
memcpy(input.data.data(), record.data.data(), input_size);
std::vector<PaddleTensor> input_slots;
input_slots.assign({input});
(*inputs).emplace_back(input_slots);
}
#ifdef PADDLE_WITH_MKLDNN
int GetNumCachedObjects(void) {
auto &pool = platform::DeviceContextPool::Instance();
platform::CPUPlace place;
auto onednn_dev_ctx =
dynamic_cast<platform::MKLDNNDeviceContext *>(pool.Get(place));
return onednn_dev_ctx->GetCachedObjectsNumber();
}
void validate_cache_onednn(int cache_capacity = 1) {
AnalysisConfig cfg;
SetConfig(&cfg);
cfg.EnableMKLDNN();
cfg.SetMkldnnCacheCapacity(cache_capacity);
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
std::vector<std::vector<PaddleTensor>> ref_outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
std::ifstream file(FLAGS_infer_data);
std::ifstream infer_file(FLAGS_infer_shape);
std::vector<std::string> lines;
std::vector<std::string> shape_lines;
// Let's work with 4 samples
auto num_samples = 4;
ref_outputs.resize(num_samples);
lines.resize(num_samples);
shape_lines.resize(num_samples);
// Let's remember number of cached objects before
// execution and after every single execution
std::vector<int> cache_filling;
cache_filling.push_back(GetNumCachedObjects());
// compute sequentially prediction
for (int i = 0; i < num_samples; ++i) {
std::getline(file, lines[i]);
std::getline(infer_file, shape_lines[i]);
SetInput(&input_slots_all, lines[i], shape_lines[i]);
predictor->Run(input_slots_all[i], &ref_outputs[i], FLAGS_batch_size);
// record number of cached objects
cache_filling.push_back(GetNumCachedObjects());
}
file.close();
infer_file.close();
predictor.reset(nullptr);
cache_filling.push_back(GetNumCachedObjects());
// Compare results
// First and last value should be equal e.g. before using cache (empty) and
// after releasing executor
PADDLE_ENFORCE_EQ(
cache_filling[0], cache_filling[cache_filling.size() - 1],
platform::errors::Fatal("Cache size before execution and after "
"releasing Executor do not match"));
// Iterate to check if cache is not increasing
// over exceeding cache capacity
if (cache_capacity != 0) {
for (int i = cache_capacity + 1; i < num_samples + 1; ++i) {
PADDLE_ENFORCE_EQ(
cache_filling[cache_capacity], cache_filling[i],
platform::errors::Fatal("Cache capacity should not increase "
"after full capacity is used"));
}
}
}
TEST(Analyzer_detect, validate_cache_onednn) {
validate_cache_onednn(2 /*cache_capacity */);
}
#endif
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -167,7 +167,7 @@ def run_convert(): ...@@ -167,7 +167,7 @@ def run_convert():
os.path.getsize(output_file) == FULL_SIZE_BYTES): os.path.getsize(output_file) == FULL_SIZE_BYTES):
if os.path.exists(output_file): if os.path.exists(output_file):
sys.stderr.write( sys.stderr.write(
"\n\nThe existing binary file is broken. Start to generate new one...\n\n". "\n\nThe existing binary file[{}] is broken. Start to generate new one...\n\n".
format(output_file)) format(output_file))
os.remove(output_file) os.remove(output_file)
if retry < try_limit: if retry < try_limit:
......
...@@ -120,6 +120,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -120,6 +120,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
: len; : len;
} }
} else if (context.Attr<bool>("set_constant")) { } else if (context.Attr<bool>("set_constant")) {
// TODO(Liu yuang) ADD NPU SET_CONSTANT FUNCTION.
math::SetConstant<DeviceContext, T> set_constant; math::SetConstant<DeviceContext, T> set_constant;
set_constant(dev_ctx, fused_tensor, set_constant(dev_ctx, fused_tensor,
static_cast<T>(context.Attr<float>("constant"))); static_cast<T>(context.Attr<float>("constant")));
...@@ -145,6 +146,14 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -145,6 +146,14 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
offset = 0; offset = 0;
std::stringstream ss; std::stringstream ss;
ss << "alloc_space_for_vars: "; ss << "alloc_space_for_vars: ";
#if defined(PADDLE_WITH_ASCEND_CL)
auto stream =
context.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
platform::NPUMemsetAsync(
static_cast<void *>(fused_tensor->mutable_data<T>(dev_ctx.GetPlace())),
0.0, fused_tensor->numel() * sizeof(T), stream);
#endif
for (size_t i = 0; i < out_tensors.size(); ++i) { for (size_t i = 0; i < out_tensors.size(); ++i) {
size_t len = static_cast<size_t>(out_tensors[i]->numel()); size_t len = static_cast<size_t>(out_tensors[i]->numel());
auto dim = out_tensors[i]->dims(); auto dim = out_tensors[i]->dims();
...@@ -160,6 +169,12 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -160,6 +169,12 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")" ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")"
<< " address: " << out_tensors[i]->data<void>() << ", "; << " address: " << out_tensors[i]->data<void>() << ", ";
} }
PADDLE_ENFORCE_EQ(
(int64_t)offset, fused_tensor->numel(),
platform::errors::InvalidArgument(
"The alloc_space_for_vars's offset: %s is unequal with "
"fused_tensor's numel: %s.",
offset, fused_tensor->numel()));
VLOG(10) << ss.str(); VLOG(10) << ss.str();
} }
...@@ -191,13 +206,13 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -191,13 +206,13 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims() ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
<< ") " << ") "
<< " addres:" << lod_tensors[i]->data<void>() << ", "; << " addres:" << lod_tensors[i]->data<void>() << ", ";
*numel += use_align *numel += use_align
? platform::Alignment( ? platform::Alignment(
static_cast<size_t>(size) * size_of_dtype, place) / static_cast<size_t>(size) * size_of_dtype, place) /
size_of_dtype size_of_dtype
: static_cast<size_t>(size); : static_cast<size_t>(size);
} }
VLOG(10) << ss.str(); VLOG(10) << ss.str();
} }
}; };
...@@ -309,6 +324,16 @@ REGISTER_OP_XPU_KERNEL( ...@@ -309,6 +324,16 @@ REGISTER_OP_XPU_KERNEL(
ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, double>); ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, double>);
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
REGISTER_OP_NPU_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext,
plat::float16>,
ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, double>);
#endif
REGISTER_OP_VERSION(coalesce_tensor) REGISTER_OP_VERSION(coalesce_tensor)
.AddCheckpoint( .AddCheckpoint(
R"ROC( R"ROC(
......
...@@ -43,12 +43,10 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> { ...@@ -43,12 +43,10 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
ncclRedOp_t nccl_red_type = ncclSum; ncclRedOp_t nccl_red_type = ncclSum;
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream)); sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
auto comm_stream =
platform::NCCLCommContext::Instance().Get(rid, place)->stream();
#ifdef PADDLE_WITH_RCCL #ifdef PADDLE_WITH_RCCL
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(comm_stream)); PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else #else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(comm_stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif #endif
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
......
...@@ -31,13 +31,12 @@ class CEmbeddingOp : public framework::OperatorWithKernel { ...@@ -31,13 +31,12 @@ class CEmbeddingOp : public framework::OperatorWithKernel {
int ids_rank = ids_dims.size(); int ids_rank = ids_dims.size();
VLOG(5) << "ids rank is " << ids_rank << std::endl; VLOG(5) << "ids rank is " << ids_rank << std::endl;
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(table_dims.size(), 2,
table_dims.size(), 2, platform::errors::InvalidArgument(
platform::errors::InvalidArgument( "The dimensions of the 'c_embedding' must be 2. "
"ShapeError: The dimensions of the 'c_embedding' must be 2. " "But received c_embedding's dimensions = %d, "
"But received c_embedding's dimensions = %d, " "c_embedding's shape = [%s].",
"c_embedding's shape = [%s].", table_dims.size(), table_dims));
table_dims.size(), table_dims));
auto output_dims = framework::vectorize(ids_dims); auto output_dims = framework::vectorize(ids_dims);
output_dims.push_back(table_dims[1]); output_dims.push_back(table_dims[1]);
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
namespace paddle {
namespace operators {
class CSoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Logits"), "Input", "Logits",
"CSoftmaxWithCrossEntropyOp");
OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label",
"CSoftmaxWithCrossEntropyOp");
OP_INOUT_CHECK(ctx->HasOutput("Softmax"), "Output", "Softmax",
"CSoftmaxWithCrossEntropyOp");
OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss",
"CSoftmaxWithCrossEntropyOp");
auto logits_dims = ctx->GetInputDim("Logits");
auto labels_dims = ctx->GetInputDim("Label");
auto logits_rank = logits_dims.size();
auto axis = logits_rank - 1;
for (int i = 0; i < logits_rank; i++) {
if (i != axis) {
if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) {
PADDLE_ENFORCE_EQ(logits_dims[i], labels_dims[i],
platform::errors::InvalidArgument(
"Input(Logits) and Input(Label) should in "
"same shape in dimensions except axis."));
}
}
}
PADDLE_ENFORCE_EQ(
labels_dims[logits_rank - 1], 1UL,
platform::errors::InvalidArgument(
"the last dimension of Input(Label) should be 1."
"But received: the last dimension of Input(Label) is [%d],"
"the last dimension is [%d]",
labels_dims[logits_rank - 1], logits_rank - 1));
ctx->SetOutputDim("Softmax", logits_dims);
logits_dims[axis] = 1;
ctx->SetOutputDim("Loss", logits_dims);
ctx->ShareLoD("Logits", /*->*/ "Softmax");
ctx->ShareLoD("Logits", /*->*/ "Loss");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "Logits"),
ctx.device_context());
}
};
class CSoftmaxWithCrossEntropyOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("Logits",
"(Tensor, default: Tensor<float>), The input tensor of unscaled "
"log probabilities, whose dimension :attr:`axis` should be scaled "
"by softmax.");
AddInput(
"Label",
"(Tensor) The input tensor of groud truth label. If :attr:`soft_label` "
"is set to false, Label is a Tensor<int64> in same shape with "
"Input(Logits) except the shape in dimension :attr:`axis` as 1. If "
"soft_label is set to true, Label is a Tensor<float/double> in same "
"shape with Input(Logits).");
AddOutput(
"Softmax",
"(Tensor, default: Tensor<float>), A tensor in same shape with "
"Input(Logits). "
"The outputs value of softmax activation by given the input batch, "
"which will be used in backward calculation.");
AddOutput("Loss",
"(Tensor, default: Tensor<float>), A tensor in same shape with "
"Input(Logits) "
"except the shape in dimension :attr:`axis` as 1. The cross "
"entropy loss.");
AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
.SetDefault(0);
AddAttr<int>("rank",
"(int default 0) rank id for CSoftmaxWithCrossEntropy.")
.SetDefault(0);
AddAttr<int>("nranks",
"(int default 1) nranks id for CSoftmaxWithCrossEntropy.")
.SetDefault(0);
AddComment(R"DOC(
CSoftmaxWithCrossEntropy Operator
)DOC");
}
};
class CSoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Loss")), true,
platform::errors::InvalidArgument(
"Input(Loss@Grad) should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), true,
platform::errors::InvalidArgument(
"Input(Softmax) should be not null."));
PADDLE_ENFORCE_EQ(
ctx->HasInput("Label"), true,
platform::errors::InvalidArgument("Input(Label) should be not null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Logits")), true,
platform::errors::InvalidArgument(
"Output(Logits@Grad) should be not null."));
ctx->SetOutputDim(framework::GradVarName("Logits"),
ctx->GetInputDim("Softmax"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Loss")),
ctx.device_context());
}
};
template <typename T>
class CSoftmaxWithCrossEntropyOpGradMaker
: public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("c_softmax_with_cross_entropy_grad");
op->SetInput("Softmax", this->Output("Softmax"));
op->SetInput("Label", this->Input("Label"));
op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
op->SetAttrMap(this->Attrs());
op->SetOutput(framework::GradVarName("Logits"), this->InputGrad("Logits"));
}
};
DECLARE_INPLACE_OP_INFERER(CSoftmaxWithCrossEntropyInplaceInferer,
{"Logits", "Softmax"});
DECLARE_INPLACE_OP_INFERER(CSoftmaxWithCrossEntropyGradInplaceInferer,
{"Softmax", framework::GradVarName("Logits")});
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OPERATOR(
c_softmax_with_cross_entropy, ops::CSoftmaxWithCrossEntropyOp,
ops::CSoftmaxWithCrossEntropyOpMaker,
ops::CSoftmaxWithCrossEntropyOpGradMaker<paddle::framework::OpDesc>,
ops::CSoftmaxWithCrossEntropyOpGradMaker<paddle::imperative::OpBase>,
ops::CSoftmaxWithCrossEntropyInplaceInferer);
REGISTER_OPERATOR(c_softmax_with_cross_entropy_grad,
ops::CSoftmaxWithCrossEntropyOpGrad,
ops::CSoftmaxWithCrossEntropyGradInplaceInferer);
REGISTER_OP_CPU_KERNEL(c_softmax_with_cross_entropy,
ops::CSoftmaxWithCrossEntropyOpCPUKernel<float>,
ops::CSoftmaxWithCrossEntropyOpCPUKernel<double>,
ops::CSoftmaxWithCrossEntropyOpCPUKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
#include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#include "paddle/fluid/string/string_helper.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaxinumNumBlocks = 4096;
static inline int NumBlocks(const int N) {
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
kNumMaxinumNumBlocks);
}
template <typename T, typename IndexT>
__global__ void MaskLabelByIndex(T* predicted_logits, const T* logit,
const IndexT* label, const int start_index,
const int end_index, const int64_t N,
const int64_t D, const int nranks) {
CUDA_KERNEL_LOOP(i, N) {
auto real_label = label[i];
PADDLE_ENFORCE((real_label < D * nranks) && (real_label >= 0),
"The index is out of bounds, "
"please check whether the value of label and "
"input meet the class number. It should "
"be less than [%d], but received [%d]",
D * nranks, real_label);
if (real_label >= start_index && real_label < end_index) {
predicted_logits[i] = logit[i * D + real_label - start_index];
}
}
}
template <typename T, typename IndexT>
__global__ void MaskLabelByIndexGrad(T* logits_grad, const T* loss_grad,
const IndexT* labels,
const int start_index, const int end_index,
const int64_t N, const int64_t D) {
CUDA_KERNEL_LOOP(i, N * D) {
auto row = i / D;
auto col = i % D;
if ((col + start_index) == labels[row]) {
logits_grad[i] = (logits_grad[i] - static_cast<T>(1.0)) * loss_grad[row];
} else {
logits_grad[i] *= loss_grad[row];
}
}
}
template <typename T>
class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* logits = ctx.Input<Tensor>("Logits");
const Tensor* labels = ctx.Input<Tensor>("Label");
Tensor* softmax = ctx.Output<Tensor>("Softmax");
Tensor* loss = ctx.Output<Tensor>("Loss");
const int rid = ctx.Attr<int>("ring_id");
const int nranks = ctx.Attr<int>("nranks");
const int rank = ctx.Attr<int>("rank");
const auto& place = ctx.GetPlace();
const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place);
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
// use global calculate stream
const auto stream = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
// allocate memory on device.
softmax->mutable_data<T>(place);
loss->mutable_data<T>(place);
const auto& logits_dims = logits->dims();
const auto& labels_dims = labels->dims();
const int axis = logits_dims.size() - 1;
const int N = SizeToAxis(axis, logits_dims);
const int D = SizeFromAxis(axis, logits_dims);
Tensor logits_2d, softmax_2d, loss_2d;
logits_2d.ShareDataWith(*logits).Resize({N, D});
softmax_2d.ShareDataWith(*softmax).Resize({N, D});
loss_2d.ShareDataWith(*loss).Resize({N, 1});
auto eigen_logits = math::EigenMatrix<T>::From(logits_2d);
auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
// step 1, obtain logit_max
Tensor logits_max;
logits_max =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
void* logits_max_buff = logits_max.mutable_data<T>(place);
auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
Eigen::DSizes<int, 1> along_axis(1);
eigen_logits_max.device(*dev_ctx.eigen_device()) =
eigen_logits.maximum(along_axis);
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
logits_max_buff, logits_max_buff, logits_max.numel(),
platform::ToNCCLDataType(logits_max.type()), ncclMax, comm->comm(),
stream));
// step 2, obtain logit - logit_max
Eigen::DSizes<int, 2> batch_by_one(N, 1);
Eigen::DSizes<int, 2> one_by_class(1, D);
eigen_softmax.device(*dev_ctx.eigen_device()) =
(eigen_logits -
eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class))
.unaryExpr(math::ValueClip<T>());
// step 3, obtain predict target
Tensor predicted_logits;
predicted_logits =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
predicted_logits.mutable_data<T>(place);
auto t = framework::EigenVector<T>::Flatten(predicted_logits);
t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
const int start_index = rank * D;
const int end_index = start_index + D;
int blocks = NumBlocks(N);
int threads = kNumCUDAThreads;
const auto& label_type = labels->type();
if (label_type == framework::proto::VarType::INT32) {
MaskLabelByIndex<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
predicted_logits.data<T>(), softmax_2d.data<T>(),
labels->data<int32_t>(), start_index, end_index, N, D, nranks);
} else if (label_type == framework::proto::VarType::INT64) {
MaskLabelByIndex<T, int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
predicted_logits.data<T>(), softmax_2d.data<T>(),
labels->data<int64_t>(), start_index, end_index, N, D, nranks);
}
void* predict_logits_buff = predicted_logits.mutable_data<T>(place);
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
predict_logits_buff, predict_logits_buff, predicted_logits.numel(),
platform::ToNCCLDataType(predicted_logits.type()), ncclSum,
comm->comm(), stream));
// step 4, obtain exp(logit)
eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.exp();
// step 5, obtain sum_exp_logits
Tensor sum_exp_logits;
sum_exp_logits =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
eigen_softmax.sum(along_axis);
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(),
platform::ToNCCLDataType(sum_exp_logits.type()), ncclSum, comm->comm(),
stream));
auto eigen_loss = math::EigenMatrix<T>::From(loss_2d);
auto eigen_predicted_logits = math::EigenMatrix<T>::From(predicted_logits);
eigen_loss.device(*dev_ctx.eigen_device()) =
(eigen_sum_exp_logits.log().unaryExpr(math::TolerableValue<T>()) -
eigen_predicted_logits)
.unaryExpr(math::TolerableValue<T>());
eigen_softmax.device(*dev_ctx.eigen_device()) =
(eigen_softmax *
eigen_sum_exp_logits.inverse().broadcast(one_by_class));
}
};
template <typename T>
class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* labels = context.Input<Tensor>("Label");
const Tensor* loss_grad =
context.Input<Tensor>(framework::GradVarName("Loss"));
Tensor* logit_grad =
context.Output<Tensor>(framework::GradVarName("Logits"));
const Tensor* softmax = context.Input<Tensor>("Softmax");
const int rank = context.Attr<int>("rank");
auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
if (logit_grad != softmax) {
framework::TensorCopy(*softmax, context.GetPlace(),
context.device_context(), logit_grad);
}
const auto sofrmax_dims = softmax->dims();
const int axis = sofrmax_dims.size() - 1;
const int N = SizeToAxis(axis, sofrmax_dims);
const int D = SizeFromAxis(axis, sofrmax_dims);
Tensor logit_grad_2d;
logit_grad_2d.ShareDataWith(*logit_grad).Resize({N, D});
int blocks = NumBlocks(N * D);
int threads = kNumCUDAThreads;
const auto& label_type = labels->type();
const int start_index = rank * D;
const int end_index = start_index + D;
if (label_type == framework::proto::VarType::INT32) {
MaskLabelByIndexGrad<T,
int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
logit_grad_2d.data<T>(), loss_grad->data<T>(),
labels->data<int32_t>(), start_index, end_index, N, D);
} else if (label_type == framework::proto::VarType::INT64) {
MaskLabelByIndexGrad<T,
int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
logit_grad_2d.data<T>(), loss_grad->data<T>(),
labels->data<int64_t>(), start_index, end_index, N, D);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
c_softmax_with_cross_entropy,
ops::CSoftmaxWithCrossEntropyOpCUDAKernel<float>,
ops::CSoftmaxWithCrossEntropyOpCUDAKernel<double>,
ops::CSoftmaxWithCrossEntropyOpCUDAKernel<plat::float16>);
REGISTER_OP_CUDA_KERNEL(
c_softmax_with_cross_entropy_grad,
ops::CSoftmaxWithCrossEntropyGradCUDAKernel<float>,
ops::CSoftmaxWithCrossEntropyGradCUDAKernel<paddle::platform::float16>,
ops::CSoftmaxWithCrossEntropyGradCUDAKernel<double>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/softmax_op.h"
namespace paddle {
namespace operators {
template <typename T>
class CSoftmaxWithCrossEntropyOpCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_THROW(platform::errors::Unavailable(
"Do not support c_embedding for cpu kernel now."));
}
};
} // namespace operators
} // namespace paddle
type: "affine_channel"
def {
inputs {
name: "X"
}
inputs {
name: "Scale"
}
inputs {
name: "Bias"
}
attrs {
name: "data_layout"
type: STRING
}
outputs {
name: "Out"
}
}
extra {
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
type: "flatten2"
def {
inputs {
name: "X"
}
outputs {
name: "Out"
}
outputs {
name: "XShape"
}
attrs {
name: "axis"
type: INT
}
}
extra {
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
type: "gru"
def {
inputs {
name: "Input"
}
inputs {
name: "H0"
}
inputs {
name: "Weight"
}
inputs {
name: "Bias"
}
outputs {
name: "BatchGate"
}
outputs {
name: "BatchResetHiddenPrev"
}
outputs {
name: "BatchHidden"
}
outputs {
name: "Hidden"
}
attrs {
name: "activation"
type: STRING
}
attrs {
name: "gate_activation"
type: STRING
}
attrs {
name: "is_reverse"
type: BOOLEAN
}
attrs {
name: "origin_mode"
type: BOOLEAN
}
}
extra {
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
type: "layer_norm"
def {
inputs {
name: "X"
}
inputs {
name: "Scale"
}
inputs {
name: "Bias"
}
outputs {
name: "Y"
}
outputs {
name: "Mean"
}
outputs {
name: "Variance"
}
attrs {
name: "epsilon"
type: FLOAT
}
attrs {
name: "begin_norm_axis"
type: INT
}
}
extra {
attrs {
name: "use_mkldnn"
type: BOOLEAN
}
attrs {
name: "mkldnn_data_type"
type: STRING
}
attrs {
name: "is_test"
type: BOOLEAN
}
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
type: "lstm"
def {
inputs {
name: "Input"
}
inputs {
name: "H0"
}
inputs {
name: "C0"
}
inputs {
name: "Weight"
}
inputs {
name: "Bias"
}
outputs {
name: "Hidden"
}
outputs {
name: "Cell"
}
outputs {
name: "BatchGate"
}
outputs {
name: "BatchCellPreAct"
}
attrs {
name: "use_peepholes"
type: BOOLEAN
}
attrs {
name: "is_reverse"
type: BOOLEAN
}
attrs {
name: "gate_activation"
type: STRING
}
attrs {
name: "cell_activation"
type: STRING
}
attrs {
name: "candidate_activation"
type: STRING
}
}
extra {
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
type: "matmul"
def {
inputs {
name: "X"
}
inputs {
name: "Y"
}
outputs {
name: "Out"
}
attrs {
name: "alpha"
type: FLOAT
}
attrs {
name: "transpose_X"
type: BOOLEAN
}
attrs {
name: "transpose_Y"
type: BOOLEAN
}
}
extra {
attrs {
name: "Scale_out"
type: FLOAT
}
attrs {
name: "Scale_x"
type: FLOAT
}
attrs {
name: "Scale_y"
type: FLOAT
}
attrs {
name: "use_mkldnn"
type: BOOLEAN
}
attrs {
name: "mkldnn_data_type"
type: STRING
}
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
attrs {
name: "use_quantizer"
type: BOOLEAN
}
attrs {
name: "force_fp32_output"
type: BOOLEAN
}
attrs {
name: "fused_reshape_Out"
type: INTS
}
attrs {
name: "fused_reshape_X"
type: INTS
}
attrs {
name: "fused_reshape_Y"
type: INTS
}
attrs {
name: "fused_transpose_Out"
type: INTS
}
attrs {
name: "fused_transpose_X"
type: INTS
}
attrs {
name: "fused_transpose_Y"
type: INTS
}
}
type: "matmul_v2"
def {
inputs {
name: "X"
}
inputs {
name: "Y"
}
outputs {
name: "Out"
}
attrs {
name: "trans_x"
type: BOOLEAN
}
attrs {
name: "trans_y"
type: BOOLEAN
}
}
extra {
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
type: "pool2d"
def {
inputs {
name: "X"
}
outputs {
name: "Out"
}
attrs {
name: "pooling_type"
type: STRING
}
attrs {
name: "ksize"
type: INTS
}
attrs {
name: "global_pooling"
type: BOOLEAN
}
attrs {
name: "strides"
type: INTS
}
attrs {
name: "paddings"
type: INTS
}
attrs {
name: "exclusive"
type: BOOLEAN
}
attrs {
name: "adaptive"
type: BOOLEAN
}
attrs {
name: "ceil_mode"
type: BOOLEAN
}
attrs {
name: "data_format"
type: STRING
}
attrs {
name: "padding_algorithm"
type: STRING
}
}
extra {
attrs {
name: "is_test"
type: BOOLEAN
}
attrs {
name: "use_cudnn"
type: BOOLEAN
}
attrs {
name: "use_mkldnn"
type: BOOLEAN
}
attrs {
name: "use_quantizer"
type: BOOLEAN
}
attrs {
name: "mkldnn_data_type"
type: STRING
}
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
type: "reshape2"
def {
inputs {
name: "X"
}
outputs {
name: "Out"
}
attrs {
name: "shape"
type: INTS
}
}
extra {
inputs {
name: "Shape"
}
inputs {
name: "ShapeTensor"
}
outputs {
name: "XShape"
}
attrs {
name: "use_quantizer"
type: BOOLEAN
}
attrs {
name: "mkldnn_data_type"
type: STRING
}
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
type: "scale"
def {
inputs {
name: "X"
}
outputs {
name: "Out"
}
attrs {
name: "bias"
type: FLOAT
}
attrs {
name: "scale"
type: FLOAT
}
attrs {
name: "bias_after_scale"
type: BOOLEAN
}
}
extra {
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
type: "softmax"
def {
inputs {
name: "X"
}
outputs {
name: "Out"
}
attrs {
name: "axis"
type: INT
}
attrs {
name: "data_format"
type: STRING
}
}
extra {
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
attrs {
name: "is_test"
type: BOOLEAN
}
attrs {
name: "mkldnn_data_type"
type: STRING
}
attrs {
name: "use_cudnn"
type: BOOLEAN
}
attrs {
name: "use_mkldnn"
type: BOOLEAN
}
}
type: "squeeze2"
def {
inputs {
name: "X"
}
outputs {
name: "Out"
}
outputs {
name: "XShape"
}
attrs {
name: "axes"
type: INTS
}
}
extra {
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
type: "transpose"
def {
inputs {
name: "X"
}
outputs {
name: "Out"
}
attrs {
name: "axis"
type: INTS
}
attrs {
name: "data_format"
type: STRING
}
}
extra {
attrs {
name: "use_mkldnn"
type: BOOLEAN
}
attrs {
name: "use_quantizer"
type: BOOLEAN
}
attrs {
name: "mkldnn_data_type"
type: STRING
}
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
type: "transpose"
def {
inputs {
name: "X"
}
outputs {
name: "Out"
}
attrs {
name: "axis"
type: INTS
}
attrs {
name: "data_format"
type: STRING
}
}
extra {
outputs {
name: "XShape"
}
attrs {
name: "use_mkldnn"
type: BOOLEAN
}
attrs {
name: "use_quantizer"
type: BOOLEAN
}
attrs {
name: "mkldnn_data_type"
type: STRING
}
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
type: "unsqueeze2"
def {
inputs {
name: "X"
}
inputs {
name: "AxesTensor"
}
inputs {
name: "AxesTensorList"
}
outputs {
name: "Out"
}
outputs {
name: "XShape"
}
attrs {
name: "axes"
type: INTS
}
}
extra {
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "paddle/fluid/operators/detection/yolo_box_op.h" #include "paddle/fluid/operators/detection/yolo_box_op.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -31,19 +32,44 @@ class YoloBoxOp : public framework::OperatorWithKernel { ...@@ -31,19 +32,44 @@ class YoloBoxOp : public framework::OperatorWithKernel {
auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors"); auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
int anchor_num = anchors.size() / 2; int anchor_num = anchors.size() / 2;
auto class_num = ctx->Attrs().Get<int>("class_num"); auto class_num = ctx->Attrs().Get<int>("class_num");
auto iou_aware = ctx->Attrs().Get<bool>("iou_aware");
auto iou_aware_factor = ctx->Attrs().Get<float>("iou_aware_factor");
PADDLE_ENFORCE_EQ(dim_x.size(), 4, platform::errors::InvalidArgument( PADDLE_ENFORCE_EQ(dim_x.size(), 4, platform::errors::InvalidArgument(
"Input(X) should be a 4-D tensor." "Input(X) should be a 4-D tensor."
"But received X dimension(%s)", "But received X dimension(%s)",
dim_x.size())); dim_x.size()));
PADDLE_ENFORCE_EQ( if (iou_aware) {
dim_x[1], anchor_num * (5 + class_num), PADDLE_ENFORCE_EQ(
platform::errors::InvalidArgument( dim_x[1], anchor_num * (6 + class_num),
"Input(X) dim[1] should be equal to (anchor_mask_number * (5 " platform::errors::InvalidArgument(
"+ class_num))." "Input(X) dim[1] should be equal to (anchor_mask_number * (6 "
"But received dim[1](%s) != (anchor_mask_number * " "+ class_num)) while iou_aware is true."
"(5+class_num)(%s).", "But received dim[1](%s) != (anchor_mask_number * "
dim_x[1], anchor_num * (5 + class_num))); "(6+class_num)(%s).",
dim_x[1], anchor_num * (6 + class_num)));
PADDLE_ENFORCE_GE(
iou_aware_factor, 0,
platform::errors::InvalidArgument(
"Attr(iou_aware_factor) should greater than or equal to 0."
"But received iou_aware_factor (%s)",
iou_aware_factor));
PADDLE_ENFORCE_LE(
iou_aware_factor, 1,
platform::errors::InvalidArgument(
"Attr(iou_aware_factor) should less than or equal to 1."
"But received iou_aware_factor (%s)",
iou_aware_factor));
} else {
PADDLE_ENFORCE_EQ(
dim_x[1], anchor_num * (5 + class_num),
platform::errors::InvalidArgument(
"Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
"+ class_num))."
"But received dim[1](%s) != (anchor_mask_number * "
"(5+class_num)(%s).",
dim_x[1], anchor_num * (5 + class_num)));
}
PADDLE_ENFORCE_EQ(dim_imgsize.size(), 2, PADDLE_ENFORCE_EQ(dim_imgsize.size(), 2,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Input(ImgSize) should be a 2-D tensor." "Input(ImgSize) should be a 2-D tensor."
...@@ -140,6 +166,10 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -140,6 +166,10 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
"Scale the center point of decoded bounding " "Scale the center point of decoded bounding "
"box. Default 1.0") "box. Default 1.0")
.SetDefault(1.); .SetDefault(1.);
AddAttr<bool>("iou_aware", "Whether use iou aware. Default false.")
.SetDefault(false);
AddAttr<float>("iou_aware_factor", "iou aware factor. Default 0.5.")
.SetDefault(0.5);
AddComment(R"DOC( AddComment(R"DOC(
This operator generates YOLO detection boxes from output of YOLOv3 network. This operator generates YOLO detection boxes from output of YOLOv3 network.
...@@ -147,7 +177,8 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -147,7 +177,8 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
should be the same, H and W specify the grid size, each grid point predict should be the same, H and W specify the grid size, each grid point predict
given number boxes, this given number, which following will be represented as S, given number boxes, this given number, which following will be represented as S,
is specified by the number of anchors. In the second dimension(the channel is specified by the number of anchors. In the second dimension(the channel
dimension), C should be equal to S * (5 + class_num), class_num is the object dimension), C should be equal to S * (5 + class_num) if :attr:`iou_aware` is false,
otherwise C should be equal to S * (6 + class_num). class_num is the object
category number of source dataset(such as 80 in coco dataset), so the category number of source dataset(such as 80 in coco dataset), so the
second(channel) dimension, apart from 4 box location coordinates x, y, w, h, second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
also includes confidence score of the box and class one-hot key of each anchor also includes confidence score of the box and class one-hot key of each anchor
...@@ -183,6 +214,15 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -183,6 +214,15 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
score_{pred} = score_{conf} * score_{class} score_{pred} = score_{conf} * score_{class}
$$ $$
where the confidence scores follow the formula bellow
.. math::
score_{conf} = \begin{case}
obj, \text{if } iou_aware == flase \\
obj^{1 - iou_aware_factor} * iou^{iou_aware_factor}, \text{otherwise}
\end{case}
)DOC"); )DOC");
} }
}; };
...@@ -197,3 +237,12 @@ REGISTER_OPERATOR( ...@@ -197,3 +237,12 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>); paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel<float>, REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel<float>,
ops::YoloBoxKernel<double>); ops::YoloBoxKernel<double>);
REGISTER_OP_VERSION(yolo_box)
.AddCheckpoint(
R"ROC(
Upgrade yolo box to add new attribute [iou_aware, iou_aware_factor].
)ROC",
paddle::framework::compatible::OpVersionDesc()
.NewAttr("iou_aware", "Whether use iou aware", false)
.NewAttr("iou_aware_factor", "iou aware factor", 0.5f));
...@@ -28,7 +28,8 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes, ...@@ -28,7 +28,8 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
const int w, const int an_num, const int class_num, const int w, const int an_num, const int class_num,
const int box_num, int input_size_h, const int box_num, int input_size_h,
int input_size_w, bool clip_bbox, const float scale, int input_size_w, bool clip_bbox, const float scale,
const float bias) { const float bias, bool iou_aware,
const float iou_aware_factor) {
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x; int stride = blockDim.x * gridDim.x;
T box[4]; T box[4];
...@@ -43,23 +44,29 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes, ...@@ -43,23 +44,29 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
int img_height = imgsize[2 * i]; int img_height = imgsize[2 * i];
int img_width = imgsize[2 * i + 1]; int img_width = imgsize[2 * i + 1];
int obj_idx = int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4,
GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4); iou_aware);
T conf = sigmoid<T>(input[obj_idx]); T conf = sigmoid<T>(input[obj_idx]);
if (iou_aware) {
int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num);
T iou = sigmoid<T>(input[iou_idx]);
conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
pow(iou, static_cast<T>(iou_aware_factor));
}
if (conf < conf_thresh) { if (conf < conf_thresh) {
continue; continue;
} }
int box_idx = int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0,
GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0); iou_aware);
GetYoloBox<T>(box, input, anchors, l, k, j, h, w, input_size_h, GetYoloBox<T>(box, input, anchors, l, k, j, h, w, input_size_h,
input_size_w, box_idx, grid_num, img_height, img_width, scale, input_size_w, box_idx, grid_num, img_height, img_width, scale,
bias); bias);
box_idx = (i * box_num + j * grid_num + k * w + l) * 4; box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width, clip_bbox); CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width, clip_bbox);
int label_idx = int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num,
GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5); 5, iou_aware);
int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf, CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
grid_num); grid_num);
...@@ -80,6 +87,8 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> { ...@@ -80,6 +87,8 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
float conf_thresh = ctx.Attr<float>("conf_thresh"); float conf_thresh = ctx.Attr<float>("conf_thresh");
int downsample_ratio = ctx.Attr<int>("downsample_ratio"); int downsample_ratio = ctx.Attr<int>("downsample_ratio");
bool clip_bbox = ctx.Attr<bool>("clip_bbox"); bool clip_bbox = ctx.Attr<bool>("clip_bbox");
bool iou_aware = ctx.Attr<bool>("iou_aware");
float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
float scale = ctx.Attr<float>("scale_x_y"); float scale = ctx.Attr<float>("scale_x_y");
float bias = -0.5 * (scale - 1.); float bias = -0.5 * (scale - 1.);
...@@ -111,11 +120,18 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> { ...@@ -111,11 +120,18 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
platform::GpuLaunchConfig config = platform::GpuLaunchConfig config =
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num); platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num);
KeYoloBoxFw<T><<<config.block_per_grid, config.thread_per_block, 0, dim3 thread_num = config.thread_per_block;
#ifdef WITH_NV_JETSON
if (config.compute_capability == 53 || config.compute_capability == 62) {
thread_num = 512;
}
#endif
KeYoloBoxFw<T><<<config.block_per_grid, thread_num, 0,
ctx.cuda_device_context().stream()>>>( ctx.cuda_device_context().stream()>>>(
input_data, imgsize_data, boxes_data, scores_data, conf_thresh, input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
anchors_data, n, h, w, an_num, class_num, box_num, input_size_h, anchors_data, n, h, w, an_num, class_num, box_num, input_size_h,
input_size_w, clip_bbox, scale, bias); input_size_w, clip_bbox, scale, bias, iou_aware, iou_aware_factor);
} }
}; };
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
namespace paddle { namespace paddle {
...@@ -43,8 +44,19 @@ HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i, ...@@ -43,8 +44,19 @@ HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx, HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
int an_num, int an_stride, int stride, int an_num, int an_stride, int stride,
int entry) { int entry, bool iou_aware) {
return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; if (iou_aware) {
return (batch * an_num + an_idx) * an_stride +
(batch * an_num + an_num + entry) * stride + hw_idx;
} else {
return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
}
}
HOSTDEVICE inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num,
int an_stride, int stride) {
return batch * an_num * an_stride + (batch * an_num + an_idx) * stride +
hw_idx;
} }
template <typename T> template <typename T>
...@@ -92,6 +104,8 @@ class YoloBoxKernel : public framework::OpKernel<T> { ...@@ -92,6 +104,8 @@ class YoloBoxKernel : public framework::OpKernel<T> {
float conf_thresh = ctx.Attr<float>("conf_thresh"); float conf_thresh = ctx.Attr<float>("conf_thresh");
int downsample_ratio = ctx.Attr<int>("downsample_ratio"); int downsample_ratio = ctx.Attr<int>("downsample_ratio");
bool clip_bbox = ctx.Attr<bool>("clip_bbox"); bool clip_bbox = ctx.Attr<bool>("clip_bbox");
bool iou_aware = ctx.Attr<bool>("iou_aware");
float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
float scale = ctx.Attr<float>("scale_x_y"); float scale = ctx.Attr<float>("scale_x_y");
float bias = -0.5 * (scale - 1.); float bias = -0.5 * (scale - 1.);
...@@ -127,15 +141,22 @@ class YoloBoxKernel : public framework::OpKernel<T> { ...@@ -127,15 +141,22 @@ class YoloBoxKernel : public framework::OpKernel<T> {
for (int j = 0; j < an_num; j++) { for (int j = 0; j < an_num; j++) {
for (int k = 0; k < h; k++) { for (int k = 0; k < h; k++) {
for (int l = 0; l < w; l++) { for (int l = 0; l < w; l++) {
int obj_idx = int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 4); stride, 4, iou_aware);
T conf = sigmoid<T>(input_data[obj_idx]); T conf = sigmoid<T>(input_data[obj_idx]);
if (iou_aware) {
int iou_idx =
GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride);
T iou = sigmoid<T>(input_data[iou_idx]);
conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
pow(iou, static_cast<T>(iou_aware_factor));
}
if (conf < conf_thresh) { if (conf < conf_thresh) {
continue; continue;
} }
int box_idx = int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0); stride, 0, iou_aware);
GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, w, GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, w,
input_size_h, input_size_w, box_idx, stride, input_size_h, input_size_w, box_idx, stride,
img_height, img_width, scale, bias); img_height, img_width, scale, bias);
...@@ -143,8 +164,8 @@ class YoloBoxKernel : public framework::OpKernel<T> { ...@@ -143,8 +164,8 @@ class YoloBoxKernel : public framework::OpKernel<T> {
CalcDetectionBox<T>(boxes_data, box, box_idx, img_height, img_width, CalcDetectionBox<T>(boxes_data, box, box_idx, img_height, img_width,
clip_bbox); clip_bbox);
int label_idx = int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5); stride, 5, iou_aware);
int score_idx = (i * box_num + j * stride + k * w + l) * class_num; int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
CalcLabelScore<T>(scores_data, input_data, label_idx, score_idx, CalcLabelScore<T>(scores_data, input_data, label_idx, score_idx,
class_num, conf, stride); class_num, conf, stride);
......
...@@ -209,6 +209,73 @@ __global__ void LayerNormForward(const T *x, const U *scale, const U *bias, ...@@ -209,6 +209,73 @@ __global__ void LayerNormForward(const T *x, const U *scale, const U *bias,
} }
} }
template <typename T, typename U, int BlockDim>
__global__ void LayerNormForwardFP16(const T *x, const U *scale, const U *bias,
T *y, U *mean, U *var, float epsilon,
int feature_size) {
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
using BlockReduce = cub::BlockReduce<PairForLayerNorm<U>, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ U mean_share;
__shared__ U var_share;
int beg_idx = blockIdx.x * feature_size + threadIdx.x;
int end_idx = (blockIdx.x + 1) * feature_size;
// Step 1: Reduce to calculate mean and var
U mean_val = 0;
U var_val = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) {
U tmp = static_cast<U>(x[i]);
mean_val += tmp;
var_val += (tmp * tmp);
}
auto pair = BlockReduce(temp_storage)
.Reduce(PairForLayerNorm<U>(mean_val, var_val),
PairForLayerNormAddFunctor<U>());
if (threadIdx.x == 0) {
auto tmp = pair.first_ / static_cast<U>(feature_size);
mean[blockIdx.x] = mean_share = static_cast<U>(tmp);
var[blockIdx.x] = var_share =
static_cast<U>(pair.second_ / static_cast<U>(feature_size) - tmp * tmp);
}
__syncthreads();
mean_val = mean_share;
U invvar = rsqrt_<U>(var_share + static_cast<U>(epsilon));
// Step 2: Calculate y
if (scale != nullptr) {
if (bias != nullptr) {
for (int i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = static_cast<T>(
scale[j] * (static_cast<U>(x[i]) - mean_val) * invvar + bias[j]);
}
} else {
for (int i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = static_cast<T>(scale[j] * (static_cast<U>(x[i]) - mean_val) *
invvar);
}
}
} else { // scale == nullptr
if (bias != nullptr) {
for (int i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar +
bias[j]);
}
} else {
for (int i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar);
}
}
}
#endif
}
template <typename T, typename U, int VPT> template <typename T, typename U, int VPT>
__inline__ __device__ void cuLoadAddStridedInputs( __inline__ __device__ void cuLoadAddStridedInputs(
const int i1_block, const int thr_load_row_off, const int thr_load_col_off, const int i1_block, const int thr_load_row_off, const int thr_load_col_off,
...@@ -872,6 +939,28 @@ void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream, ...@@ -872,6 +939,28 @@ void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
} }
} }
template <>
void LayerNormDirectCUDAFunctor<half>::operator()(
gpuStream_t stream, const half *input, std::vector<int> input_shape,
const half *bias, const half *scale, half *output, half *mean,
half *variance, int begin_norm_axis, float eps) {
const auto x_dims = framework::make_ddim(input_shape);
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int batch_size = static_cast<int>(matrix_dim[0]);
int feature_size = static_cast<int>(matrix_dim[1]);
switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE(
LayerNormForwardFP16<half, half,
kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
input, scale, bias, output, mean, variance, eps, feature_size));
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Product from begin_norm_axis to end in layer_norm must be larger "
"than 1"));
break;
}
}
template <typename T> template <typename T>
class LayerNormKernel<platform::CUDADeviceContext, T> class LayerNormKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> { : public framework::OpKernel<T> {
...@@ -961,6 +1050,9 @@ class LayerNormGradKernel<platform::CUDADeviceContext, T> ...@@ -961,6 +1050,9 @@ class LayerNormGradKernel<platform::CUDADeviceContext, T>
}; };
template class LayerNormDirectCUDAFunctor<float>; template class LayerNormDirectCUDAFunctor<float>;
#ifdef TRT_PLUGIN_FP16_AVALIABLE
template class LayerNormDirectCUDAFunctor<half>;
#endif
#undef FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE #undef FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE
#undef FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE #undef FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE
......
...@@ -62,13 +62,22 @@ void RunPyObject(py::object *py_object, ...@@ -62,13 +62,22 @@ void RunPyObject(py::object *py_object,
for (size_t i = 0; i < result_tuple.size(); i++) { for (size_t i = 0; i < result_tuple.size(); i++) {
if ((*outs)[i] != nullptr) { if ((*outs)[i] != nullptr) {
if (Py_None != result_tuple[i].ptr()) { if (Py_None != result_tuple[i].ptr()) {
try { if (py::isinstance<imperative::VarBase>(result_tuple[i])) {
auto result_var = try {
result_tuple[i].cast<std::shared_ptr<imperative::VarBase>>(); auto result_var =
*(*outs)[i] = result_var->Var(); result_tuple[i].cast<std::shared_ptr<imperative::VarBase>>();
} catch (py::cast_error &) { *(*outs)[i] = result_var->Var();
} catch (py::cast_error &) {
PADDLE_THROW(platform::errors::InvalidArgument(
"The `PyLayer.backward` function returns invalid argument, "
"the `%s` type argument can not be cast into `Tensor`.",
result_tuple[i].ptr()->ob_type->tp_name));
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"The output of `PyLayer.backward` should be `Tensor`.")); "The output of `PyLayer.backward` should be `Tensor`, but "
"received `%s`.",
result_tuple[i].ptr()->ob_type->tp_name));
} }
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
...@@ -94,13 +103,22 @@ void RunPyObject(py::object *py_object, ...@@ -94,13 +103,22 @@ void RunPyObject(py::object *py_object,
} }
if ((*outs)[0] != nullptr) { if ((*outs)[0] != nullptr) {
if (Py_None != py_result.ptr()) { if (Py_None != py_result.ptr()) {
try { if (py::isinstance<imperative::VarBase>(py_result)) {
auto result_var = try {
py_result.cast<std::shared_ptr<imperative::VarBase>>(); auto result_var =
*((*outs)[0]) = result_var->Var(); py_result.cast<std::shared_ptr<imperative::VarBase>>();
} catch (py::cast_error &) { *((*outs)[0]) = result_var->Var();
} catch (py::cast_error &) {
PADDLE_THROW(platform::errors::InvalidArgument(
"The `PyLayer.backward` function returns invalid argument, the "
"`%s` type argument can not be cast into `Tensor`.",
py_result.ptr()->ob_type->tp_name));
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"The output of `PyLayer.backward` should be `Tensor`.")); "The output of `PyLayer.backward` should be `Tensor`, but "
"received `%s`",
py_result.ptr()->ob_type->tp_name));
} }
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
......
...@@ -68,7 +68,6 @@ BufferedReader::BufferedReader( ...@@ -68,7 +68,6 @@ BufferedReader::BufferedReader(
stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx); stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx);
} }
#endif #endif
is_same_place_ = false;
cpu_buffer_.resize(buffer_size); cpu_buffer_.resize(buffer_size);
cuda_buffer_.resize(buffer_size); cuda_buffer_.resize(buffer_size);
npu_buffer_.resize(buffer_size); npu_buffer_.resize(buffer_size);
...@@ -116,7 +115,7 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -116,7 +115,7 @@ void BufferedReader::ReadAsync(size_t i) {
std::vector<void *> cuda_pinned_ptrs; std::vector<void *> cuda_pinned_ptrs;
cuda_pinned_ptrs.reserve(cpu.size()); cuda_pinned_ptrs.reserve(cpu.size());
platform::RecordEvent record_event("BufferedReader:MemoryCopy"); platform::RecordEvent record_event("BufferedReader:MemoryCopy");
// NODE(chenwehiang): When we use CUDAPinned Memory, we need call // NODE(chenweihang): When we use CUDAPinned Memory, we need call
// cudaHostAlloc, that is a CUDA API, calling CUDA API need load // cudaHostAlloc, that is a CUDA API, calling CUDA API need load
// cuda lib into device, it will cost hundreds of MB of GPU memory. // cuda lib into device, it will cost hundreds of MB of GPU memory.
// If we don't set Device here, which will use CUDAPlace(0) default. // If we don't set Device here, which will use CUDAPlace(0) default.
...@@ -126,18 +125,21 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -126,18 +125,21 @@ void BufferedReader::ReadAsync(size_t i) {
if (platform::is_cpu_place(cpu[i].place())) { if (platform::is_cpu_place(cpu[i].place())) {
cuda[i].Resize(cpu[i].dims()); cuda[i].Resize(cpu[i].dims());
cuda[i].set_layout(cpu[i].layout()); cuda[i].set_layout(cpu[i].layout());
cuda_pinned_ptrs.emplace_back( cuda_pinned_ptrs[i] =
cuda[i].mutable_data(cuda_pinned_place, cpu[i].type())); cuda[i].mutable_data(cuda_pinned_place, cpu[i].type());
auto size = auto size =
cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i], memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i],
BOOST_GET_CONST(platform::CPUPlace, cpu[i].place()), BOOST_GET_CONST(platform::CPUPlace, cpu[i].place()),
cpu[i].data<void>(), size); cpu[i].data<void>(), size);
cuda[i].set_lod(cpu[i].lod()); cuda[i].set_lod(cpu[i].lod());
} else { } else {
// we set same place flag & use cpu[i] directly // Here the cpu[i]'s place may be CUDAPlace, CUDAPinnedPlace, or
is_same_place_ = true; // others, we don't copy the memory of it to CUDAPinnedPlace, but
// we should share tensor data to cuda[i]
cuda[i].ShareDataWith(cpu[i]);
} }
} }
} else { } else {
...@@ -296,9 +298,9 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) { ...@@ -296,9 +298,9 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
return; return;
} }
if (platform::is_gpu_place(place_) && !is_same_place_) { if (platform::is_gpu_place(place_)) {
*out = std::move(cuda_buffer_[i]); *out = std::move(cuda_buffer_[i]);
} else if (platform::is_npu_place(place_) && !is_same_place_) { } else if (platform::is_npu_place(place_)) {
*out = std::move(npu_buffer_[i]); *out = std::move(npu_buffer_[i]);
} else { } else {
*out = std::move(cpu_buffer_[i]); *out = std::move(cpu_buffer_[i]);
......
...@@ -67,7 +67,6 @@ class BufferedReader : public framework::DecoratedReader { ...@@ -67,7 +67,6 @@ class BufferedReader : public framework::DecoratedReader {
// buffer, just read async and create futures as buffer size. However, to // buffer, just read async and create futures as buffer size. However, to
// malloc tensors every time is extremely slow. Here we store all data in // malloc tensors every time is extremely slow. Here we store all data in
// buffers and prevent alloc every time. // buffers and prevent alloc every time.
bool is_same_place_;
std::vector<TensorVec> cpu_buffer_; std::vector<TensorVec> cpu_buffer_;
std::vector<TensorVec> cuda_buffer_; std::vector<TensorVec> cuda_buffer_;
std::vector<TensorVec> npu_buffer_; std::vector<TensorVec> npu_buffer_;
......
...@@ -124,8 +124,10 @@ __global__ void GPUROIAlignForward( ...@@ -124,8 +124,10 @@ __global__ void GPUROIAlignForward(
T roi_width = roi_xmax - roi_xmin; T roi_width = roi_xmax - roi_xmin;
T roi_height = roi_ymax - roi_ymin; T roi_height = roi_ymax - roi_ymin;
roi_width = max(roi_width, static_cast<T>(1.)); if (!continuous_coordinate) {
roi_height = max(roi_height, static_cast<T>(1.)); roi_width = max(roi_width, static_cast<T>(1.));
roi_height = max(roi_height, static_cast<T>(1.));
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height); T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width); T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
...@@ -138,7 +140,7 @@ __global__ void GPUROIAlignForward( ...@@ -138,7 +140,7 @@ __global__ void GPUROIAlignForward(
: ceil(roi_height / pooled_height); : ceil(roi_height / pooled_height);
int roi_bin_grid_w = int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
const T count = roi_bin_grid_h * roi_bin_grid_w; const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
T output_val = 0; T output_val = 0;
for (int iy = 0; iy < roi_bin_grid_h; iy++) { for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_ymin + ph * bin_size_h + const T y = roi_ymin + ph * bin_size_h +
...@@ -180,9 +182,10 @@ __global__ void GPUROIAlignBackward( ...@@ -180,9 +182,10 @@ __global__ void GPUROIAlignBackward(
T roi_width = roi_xmax - roi_xmin; T roi_width = roi_xmax - roi_xmin;
T roi_height = roi_ymax - roi_ymin; T roi_height = roi_ymax - roi_ymin;
roi_width = max(roi_width, static_cast<T>(1.)); if (!continuous_coordinate) {
roi_height = max(roi_height, static_cast<T>(1.)); roi_width = max(roi_width, static_cast<T>(1.));
roi_height = max(roi_height, static_cast<T>(1.));
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height); T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width); T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
......
...@@ -226,8 +226,10 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> { ...@@ -226,8 +226,10 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
T roi_width = roi_xmax - roi_xmin; T roi_width = roi_xmax - roi_xmin;
T roi_height = roi_ymax - roi_ymin; T roi_height = roi_ymax - roi_ymin;
roi_width = std::max(roi_width, static_cast<T>(1.)); if (!aligned) {
roi_height = std::max(roi_height, static_cast<T>(1.)); roi_width = std::max(roi_width, static_cast<T>(1.));
roi_height = std::max(roi_height, static_cast<T>(1.));
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height); T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width); T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
...@@ -239,7 +241,7 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> { ...@@ -239,7 +241,7 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
int roi_bin_grid_w = (sampling_ratio > 0) int roi_bin_grid_w = (sampling_ratio > 0)
? sampling_ratio ? sampling_ratio
: ceil(roi_width / pooled_width); : ceil(roi_width / pooled_width);
const T count = roi_bin_grid_h * roi_bin_grid_w; const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);
Tensor pre_pos; Tensor pre_pos;
Tensor pre_w; Tensor pre_w;
int pre_size = count * out_stride[1]; int pre_size = count * out_stride[1];
...@@ -362,6 +364,10 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> { ...@@ -362,6 +364,10 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
T roi_height = roi_ymax - roi_ymin; T roi_height = roi_ymax - roi_ymin;
roi_width = std::max(roi_width, static_cast<T>(1.)); roi_width = std::max(roi_width, static_cast<T>(1.));
roi_height = std::max(roi_height, static_cast<T>(1.)); roi_height = std::max(roi_height, static_cast<T>(1.));
if (!aligned) {
roi_width = std::max(roi_width, static_cast<T>(1.));
roi_height = std::max(roi_height, static_cast<T>(1.));
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height); T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width); T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/assign_value_op.h" #include "paddle/fluid/operators/assign_value_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/slice_utils.h"
#include "paddle/fluid/operators/utils.h" #include "paddle/fluid/operators/utils.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -59,106 +60,6 @@ inline std::string GetValueName(framework::proto::VarType::Type data_type) { ...@@ -59,106 +60,6 @@ inline std::string GetValueName(framework::proto::VarType::Type data_type) {
return value_name; return value_name;
} }
inline void CheckAndUpdateSlice(const framework::DDim in_dims,
const std::vector<int64_t> axes,
std::vector<int64_t>* starts,
std::vector<int64_t>* ends,
std::vector<int64_t>* steps) {
for (size_t i = 0; i < axes.size(); ++i) {
int64_t axis = axes[i];
int64_t dim_value = in_dims[axis];
int64_t start =
(*starts)[i] < 0 ? ((*starts)[i] + dim_value) : (*starts)[i];
int64_t end = (*ends)[i] < 0 ? ((*ends)[i] + dim_value) : (*ends)[i];
start = std::max(start, static_cast<int64_t>(0));
end = std::min(end, dim_value);
int64_t step = (*steps)[i];
PADDLE_ENFORCE_NE(
step, 0, platform::errors::InvalidArgument(
"Step should not be 0, but received step = %d.", step));
if (step > 0) {
start = std::min(start, dim_value);
end = std::max(end, static_cast<int64_t>(0));
PADDLE_ENFORCE_GT(
end, start,
platform::errors::InvalidArgument(
"When step > 0, end should be greater than start, but "
"received end = %d, start = %d.",
end, start));
} else {
// NOTE(liym27): When step < 0, start should less and equal to dim_value-1
// "end is -1" means contain the 0-th element of this axis.
start = std::min(start, dim_value - 1);
end = std::max(end, static_cast<int64_t>(-1));
PADDLE_ENFORCE_GT(
start, end,
platform::errors::InvalidArgument(
"When step < 0, start should be greater than end, but "
"received start = %d, end = %d.",
start, end));
}
(*starts)[i] = start;
(*ends)[i] = end;
}
}
inline framework::DDim GetSliceDims(const framework::DDim in_dims,
const std::vector<int64_t>& axes,
const std::vector<int64_t>& starts,
const std::vector<int64_t>& ends,
const std::vector<int64_t>& steps) {
framework::DDim slice_dims(in_dims);
for (size_t i = 0; i < axes.size(); ++i) {
int64_t axis = axes[i];
int64_t start = starts[i];
int64_t end = ends[i];
int64_t step = steps[i];
if (step > 0) {
slice_dims[axis] = (end - start + step - 1) / step;
} else {
slice_dims[axis] = (end - start + step + 1) / step;
}
}
return slice_dims;
}
inline framework::DDim GetDecreasedDims(
const framework::DDim slice_dims,
const std::vector<int64_t>& decrease_axes) {
// Get dims after decreasing axes.
framework::DDim decreased_dims(slice_dims);
if (decrease_axes.size() > 0) {
for (size_t i = 0; i < decrease_axes.size(); ++i) {
int64_t axis = decrease_axes[i];
PADDLE_ENFORCE_EQ(
decreased_dims[axis], 1,
platform::errors::InvalidArgument("decrease dim should be 1"));
decreased_dims[axis] = 0;
}
std::vector<int64_t> new_shape;
for (int i = 0; i < decreased_dims.size(); ++i) {
if (decreased_dims[i] != 0) {
new_shape.push_back(decreased_dims[i]);
}
}
// NOTE(liym27): Paddle does not support that the rank of Tensor is 0, and
// uses [1] instead.
if (new_shape.size() == 0) {
new_shape.push_back(1);
}
decreased_dims = framework::make_ddim(new_shape);
}
return decreased_dims;
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class SetValueKernel : public framework::OpKernel<T> { class SetValueKernel : public framework::OpKernel<T> {
public: public:
...@@ -225,8 +126,8 @@ class SetValueKernel : public framework::OpKernel<T> { ...@@ -225,8 +126,8 @@ class SetValueKernel : public framework::OpKernel<T> {
} }
auto in_dims = in->dims(); auto in_dims = in->dims();
CheckAndUpdateSlice(in_dims, axes, &starts, &ends, &steps); CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps);
auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, steps); auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps);
auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes); auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes);
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
......
...@@ -28,13 +28,10 @@ class SliceOp : public framework::OperatorWithKernel { ...@@ -28,13 +28,10 @@ class SliceOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "slice");
platform::errors::InvalidArgument( OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "slice");
"Input (Input) of slice op should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, // Case 1: Special treatment when input is a tensor array.
platform::errors::InvalidArgument(
"Output (Out) of slice op should not be null."));
auto x_var_type = ctx->GetInputsVarType("Input")[0]; auto x_var_type = ctx->GetInputsVarType("Input")[0];
auto axes = ctx->Attrs().Get<std::vector<int>>("axes"); auto axes = ctx->Attrs().Get<std::vector<int>>("axes");
if (x_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) { if (x_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
...@@ -57,6 +54,8 @@ class SliceOp : public framework::OperatorWithKernel { ...@@ -57,6 +54,8 @@ class SliceOp : public framework::OperatorWithKernel {
return; return;
} }
} }
// Case 2: input is a tensor.
auto in_dims = ctx->GetInputDim("Input"); auto in_dims = ctx->GetInputDim("Input");
PADDLE_ENFORCE_LT(in_dims.size(), 7, PADDLE_ENFORCE_LT(in_dims.size(), 7,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -65,101 +64,54 @@ class SliceOp : public framework::OperatorWithKernel { ...@@ -65,101 +64,54 @@ class SliceOp : public framework::OperatorWithKernel {
auto starts = ctx->Attrs().Get<std::vector<int>>("starts"); auto starts = ctx->Attrs().Get<std::vector<int>>("starts");
auto ends = ctx->Attrs().Get<std::vector<int>>("ends"); auto ends = ctx->Attrs().Get<std::vector<int>>("ends");
auto infer_flags = ctx->Attrs().Get<std::vector<int>>("infer_flags");
auto decrease_axis = ctx->Attrs().Get<std::vector<int>>("decrease_axis"); auto decrease_axis = ctx->Attrs().Get<std::vector<int>>("decrease_axis");
auto infer_flags = ctx->Attrs().Get<std::vector<int>>("infer_flags");
auto starts_size = starts.size();
auto ends_size = ends.size();
if (infer_flags.empty()) { if (infer_flags.empty()) {
// Initialize infer_flags with 1. // Initialize infer_flags with 1.
// To be compatible with other op tests in which infer_flags is not set. // To be compatible with other op tests in which infer_flags is not set.
infer_flags = std::vector<int>(axes.size(), 1); infer_flags = std::vector<int>(axes.size(), 1);
} }
// 2.1 Check attrs.
auto starts_size = starts.size();
auto ends_size = ends.size();
if (ctx->HasInputs("StartsTensorList")) { if (ctx->HasInputs("StartsTensorList")) {
auto StartsTensorList = ctx->Inputs("StartsTensorList"); starts_size = ctx->Inputs("StartsTensorList").size();
PADDLE_ENFORCE_GT(StartsTensorList.size(), 0, PADDLE_ENFORCE_GT(starts_size, 0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"StartsTensorList size can't be zero")); "StartsTensorList size can't be zero"));
starts_size = StartsTensorList.size();
} }
if (ctx->HasInputs("EndsTensorList")) { if (ctx->HasInputs("EndsTensorList")) {
auto EndsTensorList = ctx->Inputs("EndsTensorList"); ends_size = ctx->Inputs("EndsTensorList").size();
PADDLE_ENFORCE_GT(EndsTensorList.size(), 0, PADDLE_ENFORCE_GT(ends_size, 0, platform::errors::InvalidArgument(
platform::errors::InvalidArgument( "EndsTensorList size can't be zero"));
"EndsTensorList size can't be zero"));
ends_size = EndsTensorList.size();
} }
if (ctx->HasInput("StartsTensor") == false) { if (!ctx->HasInput("StartsTensor")) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
starts_size, axes.size(), starts_size, axes.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of starts must be equal to the size of axes.")); "The size of starts must be equal to the size of axes."));
} }
if (ctx->HasInput("EndsTensor") == false) { if (!ctx->HasInput("EndsTensor")) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
ends_size, axes.size(), ends_size, axes.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of ends must be equal to the size of axes.")); "The size of ends must be equal to the size of axes."));
} }
int dim_value, start, end; CheckAndUpdateSliceAttrs<int>(in_dims, axes, &starts, &ends, nullptr,
for (size_t i = 0; i < axes.size(); ++i) { &infer_flags);
PADDLE_ENFORCE_LT(static_cast<int>(axes[i]), in_dims.size(),
platform::errors::InvalidArgument(
"The index of dimension in axes must be less "
"than the size of input shape."));
if (infer_flags[i] == -1) {
out_dims[axes[i]] = -1;
} else {
// infer out_dim shape
dim_value = out_dims[axes[i]];
if (dim_value > 0) {
start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
start = std::max(start, 0);
end = std::max(end, 0);
end = std::min(end, dim_value);
PADDLE_ENFORCE_LE(start, dim_value,
platform::errors::InvalidArgument(
"start should be less than or equal to the "
"dimension value, but received "
"start = %d, shape[%d] = %d.",
starts[i], axes[i], out_dims[axes[i]]));
PADDLE_ENFORCE_GT(end, start,
platform::errors::InvalidArgument(
"end should greater than start, but received "
"end = %d, start = %d.",
ends[i], starts[i]));
out_dims[axes[i]] = end - start;
}
}
}
// generate new shape
if (decrease_axis.size() > 0) {
std::vector<int> new_out_shape;
for (size_t i = 0; i < decrease_axis.size(); ++i) {
if (ctx->IsRuntime() && infer_flags[i] != -1) {
PADDLE_ENFORCE_EQ(
out_dims[decrease_axis[i]], 1,
platform::errors::InvalidArgument("decrease dim should be 1"));
}
out_dims[decrease_axis[i]] = 0;
}
for (int i = 0; i < out_dims.size(); ++i) { auto slice_dims =
if (out_dims[i] != 0) { GetSliceDims<int>(in_dims, axes, starts, ends, nullptr, &infer_flags);
new_out_shape.push_back(out_dims[i]); if (ctx->IsRuntime()) {
} out_dims = GetDecreasedDims<int>(slice_dims, decrease_axis, &infer_flags);
} } else {
if (new_out_shape.size() == 0) { out_dims = GetDecreasedDims<int>(slice_dims, decrease_axis, nullptr);
new_out_shape.push_back(1);
}
out_dims = framework::make_ddim(new_out_shape);
} }
ctx->SetOutputDim("Out", out_dims); ctx->SetOutputDim("Out", out_dims);
if (axes[0] != 0) { if (axes[0] != 0) {
ctx->ShareLoD("Input", /*->*/ "Out"); ctx->ShareLoD("Input", /*->*/ "Out");
...@@ -185,6 +137,7 @@ class SliceOp : public framework::OperatorWithKernel { ...@@ -185,6 +137,7 @@ class SliceOp : public framework::OperatorWithKernel {
return framework::OpKernelType( return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace()); OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace());
} }
framework::OpKernelType GetKernelTypeForVar( framework::OpKernelType GetKernelTypeForVar(
const std::string &var_name, const Tensor &tensor, const std::string &var_name, const Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const override { const framework::OpKernelType &expected_kernel_type) const override {
......
此差异已折叠。
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <paddle/fluid/framework/operator.h>
#include <string>
#include <vector>
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T = int64_t>
inline void CheckAndUpdateSliceAttrs(const framework::DDim in_dims,
const std::vector<T>& axes,
std::vector<T>* starts,
std::vector<T>* ends,
std::vector<int64_t>* steps = nullptr,
std::vector<T>* infer_flags = nullptr) {
for (size_t i = 0; i < axes.size(); ++i) {
T axis = axes[i];
T dim_value = in_dims[axis];
if (dim_value > 0) {
if (infer_flags != nullptr && (*infer_flags)[i] == -1) {
continue;
}
T start = (*starts)[i] < 0 ? ((*starts)[i] + dim_value) : (*starts)[i];
start = std::max(start, static_cast<T>(0));
T end = (*ends)[i] < 0 ? ((*ends)[i] + dim_value) : (*ends)[i];
end = std::min(end, dim_value);
T step = steps == nullptr ? 1 : (*steps)[i];
PADDLE_ENFORCE_NE(
step, 0, platform::errors::InvalidArgument(
"Step should not be 0, but received step = %d.", step));
if (step > 0) {
start = std::min(start, dim_value);
end = std::max(end, static_cast<T>(0));
PADDLE_ENFORCE_GT(
end, start,
platform::errors::InvalidArgument(
"When step > 0, end should be greater than start, but "
"received end = %d, start = %d.",
end, start));
} else {
// NOTE(liym27): When step < 0, start should less and equal to
// dim_value-1
// "end is -1" means contain the 0-th element of this axis.
start = std::min(start, dim_value - 1);
end = std::max(end, static_cast<T>(-1));
PADDLE_ENFORCE_GT(
start, end,
platform::errors::InvalidArgument(
"When step < 0, start should be greater than end, but "
"received start = %d, end = %d.",
start, end));
}
(*starts)[i] = start;
(*ends)[i] = end;
}
}
}
template <typename T = int64_t>
inline framework::DDim GetSliceDims(const framework::DDim in_dims,
const std::vector<T>& axes,
const std::vector<T>& starts,
const std::vector<T>& ends,
std::vector<T>* steps = nullptr,
std::vector<T>* infer_flags = nullptr) {
framework::DDim slice_dims(in_dims);
for (size_t i = 0; i < axes.size(); ++i) {
T axis = axes[i];
if (infer_flags != nullptr && (*infer_flags)[i] == -1) {
slice_dims[axis] = -1;
continue;
}
T start = starts[i];
T end = ends[i];
T step = steps == nullptr ? 1 : (*steps)[i];
if (step > 0) {
slice_dims[axis] = (end - start + step - 1) / step;
} else {
slice_dims[axis] = (end - start + step + 1) / step;
}
}
return slice_dims;
}
template <typename T = int64_t>
inline framework::DDim GetDecreasedDims(const framework::DDim slice_dims,
const std::vector<T>& decrease_axes,
std::vector<T>* infer_flags = nullptr) {
framework::DDim decreased_dims(slice_dims);
if (decrease_axes.size() > 0) {
for (size_t i = 0; i < decrease_axes.size(); ++i) {
T axis = decrease_axes[i];
if (infer_flags && (*infer_flags)[i] != -1) {
PADDLE_ENFORCE_EQ(
decreased_dims[axis], 1,
platform::errors::InvalidArgument("decrease dim should be 1"));
}
decreased_dims[axis] = 0;
}
std::vector<T> new_shape;
for (int i = 0; i < decreased_dims.size(); ++i) {
if (decreased_dims[i] != 0) {
new_shape.push_back(decreased_dims[i]);
}
}
// NOTE(liym27): Paddle does not support that the rank of Tensor is 0, and
// uses [1] instead.
if (new_shape.size() == 0) {
new_shape.push_back(1);
}
decreased_dims = framework::make_ddim(new_shape);
}
return decreased_dims;
}
} // namespace operators
} // namespace paddle
...@@ -324,6 +324,7 @@ REGISTER_OPERATOR(strided_slice_grad, ops::StridedSliceOpGrad, ...@@ -324,6 +324,7 @@ REGISTER_OPERATOR(strided_slice_grad, ops::StridedSliceOpGrad,
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
strided_slice, strided_slice,
ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, bool>,
ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, int>, ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, int>,
ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, int64_t>, ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, float>, ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, float>,
...@@ -335,6 +336,7 @@ REGISTER_OP_CPU_KERNEL( ...@@ -335,6 +336,7 @@ REGISTER_OP_CPU_KERNEL(
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
strided_slice_grad, strided_slice_grad,
ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, bool>,
ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int>, ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int64_t>, ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, float>, ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
strided_slice, strided_slice,
ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, bool>,
ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, int>, ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, int>,
ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, int64_t>, ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, float>, ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, float>,
...@@ -29,7 +30,8 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -29,7 +30,8 @@ REGISTER_OP_CUDA_KERNEL(
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
strided_slice_grad, strided_slice_grad,
ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int>, ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, bool>,
ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>, ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, float>, ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, double>, ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, double>,
......
...@@ -105,13 +105,15 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker, ...@@ -105,13 +105,15 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker,
ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>); ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp); REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>, tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, bool>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, double>, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int>, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int64_t>, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, plat::float16>); ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, plat::float16>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
tril_triu_grad, tril_triu_grad,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, bool>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, float>, ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, double>, ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int>, ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int>,
......
...@@ -18,7 +18,7 @@ namespace ops = paddle::operators; ...@@ -18,7 +18,7 @@ namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
tril_triu, tril_triu, ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, bool>,
ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, float>, ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, double>, ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int>, ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int>,
...@@ -26,6 +26,7 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -26,6 +26,7 @@ REGISTER_OP_CUDA_KERNEL(
ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, plat::float16>); ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
tril_triu_grad, tril_triu_grad,
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, bool>,
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, float>, ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, double>, ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int>, ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int>,
......
...@@ -563,7 +563,7 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; } ...@@ -563,7 +563,7 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
: CPUDeviceContext(place), p_blobmap_() { : CPUDeviceContext(place), p_blobmap_() {
p_blobmap_.reset(new BlobMap()); p_blobmap_.reset(new BlobMap());
p_exec_items_.reset(new ExecMap()); p_exec_items_.reset(new ExecShape());
p_mutex_.reset(new std::mutex()); p_mutex_.reset(new std::mutex());
} }
...@@ -644,10 +644,15 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) { ...@@ -644,10 +644,15 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
if (ptr == nullptr) { if (ptr == nullptr) {
p_blobmap_->clear(); p_blobmap_->clear();
} else { } else {
for (auto& v : (*p_exec_items_)[ptr]) { // Iterate through all shapes and release
(v.first)->erase(v.second); // for each shape and active executor all entries
// of this executor
for (auto& s : *p_exec_items_) {
for (auto& v : (*s.second)[ptr]) {
(v.first)->erase(v.second);
}
s.second->erase(ptr);
} }
p_exec_items_->erase(ptr);
} }
} else { } else {
VLOG(3) << "Prevented Clearing DNNL cache."; VLOG(3) << "Prevented Clearing DNNL cache.";
...@@ -655,11 +660,24 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) { ...@@ -655,11 +660,24 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
} }
} }
void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const {
p_exec_items_->erase(p_exec_items_->begin());
}
void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob, void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
KeyBlob::iterator it) const { KeyBlob::iterator it) const {
// Take current input shape from TLS
// Take current executor addess from TLS // Take current executor addess from TLS
// and for this executor's items add the one defined with arguments // and for this executor's items add the one defined with arguments
(*p_exec_items_)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it)); auto key_it = p_exec_items_
->insert(std::make_pair(tls().cur_input_shape_str,
std::make_shared<ExecMap>()))
.first;
(*key_it->second)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));
VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size()
<< " curr exec size: "
<< (*key_it->second)[tls().get_curr_exec()].size() << "\n";
} }
void MKLDNNDeviceContext::BlockNextCacheClearing() { void MKLDNNDeviceContext::BlockNextCacheClearing() {
...@@ -716,6 +734,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name, ...@@ -716,6 +734,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
VLOG(2) << "sid=" << sid VLOG(2) << "sid=" << sid
<< ", remove all blobs of shape: " << sBlob->begin()->first; << ", remove all blobs of shape: " << sBlob->begin()->first;
sBlob->erase(sBlob->begin()->first); sBlob->erase(sBlob->begin()->first);
RemoveShapeEntriesWithExecutor();
} }
pBlob = std::make_shared<KeyBlob>(); pBlob = std::make_shared<KeyBlob>();
(*sBlob)[tls().cur_input_shape_str] = pBlob; (*sBlob)[tls().cur_input_shape_str] = pBlob;
...@@ -739,7 +758,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name, ...@@ -739,7 +758,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
return; return;
} }
unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) { unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const {
unsigned int num_entries = 0; unsigned int num_entries = 0;
for (auto const& l3 : *p_blobmap_) { for (auto const& l3 : *p_blobmap_) {
for (auto const& l2 : *(l3.second)) { for (auto const& l2 : *(l3.second)) {
......
...@@ -749,8 +749,14 @@ class MKLDNNDeviceContext : public CPUDeviceContext { ...@@ -749,8 +749,14 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
using ShapeBlob = umap_key_string_t<KeyBlob>; using ShapeBlob = umap_key_string_t<KeyBlob>;
using BlobMap = umap_value_smart_t<int, ShapeBlob>; using BlobMap = umap_value_smart_t<int, ShapeBlob>;
using ExecMap = std::unordered_map< // Auxillary two-level structure (shape, executor) to easier control
void*, std::vector<std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>>>; // clearing cache objects related to specific executor
using ExecKey = void*;
using ExecMapCacheIterPair = std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>;
using ExecMap =
std::unordered_map<ExecKey, std::vector<ExecMapCacheIterPair>>;
using ExecShape = std::unordered_map<std::string, std::shared_ptr<ExecMap>>;
explicit MKLDNNDeviceContext(CPUPlace place); explicit MKLDNNDeviceContext(CPUPlace place);
...@@ -759,6 +765,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext { ...@@ -759,6 +765,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
// Register object to currently used executor's map // Register object to currently used executor's map
void LinkEntryWithExecutor(BlobPtr_t<KeyBlob>, KeyBlob::iterator) const; void LinkEntryWithExecutor(BlobPtr_t<KeyBlob>, KeyBlob::iterator) const;
void RemoveShapeEntriesWithExecutor(void) const;
// Remove all entries from the blob map // Remove all entries from the blob map
void ResetBlobMap(void* ptr); void ResetBlobMap(void* ptr);
...@@ -773,7 +780,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext { ...@@ -773,7 +780,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
void SetBlob(const std::string& name, std::shared_ptr<void> data) const; void SetBlob(const std::string& name, std::shared_ptr<void> data) const;
// Calculate number of oneDNN objects cached // Calculate number of oneDNN objects cached
unsigned int GetCachedObjectsNumber(void); unsigned int GetCachedObjectsNumber(void) const;
// Find a saved blob. Return nullptr if not found // Find a saved blob. Return nullptr if not found
std::shared_ptr<void> GetBlob(const std::string& name) const; std::shared_ptr<void> GetBlob(const std::string& name) const;
...@@ -786,7 +793,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext { ...@@ -786,7 +793,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
std::shared_ptr<BlobMap> p_blobmap_; std::shared_ptr<BlobMap> p_blobmap_;
// Map key is pointer of executor and value is a data(iterator in map) needed // Map key is pointer of executor and value is a data(iterator in map) needed
// to erase // to erase
std::shared_ptr<ExecMap> p_exec_items_; std::shared_ptr<ExecShape> p_exec_items_;
std::shared_ptr<std::mutex> p_mutex_; std::shared_ptr<std::mutex> p_mutex_;
bool block_next_cache_clearing_ = false; bool block_next_cache_clearing_ = false;
}; };
......
...@@ -26,9 +26,11 @@ size_t Alignment(size_t size, const platform::Place &place) { ...@@ -26,9 +26,11 @@ size_t Alignment(size_t size, const platform::Place &place) {
#elif defined(PADDLE_WITH_XPU) #elif defined(PADDLE_WITH_XPU)
// TODO(wangxi): add XpuMinChunkSize // TODO(wangxi): add XpuMinChunkSize
alignment = alignment; alignment = alignment;
#elif defined(PADDLE_WITH_ASCEND_CL)
alignment = NPUMinChunkSize();
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"Fluid is not compiled with CUDA.")); "Fluid is not compiled with CUDA or NPU."));
#endif #endif
} }
size_t remaining = size % alignment; size_t remaining = size % alignment;
......
...@@ -19,6 +19,8 @@ limitations under the License. */ ...@@ -19,6 +19,8 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#elif defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/npu_info.h"
#endif #endif
namespace paddle { namespace paddle {
......
...@@ -775,13 +775,13 @@ inline std::string GetExternalErrorMsg(T status) { ...@@ -775,13 +775,13 @@ inline std::string GetExternalErrorMsg(T status) {
} }
} }
#else #else
char buf[100]; char buf[512];
MEMORY_BASIC_INFORMATION mbi; MEMORY_BASIC_INFORMATION mbi;
HMODULE h_module = HMODULE h_module =
(::VirtualQuery(GetCurrentTraceBackString, &mbi, sizeof(mbi)) != 0) (::VirtualQuery(GetCurrentTraceBackString, &mbi, sizeof(mbi)) != 0)
? (HMODULE)mbi.AllocationBase ? (HMODULE)mbi.AllocationBase
: NULL; : NULL;
GetModuleFileName(h_module, buf, 100); GetModuleFileName(h_module, buf, 512);
std::string strModule(buf); std::string strModule(buf);
const size_t last_slash_idx = strModule.find_last_of("\\"); const size_t last_slash_idx = strModule.find_last_of("\\");
std::string compare_path = strModule.substr(strModule.length() - 7); std::string compare_path = strModule.substr(strModule.length() - 7);
......
...@@ -417,7 +417,7 @@ TEST(enforce, cuda_success) { ...@@ -417,7 +417,7 @@ TEST(enforce, cuda_success) {
"An unsupported value or parameter was passed to the function (a " "An unsupported value or parameter was passed to the function (a "
"negative vector size, for example).To correct: ensure that all the " "negative vector size, for example).To correct: ensure that all the "
"parameters being passed have valid values")); "parameters being passed have valid values"));
/*
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error")); EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error"));
...@@ -430,7 +430,6 @@ TEST(enforce, cuda_success) { ...@@ -430,7 +430,6 @@ TEST(enforce, cuda_success) {
"The call to NCCL is incorrect. This is " "The call to NCCL is incorrect. This is "
"usually reflecting a programming error")); "usually reflecting a programming error"));
#endif #endif
*/
} }
#endif #endif
#endif #endif
......
...@@ -37,6 +37,7 @@ struct GpuLaunchConfig { ...@@ -37,6 +37,7 @@ struct GpuLaunchConfig {
dim3 theory_thread_count = dim3(1, 1, 1); dim3 theory_thread_count = dim3(1, 1, 1);
dim3 thread_per_block = dim3(1, 1, 1); dim3 thread_per_block = dim3(1, 1, 1);
dim3 block_per_grid = dim3(1, 1, 1); dim3 block_per_grid = dim3(1, 1, 1);
int compute_capability = 0;
}; };
inline GpuLaunchConfig GetGpuLaunchConfig1D( inline GpuLaunchConfig GetGpuLaunchConfig1D(
...@@ -67,11 +68,14 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D( ...@@ -67,11 +68,14 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(
std::min(max_threads, context.GetMaxThreadsPerBlock()); std::min(max_threads, context.GetMaxThreadsPerBlock());
const int block_count = const int block_count =
std::min(DivUp(physical_thread_count, thread_per_block), sm); std::min(DivUp(physical_thread_count, thread_per_block), sm);
// Get compute_capability
const int capability = context.GetComputeCapability();
GpuLaunchConfig config; GpuLaunchConfig config;
config.theory_thread_count.x = theory_thread_count; config.theory_thread_count.x = theory_thread_count;
config.thread_per_block.x = thread_per_block; config.thread_per_block.x = thread_per_block;
config.block_per_grid.x = block_count; config.block_per_grid.x = block_count;
config.compute_capability = capability;
return config; return config;
} }
......
...@@ -56,6 +56,7 @@ set(PYBIND_SRCS ...@@ -56,6 +56,7 @@ set(PYBIND_SRCS
ir.cc ir.cc
inference_api.cc inference_api.cc
compatible.cc compatible.cc
io.cc
generator_py.cc) generator_py.cc)
if(WITH_ASCEND) if(WITH_ASCEND)
......
...@@ -511,6 +511,7 @@ void BindAnalysisConfig(py::module *m) { ...@@ -511,6 +511,7 @@ void BindAnalysisConfig(py::module *m) {
py::arg("disable_trt_plugin_fp16") = false) py::arg("disable_trt_plugin_fp16") = false)
.def("enable_tensorrt_oss", &AnalysisConfig::EnableTensorRtOSS) .def("enable_tensorrt_oss", &AnalysisConfig::EnableTensorRtOSS)
.def("tensorrt_oss_enabled", &AnalysisConfig::tensorrt_oss_enabled) .def("tensorrt_oss_enabled", &AnalysisConfig::tensorrt_oss_enabled)
.def("exp_disable_tensorrt_ops", &AnalysisConfig::Exp_DisableTensorRtOPs)
.def("enable_tensorrt_dla", &AnalysisConfig::EnableTensorRtDLA, .def("enable_tensorrt_dla", &AnalysisConfig::EnableTensorRtDLA,
py::arg("dla_core") = 0) py::arg("dla_core") = 0)
.def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled) .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled)
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/pybind/io.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h"
namespace py = pybind11;
namespace paddle {
namespace pybind {
void BindIO(pybind11::module *m) {
m->def("save_lod_tensor", [](const paddle::framework::LoDTensor &tensor,
const std::string &str_file_name) {
std::ofstream fout(str_file_name, std::ios::binary);
PADDLE_ENFORCE_EQ(static_cast<bool>(fout), true,
platform::errors::Unavailable(
"Cannot open %s to save variables.", str_file_name));
paddle::framework::SerializeToStream(fout, tensor);
int64_t tellp = fout.tellp();
fout.close();
return tellp;
});
m->def("load_lod_tensor", [](paddle::framework::LoDTensor &tensor,
const std::string &str_file_name) {
std::ifstream fin(str_file_name, std::ios::binary);
PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
platform::errors::Unavailable(
"Cannot open %s to load variables.", str_file_name));
paddle::framework::DeserializeFromStream(fin, &tensor);
int64_t tellg = fin.tellg();
fin.close();
return tellg;
});
m->def("save_selected_rows",
[](const paddle::framework::SelectedRows &selected_rows,
const std::string &str_file_name) {
std::ofstream fout(str_file_name, std::ios::binary);
PADDLE_ENFORCE_EQ(
static_cast<bool>(fout), true,
platform::errors::Unavailable(
"Cannot open %s to save SelectedRows.", str_file_name));
paddle::framework::SerializeToStream(fout, selected_rows);
int64_t tellp = fout.tellp();
fout.close();
return tellp;
});
m->def("load_selected_rows",
[](paddle::framework::SelectedRows &selected_rows,
const std::string &str_file_name) {
std::ifstream fin(str_file_name, std::ios::binary);
PADDLE_ENFORCE_EQ(
static_cast<bool>(fin), true,
platform::errors::Unavailable(
"Cannot open %s to load SelectedRows.", str_file_name));
paddle::framework::DeserializeFromStream(fin, &selected_rows);
int64_t tellg = fin.tellg();
fin.close();
return tellg;
});
m->def("save_lod_tensor_to_memory",
[](const paddle::framework::LoDTensor &tensor) -> py::bytes {
std::ostringstream ss;
paddle::framework::SerializeToStream(ss, tensor);
return ss.str();
});
m->def("load_lod_tensor_from_memory", [](paddle::framework::LoDTensor &tensor,
const std::string &tensor_bytes) {
std::istringstream fin(tensor_bytes, std::ios::in | std::ios::binary);
paddle::framework::DeserializeFromStream(fin, &tensor);
});
m->def("save_selected_rows_to_memory",
[](const paddle::framework::SelectedRows &selected_rows) -> py::bytes {
std::ostringstream ss;
paddle::framework::SerializeToStream(ss, selected_rows);
return ss.str();
});
m->def("load_selected_rows_from_memory",
[](paddle::framework::SelectedRows &selected_rows,
const std::string &selected_rows_bytes) {
std::istringstream fin(selected_rows_bytes,
std::ios::in | std::ios::binary);
paddle::framework::DeserializeFromStream(fin, &selected_rows);
});
}
} // namespace pybind
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <Python.h>
#include "paddle/fluid/pybind/pybind_boost_headers.h"
namespace paddle {
namespace pybind {
void BindIO(pybind11::module* m);
} // namespace pybind
} // namespace paddle
...@@ -65,6 +65,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = { ...@@ -65,6 +65,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}}, {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
{"momentum", {"Param", "Grad", "Velocity", "LearningRate"}}, {"momentum", {"Param", "Grad", "Velocity", "LearningRate"}},
{"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}}, {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
{"run_program", {"X", "Params"}},
}; };
// NOTE(zhiqiu): Like op_ins_map. // NOTE(zhiqiu): Like op_ins_map.
...@@ -98,6 +99,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = { ...@@ -98,6 +99,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
{"rnn", {"DropoutState", "Reserve", "Out", "State"}}, {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
{"lamb", {"lamb",
{"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
{"run_program", {"DOut"}},
}; };
// NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
...@@ -148,6 +150,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = { ...@@ -148,6 +150,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
{"lamb", {"lamb",
{"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
{"rnn", {"DropoutState"}}, {"rnn", {"DropoutState"}},
{"run_program", {"Out", "DOut", "OutScope"}},
}; };
// NOTE(pangyoki): Tensor View Strategy. // NOTE(pangyoki): Tensor View Strategy.
...@@ -173,7 +176,7 @@ std::set<std::string> inplace_op_duplicable_ins_set = { ...@@ -173,7 +176,7 @@ std::set<std::string> inplace_op_duplicable_ins_set = {
// clang-format off // clang-format off
const char* OUT_INITIALIZER_TEMPLATE = const char* OUT_INITIALIZER_TEMPLATE =
R"({"%s", {std::shared_ptr<imperative::VarBase>(new imperative::VarBase(tracer->GenerateUniqueName()))}})"; R"({"%s", {std::shared_ptr<imperative::VarBase>(new imperative::VarBase("auto_"+std::to_string(VarBaseUniqueNameID++)+"_"))}})";
const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableOutput(%s)})"; const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableOutput(%s)})";
const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})"; const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})";
...@@ -255,12 +258,11 @@ R"( ...@@ -255,12 +258,11 @@ R"(
ConstructAttrMapFromPyArgs("%s", %d, &attrs, args); ConstructAttrMapFromPyArgs("%s", %d, &attrs, args);
{ {
py::gil_scoped_release release; py::gil_scoped_release release;
auto tracer = imperative::GetCurrentTracer();
%s %s
imperative::NameVarBaseMap outs = %s; imperative::NameVarBaseMap outs = %s;
imperative::NameVarBaseMap ins = %s; imperative::NameVarBaseMap ins = %s;
%s %s
tracer->TraceOp("%s", ins, outs, attrs, {%s}); imperative::GetCurrentTracer()->TraceOp("%s", ins, outs, attrs, {%s});
return %s; return %s;
} }
})"; })";
...@@ -585,7 +587,8 @@ int main(int argc, char* argv[]) { ...@@ -585,7 +587,8 @@ int main(int argc, char* argv[]) {
out << "namespace py = pybind11;" out << "namespace py = pybind11;"
<< "\n"; << "\n";
out << "namespace paddle {\n" out << "namespace paddle {\n"
<< "namespace pybind {\n"; << "namespace pybind {\n\n";
out << "std::atomic<int> VarBaseUniqueNameID{0};\n";
out << paddle::string::join_strings(std::get<0>(op_funcs), '\n'); out << paddle::string::join_strings(std::get<0>(op_funcs), '\n');
out << "\n\n"; out << "\n\n";
......
...@@ -69,6 +69,7 @@ limitations under the License. */ ...@@ -69,6 +69,7 @@ limitations under the License. */
#include "paddle/fluid/platform/monitor.h" #include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/pybind/io.h"
#ifdef PADDLE_WITH_ASCEND #ifdef PADDLE_WITH_ASCEND
#include "paddle/fluid/pybind/ascend_wrapper_py.h" #include "paddle/fluid/pybind/ascend_wrapper_py.h"
#endif #endif
...@@ -498,70 +499,6 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -498,70 +499,6 @@ PYBIND11_MODULE(core_noavx, m) {
#endif #endif
return tensor; return tensor;
}); });
m.def("_save_lod_tensor", [](const LoDTensor &tensor,
const std::string &str_file_name) {
std::ofstream fout(str_file_name, std::ios::binary);
PADDLE_ENFORCE_EQ(static_cast<bool>(fout), true,
platform::errors::Unavailable(
"Cannot open %s to save variables.", str_file_name));
SerializeToStream(fout, tensor);
int64_t tellp = fout.tellp();
fout.close();
return tellp;
});
m.def("_load_lod_tensor", [](LoDTensor &tensor,
const std::string &str_file_name) {
std::ifstream fin(str_file_name, std::ios::binary);
PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
platform::errors::Unavailable(
"Cannot open %s to load variables.", str_file_name));
DeserializeFromStream(fin, &tensor);
int64_t tellg = fin.tellg();
fin.close();
return tellg;
});
m.def("_save_selected_rows", [](const SelectedRows &selected_rows,
const std::string &str_file_name) {
std::ofstream fout(str_file_name, std::ios::binary);
PADDLE_ENFORCE_EQ(
static_cast<bool>(fout), true,
platform::errors::Unavailable("Cannot open %s to save SelectedRows.",
str_file_name));
SerializeToStream(fout, selected_rows);
int64_t tellp = fout.tellp();
fout.close();
return tellp;
});
m.def("_load_selected_rows",
[](SelectedRows &selected_rows, const std::string &str_file_name) {
std::ifstream fin(str_file_name, std::ios::binary);
PADDLE_ENFORCE_EQ(
static_cast<bool>(fin), true,
platform::errors::Unavailable(
"Cannot open %s to load SelectedRows.", str_file_name));
DeserializeFromStream(fin, &selected_rows);
int64_t tellg = fin.tellg();
fin.close();
return tellg;
});
m.def("_save_static_dict",
[](const std::string &str_file_name, const py::handle &vec_var_list,
const Scope &scope) {
std::vector<std::string> vec_name_list = GetNameList(vec_var_list);
SaveStaticNameListToDisk(str_file_name, vec_name_list, scope);
});
m.def("_load_static_dict",
[](const std::string &str_file_name, const py::handle &vec_var_list,
const Scope &scope, const Executor *executor) {
std::vector<std::string> vec_name_list = GetNameList(vec_var_list);
CreateVariableIfNotExit(vec_var_list, scope, executor);
LoadStaticNameListFromDisk(str_file_name, vec_name_list, scope);
});
m.def("_create_loaded_parameter", m.def("_create_loaded_parameter",
[](const py::handle &vec_var_list, const Scope &scope, [](const py::handle &vec_var_list, const Scope &scope,
...@@ -569,26 +506,6 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -569,26 +506,6 @@ PYBIND11_MODULE(core_noavx, m) {
CreateVariableIfNotExit(vec_var_list, scope, executor); CreateVariableIfNotExit(vec_var_list, scope, executor);
}); });
m.def("_save_dygraph_dict", [](const std::string &str_file_name,
const PyNameVarBaseMap &state_dict) {
auto vec_var_base_list = GetVarBaseList(state_dict);
SaveDygraphVarBaseListToDisk(str_file_name, vec_var_base_list);
});
m.def("_load_dygraph_dict", [](const std::string &str_file_name) {
auto load_tensor = LoadDygraphVarBaseListFromDisk(str_file_name);
std::unordered_map<std::string, std::shared_ptr<imperative::VarBase>>
map_output;
for (size_t i = 0; i < load_tensor.size(); ++i) {
map_output.emplace(load_tensor[i]->Name(), load_tensor[i]);
}
return map_output;
});
m.def("save_op_version_info", [](framework::ProgramDesc &desc) { m.def("save_op_version_info", [](framework::ProgramDesc &desc) {
framework::compatible::pb::OpVersionMap pb_vmap{desc.OpVersionMap()}; framework::compatible::pb::OpVersionMap pb_vmap{desc.OpVersionMap()};
framework::compatible::SaveOpVersions( framework::compatible::SaveOpVersions(
...@@ -3154,6 +3071,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3154,6 +3071,7 @@ All parameter, weight, gradient are variables in Paddle.
.def("device_count", &ParallelExecutor::DeviceCount); .def("device_count", &ParallelExecutor::DeviceCount);
BindFleetWrapper(&m); BindFleetWrapper(&m);
BindIO(&m);
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
BindHeterWrapper(&m); BindHeterWrapper(&m);
......
...@@ -22,11 +22,12 @@ setlocal enabledelayedexpansion ...@@ -22,11 +22,12 @@ setlocal enabledelayedexpansion
rem -------clean up environment----------- rem -------clean up environment-----------
set work_dir=%cd% set work_dir=%cd%
set cache_dir=%work_dir:Paddle=cache% if not defined cache_dir set cache_dir=%work_dir:Paddle=cache%
if not exist %cache_dir%\tools ( if not exist %cache_dir%\tools (
git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
) )
taskkill /f /im cmake.exe 2>NUL taskkill /f /im cmake.exe 2>NUL
taskkill /f /im ninja.exe 2>NUL
taskkill /f /im MSBuild.exe 2>NUL taskkill /f /im MSBuild.exe 2>NUL
taskkill /f /im cl.exe 2>NUL taskkill /f /im cl.exe 2>NUL
taskkill /f /im lib.exe 2>NUL taskkill /f /im lib.exe 2>NUL
...@@ -217,7 +218,8 @@ set CUDA_ARCH_NAME=All ...@@ -217,7 +218,8 @@ set CUDA_ARCH_NAME=All
call :cmake || goto cmake_error call :cmake || goto cmake_error
call :build || goto build_error call :build || goto build_error
call :zip_file || goto zip_file_error call :zip_cc_file || goto zip_cc_file_error
call :zip_c_file || goto zip_c_file_error
goto:success goto:success
rem "Other configurations are added here" rem "Other configurations are added here"
...@@ -689,7 +691,7 @@ goto:eof ...@@ -689,7 +691,7 @@ goto:eof
exit /b 1 exit /b 1
rem --------------------------------------------------------------------------------------------- rem ---------------------------------------------------------------------------------------------
:zip_file :zip_cc_file
tree /F %cd%\paddle_inference_install_dir\paddle tree /F %cd%\paddle_inference_install_dir\paddle
if exist paddle_inference.zip del paddle_inference.zip if exist paddle_inference.zip del paddle_inference.zip
python -c "import shutil;shutil.make_archive('paddle_inference', 'zip', root_dir='paddle_inference_install_dir')" python -c "import shutil;shutil.make_archive('paddle_inference', 'zip', root_dir='paddle_inference_install_dir')"
...@@ -701,10 +703,27 @@ for /F %%i in ("%libsize%") do ( ...@@ -701,10 +703,27 @@ for /F %%i in ("%libsize%") do (
) )
goto:eof goto:eof
:zip_file_error :zip_cc_file_error
echo Tar inference library failed! echo Tar inference library failed!
exit /b 1 exit /b 1
rem ---------------------------------------------------------------------------------------------
:zip_c_file
tree /F %cd%\paddle_inference_c_install_dir\paddle
if exist paddle_inference_c.zip del paddle_inference_c.zip
python -c "import shutil;shutil.make_archive('paddle_inference_c', 'zip', root_dir='paddle_inference_c_install_dir')"
%cache_dir%\tools\busybox64.exe du -h -k paddle_inference_c.zip > lib_size.txt
set /p libsize=< lib_size.txt
for /F %%i in ("%libsize%") do (
set /a libsize_m=%%i/1024
echo "Windows Paddle_Inference CAPI ZIP Size: !libsize_m!M"
)
goto:eof
:zip_c_file_error
echo Tar inference capi library failed!
exit /b 1
:timestamp :timestamp
setlocal enabledelayedexpansion setlocal enabledelayedexpansion
@ECHO OFF @ECHO OFF
...@@ -763,6 +782,7 @@ echo ======================================== ...@@ -763,6 +782,7 @@ echo ========================================
echo Clean up environment at the end ... echo Clean up environment at the end ...
echo ======================================== echo ========================================
taskkill /f /im cmake.exe 2>NUL taskkill /f /im cmake.exe 2>NUL
taskkill /f /im ninja.exe 2>NUL
taskkill /f /im MSBuild.exe 2>NUL taskkill /f /im MSBuild.exe 2>NUL
taskkill /f /im git.exe 2>NUL taskkill /f /im git.exe 2>NUL
taskkill /f /im cl.exe 2>NUL taskkill /f /im cl.exe 2>NUL
......
...@@ -426,6 +426,13 @@ EOF ...@@ -426,6 +426,13 @@ EOF
buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference.tgz |awk '{print $1}') buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference.tgz |awk '{print $1}')
echo "Paddle_Inference Size: $buildSize" echo "Paddle_Inference Size: $buildSize"
echo "ipipe_log_param_Paddle_Inference_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt echo "ipipe_log_param_Paddle_Inference_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt
elif [ "$1" == "paddle_inference_c" ]; then
cd ${PADDLE_ROOT}/build
cp -r paddle_inference_c_install_dir paddle_inference_c
tar -czf paddle_inference_c.tgz paddle_inference_c
buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference_c.tgz |awk '{print $1}')
echo "Paddle_Inference Capi Size: $buildSize"
echo "ipipe_log_param_Paddle_Inference_capi_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt
else else
SYSTEM=`uname -s` SYSTEM=`uname -s`
if [ "$SYSTEM" == "Darwin" ]; then if [ "$SYSTEM" == "Darwin" ]; then
...@@ -1941,6 +1948,7 @@ EOF ...@@ -1941,6 +1948,7 @@ EOF
echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
build_size "paddle_inference" build_size "paddle_inference"
build_size "paddle_inference_c"
} }
function tar_fluid_lib() { function tar_fluid_lib() {
...@@ -2001,12 +2009,16 @@ function build_document_preview() { ...@@ -2001,12 +2009,16 @@ function build_document_preview() {
sh /paddle/tools/document_preview.sh ${PORT} sh /paddle/tools/document_preview.sh ${PORT}
} }
# origin name: example
function example() { function exec_samplecode_test() {
pip install ${PADDLE_ROOT}/build/python/dist/*.whl pip install ${PADDLE_ROOT}/build/python/dist/*.whl
paddle version paddle version
cd ${PADDLE_ROOT}/tools cd ${PADDLE_ROOT}/tools
python sampcd_processor.py cpu;example_error=$? if [ "$1" = "cpu" ] ; then
python sampcd_processor.py cpu; example_error=$?
elif [ "$1" = "gpu" ] ; then
python sampcd_processor.py --threads=16 --full-test gpu; example_error=$?
fi
if [ "$example_error" != "0" ];then if [ "$example_error" != "0" ];then
echo "Code instance execution failed" >&2 echo "Code instance execution failed" >&2
exit 5 exit 5
...@@ -2119,7 +2131,7 @@ function main() { ...@@ -2119,7 +2131,7 @@ function main() {
check_sequence_op_unittest check_sequence_op_unittest
generate_api_spec ${PYTHON_ABI:-""} "PR" generate_api_spec ${PYTHON_ABI:-""} "PR"
set +e set +e
example_info=$(example) example_info=$(exec_samplecode_test cpu)
example_code=$? example_code=$?
summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info" summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info"
assert_api_spec_approvals assert_api_spec_approvals
...@@ -2278,7 +2290,11 @@ function main() { ...@@ -2278,7 +2290,11 @@ function main() {
build_document_preview build_document_preview
;; ;;
api_example) api_example)
example example_info=$(exec_samplecode_test cpu)
example_code=$?
check_style_code=0
check_style_info=
summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info"
;; ;;
test_op_benchmark) test_op_benchmark)
test_op_benchmark test_op_benchmark
......
此差异已折叠。
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .auto_cast import auto_cast from .auto_cast import auto_cast # noqa: F401
from .grad_scaler import GradScaler from .grad_scaler import GradScaler # noqa: F401
__all__ = ['auto_cast', 'GradScaler'] __all__ = ['auto_cast', 'GradScaler']
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
from paddle.fluid.dygraph.amp import amp_guard from paddle.fluid.dygraph.amp import amp_guard
__all__ = ['auto_cast'] __all__ = []
def auto_cast(enable=True, custom_white_list=None, custom_black_list=None): def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
from paddle.fluid.dygraph.amp import AmpScaler from paddle.fluid.dygraph.amp import AmpScaler
__all__ = ['GradScaler'] __all__ = []
class GradScaler(AmpScaler): class GradScaler(AmpScaler):
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册