提交 1d755225 编写于 作者: J jim19930609

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into eager_dygraph_final_state_auto_codegen
...@@ -4,7 +4,8 @@ paddle/fluid/API_DEV.spec ...@@ -4,7 +4,8 @@ paddle/fluid/API_DEV.spec
paddle/fluid/API_PR.spec paddle/fluid/API_PR.spec
paddle/fluid/op_use_default_grad_maker_DEV.spec paddle/fluid/op_use_default_grad_maker_DEV.spec
paddle/fluid/op_use_default_grad_maker_PR.spec paddle/fluid/op_use_default_grad_maker_PR.spec
paddle/pten/api/*/api* paddle/pten/api/*/api.*
paddle/pten/api/*/backward*
paddle/pten/include/* paddle/pten/include/*
paddle/pten/extension.h paddle/pten/extension.h
......
...@@ -36,7 +36,7 @@ ENDIF() ...@@ -36,7 +36,7 @@ ENDIF()
if(NOT DEFINED XPU_BASE_URL) if(NOT DEFINED XPU_BASE_URL)
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220104") SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220116")
else() else()
SET(XPU_BASE_URL "${XPU_BASE_URL}") SET(XPU_BASE_URL "${XPU_BASE_URL}")
endif() endif()
......
...@@ -27,7 +27,7 @@ limitations under the License. */ ...@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/port.h" #include "paddle/pten/backends/dynload/port.h"
namespace butil { namespace butil {
class IOBuf; class IOBuf;
...@@ -78,11 +78,11 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg, ...@@ -78,11 +78,11 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
const framework::Scope* scope); const framework::Scope* scope);
void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg, void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
butil::IOBufBytesIterator& iobuf, butil::IOBufBytesIterator& iobuf, // NOLINT
const platform::DeviceContext& ctx); const platform::DeviceContext& ctx);
void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg, void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
butil::IOBufBytesIterator& iobuf, butil::IOBufBytesIterator& iobuf, // NOLINT
const platform::DeviceContext& ctx); const platform::DeviceContext& ctx);
std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port); std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port);
......
...@@ -40,9 +40,9 @@ ...@@ -40,9 +40,9 @@
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle { namespace paddle {
namespace distributed { namespace distributed {
...@@ -202,7 +202,7 @@ class ValueBlock { ...@@ -202,7 +202,7 @@ class ValueBlock {
// value = _alloc.acquire(value_length_); // value = _alloc.acquire(value_length_);
table[id] = value; table[id] = value;
} else { } else {
value = (VALUE *)(void *)(res->second); value = (VALUE *)(void *)(res->second); // NOLINT
} }
return value; return value;
} }
...@@ -282,8 +282,8 @@ class ValueBlock { ...@@ -282,8 +282,8 @@ class ValueBlock {
value->unseen_days_++; value->unseen_days_++;
if (value->unseen_days_ >= threshold) { if (value->unseen_days_ >= threshold) {
butil::return_object(iter->second); butil::return_object(iter->second);
//_alloc.release(iter->second); // _alloc.release(iter->second);
//_alloc.release(value); // _alloc.release(value);
iter = table.erase(iter); iter = table.erase(iter);
} else { } else {
++iter; ++iter;
......
...@@ -216,8 +216,9 @@ void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) { ...@@ -216,8 +216,9 @@ void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
#define PADDLE_TENSOR_ADD(cpp_type) \ #define PADDLE_TENSOR_ADD(cpp_type) \
if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \ if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \
TensorAddFunctor<cpp_type> func(numel, src_tensor->data<cpp_type>(), \ TensorAddFunctor<cpp_type> func( \
dst_tensor->mutable_data<cpp_type>()); \ numel, src_tensor->data<cpp_type>(), \
dst_tensor->mutable_data<cpp_type>(place)); \
paddle::platform::VisitPlace(place, func); \ paddle::platform::VisitPlace(place, func); \
return; \ return; \
} }
......
add_subdirectory(final_state_generator) #add_subdirectory(final_state_generator)
set(EAGER_GENERETOR_DEPS ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag) set(EAGER_GENERETOR_DEPS ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
......
...@@ -75,6 +75,10 @@ def GetAutoGradMetaName(string): ...@@ -75,6 +75,10 @@ def GetAutoGradMetaName(string):
return f"{string}_autograd_meta" return f"{string}_autograd_meta"
def GetAutoGradMetaVectorName(string):
return f"{string}_autograd_meta_vec"
###################### ######################
### File Readers ### ### File Readers ###
###################### ######################
...@@ -219,10 +223,6 @@ def ParseYamlBackward(args_str, returns_str): ...@@ -219,10 +223,6 @@ def ParseYamlBackward(args_str, returns_str):
def ForwardsValidationCheck(forward_inputs_list, forward_attrs_list, def ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
forward_returns_list, orig_forward_inputs_list, forward_returns_list, orig_forward_inputs_list,
orig_forward_attrs_list, orig_forward_returns_list): orig_forward_attrs_list, orig_forward_returns_list):
# inputs_list = [ [input_name, input_type, orig_position], ...]
# attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# forward_returns_list = [ [ret_name, ret_type, orig_position] ...]
# orig_returns_list = [ [ret_type, orig_position], ...]
for i in range(len(forward_inputs_list)): for i in range(len(forward_inputs_list)):
forward_input_name = forward_inputs_list[i][0] forward_input_name = forward_inputs_list[i][0]
forward_input_type = forward_inputs_list[i][1] forward_input_type = forward_inputs_list[i][1]
...@@ -270,9 +270,6 @@ def ForwardsValidationCheck(forward_inputs_list, forward_attrs_list, ...@@ -270,9 +270,6 @@ def ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
def BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map, def BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
backward_attrs_list): backward_attrs_list):
# backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...}
# backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# Check Order: TensorWrappers, GradTensors, Attributes # Check Order: TensorWrappers, GradTensors, Attributes
max_fwd_input_position = -1 max_fwd_input_position = -1
...@@ -291,10 +288,6 @@ def BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map, ...@@ -291,10 +288,6 @@ def BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
def DetermineForwardPositionMap(forward_inputs_list, forward_returns_list): def DetermineForwardPositionMap(forward_inputs_list, forward_returns_list):
# inputs_list = [ [input_name, input_type, orig_position], ...]
# forward_returns_list = [ [ret_name, ret_type, orig_position] ...]
# forward_position_map = { "name" : [type, fwd_position] ...}
forward_inputs_position_map = {} forward_inputs_position_map = {}
forward_outputs_position_map = {} forward_outputs_position_map = {}
for i in range(len(forward_inputs_list)): for i in range(len(forward_inputs_list)):
...@@ -319,15 +312,6 @@ def DetermineForwardPositionMap(forward_inputs_list, forward_returns_list): ...@@ -319,15 +312,6 @@ def DetermineForwardPositionMap(forward_inputs_list, forward_returns_list):
def SlotNameMatching(backward_inputs_list, backward_returns_list, def SlotNameMatching(backward_inputs_list, backward_returns_list,
forward_inputs_position_map, forward_outputs_position_map): forward_inputs_position_map, forward_outputs_position_map):
# backward_inputs_list = [ [input_name, input_type, orig_position], ...]
# backward_returns_list = [ [ret_name, ret_type, orig_position], ...]
# forward_inputs_position_map = { "name" : [type, fwd_position] }
# forward_outputs_position_map = { "name" : [type, fwd_position] }
# backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...}
# backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...}
backward_fwd_input_map = {} backward_fwd_input_map = {}
backward_grad_input_map = {} backward_grad_input_map = {}
backward_grad_output_map = {} backward_grad_output_map = {}
...@@ -580,7 +564,14 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name, ...@@ -580,7 +564,14 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
compute_require_grad_args_list = ["trace_backward"] compute_require_grad_args_list = ["trace_backward"]
for name, (ttype, pos) in forward_inputs_position_map.items(): for name, (ttype, pos) in forward_inputs_position_map.items():
input_autograd_meta_name = GetAutoGradMetaName(name) input_autograd_meta_name = GetAutoGradMetaName(name)
input_autograd_meta = f" auto* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});" if IsPlainTensorType(ttype):
input_autograd_meta = f" egr::EagerTensor* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
else:
assert IsVectorTensorType(ttype)
input_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
input_autograd_meta = f" std::vector<egr::EagerTensor*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({name});\n"
input_autograd_meta += f" std::vector<egr::EagerTensor*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
inputs_autograd_meta_list.append(input_autograd_meta) inputs_autograd_meta_list.append(input_autograd_meta)
compute_require_grad_args_list.append(input_autograd_meta_name) compute_require_grad_args_list.append(input_autograd_meta_name)
inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list) inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list)
...@@ -592,11 +583,23 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name, ...@@ -592,11 +583,23 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
num_fwd_outputs = len(forward_outputs_position_map.keys()) num_fwd_outputs = len(forward_outputs_position_map.keys())
for name, (rtype, pos) in forward_outputs_position_map.items(): for name, (rtype, pos) in forward_outputs_position_map.items():
output_autograd_meta_name = GetAutoGradMetaName(name) output_autograd_meta_name = GetAutoGradMetaName(name)
output_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
if num_fwd_outputs == 1: if num_fwd_outputs == 1:
output_autograd_meta = f" auto* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(outputs);" if IsPlainTensorType(rtype):
output_autograd_meta = f" egr::EagerTensor* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(outputs);"
else:
assert IsVectorTensorType(rtype)
output_autograd_meta = f" std::vector<egr::EagerTensor*> {output_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({outputs});\n"
output_autograd_meta += f" std::vector<egr::EagerTensor*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
else: else:
# Tuple api_result # Tuple api_result
outputs_autograd_meta = f" auto* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(outputs[{pos}]);" if IsPlainTensorType(rtype):
outputs_autograd_meta = f" egr::EagerTensor* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(outputs[{pos}]);"
else:
assert IsVectorTensorType(rtype)
output_autograd_meta = f" std::vector<egr::EagerTensor*> {output_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta(outputs[{pos}]);\n"
output_autograd_meta += f" std::vector<egr::EagerTensor*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
outputs_autograd_meta_list.append(output_autograd_meta) outputs_autograd_meta_list.append(output_autograd_meta)
pass_stop_gradient_args_list.append(output_autograd_meta_name) pass_stop_gradient_args_list.append(output_autograd_meta_name)
...@@ -786,7 +789,6 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, ...@@ -786,7 +789,6 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
auto outputs = {}; auto outputs = {};
// Node Creation
{} {}
// Returns // Returns
...@@ -903,17 +905,10 @@ if __name__ == "__main__": ...@@ -903,17 +905,10 @@ if __name__ == "__main__":
# Collect Forward Inputs/Outputs # Collect Forward Inputs/Outputs
forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward( forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
bwd_forward_str) bwd_forward_str)
print("Parsed Forward Inputs List: ", forward_inputs_list)
print("Prased Forward Attrs List: ", forward_attrs_list)
print("Parsed Forward Returns List: ", forward_returns_list)
# Collect Original Forward Inputs/Outputs and then perform validation checks # Collect Original Forward Inputs/Outputs and then perform validation checks
orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward( orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
fwd_args_str, fwd_returns_str) fwd_args_str, fwd_returns_str)
print("Parsed Original Forward Inputs List: ", orig_forward_inputs_list)
print("Prased Original Forward Attrs List: ", orig_forward_attrs_list)
print("Parsed Original Forward Returns List: ",
orig_forward_returns_list)
# Forward Validation Checks # Forward Validation Checks
ForwardsValidationCheck(forward_inputs_list, forward_attrs_list, ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
...@@ -924,25 +919,15 @@ if __name__ == "__main__": ...@@ -924,25 +919,15 @@ if __name__ == "__main__":
# Parse Backward Inputs/Outputs # Parse Backward Inputs/Outputs
backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward( backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
bwd_args_str, bwd_returns_str) bwd_args_str, bwd_returns_str)
print("Parsed Backward Inputs List: ", backward_inputs_list)
print("Prased Backward Attrs List: ", backward_attrs_list)
print("Parsed Backward Returns List: ", backward_returns_list)
# Determine Forward Inputs/Outputs Position # Determine Forward Inputs/Outputs Position
forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
forward_inputs_list, forward_returns_list) forward_inputs_list, forward_returns_list)
print("Generated Forward Input Position Map: ",
forward_inputs_position_map)
print("Generated Forward Output Position Map: ",
forward_outputs_position_map)
# SlotName Matching # SlotName Matching
backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching( backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
backward_inputs_list, backward_returns_list, backward_inputs_list, backward_returns_list,
forward_inputs_position_map, forward_outputs_position_map) forward_inputs_position_map, forward_outputs_position_map)
print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
print("Generated Backward Grad Input Map: ", backward_grad_input_map)
print("Generated Backward Grad Output Map: ", backward_grad_output_map)
# Backward Validation Check # Backward Validation Check
BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map, BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
...@@ -951,13 +936,11 @@ if __name__ == "__main__": ...@@ -951,13 +936,11 @@ if __name__ == "__main__":
# Node Declaration Generation # Node Declaration Generation
node_declaration_str += GenerateNodeDeclaration( node_declaration_str += GenerateNodeDeclaration(
fwd_api_name, backward_fwd_input_map, backward_attrs_list) fwd_api_name, backward_fwd_input_map, backward_attrs_list)
print("Generated Node Declaration: ", node_declaration_str)
node_definition_str += GenerateNodeDefinition( node_definition_str += GenerateNodeDefinition(
fwd_api_name, bwd_api_name, backward_fwd_input_map, fwd_api_name, bwd_api_name, backward_fwd_input_map,
backward_grad_input_map, backward_grad_output_map, backward_grad_input_map, backward_grad_output_map,
backward_attrs_list) backward_attrs_list)
print("Generated Node Definition: ", node_definition_str)
# Node Definition Generation # Node Definition Generation
definition_declaration_pair = GenerateForwardDefinition( definition_declaration_pair = GenerateForwardDefinition(
...@@ -965,8 +948,6 @@ if __name__ == "__main__": ...@@ -965,8 +948,6 @@ if __name__ == "__main__":
forward_outputs_position_map, forward_attrs_list, forward_outputs_position_map, forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map, backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list) backward_grad_output_map, backward_attrs_list)
print("Generated Forward Definition: ", forward_definition_str)
print("Generated Forward Declaration: ", forward_declaration_str)
forward_definition_str += definition_declaration_pair[0] forward_definition_str += definition_declaration_pair[0]
forward_declaration_str += definition_declaration_pair[1] forward_declaration_str += definition_declaration_pair[1]
......
...@@ -36,7 +36,8 @@ TEST(AccumulationNode, EagerTensor) { ...@@ -36,7 +36,8 @@ TEST(AccumulationNode, EagerTensor) {
paddle::platform::CPUPlace()) paddle::platform::CPUPlace())
.get(), .get(),
meta); meta);
dt0->mutable_data<paddle::platform::float16>()[0] = 10.0; dt0->mutable_data<paddle::platform::float16>(
paddle::platform::CPUPlace())[0] = 10.0;
EagerTensor et0 = EagerTensor(dt0); EagerTensor et0 = EagerTensor(dt0);
std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>( std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>(
...@@ -45,7 +46,8 @@ TEST(AccumulationNode, EagerTensor) { ...@@ -45,7 +46,8 @@ TEST(AccumulationNode, EagerTensor) {
.get(), .get(),
meta); meta);
dt1->mutable_data<paddle::platform::float16>()[0] = 20.0; dt1->mutable_data<paddle::platform::float16>(
paddle::platform::CPUPlace())[0] = 20.0;
EagerTensor et1 = EagerTensor(dt1); EagerTensor et1 = EagerTensor(dt1);
std::shared_ptr<pten::DenseTensor> grad_dt = std::shared_ptr<pten::DenseTensor> grad_dt =
......
...@@ -46,7 +46,7 @@ TEST(AutogradMeta, MemberFunction) { ...@@ -46,7 +46,7 @@ TEST(AutogradMeta, MemberFunction) {
paddle::platform::CPUPlace()) paddle::platform::CPUPlace())
.get(), .get(),
meta); meta);
auto* dt_ptr = dt->mutable_data<float>(); auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr[0] = 5.0f; dt_ptr[0] = 5.0f;
dt_ptr[1] = 10.0f; dt_ptr[1] = 10.0f;
grad_t->set_impl(dt); grad_t->set_impl(dt);
......
...@@ -40,7 +40,7 @@ TEST(EagerTensor, Constructor) { ...@@ -40,7 +40,7 @@ TEST(EagerTensor, Constructor) {
paddle::platform::CPUPlace()) paddle::platform::CPUPlace())
.get(), .get(),
meta); meta);
auto* dt_ptr = dt->mutable_data<float>(); auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr[0] = 5.0f; dt_ptr[0] = 5.0f;
dt_ptr[1] = 10.0f; dt_ptr[1] = 10.0f;
egr::EagerTensor et3 = egr::EagerTensor(dt); egr::EagerTensor et3 = egr::EagerTensor(dt);
...@@ -70,7 +70,7 @@ TEST(EagerTensor, MemberFunction) { ...@@ -70,7 +70,7 @@ TEST(EagerTensor, MemberFunction) {
paddle::platform::CPUPlace()) paddle::platform::CPUPlace())
.get(), .get(),
meta); meta);
auto* dt_ptr = dt->mutable_data<float>(); auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr[0] = 5.0f; dt_ptr[0] = 5.0f;
dt_ptr[1] = 10.0f; dt_ptr[1] = 10.0f;
VLOG(6) << "Make Dense Tensor"; VLOG(6) << "Make Dense Tensor";
......
...@@ -45,7 +45,7 @@ TEST(GradNodeInfo, GradNodeBase) { ...@@ -45,7 +45,7 @@ TEST(GradNodeInfo, GradNodeBase) {
paddle::platform::CPUPlace()) paddle::platform::CPUPlace())
.get(), .get(),
meta); meta);
auto* dt_ptr = dt->mutable_data<float>(); auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr[0] = 5.0f; dt_ptr[0] = 5.0f;
egr::EagerTensor et1(dt); egr::EagerTensor et1(dt);
grads = {{et1}}; grads = {{et1}};
...@@ -102,7 +102,7 @@ TEST(GradNodeInfo, GradNodeBase) { ...@@ -102,7 +102,7 @@ TEST(GradNodeInfo, GradNodeBase) {
paddle::platform::CPUPlace()) paddle::platform::CPUPlace())
.get(), .get(),
meta); meta);
auto* dt_ptr = dt->mutable_data<float>(); auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr[0] = 6.0f; dt_ptr[0] = 6.0f;
auto* et_ptr = auto* et_ptr =
std::dynamic_pointer_cast<pten::DenseTensor>(et.impl())->data<float>(); std::dynamic_pointer_cast<pten::DenseTensor>(et.impl())->data<float>();
...@@ -121,8 +121,8 @@ TEST(GradNodeInfo, GradNodeBase) { ...@@ -121,8 +121,8 @@ TEST(GradNodeInfo, GradNodeBase) {
VLOG(6) << "Test Reduce Hook"; VLOG(6) << "Test Reduce Hook";
auto reduce_hook = [&](void) -> void { auto reduce_hook = [&](void) -> void {
auto* et_ptr = std::dynamic_pointer_cast<pten::DenseTensor>(et1.impl()) auto* et_ptr =
->mutable_data<float>(); std::dynamic_pointer_cast<pten::DenseTensor>(et1.impl())->data<float>();
et_ptr[0] = 100.0; et_ptr[0] = 100.0;
VLOG(6) << "Running Reduce Hook"; VLOG(6) << "Running Reduce Hook";
}; };
......
...@@ -41,7 +41,7 @@ class GradTestNode : public egr::GradNodeBase { ...@@ -41,7 +41,7 @@ class GradTestNode : public egr::GradNodeBase {
paddle::platform::CPUPlace()) paddle::platform::CPUPlace())
.get(), .get(),
meta); meta);
auto* dt_ptr = dt->mutable_data<float>(); auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr[0] = 6.0f; dt_ptr[0] = 6.0f;
egr::EagerTensor et1(dt); egr::EagerTensor et1(dt);
std::vector<std::vector<egr::EagerTensor>> res = {{et1}}; std::vector<std::vector<egr::EagerTensor>> res = {{et1}};
......
...@@ -57,7 +57,7 @@ TEST(GradTensorHolder, Interfaces) { ...@@ -57,7 +57,7 @@ TEST(GradTensorHolder, Interfaces) {
paddle::platform::CPUPlace()) paddle::platform::CPUPlace())
.get(), .get(),
meta); meta);
dt0->mutable_data<float>()[0] = 10.0; dt0->mutable_data<float>(paddle::platform::CPUPlace())[0] = 10.0;
EagerTensor et0 = EagerTensor(dt0); EagerTensor et0 = EagerTensor(dt0);
std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>( std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>(
...@@ -65,7 +65,7 @@ TEST(GradTensorHolder, Interfaces) { ...@@ -65,7 +65,7 @@ TEST(GradTensorHolder, Interfaces) {
paddle::platform::CPUPlace()) paddle::platform::CPUPlace())
.get(), .get(),
meta); meta);
dt1->mutable_data<float>()[0] = 20.0; dt1->mutable_data<float>(paddle::platform::CPUPlace())[0] = 20.0;
EagerTensor et1 = EagerTensor(dt1); EagerTensor et1 = EagerTensor(dt1);
// Constructor empty GradTensorHolder // Constructor empty GradTensorHolder
......
...@@ -29,7 +29,7 @@ TEST(TensorWrapper, Basic) { ...@@ -29,7 +29,7 @@ TEST(TensorWrapper, Basic) {
paddle::platform::CPUPlace()) paddle::platform::CPUPlace())
.get(), .get(),
meta); meta);
auto* dt_ptr = dt->mutable_data<float>(); auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr[0] = 5.0f; dt_ptr[0] = 5.0f;
dt_ptr[1] = 10.0f; dt_ptr[1] = 10.0f;
et1.set_impl(dt); et1.set_impl(dt);
...@@ -56,7 +56,7 @@ TEST(TensorWrapper, Basic) { ...@@ -56,7 +56,7 @@ TEST(TensorWrapper, Basic) {
paddle::platform::CPUPlace()) paddle::platform::CPUPlace())
.get(), .get(),
meta2); meta2);
auto* dt_ptr2 = dt->mutable_data<float>(); auto* dt_ptr2 = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr2[0] = 6.0f; dt_ptr2[0] = 6.0f;
dt_ptr2[1] = 11.0f; dt_ptr2[1] = 11.0f;
et2.set_impl(dt2); et2.set_impl(dt2);
......
...@@ -35,7 +35,7 @@ TEST(EagerUtils, AutoGradMeta) { ...@@ -35,7 +35,7 @@ TEST(EagerUtils, AutoGradMeta) {
paddle::platform::CPUPlace()) paddle::platform::CPUPlace())
.get(), .get(),
meta); meta);
dt0->mutable_data<float>()[0] = 10.0; dt0->mutable_data<float>(paddle::platform::CPUPlace())[0] = 10.0;
EagerTensor et0 = EagerTensor(dt0); EagerTensor et0 = EagerTensor(dt0);
std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>( std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>(
...@@ -43,7 +43,7 @@ TEST(EagerUtils, AutoGradMeta) { ...@@ -43,7 +43,7 @@ TEST(EagerUtils, AutoGradMeta) {
paddle::platform::CPUPlace()) paddle::platform::CPUPlace())
.get(), .get(),
meta); meta);
dt1->mutable_data<float>()[0] = 20.0; dt1->mutable_data<float>(paddle::platform::CPUPlace())[0] = 20.0;
EagerTensor et1 = EagerTensor(dt1); EagerTensor et1 = EagerTensor(dt1);
std::vector<EagerTensor> ets = {et0, et1}; std::vector<EagerTensor> ets = {et0, et1};
...@@ -112,7 +112,7 @@ egr::EagerTensor CreateTestCPUTensor(T val, ...@@ -112,7 +112,7 @@ egr::EagerTensor CreateTestCPUTensor(T val,
paddle::platform::CPUPlace()) paddle::platform::CPUPlace())
.get(), .get(),
meta); meta);
auto* dt_ptr = dt->mutable_data<T>(); auto* dt_ptr = dt->mutable_data<T>(paddle::platform::CPUPlace());
for (int64_t i = 0; i < dt->numel(); i++) { for (int64_t i = 0; i < dt->numel(); i++) {
dt_ptr[i] = val; dt_ptr[i] = val;
} }
......
...@@ -44,8 +44,8 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) { ...@@ -44,8 +44,8 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) {
paddle::memory::Alloc(place, bytes_size)), paddle::memory::Alloc(place, bytes_size)),
std::move(ret_meta)); std::move(ret_meta));
float* t_ptr = t_dense->mutable_data<float>(); float* t_ptr = t_dense->mutable_data<float>(place);
float* ret_ptr = ret_dense->mutable_data<float>(); float* ret_ptr = ret_dense->mutable_data<float>(place);
for (int i = 0; i < ret_dense->numel(); i++) { for (int i = 0; i < ret_dense->numel(); i++) {
ret_ptr[i] = t_ptr[i] + 5.0; ret_ptr[i] = t_ptr[i] + 5.0;
} }
...@@ -184,7 +184,7 @@ TEST(FwdBwdJoint, BranchedNodes) { ...@@ -184,7 +184,7 @@ TEST(FwdBwdJoint, BranchedNodes) {
// Examine Forward Output 2 // Examine Forward Output 2
{ {
auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out2.impl()); auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out2.impl());
float* ptr = dense_out->mutable_data<float>(); float* ptr = dense_out->mutable_data<float>(paddle::platform::CPUPlace());
for (int i = 0; i < 20; i++) { for (int i = 0; i < 20; i++) {
PADDLE_ENFORCE(ptr[i] == 150.0, PADDLE_ENFORCE(ptr[i] == 150.0,
paddle::platform::errors::Fatal( paddle::platform::errors::Fatal(
......
...@@ -45,8 +45,8 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) { ...@@ -45,8 +45,8 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) {
paddle::memory::Alloc(place, bytes_size)), paddle::memory::Alloc(place, bytes_size)),
std::move(ret_meta)); std::move(ret_meta));
float* t_ptr = t_dense->mutable_data<float>(); float* t_ptr = t_dense->mutable_data<float>(place);
float* ret_ptr = ret_dense->mutable_data<float>(); float* ret_ptr = ret_dense->mutable_data<float>(place);
for (int i = 0; i < ret_dense->numel(); i++) { for (int i = 0; i < ret_dense->numel(); i++) {
ret_ptr[i] = t_ptr[i] + 3.0; ret_ptr[i] = t_ptr[i] + 3.0;
} }
......
...@@ -34,7 +34,7 @@ bool CompareGradTensorWithValue(const egr::EagerTensor& target, T value) { ...@@ -34,7 +34,7 @@ bool CompareGradTensorWithValue(const egr::EagerTensor& target, T value) {
egr::AutogradMeta* meta = egr::EagerUtils::unsafe_autograd_meta(target); egr::AutogradMeta* meta = egr::EagerUtils::unsafe_autograd_meta(target);
auto grad_dense = auto grad_dense =
std::dynamic_pointer_cast<pten::DenseTensor>(meta->Grad().impl()); std::dynamic_pointer_cast<pten::DenseTensor>(meta->Grad().impl());
T* ptr = grad_dense->mutable_data<T>(); T* ptr = grad_dense->data<T>();
std::vector<T> host_data(grad_dense->numel()); std::vector<T> host_data(grad_dense->numel());
if (paddle::platform::is_gpu_place(grad_dense->place())) { if (paddle::platform::is_gpu_place(grad_dense->place())) {
...@@ -67,7 +67,7 @@ template <typename T> ...@@ -67,7 +67,7 @@ template <typename T>
bool CompareTensorWithValue(const egr::EagerTensor& target, T value) { bool CompareTensorWithValue(const egr::EagerTensor& target, T value) {
// TODO(jiabin): Support Selected Rows later // TODO(jiabin): Support Selected Rows later
auto dense_t = std::dynamic_pointer_cast<pten::DenseTensor>(target.impl()); auto dense_t = std::dynamic_pointer_cast<pten::DenseTensor>(target.impl());
T* ptr = dense_t->mutable_data<T>(); T* ptr = dense_t->data<T>();
std::vector<T> host_data(dense_t->numel()); std::vector<T> host_data(dense_t->numel());
if (paddle::platform::is_gpu_place(dense_t->place())) { if (paddle::platform::is_gpu_place(dense_t->place())) {
......
...@@ -20,9 +20,9 @@ limitations under the License. */ ...@@ -20,9 +20,9 @@ limitations under the License. */
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/eigen_ext.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -38,8 +38,8 @@ limitations under the License. */ ...@@ -38,8 +38,8 @@ limitations under the License. */
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/reader/blocking_queue.h" #include "paddle/fluid/operators/reader/blocking_queue.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/platform/timer.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -19,14 +19,18 @@ ...@@ -19,14 +19,18 @@
#define _LINUX #define _LINUX
#endif #endif
#ifdef _LINUX
#ifndef likely #ifndef likely
#define likely(x) __builtin_expect((x), 1) #ifdef _LINUX
#define likely(expr) (__builtin_expect(!!(expr), 1))
#else
#define likely(expr) (expr)
#endif #endif
#endif #endif
#ifdef _LINUX
#ifndef unlikely #ifndef unlikely
#define unlikely(x) __builtin_expect((x), 0) #ifdef _LINUX
#define unlikely(expr) (__builtin_expect(!!(expr), 0))
#else
#define unlikely(expr) (expr)
#endif #endif
#endif #endif
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "paddle/fluid/framework/io/shell.h" #include "paddle/fluid/framework/io/shell.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
......
...@@ -34,8 +34,8 @@ ...@@ -34,8 +34,8 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
#include "paddle/pten/backends/dynload/port.h"
#if defined(__arm__) || defined(__aarch64__) || defined(__ARM_NEON) || \ #if defined(__arm__) || defined(__aarch64__) || defined(__ARM_NEON) || \
defined(__ARM_NEON__) defined(__ARM_NEON__)
......
...@@ -34,7 +34,7 @@ limitations under the License. */ ...@@ -34,7 +34,7 @@ limitations under the License. */
#include "paddle/fluid/framework/trainer_desc.pb.h" #include "paddle/fluid/framework/trainer_desc.pb.h"
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/reader/blocking_queue.h" #include "paddle/fluid/operators/reader/blocking_queue.h"
#include "paddle/fluid/platform/port.h" #include "paddle/pten/backends/dynload/port.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/port.h" #include "paddle/pten/backends/dynload/port.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
...@@ -28,7 +28,7 @@ limitations under the License. */ ...@@ -28,7 +28,7 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h" #include "paddle/pten/backends/dynload/port.h"
#ifdef _WIN32 #ifdef _WIN32
#include <direct.h> #include <direct.h>
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/port.h" #include "paddle/pten/backends/dynload/port.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -31,8 +31,8 @@ ...@@ -31,8 +31,8 @@
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/pten/backends/dynload/port.h"
extern std::string paddle::framework::DataTypeToString( extern std::string paddle::framework::DataTypeToString(
const framework::proto::VarType::Type type); const framework::proto::VarType::Type type);
......
...@@ -22,8 +22,8 @@ limitations under the License. */ ...@@ -22,8 +22,8 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h" #include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/pten/backends/dynload/port.h"
DECLARE_bool(use_mkldnn); DECLARE_bool(use_mkldnn);
......
...@@ -8,7 +8,6 @@ set(pybind_file_final ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h) ...@@ -8,7 +8,6 @@ set(pybind_file_final ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operators/CMakeLists.txt. DO NOT EDIT!\n\n") file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operators/CMakeLists.txt. DO NOT EDIT!\n\n")
add_subdirectory(math) add_subdirectory(math)
add_subdirectory(eigen)
add_subdirectory(controlflow) add_subdirectory(controlflow)
add_subdirectory(detection) add_subdirectory(detection)
add_subdirectory(elementwise) add_subdirectory(elementwise)
......
...@@ -23,7 +23,7 @@ limitations under the License. */ ...@@ -23,7 +23,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h" #include "paddle/pten/backends/dynload/port.h"
DECLARE_bool(use_mkldnn); DECLARE_bool(use_mkldnn);
......
...@@ -22,49 +22,40 @@ limitations under the License. */ ...@@ -22,49 +22,40 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T> #define COMPARE_FUNCTOR(func_name, op) \
struct LessThanFunctor { template <typename InT, typename OutT = bool> \
using ELEM_TYPE = T; struct func_name { \
HOSTDEVICE bool operator()(const T a, const T b) const { return a < b; } using ELEM_TYPE = InT; \
}; HOSTDEVICE OutT operator()(const InT a, const InT b) const { \
return static_cast<OutT>(a op b); \
template <typename T> } \
struct LessEqualFunctor { };
using ELEM_TYPE = T;
HOSTDEVICE bool operator()(const T a, const T b) const { return a <= b; } COMPARE_FUNCTOR(LessThanFunctor, <)
}; COMPARE_FUNCTOR(LessEqualFunctor, <=)
COMPARE_FUNCTOR(GreaterThanFunctor, >)
template <typename T> COMPARE_FUNCTOR(GreaterEqualFunctor, >=)
struct GreaterThanFunctor { #undef COMPARE_FUNCTOR
using ELEM_TYPE = T;
HOSTDEVICE bool operator()(const T a, const T b) const { return a > b; } template <typename InT, typename OutT = bool>
};
template <typename T>
struct GreaterEqualFunctor {
using ELEM_TYPE = T;
HOSTDEVICE bool operator()(const T a, const T b) const { return a >= b; }
};
template <typename T>
struct EqualFunctor { struct EqualFunctor {
using ELEM_TYPE = T; using ELEM_TYPE = InT;
HOSTDEVICE bool operator()(const T a, const T b) const { HOSTDEVICE OutT operator()(const InT a, const InT b) const {
if (std::is_floating_point<T>::value) { if (std::is_floating_point<InT>::value) {
// This branch will be optimized while compiling if T is integer. It is // This branch will be optimized while compiling if T is integer. It is
// safe to cast a and b to double. // safe to cast a and b to double.
return fabs(static_cast<double>(a - b)) < 1e-8; return static_cast<OutT>(fabs(static_cast<double>(a - b)) < 1e-8);
} else { } else {
return (a == b); return static_cast<OutT>(a == b);
} }
} }
}; };
template <typename T> template <typename InT, typename OutT = bool>
struct NotEqualFunctor { struct NotEqualFunctor {
using ELEM_TYPE = T; using ELEM_TYPE = InT;
HOSTDEVICE bool operator()(const T a, const T b) const { HOSTDEVICE bool operator()(const InT a, const InT b) const {
return !EqualFunctor<T>()(a, b); return !EqualFunctor<InT, OutT>()(a, b);
} }
}; };
......
file(GLOB EIGEN_CC_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
file(GLOB EIGEN_CU_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu")
if(WITH_GPU)
nv_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3)
elseif(WITH_ROCM)
hip_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3)
else()
cc_library(eigen_function SRCS ${EIGEN_CC_SOURCES} DEPS eigen3)
endif()
...@@ -18,243 +18,71 @@ limitations under the License. */ ...@@ -18,243 +18,71 @@ limitations under the License. */
#ifndef NOMINMAX #ifndef NOMINMAX
#define NOMINMAX #define NOMINMAX
#endif #endif
#include "unsupported/Eigen/CXX11/Tensor" #include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename EigenDevice, typename T, int Rank> template <typename EigenDevice, typename T, int Rank>
struct EigenBroadcast { using EigenBroadcast = pten::funcs::EigenBroadcast<EigenDevice, T, Rank>;
using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
using InType32BitIndex =
Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
Eigen::Aligned>;
using OutType = Eigen::TensorMap<
Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType32BitIndex =
Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
Eigen::Aligned>;
static void Eval(const EigenDevice& dev, OutType out, InType in,
const Array& bcast);
static void Eval(const EigenDevice& dev, OutType32BitIndex out,
InType32BitIndex in, const Array& bcast);
};
template <typename EigenDevice, typename T, int Rank> template <typename EigenDevice, typename T, int Rank>
struct EigenBroadcastGrad { using EigenBroadcastGrad =
using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>; pten::funcs::EigenBroadcastGrad<EigenDevice, T, Rank>;
using Array2 = Eigen::DSizes<Eigen::DenseIndex, Rank * 2>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, InType in,
const Array& reduce_dims, const Array2& reshape_dims);
};
template <typename EigenDevice, typename T, int Rank> template <typename EigenDevice, typename T, int Rank>
struct EigenConstant { using EigenConstant = pten::funcs::EigenConstant<EigenDevice, T, Rank>;
using Type = Eigen::TensorMap<
Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, Type out, const T value);
};
template <typename EigenDevice, typename T> template <typename EigenDevice, typename T>
struct EigenSign { using EigenSign = pten::funcs::EigenSign<EigenDevice, T>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in);
};
template <typename EigenDevice, typename T, int Rank> template <typename EigenDevice, typename T, int Rank>
struct EigenReverse { using EigenReverse = pten::funcs::EigenReverse<EigenDevice, T, Rank>;
using Array = Eigen::DSizes<bool, Rank>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType = Eigen::TensorMap<
Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in,
const Array& reverse);
};
template <typename EigenDevice, typename T> template <typename EigenDevice, typename T>
struct EigenAdd { using EigenAdd = pten::funcs::EigenAdd<EigenDevice, T>;
using InType = Eigen::TensorMap<Eigen::TensorFixedSize<
const T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in,
const T value);
};
template <typename EigenDevice, typename T> template <typename EigenDevice, typename T>
struct EigenSub { using EigenSub = pten::funcs::EigenSub<EigenDevice, T>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& left,
const InType& right);
};
template <typename EigenDevice, typename T, int Rank> template <typename EigenDevice, typename T, int Rank>
struct EigenSlice { using EigenSlice = pten::funcs::EigenSlice<EigenDevice, T, Rank>;
using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
using Array32Bit = Eigen::DSizes<int, Rank>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
using InType32BitIndex =
Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
Eigen::Aligned>;
using OutType = Eigen::TensorMap<
Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType32BitIndex =
Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
Eigen::Aligned>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in,
const Array& offsets, const Array& extents);
static void Eval(const EigenDevice& dev, OutType32BitIndex out,
const InType32BitIndex& in, const Array32Bit& offsets,
const Array32Bit& extents);
};
template <typename EigenDevice, typename T, int Rank> template <typename EigenDevice, typename T, int Rank>
struct EigenPad { using EigenPad = pten::funcs::EigenPad<EigenDevice, T, Rank>;
using Array = std::array<std::pair<int64_t, int64_t>, Rank>;
using Array32Bit = std::array<std::pair<int, int>, Rank>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
using InType32BitIndex =
Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
Eigen::Aligned>;
using OutType = Eigen::TensorMap<
Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType32BitIndex =
Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
Eigen::Aligned>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in,
const Array& padding, const T value);
static void Eval(const EigenDevice& dev, OutType32BitIndex out,
const InType32BitIndex& in, const Array32Bit& padding,
const T value);
};
template <typename EigenDevice, typename T> template <typename EigenDevice, typename T>
struct EigenScale { using EigenScale = pten::funcs::EigenScale<EigenDevice, T>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in,
const T scale, const T bias, const bool bias_after_scale);
};
template <typename EigenDevice, typename T> template <typename EigenDevice, typename T>
struct EigenErf { using EigenErf = pten::funcs::EigenErf<EigenDevice, T>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in);
};
template <typename EigenDevice, typename T> template <typename EigenDevice, typename T>
struct EigenErfGrad { using EigenErfGrad = pten::funcs::EigenErfGrad<EigenDevice, T>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType din, const InType& in,
const InType& dout);
};
template <typename EigenDevice, typename T> template <typename EigenDevice, typename T>
struct EigenRankLoss { using EigenRankLoss = pten::funcs::EigenRankLoss<EigenDevice, T>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& label,
const InType& left, const InType& right);
};
template <typename EigenDevice, typename T> template <typename EigenDevice, typename T>
struct EigenRankLossGrad { using EigenRankLossGrad = pten::funcs::EigenRankLossGrad<EigenDevice, T>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void EvalLeft(const EigenDevice& dev, OutType dleft,
const InType& dout, const InType& label,
const InType& left, const InType& right);
static void EvalRight(const EigenDevice& dev, OutType dright,
const InType& dout, const InType& label,
const InType& left, const InType& right);
};
template <typename EigenDevice, typename T> template <typename EigenDevice, typename T>
struct EigenLogLoss { using EigenLogLoss = pten::funcs::EigenLogLoss<EigenDevice, T>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& pred,
const InType& label, const T& epsilon);
};
template <typename EigenDevice, typename T> template <typename EigenDevice, typename T>
struct EigenLogLossGrad { using EigenLogLossGrad = pten::funcs::EigenLogLossGrad<EigenDevice, T>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType dpred, const InType& dloss,
const InType& pred, const InType& label, const T& epsilon);
};
template <typename EigenDevice, typename T> template <typename EigenDevice, typename T>
struct EigenHingeLoss { using EigenHingeLoss = pten::funcs::EigenHingeLoss<EigenDevice, T>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType loss, const InType& pred,
const InType& label);
};
template <typename EigenDevice, typename T> template <typename EigenDevice, typename T>
struct EigenHingeLossGrad { using EigenHingeLossGrad = pten::funcs::EigenHingeLossGrad<EigenDevice, T>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType dpred, const InType& dloss,
const InType& pred, const InType& label);
};
template <typename EigenDevice, typename T> template <typename EigenDevice, typename T>
struct EigenL1Norm { using EigenL1Norm = pten::funcs::EigenL1Norm<EigenDevice, T>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in);
};
template <typename EigenDevice, typename T> template <typename EigenDevice, typename T>
struct EigenL1NormGrad { using EigenL1NormGrad = pten::funcs::EigenL1NormGrad<EigenDevice, T>;
using Array = Eigen::DSizes<Eigen::DenseIndex, 1>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType din, const InType& dout,
const InType& in, const Array& bcast);
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/expect.h"
#include "paddle/fluid/operators/fused/fusion_gru_op.h" #include "paddle/fluid/operators/fused/fusion_gru_op.h"
#include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h" #include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
...@@ -41,7 +42,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> { ...@@ -41,7 +42,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
ctx.InputName("X") + ctx.InputName("WeightH")) { ctx.InputName("X") + ctx.InputName("WeightH")) {
const bool is_INT8 = std::is_same<T, uint8_t>::value; const bool is_INT8 = std::is_same<T, uint8_t>::value;
if (!this->isCached()) { if (unlikely(!this->isCached())) {
// oneDNN kernel has hardcoded activation functions // oneDNN kernel has hardcoded activation functions
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
ctx.Attr<std::string>("gate_activation"), "sigmoid", ctx.Attr<std::string>("gate_activation"), "sigmoid",
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/expect.h"
#include "paddle/fluid/operators/fused/fusion_lstm_op.h" #include "paddle/fluid/operators/fused/fusion_lstm_op.h"
#include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h" #include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
...@@ -40,7 +41,7 @@ class LSTMMKLDNNHandler ...@@ -40,7 +41,7 @@ class LSTMMKLDNNHandler
ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0, ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
is_reverse, N, Ti, IC, OC, 4, is_reverse, N, Ti, IC, OC, 4,
ctx.InputName("X") + ctx.InputName("WeightH")) { ctx.InputName("X") + ctx.InputName("WeightH")) {
if (!this->isCached()) { if (unlikely(!this->isCached())) {
const bool is_INT8 = std::is_same<T, uint8_t>::value; const bool is_INT8 = std::is_same<T, uint8_t>::value;
const bool use_peepholes = ctx.Attr<bool>("use_peepholes"); const bool use_peepholes = ctx.Attr<bool>("use_peepholes");
// oneDNN kernel has hardcoded activation functions // oneDNN kernel has hardcoded activation functions
......
...@@ -15,9 +15,9 @@ ...@@ -15,9 +15,9 @@
#pragma once #pragma once
#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/platform/eigen_ext.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -477,6 +477,155 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -477,6 +477,155 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
} }
}; };
#ifdef PADDLE_WITH_XPU
template <typename T>
struct MergeAdd<platform::XPUDeviceContext, T> {
framework::SelectedRows operator()(const platform::XPUDeviceContext& context,
const framework::SelectedRows& input,
const bool sorted_result = false) {
framework::SelectedRows out;
(*this)(context, input, &out, sorted_result);
return out;
}
void operator()(const platform::XPUDeviceContext& context,
const framework::SelectedRows& input,
framework::SelectedRows* output,
const bool sorted_result = false) {
framework::Vector<int64_t> input_rows(input.rows());
if (input_rows.size() == 0) {
return;
}
framework::SelectedRows& out = *output;
std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
auto input_width = input.value().dims()[1];
out.set_rows(merge_rows);
out.set_height(input.height());
out.mutable_value()->mutable_data<T>(
framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace());
int r =
xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
merge_rows.size() * input_width, static_cast<T>(0.f));
PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
platform::errors::External("XPU constant op return"
" wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) {
rows_to_id[merge_rows[i]] = i;
}
auto* out_data = out.mutable_value()->data<T>();
auto* input_data = input.value().data<T>();
int n = input_width;
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id[input_rows[i]];
auto r = xpu::add(context.x_context(), &input_data[i * input_width],
&out_data[out_i * input_width],
&out_data[out_i * input_width], n);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU API return wrong value[%d %s], ", r,
XPUAPIErrorMsg[r]));
}
}
void operator()(const platform::XPUDeviceContext& context,
const std::vector<const framework::SelectedRows*>& inputs,
framework::SelectedRows* output,
const bool sorted_result = false) {
if (inputs.size() == 0) {
VLOG(3) << "no input! return";
return;
}
const framework::SelectedRows* has_value_input = nullptr;
for (auto* in : inputs) {
if (in->rows().size() > 0) {
has_value_input = in;
break;
}
}
if (has_value_input == nullptr) {
VLOG(3) << "no input has value! just return" << std::endl;
return;
}
auto input_width = has_value_input->value().dims()[1];
auto input_height = has_value_input->height();
framework::SelectedRows& out = *output;
std::set<int64_t> merged_row_set;
size_t row_num = 0;
for (auto* input : inputs) {
if (input->rows().size() == 0) {
continue;
}
PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
platform::errors::InvalidArgument(
"All inputs should have same "
"dimension except for the first one."));
PADDLE_ENFORCE_EQ(input_height, input->height(),
platform::errors::InvalidArgument(
"All inputs should have same height."));
row_num += input->rows().size();
merged_row_set.insert(input->rows().begin(), input->rows().end());
}
std::vector<int64_t> merge_rows(merged_row_set.begin(),
merged_row_set.end());
if (sorted_result) {
std::sort(merge_rows.begin(), merge_rows.end());
}
out.set_rows(merge_rows);
out.set_height(input_height);
out.mutable_value()->mutable_data<T>(
framework::make_ddim(
{static_cast<int64_t>(merged_row_set.size()), input_width}),
context.GetPlace());
int r =
xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
merge_rows.size() * input_width, static_cast<T>(0.f));
PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
platform::errors::External("XPU constant op return"
" wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
float* out_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) {
rows_to_id[merge_rows[i]] = i;
}
for (auto* input : inputs) {
if (input->rows().size() == 0) {
continue;
}
auto& input_rows = input->rows();
int n = input_width;
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id[input_rows[i]];
auto r = xpu::add(
context.x_context(), input->value().data<T>() + i * input_width,
&out_data[out_i * input_width], &out_data[out_i * input_width], n);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU API return wrong value[%d %s], ", r,
XPUAPIErrorMsg[r]));
}
}
}
};
#endif
template <typename T> template <typename T>
struct MergeAverage<platform::CPUDeviceContext, T> { struct MergeAverage<platform::CPUDeviceContext, T> {
framework::SelectedRows operator()(const platform::CPUDeviceContext& context, framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
...@@ -589,6 +738,10 @@ template struct MergeAdd<platform::CPUDeviceContext, ...@@ -589,6 +738,10 @@ template struct MergeAdd<platform::CPUDeviceContext,
template struct MergeAdd<platform::CPUDeviceContext, template struct MergeAdd<platform::CPUDeviceContext,
paddle::platform::bfloat16>; paddle::platform::bfloat16>;
#ifdef PADDLE_WITH_XPU
template struct MergeAdd<platform::XPUDeviceContext, float>;
#endif
template struct MergeAverage<platform::CPUDeviceContext, int>; template struct MergeAverage<platform::CPUDeviceContext, int>;
template struct MergeAverage<platform::CPUDeviceContext, int64_t>; template struct MergeAverage<platform::CPUDeviceContext, int64_t>;
template struct MergeAverage<platform::CPUDeviceContext, float>; template struct MergeAverage<platform::CPUDeviceContext, float>;
......
...@@ -219,18 +219,20 @@ class MatrixRankCPUKernel : public framework::OpKernel<T> { ...@@ -219,18 +219,20 @@ class MatrixRankCPUKernel : public framework::OpKernel<T> {
tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1)); tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
Tensor compare_result; Tensor compare_result;
compare_result.mutable_data<int>(detail::NewAxisDim(dim_out, k), compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k),
context.GetPlace()); context.GetPlace());
int axis = -1; int axis = -1;
if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) { if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) {
ElementwiseComputeEx<GreaterThanFunctor<T>, platform::CPUDeviceContext, T, ElementwiseComputeEx<GreaterThanFunctor<T, int64_t>,
int>(context, &eigenvalue_tensor, &tol_tensor, axis, platform::CPUDeviceContext, T, int>(
GreaterThanFunctor<T>(), &compare_result); context, &eigenvalue_tensor, &tol_tensor, axis,
GreaterThanFunctor<T, int64_t>(), &compare_result);
} else { } else {
ElementwiseComputeEx<LessThanFunctor<T>, platform::CPUDeviceContext, T, ElementwiseComputeEx<LessThanFunctor<T, int64_t>,
int>(context, &eigenvalue_tensor, &tol_tensor, axis, platform::CPUDeviceContext, T, int>(
LessThanFunctor<T>(), &compare_result); context, &eigenvalue_tensor, &tol_tensor, axis,
LessThanFunctor<T, int64_t>(), &compare_result);
} }
auto dito_int = auto dito_int =
math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext, math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext,
......
...@@ -129,10 +129,10 @@ class MatrixRankGPUKernel : public framework::OpKernel<T> { ...@@ -129,10 +129,10 @@ class MatrixRankGPUKernel : public framework::OpKernel<T> {
compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k), compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k),
context.GetPlace()); context.GetPlace());
int axis = -1; int axis = -1;
ElementwiseComputeEx<GreaterThanFunctor<T>, platform::CUDADeviceContext, T, ElementwiseComputeEx<GreaterThanFunctor<T, int64_t>,
int64_t>(context, &eigenvalue_tensor, &tol_tensor, platform::CUDADeviceContext, T, int64_t>(
axis, GreaterThanFunctor<T>(), context, &eigenvalue_tensor, &tol_tensor, axis,
&compare_result); GreaterThanFunctor<T, int64_t>(), &compare_result);
auto dito_int = auto dito_int =
math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext, math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
int64_t>(context); int64_t>(context);
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/controlflow/compare_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -46,16 +47,6 @@ static DDim RemoveLastDim(const DDim& dim) { ...@@ -46,16 +47,6 @@ static DDim RemoveLastDim(const DDim& dim) {
} }
} // namespace detail } // namespace detail
template <typename T>
struct GreaterThanFunctor {
HOSTDEVICE int operator()(const T a, const T b) const { return a > b; }
};
template <typename T>
struct LessThanFunctor {
HOSTDEVICE int operator()(const T a, const T b) const { return a < b; }
};
template <typename T> template <typename T>
struct GreaterElementFunctor { struct GreaterElementFunctor {
HOSTDEVICE T operator()(const T a, const T b) const { HOSTDEVICE T operator()(const T a, const T b) const {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <tuple> #include <tuple>
#include "paddle/fluid/framework/expect.h"
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
...@@ -79,7 +80,7 @@ class ConvMKLDNNHandlerT ...@@ -79,7 +80,7 @@ class ConvMKLDNNHandlerT
dev_ctx, mkldnn_engine, cpu_place, dev_ctx, mkldnn_engine, cpu_place,
platform::CreateKey(dev_ctx, framework::vectorize(input->dims()), platform::CreateKey(dev_ctx, framework::vectorize(input->dims()),
unique_name)) { unique_name)) {
if (!this->isCached()) { if (unlikely(!this->isCached())) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
input->layout(), framework::DataLayout::kMKLDNN, input->layout(), framework::DataLayout::kMKLDNN,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -264,7 +265,7 @@ class ConvMKLDNNHandlerT ...@@ -264,7 +265,7 @@ class ConvMKLDNNHandlerT
dev_ctx, dev_ctx.GetEngine(), cpu_place, dev_ctx, dev_ctx.GetEngine(), cpu_place,
platform::CreateKey(dev_ctx, framework::vectorize(in->dims()), platform::CreateKey(dev_ctx, framework::vectorize(in->dims()),
unique_name)) { unique_name)) {
if (!this->isBwdCached()) { if (unlikely(!this->isBwdCached())) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in->layout(), framework::DataLayout::kMKLDNN, in->layout(), framework::DataLayout::kMKLDNN,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/expect.h"
#include "paddle/fluid/platform/mkldnn_reuse.h" #include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle { namespace paddle {
...@@ -39,7 +40,7 @@ class PReluMKLDNNHandler ...@@ -39,7 +40,7 @@ class PReluMKLDNNHandler
dev_ctx, engine, cpu_place, dev_ctx, engine, cpu_place,
platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
uniq_name)) { uniq_name)) {
if (!this->isCached()) { if (unlikely(!this->isCached())) {
auto x_md = memory::desc(framework::vectorize(x->dims()), auto x_md = memory::desc(framework::vectorize(x->dims()),
MKLDNNGetDataType<T>(), x->format()); MKLDNNGetDataType<T>(), x->format());
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/optimizers/adam_op.h" #include "paddle/fluid/operators/optimizers/adam_op.h"
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -155,6 +156,11 @@ class AdamOpXPUKernel : public framework::OpKernel<T> { ...@@ -155,6 +156,11 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
mom2_out.template mutable_data<float>(ctx.GetPlace()), mom2_out.template mutable_data<float>(ctx.GetPlace()),
param_out.template mutable_data<float>(ctx.GetPlace()), param_out.template mutable_data<float>(ctx.GetPlace()),
beta1, beta2, epsilon, param.numel()); beta1, beta2, epsilon, param.numel());
xpu_wait(dev_ctx.x_context()->xpu_stream);
PADDLE_ENFORCE_EQ(
r == xpu::Error_t::SUCCESS, true,
platform::errors::External("XPU API return wrong value[%d],", r));
if (!use_global_beta_pow) { if (!use_global_beta_pow) {
// update in cpu and then copy to xpu // update in cpu and then copy to xpu
if (beta1_pow.place() == platform::CPUPlace() && if (beta1_pow.place() == platform::CPUPlace() &&
...@@ -165,7 +171,6 @@ class AdamOpXPUKernel : public framework::OpKernel<T> { ...@@ -165,7 +171,6 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
const float* beta2_pow_p = beta2_pow.template data<float>(); const float* beta2_pow_p = beta2_pow.template data<float>();
beta2_pow_out->mutable_data<float>(platform::CPUPlace())[0] = beta2_pow_out->mutable_data<float>(platform::CPUPlace())[0] =
beta2 * beta2_pow_p[0]; beta2 * beta2_pow_p[0];
xpu_wait(dev_ctx.x_context()->xpu_stream);
} else { } else {
float* beta1_pow_out_p = float* beta1_pow_out_p =
beta1_pow_out->mutable_data<float>(ctx.GetPlace()); beta1_pow_out->mutable_data<float>(ctx.GetPlace());
...@@ -177,23 +182,129 @@ class AdamOpXPUKernel : public framework::OpKernel<T> { ...@@ -177,23 +182,129 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
r, xpu::SUCCESS, r, xpu::SUCCESS,
platform::errors::External( platform::errors::External(
"XPU kernel scale occur error in adamw error code ", r, "XPU kernel scale occur error in adam error code ", r,
XPUAPIErrorMsg[r])); XPUAPIErrorMsg[r]));
r = xpu::scale(dev_ctx.x_context(), beta2_pow_ptr, beta2_pow_out_p, r = xpu::scale(dev_ctx.x_context(), beta2_pow_ptr, beta2_pow_out_p,
beta2_pow.numel(), false, beta2, 0.0f); beta2_pow.numel(), false, beta2, 0.0f);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
r, xpu::SUCCESS, r, xpu::SUCCESS,
platform::errors::External( platform::errors::External(
"XPU kernel scale occur error in adamw error code ", r, "XPU kernel scale occur error in adam error code ", r,
XPUAPIErrorMsg[r])); XPUAPIErrorMsg[r]));
xpu_wait(dev_ctx.x_context()->xpu_stream);
}
}
} else if (grad_var->IsType<framework::SelectedRows>()) {
auto* grad = ctx.Input<framework::SelectedRows>("Grad");
auto& dev_ctx = ctx.template device_context<DeviceContext>();
if (grad->rows().size() == 0) {
VLOG(3) << "grad row size is 0!!";
return;
}
std::vector<int64_t> cpu_rows(grad->rows().begin(), grad->rows().end());
bool is_strict_sorted = true;
for (size_t i = 1; i < cpu_rows.size(); ++i) {
if (cpu_rows[i - 1] >= cpu_rows[i]) {
is_strict_sorted = false;
break;
} }
}
framework::SelectedRows tmp_grad_merge;
const framework::SelectedRows* grad_merge_ptr;
if (is_strict_sorted) {
grad_merge_ptr = grad;
} else {
scatter::MergeAdd<platform::XPUDeviceContext, T> merge_func;
merge_func(ctx.template device_context<platform::XPUDeviceContext>(),
*grad, &tmp_grad_merge, true);
xpu_wait(dev_ctx.x_context()->xpu_stream);
grad_merge_ptr = &tmp_grad_merge;
}
const T* beta1_pow_ptr = beta1_pow.template data<T>();
const T* beta2_pow_ptr = beta2_pow.template data<T>();
Tensor xpu_beta1_pow;
Tensor xpu_beta2_pow;
if (beta1_pow.place() == platform::CPUPlace() &&
beta2_pow.place() == platform::CPUPlace()) {
paddle::framework::TensorCopy(beta1_pow, ctx.GetPlace(), dev_ctx,
&xpu_beta1_pow);
paddle::framework::TensorCopy(beta2_pow, ctx.GetPlace(), dev_ctx,
&xpu_beta2_pow);
dev_ctx.Wait();
beta1_pow_ptr = xpu_beta1_pow.template data<T>();
beta2_pow_ptr = xpu_beta2_pow.template data<T>();
}
auto& grad_merge = *grad_merge_ptr;
auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>();
int row_count = grad_merge.rows().size();
std::vector<int> rows(row_count);
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
int* xpu_rows = RAII_GUARD.alloc_l3_or_gm<int>(row_count);
std::vector<int64_t> merge_rows(grad_merge.rows().begin(),
grad_merge.rows().end());
for (size_t i = 0; i < grad_merge.rows().size(); ++i) {
rows[i] = static_cast<int>(merge_rows[i]);
}
xpu_wait(dev_ctx.x_context()->xpu_stream);
memory::Copy(ctx.GetPlace(), xpu_rows, platform::CPUPlace(), rows.data(),
row_count * sizeof(int));
auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
auto ori_rows = param.numel() / row_numel;
PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true, int lazy_mode = static_cast<int>(ctx.Attr<bool>("lazy_mode"));
platform::errors::External( int r = xpu::sparse_adam(
"XPU API return wrong value[%d], please check " dev_ctx.x_context(), grad_data, mom1.template data<T>(),
"where Baidu Kunlun Card is properly installed.", mom2.template data<T>(), param.template data<T>(), beta1_pow_ptr,
r)); beta2_pow_ptr, lr.template data<T>(),
mom1_out.template mutable_data<T>(ctx.GetPlace()),
mom2_out.template mutable_data<T>(ctx.GetPlace()),
param_out.template mutable_data<T>(ctx.GetPlace()), beta1, beta2,
epsilon, ori_rows, xpu_rows, row_numel, grad_merge.rows().size(),
lazy_mode);
PADDLE_ENFORCE_EQ(
r == xpu::Error_t::SUCCESS, true,
platform::errors::External("XPU API return wrong value[%d],", r));
if (!use_global_beta_pow) {
// update in cpu and then copy to xpu
if (beta1_pow.place() == platform::CPUPlace() &&
beta2_pow.place() == platform::CPUPlace()) {
const float* beta1_pow_p = beta1_pow.template data<float>();
beta1_pow_out->mutable_data<float>(platform::CPUPlace())[0] =
beta1 * beta1_pow_p[0];
const float* beta2_pow_p = beta2_pow.template data<float>();
beta2_pow_out->mutable_data<float>(platform::CPUPlace())[0] =
beta2 * beta2_pow_p[0];
} else {
float* beta1_pow_out_p =
beta1_pow_out->mutable_data<float>(ctx.GetPlace());
float* beta2_pow_out_p =
beta2_pow_out->mutable_data<float>(ctx.GetPlace());
int r =
xpu::scale(dev_ctx.x_context(), beta1_pow_ptr, beta1_pow_out_p,
beta1_pow.numel(), false, beta1, 0.0f);
PADDLE_ENFORCE_EQ(
r, xpu::SUCCESS,
platform::errors::External(
"XPU kernel scale occur error in adam error code ", r,
XPUAPIErrorMsg[r]));
r = xpu::scale(dev_ctx.x_context(), beta2_pow_ptr, beta2_pow_out_p,
beta2_pow.numel(), false, beta2, 0.0f);
PADDLE_ENFORCE_EQ(
r, xpu::SUCCESS,
platform::errors::External(
"XPU kernel scale occur error in adam error code ", r,
XPUAPIErrorMsg[r]));
}
} }
xpu_wait(dev_ctx.x_context()->xpu_stream);
} else { } else {
PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument( PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument(
"Variable type not supported by adam_op")); "Variable type not supported by adam_op"));
......
...@@ -22,8 +22,8 @@ limitations under the License. */ ...@@ -22,8 +22,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/algorithm.h" #include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/operators/math/squared_l2_norm.h" #include "paddle/fluid/operators/math/squared_l2_norm.h"
#include "paddle/fluid/platform/eigen_ext.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -27,7 +27,7 @@ limitations under the License. */ ...@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h" #include "paddle/pten/backends/dynload/port.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -72,7 +72,8 @@ struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> { ...@@ -72,7 +72,8 @@ struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
} }
}; };
template <template <typename T> typename CompareFunctor, typename T> template <template <typename InT, typename OutT> typename CompareFunctor,
typename T>
struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> { struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs, void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
const Tensor& rhs, Tensor* mask) { const Tensor& rhs, Tensor* mask) {
...@@ -81,7 +82,7 @@ struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> { ...@@ -81,7 +82,7 @@ struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kBinary, int64_t, T>(dev_ctx, ins, &outs, ElementwiseType::kBinary, int64_t, T>(dev_ctx, ins, &outs,
CompareFunctor<int64_t>()); CompareFunctor<int64_t, T>());
} }
}; };
......
...@@ -112,12 +112,13 @@ void SameDimsBinaryOP(const Tensor& lhs, const Tensor& rhs, Tensor* out) { ...@@ -112,12 +112,13 @@ void SameDimsBinaryOP(const Tensor& lhs, const Tensor& rhs, Tensor* out) {
} }
} }
template <typename DeviceContext, template <typename T> typename CompareFunctor, template <typename DeviceContext,
template <typename InT, typename OutT> typename CompareFunctor,
typename T> typename T>
struct GetMask { struct GetMask {
void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs, void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
const Tensor& rhs, Tensor* mask) { const Tensor& rhs, Tensor* mask) {
SameDimsBinaryOP<int64_t, CompareFunctor<int64_t>, T>(lhs, rhs, mask); SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(lhs, rhs, mask);
} }
}; };
......
...@@ -10,7 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,7 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/eigen_ext.h" #include "paddle/pten/kernels/funcs/eigen/extensions.h"
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "gtest/gtest.h" #include "gtest/gtest.h"
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/complex.h"
#include <complex> #include <complex>
#include "paddle/fluid/platform/eigen_ext.h" #include "paddle/pten/kernels/funcs/eigen/extensions.h"
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "gtest/gtest.h" #include "gtest/gtest.h"
...@@ -267,56 +267,56 @@ TEST(complex, print) { ...@@ -267,56 +267,56 @@ TEST(complex, print) {
TEST(complex, isinf) { TEST(complex, isinf) {
// *********** complex<float> ************* // *********** complex<float> *************
complex<float> a; complex<float> a;
a.real = float(INFINITY); a.real = static_cast<float>(INFINITY);
EXPECT_EQ(std::isinf(a), true); EXPECT_EQ(std::isinf(a), true);
a.imag = float(INFINITY); a.imag = static_cast<float>(INFINITY);
EXPECT_EQ(std::isinf(a), true); EXPECT_EQ(std::isinf(a), true);
complex<float> b = float(INFINITY); complex<float> b = static_cast<float>(INFINITY);
EXPECT_EQ(std::isinf(b), true); EXPECT_EQ(std::isinf(b), true);
complex<float> c(float(INFINITY), 0); complex<float> c(static_cast<float>(INFINITY), 0);
EXPECT_EQ(std::isinf(c), true); EXPECT_EQ(std::isinf(c), true);
// *********** complex<double> ************* // *********** complex<double> *************
complex<double> a1; complex<double> a1;
a1.real = double(INFINITY); a1.real = static_cast<double>(INFINITY);
EXPECT_EQ(std::isinf(a1), true); EXPECT_EQ(std::isinf(a1), true);
a1.imag = double(INFINITY); a1.imag = static_cast<double>(INFINITY);
EXPECT_EQ(std::isinf(a1), true); EXPECT_EQ(std::isinf(a1), true);
complex<double> b1 = double(INFINITY); complex<double> b1 = static_cast<double>(INFINITY);
EXPECT_EQ(std::isinf(b1), true); EXPECT_EQ(std::isinf(b1), true);
complex<double> c1(double(INFINITY), 0); complex<double> c1(static_cast<double>(INFINITY), 0);
EXPECT_EQ(std::isinf(c1), true); EXPECT_EQ(std::isinf(c1), true);
} }
TEST(complex, isnan) { TEST(complex, isnan) {
// *********** complex<float> ************* // *********** complex<float> *************
complex<float> a; complex<float> a;
a.real = float(NAN); a.real = static_cast<float>(NAN);
EXPECT_EQ(std::isnan(a), true); EXPECT_EQ(std::isnan(a), true);
a.imag = float(NAN); a.imag = static_cast<float>(NAN);
EXPECT_EQ(std::isnan(a), true); EXPECT_EQ(std::isnan(a), true);
complex<float> b = float(NAN); complex<float> b = static_cast<float>(NAN);
EXPECT_EQ(std::isnan(b), true); EXPECT_EQ(std::isnan(b), true);
complex<float> c(float(NAN), 0); complex<float> c(static_cast<float>(NAN), 0);
EXPECT_EQ(std::isnan(c), true); EXPECT_EQ(std::isnan(c), true);
// *********** complex<double> ************* // *********** complex<double> *************
complex<double> a1; complex<double> a1;
a1.real = double(NAN); a1.real = static_cast<double>(NAN);
EXPECT_EQ(std::isnan(a1), true); EXPECT_EQ(std::isnan(a1), true);
a1.imag = double(NAN); a1.imag = static_cast<double>(NAN);
EXPECT_EQ(std::isnan(a1), true); EXPECT_EQ(std::isnan(a1), true);
complex<double> b1 = double(NAN); complex<double> b1 = static_cast<double>(NAN);
EXPECT_EQ(std::isnan(b1), true); EXPECT_EQ(std::isnan(b1), true);
complex<double> c1(double(NAN), 0); complex<double> c1(static_cast<double>(NAN), 0);
EXPECT_EQ(std::isnan(c1), true); EXPECT_EQ(std::isnan(c1), true);
} }
......
...@@ -23,8 +23,8 @@ ...@@ -23,8 +23,8 @@
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/eigen_ext.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
namespace paddle { namespace paddle {
...@@ -303,59 +303,59 @@ TEST(complex, print) { ...@@ -303,59 +303,59 @@ TEST(complex, print) {
TEST(complex, isinf) { TEST(complex, isinf) {
// *********** complex<float> ************* // *********** complex<float> *************
complex<float> a; complex<float> a;
a.real = float(INFINITY); a.real = static_cast<float>(INFINITY);
EXPECT_EQ(std::isinf(a), true); EXPECT_EQ(std::isinf(a), true);
a.imag = float(INFINITY); a.imag = static_cast<float>(INFINITY);
EXPECT_EQ(std::isinf(a), true); EXPECT_EQ(std::isinf(a), true);
complex<float> b = float(INFINITY); complex<float> b = static_cast<float>(INFINITY);
EXPECT_EQ(std::isinf(b), true); EXPECT_EQ(std::isinf(b), true);
complex<float> c(float(INFINITY), 0); complex<float> c(static_cast<float>(INFINITY), 0);
EXPECT_EQ(std::isinf(c), true); EXPECT_EQ(std::isinf(c), true);
// *********** complex<double> ************* // *********** complex<double> *************
complex<double> a1; complex<double> a1;
a1.real = double(INFINITY); a1.real = static_cast<double>(INFINITY);
EXPECT_EQ(std::isinf(a1), true); EXPECT_EQ(std::isinf(a1), true);
a1.imag = double(INFINITY); a1.imag = static_cast<double>(INFINITY);
EXPECT_EQ(std::isinf(a1), true); EXPECT_EQ(std::isinf(a1), true);
complex<double> b1 = double(INFINITY); complex<double> b1 = static_cast<double>(INFINITY);
EXPECT_EQ(std::isinf(b1), true); EXPECT_EQ(std::isinf(b1), true);
complex<double> c1(double(INFINITY), 0); complex<double> c1(static_cast<double>(INFINITY), 0);
EXPECT_EQ(std::isinf(c1), true); EXPECT_EQ(std::isinf(c1), true);
} }
TEST(complex, isnan) { TEST(complex, isnan) {
// *********** complex<float> ************* // *********** complex<float> *************
complex<float> a; complex<float> a;
a.real = float(NAN); a.real = static_cast<float>(NAN);
EXPECT_EQ(std::isnan(a), true); EXPECT_EQ(std::isnan(a), true);
a.imag = float(NAN); a.imag = static_cast<float>(NAN);
EXPECT_EQ(std::isnan(a), true); EXPECT_EQ(std::isnan(a), true);
complex<float> b = float(NAN); complex<float> b = static_cast<float>(NAN);
EXPECT_EQ(std::isnan(b), true); EXPECT_EQ(std::isnan(b), true);
complex<float> c(float(NAN), 0); complex<float> c(static_cast<float>(NAN), 0);
EXPECT_EQ(std::isnan(c), true); EXPECT_EQ(std::isnan(c), true);
// *********** complex<double> ************* // *********** complex<double> *************
complex<double> a1; complex<double> a1;
a1.real = double(NAN); a1.real = static_cast<double>(NAN);
EXPECT_EQ(std::isnan(a1), true); EXPECT_EQ(std::isnan(a1), true);
a1.imag = double(NAN); a1.imag = static_cast<double>(NAN);
EXPECT_EQ(std::isnan(a1), true); EXPECT_EQ(std::isnan(a1), true);
complex<double> b1 = double(NAN); complex<double> b1 = static_cast<double>(NAN);
EXPECT_EQ(std::isnan(b1), true); EXPECT_EQ(std::isnan(b1), true);
complex<double> c1(double(NAN), 0); complex<double> c1(static_cast<double>(NAN), 0);
EXPECT_EQ(std::isnan(c1), true); EXPECT_EQ(std::isnan(c1), true);
} }
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
#endif #endif
\ No newline at end of file
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h" #include "paddle/pten/backends/dynload/port.h"
#define HCOM_GROUP_PREFIX "HCOM_GROUP_" #define HCOM_GROUP_PREFIX "HCOM_GROUP_"
......
...@@ -25,6 +25,7 @@ limitations under the License. */ ...@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/platform/ipu/ipu_backend.h" #include "paddle/fluid/platform/ipu/ipu_backend.h"
#endif #endif
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/expect.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
...@@ -841,15 +842,6 @@ unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const { ...@@ -841,15 +842,6 @@ unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const {
return num_entries; return num_entries;
} }
// TODO(jczaja): Replace with C++20 equivalents when applicable
#ifdef _WIN32
#define likely(expr) (expr)
#define unlikely(expr) (expr)
#else
#define likely(expr) (__builtin_expect(!!(expr), 1))
#define unlikely(expr) (__builtin_expect(!!(expr), 0))
#endif
MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob( MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
const std::string& name) const { const std::string& name) const {
BlobMap* pMap = p_blobmap_.get(); BlobMap* pMap = p_blobmap_.get();
......
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce pten_dynamic_loader)
list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc nvtx.cc cufft.cc) list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc nvtx.cc cufft.cc)
...@@ -34,24 +34,24 @@ if (CUPTI_FOUND) ...@@ -34,24 +34,24 @@ if (CUPTI_FOUND)
list(APPEND CUDA_SRCS cupti.cc) list(APPEND CUDA_SRCS cupti.cc)
endif(CUPTI_FOUND) endif(CUPTI_FOUND)
if(WITH_ROCM) if(WITH_ROCM)
hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader) hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader pten_dynload_cuda)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
elseif (WITH_ASCEND_CL) elseif (WITH_ASCEND_CL)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl pten_dynload_warpctc)
else() else()
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader pten_dynload_cuda)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
endif() endif()
if (WITH_MKLML) if (WITH_MKLML)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml pten_dynload_mklml)
endif() endif()
cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader) cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader pten_dynload_lapack)
add_dependencies(dynload_lapack extern_lapack) add_dependencies(dynload_lapack extern_lapack)
# TODO(TJ): add iomp, mkldnn? # TODO(TJ): add iomp, mkldnn?
if (MKL_FOUND AND WITH_ONEMKL) if (MKL_FOUND AND WITH_ONEMKL)
message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}") message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader) cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader pten_dynload_mklrt)
target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE}) target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE})
endif() endif()
...@@ -17,8 +17,6 @@ limitations under the License. */ ...@@ -17,8 +17,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag cublas_dso_flag;
void *cublas_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
......
...@@ -20,16 +20,12 @@ limitations under the License. */ ...@@ -20,16 +20,12 @@ limitations under the License. */
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <type_traits> #include <type_traits>
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/cublas.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag cublas_dso_flag;
extern void *cublas_dso_handle;
/** /**
* The following macro definition can generate structs * The following macro definition can generate structs
* (for each function) to dynamic load cublas routine * (for each function) to dynamic load cublas routine
...@@ -37,19 +33,8 @@ extern void *cublas_dso_handle; ...@@ -37,19 +33,8 @@ extern void *cublas_dso_handle;
* *
* note: default dynamic linked libs * note: default dynamic linked libs
*/ */
#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ #define PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
struct DynLoad__##__name { \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
template <typename... Args> \
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cublas_func = \
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
std::call_once(cublas_dso_flag, []() { \
cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
}); \
static void *p_##__name = dlsym(cublas_dso_handle, #__name); \
return reinterpret_cast<cublas_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
...@@ -99,7 +84,7 @@ extern void *cublas_dso_handle; ...@@ -99,7 +84,7 @@ extern void *cublas_dso_handle;
__macro(cublasSgetrsBatched); \ __macro(cublasSgetrsBatched); \
__macro(cublasDgetrsBatched); __macro(cublasDgetrsBatched);
CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) CUBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
// APIs available after CUDA 8.0 // APIs available after CUDA 8.0
#if CUDA_VERSION >= 8000 #if CUDA_VERSION >= 8000
...@@ -111,7 +96,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) ...@@ -111,7 +96,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
__macro(cublasZgemmStridedBatched); \ __macro(cublasZgemmStridedBatched); \
__macro(cublasHgemmStridedBatched); __macro(cublasHgemmStridedBatched);
CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) CUBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif #endif
// APIs available after CUDA 9.0 // APIs available after CUDA 9.0
...@@ -120,7 +105,7 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) ...@@ -120,7 +105,7 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
__macro(cublasSetMathMode); \ __macro(cublasSetMathMode); \
__macro(cublasGetMathMode); __macro(cublasGetMathMode);
CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) CUBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif #endif
// APIs available after CUDA 9.1 // APIs available after CUDA 9.1
...@@ -129,10 +114,10 @@ CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) ...@@ -129,10 +114,10 @@ CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
__macro(cublasGemmBatchedEx); \ __macro(cublasGemmBatchedEx); \
__macro(cublasGemmStridedBatchedEx); __macro(cublasGemmStridedBatchedEx);
CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) CUBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif #endif
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP #undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -17,8 +17,6 @@ limitations under the License. */ ...@@ -17,8 +17,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag cublasLt_dso_flag;
void *cublasLt_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
......
...@@ -19,16 +19,12 @@ limitations under the License. */ ...@@ -19,16 +19,12 @@ limitations under the License. */
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <type_traits> #include <type_traits>
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/cublasLt.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag cublasLt_dso_flag;
extern void *cublasLt_dso_handle;
/** /**
* The following macro definition can generate structs * The following macro definition can generate structs
* (for each function) to dynamic load cublasLt routine * (for each function) to dynamic load cublasLt routine
...@@ -36,20 +32,8 @@ extern void *cublasLt_dso_handle; ...@@ -36,20 +32,8 @@ extern void *cublasLt_dso_handle;
* *
* note: default dynamic linked libs * note: default dynamic linked libs
*/ */
#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name) \ #define PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name) \
struct DynLoad__##__name { \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
template <typename... Args> \
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cublasLt_func = \
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
std::call_once(cublasLt_dso_flag, []() { \
cublasLt_dso_handle = \
paddle::platform::dynload::GetCublasLtDsoHandle(); \
}); \
static void *p_##__name = dlsym(cublasLt_dso_handle, #__name); \
return reinterpret_cast<cublasLt_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
// APIs available after CUDA 10.1 // APIs available after CUDA 10.1
...@@ -69,10 +53,10 @@ extern void *cublasLt_dso_handle; ...@@ -69,10 +53,10 @@ extern void *cublasLt_dso_handle;
__macro(cublasLtMatrixTransformDescDestroy); \ __macro(cublasLtMatrixTransformDescDestroy); \
__macro(cublasLtMatrixTransformDescSetAttribute); __macro(cublasLtMatrixTransformDescSetAttribute);
CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP) CUBLASLT_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
// #endif // #endif
#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP #undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -13,14 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,14 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/dynload/cuda_driver.h" #include "paddle/fluid/platform/dynload/cuda_driver.h"
#include "paddle/pten/backends/dynload/cuda_driver.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag cuda_dso_flag;
void* cuda_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
#if CUDA_VERSION >= 10020 #if CUDA_VERSION >= 10020
...@@ -28,10 +26,7 @@ CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP); ...@@ -28,10 +26,7 @@ CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP);
#endif #endif
CUDA_ROUTINE_EACH(DEFINE_WRAP); CUDA_ROUTINE_EACH(DEFINE_WRAP);
bool HasCUDADriver() { bool HasCUDADriver() { return pten::dynload::HasCUDADriver(); }
std::call_once(cuda_dso_flag, []() { cuda_dso_handle = GetCUDADsoHandle(); });
return cuda_dso_handle != nullptr;
}
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
...@@ -17,30 +17,17 @@ limitations under the License. */ ...@@ -17,30 +17,17 @@ limitations under the License. */
#include <cuda.h> #include <cuda.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/cuda_driver.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag cuda_dso_flag;
extern void* cuda_dso_handle;
extern bool HasCUDADriver(); extern bool HasCUDADriver();
#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name) \ #define PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name) \
struct DynLoad__##__name { \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
template <typename... Args> \ extern DynLoad__##__name __name
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cuda_func = decltype(&::__name); \
std::call_once(cuda_dso_flag, []() { \
cuda_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \
}); \
static void* p_##__name = dlsym(cuda_dso_handle, #__name); \
return reinterpret_cast<cuda_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/** /**
* include all needed cuda driver functions * include all needed cuda driver functions
...@@ -72,12 +59,12 @@ extern bool HasCUDADriver(); ...@@ -72,12 +59,12 @@ extern bool HasCUDADriver();
__macro(cuMemRelease); \ __macro(cuMemRelease); \
__macro(cuMemAddressFree) __macro(cuMemAddressFree)
CUDA_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_CUDA_WRAP); CUDA_ROUTINE_EACH_VVM(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
#endif #endif
CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP); CUDA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
#undef DECLARE_DYNAMIC_LOAD_CUDA_WRAP #undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
...@@ -13,13 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,13 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/pten/backends/dynload/cudnn.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag cudnn_dso_flag;
void* cudnn_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
...@@ -45,19 +43,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP); ...@@ -45,19 +43,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
#endif #endif
bool HasCUDNN() { bool HasCUDNN() { return pten::dynload::HasCUDNN(); }
std::call_once(cudnn_dso_flag,
[]() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
return cudnn_dso_handle != nullptr;
}
void EnforceCUDNNLoaded(const char* fn_name) {
PADDLE_ENFORCE_NOT_NULL(
cudnn_dso_handle,
platform::errors::PreconditionNotMet(
"Cannot load cudnn shared library. Cannot invoke method %s.",
fn_name));
}
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
...@@ -18,32 +18,17 @@ limitations under the License. */ ...@@ -18,32 +18,17 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/cudnn.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag cudnn_dso_flag;
extern void* cudnn_dso_handle;
extern bool HasCUDNN(); extern bool HasCUDNN();
extern void EnforceCUDNNLoaded(const char* fn_name); #define PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
struct DynLoad__##__name { \ extern DynLoad__##__name __name
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cudnn_func = decltype(&::__name); \
std::call_once(cudnn_dso_flag, []() { \
cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
}); \
EnforceCUDNNLoaded(#__name); \
static void* p_##__name = dlsym(cudnn_dso_handle, #__name); \
return reinterpret_cast<cudnn_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/** /**
* include all needed cudnn functions in HPPL * include all needed cudnn functions in HPPL
...@@ -127,7 +112,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -127,7 +112,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnGetActivationDescriptor); \ __macro(cudnnGetActivationDescriptor); \
__macro(cudnnDestroyActivationDescriptor); \ __macro(cudnnDestroyActivationDescriptor); \
__macro(cudnnSetRNNDescriptor_v6); __macro(cudnnSetRNNDescriptor_v6);
CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000 #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \ #define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \
...@@ -135,7 +120,8 @@ CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -135,7 +120,8 @@ CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnGetConvolutionForwardAlgorithm); \ __macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnGetConvolutionBackwardDataAlgorithm); \ __macro(cudnnGetConvolutionBackwardDataAlgorithm); \
__macro(cudnnSetRNNDescriptor); __macro(cudnnSetRNNDescriptor);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(
PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
#if CUDNN_VERSION >= 7001 #if CUDNN_VERSION >= 7001
...@@ -153,7 +139,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -153,7 +139,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \ __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
__macro(cudnnGetConvolutionForwardAlgorithm_v7); \ __macro(cudnnGetConvolutionForwardAlgorithm_v7); \
__macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount); __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
#if CUDNN_VERSION >= 7201 #if CUDNN_VERSION >= 7201
...@@ -166,7 +152,7 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -166,7 +152,7 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnRNNBackwardDataEx); \ __macro(cudnnRNNBackwardDataEx); \
__macro(cudnnRNNBackwardWeightsEx); \ __macro(cudnnRNNBackwardWeightsEx); \
__macro(cudnnRNNForwardInferenceEx); __macro(cudnnRNNForwardInferenceEx);
CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
#if CUDNN_VERSION >= 7401 #if CUDNN_VERSION >= 7401
...@@ -176,7 +162,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -176,7 +162,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \ __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \
__macro(cudnnBatchNormalizationBackwardEx); \ __macro(cudnnBatchNormalizationBackwardEx); \
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize); __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_AFTER_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
#if CUDNN_VERSION >= 8000 #if CUDNN_VERSION >= 8000
...@@ -192,7 +178,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -192,7 +178,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnSetFusedOpsConstParamPackAttribute); \ __macro(cudnnSetFusedOpsConstParamPackAttribute); \
__macro(cudnnSetFusedOpsVariantParamPackAttribute); \ __macro(cudnnSetFusedOpsVariantParamPackAttribute); \
__macro(cudnnMakeFusedOpsPlan); __macro(cudnnMakeFusedOpsPlan);
CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R8(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
} // namespace dynload } // namespace dynload
......
...@@ -13,31 +13,17 @@ See the License for the specific language governing permissions and ...@@ -13,31 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/dynload/cufft.h" #include "paddle/fluid/platform/dynload/cufft.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/pten/backends/dynload/cufft.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag cufft_dso_flag;
void* cufft_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP); CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
bool HasCUFFT() { bool HasCUFFT() { return pten::dynload::HasCUFFT(); }
std::call_once(cufft_dso_flag,
[]() { cufft_dso_handle = GetCUFFTDsoHandle(); });
return cufft_dso_handle != nullptr;
}
void EnforceCUFFTLoaded(const char* fn_name) {
PADDLE_ENFORCE_NOT_NULL(
cufft_dso_handle,
platform::errors::PreconditionNotMet(
"Cannot load cufft shared library. Cannot invoke method %s.",
fn_name));
}
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
...@@ -19,32 +19,17 @@ limitations under the License. */ ...@@ -19,32 +19,17 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/cufft.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag cufft_dso_flag;
extern void* cufft_dso_handle;
extern bool HasCUFFT(); extern bool HasCUFFT();
extern void EnforceCUFFTLoaded(const char* fn_name); #define PLATFORM_DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name) \
#define DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name) \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
struct DynLoad__##__name { \ extern DynLoad__##__name __name
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cufft_func = decltype(&::__name); \
std::call_once(cufft_dso_flag, []() { \
cufft_dso_handle = paddle::platform::dynload::GetCUFFTDsoHandle(); \
}); \
EnforceCUFFTLoaded(#__name); \
static void* p_##__name = dlsym(cufft_dso_handle, #__name); \
return reinterpret_cast<cufft_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/** /**
* include all needed cufft functions in HPPL * include all needed cufft functions in HPPL
...@@ -104,7 +89,7 @@ extern void EnforceCUFFTLoaded(const char* fn_name); ...@@ -104,7 +89,7 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
__macro(cufftXtExecDescriptor); \ __macro(cufftXtExecDescriptor); \
__macro(cufftXtSetWorkAreaPolicy); __macro(cufftXtSetWorkAreaPolicy);
CUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUFFT_WRAP) CUFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
...@@ -20,9 +20,6 @@ namespace paddle { ...@@ -20,9 +20,6 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag cupti_dso_flag;
void *cupti_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
CUPTI_ROUTINE_EACH(DEFINE_WRAP); CUPTI_ROUTINE_EACH(DEFINE_WRAP);
......
...@@ -19,16 +19,12 @@ limitations under the License. */ ...@@ -19,16 +19,12 @@ limitations under the License. */
#include <cupti.h> #include <cupti.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/cupti.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag cupti_dso_flag;
extern void *cupti_dso_handle;
/** /**
* The following macro definition can generate structs * The following macro definition can generate structs
* (for each function) to dynamic load cupti routine * (for each function) to dynamic load cupti routine
...@@ -36,18 +32,8 @@ extern void *cupti_dso_handle; ...@@ -36,18 +32,8 @@ extern void *cupti_dso_handle;
* *
* note: default dynamic linked libs * note: default dynamic linked libs
*/ */
#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \ #define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \
struct DynLoad__##__name { \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
template <typename... Args> \
inline CUptiResult CUPTIAPI operator()(Args... args) { \
using cuptiFunc = decltype(&::__name); \
std::call_once(cupti_dso_flag, []() { \
cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
}); \
static void *p_##__name = dlsym(cupti_dso_handle, #__name); \
return reinterpret_cast<cuptiFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#define CUPTI_ROUTINE_EACH(__macro) \ #define CUPTI_ROUTINE_EACH(__macro) \
......
...@@ -18,9 +18,6 @@ namespace paddle { ...@@ -18,9 +18,6 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag curand_dso_flag;
void *curand_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
......
...@@ -16,27 +16,14 @@ limitations under the License. */ ...@@ -16,27 +16,14 @@ limitations under the License. */
#include <curand.h> #include <curand.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/curand.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag curand_dso_flag;
extern void *curand_dso_handle; #define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
curandStatus_t operator()(Args... args) { \
using curandFunc = decltype(&::__name); \
std::call_once(curand_dso_flag, []() { \
curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
}); \
static void *p_##__name = dlsym(curand_dso_handle, #__name); \
return reinterpret_cast<curandFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#define CURAND_RAND_ROUTINE_EACH(__macro) \ #define CURAND_RAND_ROUTINE_EACH(__macro) \
...@@ -48,7 +35,7 @@ extern void *curand_dso_handle; ...@@ -48,7 +35,7 @@ extern void *curand_dso_handle;
__macro(curandGenerateNormal); \ __macro(curandGenerateNormal); \
__macro(curandDestroyGenerator); __macro(curandDestroyGenerator);
CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP); CURAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
...@@ -18,9 +18,6 @@ namespace paddle { ...@@ -18,9 +18,6 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag cusolver_dso_flag;
void *cusolver_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
CUSOLVER_ROUTINE_EACH(DEFINE_WRAP); CUSOLVER_ROUTINE_EACH(DEFINE_WRAP);
......
...@@ -17,28 +17,14 @@ limitations under the License. */ ...@@ -17,28 +17,14 @@ limitations under the License. */
#include <cusolverDn.h> #include <cusolverDn.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/cusolver.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag cusolver_dso_flag;
extern void *cusolver_dso_handle;
#define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name) \ #define PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name) \
struct DynLoad__##__name { \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
template <typename... Args> \
cusolverStatus_t operator()(Args... args) { \
using cusolverFunc = decltype(&::__name); \
std::call_once(cusolver_dso_flag, []() { \
cusolver_dso_handle = \
paddle::platform::dynload::GetCusolverDsoHandle(); \
}); \
static void *p_##__name = dlsym(cusolver_dso_handle, #__name); \
return reinterpret_cast<cusolverFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#define CUSOLVER_ROUTINE_EACH(__macro) \ #define CUSOLVER_ROUTINE_EACH(__macro) \
...@@ -62,7 +48,7 @@ extern void *cusolver_dso_handle; ...@@ -62,7 +48,7 @@ extern void *cusolver_dso_handle;
__macro(cusolverDnCheevd); \ __macro(cusolverDnCheevd); \
__macro(cusolverDnZheevd); __macro(cusolverDnZheevd);
CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP); CUSOLVER_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
#if CUDA_VERSION >= 9020 #if CUDA_VERSION >= 9020
#define CUSOLVER_ROUTINE_EACH_R1(__macro) \ #define CUSOLVER_ROUTINE_EACH_R1(__macro) \
...@@ -105,7 +91,7 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP); ...@@ -105,7 +91,7 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
__macro(cusolverDnCungqr); \ __macro(cusolverDnCungqr); \
__macro(cusolverDnZungqr); __macro(cusolverDnZungqr);
CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) CUSOLVER_ROUTINE_EACH_R1(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
#endif #endif
#if CUDA_VERSION >= 9020 #if CUDA_VERSION >= 9020
...@@ -117,10 +103,10 @@ CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) ...@@ -117,10 +103,10 @@ CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
__macro(cusolverDnDsyevj); \ __macro(cusolverDnDsyevj); \
__macro(cusolverDnDestroySyevjInfo); __macro(cusolverDnDestroySyevjInfo);
CUSOLVER_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) CUSOLVER_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
#endif #endif
#undef DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP #undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -18,9 +18,6 @@ namespace paddle { ...@@ -18,9 +18,6 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag cusparse_dso_flag;
void *cusparse_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
#ifdef CUSPARSE_ROUTINE_EACH #ifdef CUSPARSE_ROUTINE_EACH
......
...@@ -17,28 +17,14 @@ limitations under the License. */ ...@@ -17,28 +17,14 @@ limitations under the License. */
#include <cusparse.h> #include <cusparse.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/cusparse.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag cusparse_dso_flag;
extern void *cusparse_dso_handle;
#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name) \ #define PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name) \
struct DynLoad__##__name { \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
template <typename... Args> \
cusparseStatus_t operator()(Args... args) { \
using cusparseFunc = decltype(&::__name); \
std::call_once(cusparse_dso_flag, []() { \
cusparse_dso_handle = \
paddle::platform::dynload::GetCusparseDsoHandle(); \
}); \
static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
return reinterpret_cast<cusparseFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
...@@ -54,7 +40,7 @@ extern void *cusparse_dso_handle; ...@@ -54,7 +40,7 @@ extern void *cusparse_dso_handle;
__macro(cusparseSetMatType); \ __macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase); __macro(cusparseSetMatIndexBase);
CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP); CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
// APIs available after CUDA 11.2 // APIs available after CUDA 11.2
#if CUDA_VERSION >= 11020 #if CUDA_VERSION >= 11020
...@@ -74,7 +60,7 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP); ...@@ -74,7 +60,7 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
__macro(cusparseSparseToDense_bufferSize); \ __macro(cusparseSparseToDense_bufferSize); \
__macro(cusparseSparseToDense); __macro(cusparseSparseToDense);
CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) CUSPARSE_ROUTINE_EACH_11020(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
// APIs available after CUDA 11.3 // APIs available after CUDA 11.3
#if CUDA_VERSION >= 11030 #if CUDA_VERSION >= 11030
...@@ -83,13 +69,13 @@ CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) ...@@ -83,13 +69,13 @@ CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
__macro(cusparseSDDMM_preprocess); \ __macro(cusparseSDDMM_preprocess); \
__macro(cusparseSDDMM); __macro(cusparseSDDMM);
CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) CUSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif #endif
#endif #endif
#endif #endif
#endif #endif
#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP #undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -18,9 +18,6 @@ namespace paddle { ...@@ -18,9 +18,6 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag hipfft_dso_flag;
void *hipfft_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP); HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
......
...@@ -17,8 +17,7 @@ limitations under the License. */ ...@@ -17,8 +17,7 @@ limitations under the License. */
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/hipfft.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -26,18 +25,8 @@ namespace dynload { ...@@ -26,18 +25,8 @@ namespace dynload {
extern std::once_flag hipfft_dso_flag; extern std::once_flag hipfft_dso_flag;
extern void *hipfft_dso_handle; extern void *hipfft_dso_handle;
#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name) \ #define PLATFORM_DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name) \
struct DynLoad__##__name { \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using hipfftFunc = decltype(&::__name); \
std::call_once(hipfft_dso_flag, []() { \
hipfft_dso_handle = paddle::platform::dynload::GetROCFFTDsoHandle(); \
}); \
static void *p_##__name = dlsym(hipfft_dso_handle, #__name); \
return reinterpret_cast<hipfftFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#define HIPFFT_FFT_ROUTINE_EACH(__macro) \ #define HIPFFT_FFT_ROUTINE_EACH(__macro) \
...@@ -70,53 +59,8 @@ extern void *hipfft_dso_handle; ...@@ -70,53 +59,8 @@ extern void *hipfft_dso_handle;
__macro(hipfftGetVersion); \ __macro(hipfftGetVersion); \
__macro(hipfftGetProperty); __macro(hipfftGetProperty);
HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP); HIPFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
inline const char *hipfftGetErrorString(hipfftResult_t status) {
switch (status) {
case HIPFFT_SUCCESS:
return "'HIPFFT_SUCCESS'. The hipFFT operation was successful.";
case HIPFFT_INVALID_PLAN:
return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle.";
case HIPFFT_ALLOC_FAILED:
return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU "
"memory.";
case HIPFFT_INVALID_TYPE:
return "'HIPFFT_INVALID_TYPE'. No longer used.";
case HIPFFT_INVALID_VALUE:
return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or "
"parameter.";
case HIPFFT_INTERNAL_ERROR:
return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library "
"error.";
case HIPFFT_EXEC_FAILED:
return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU.";
case HIPFFT_SETUP_FAILED:
return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize.";
case HIPFFT_INVALID_SIZE:
return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size.";
case HIPFFT_UNALIGNED_DATA:
return "'HIPFFT_UNALIGNED_DATA'. No longer used.";
case HIPFFT_INCOMPLETE_PARAMETER_LIST:
return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call.";
case HIPFFT_INVALID_DEVICE:
return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different "
"GPU than plan creation.";
case HIPFFT_PARSE_ERROR:
return "'HIPFFT_PARSE_ERROR'. Internal plan database error.";
case HIPFFT_NO_WORKSPACE:
return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to "
"plan execution.";
case HIPFFT_NOT_IMPLEMENTED:
return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement "
"functionality for parameters given.";
case HIPFFT_NOT_SUPPORTED:
return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for "
"parameters given.";
default:
return "HIPFFT_STATUS_UNKNOWN_ERROR";
}
}
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -18,9 +18,6 @@ namespace paddle { ...@@ -18,9 +18,6 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag hiprand_dso_flag;
void *hiprand_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP); HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
......
...@@ -16,28 +16,15 @@ limitations under the License. */ ...@@ -16,28 +16,15 @@ limitations under the License. */
#include <hiprand.h> #include <hiprand.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/hiprand.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag hiprand_dso_flag;
extern void *hiprand_dso_handle; #define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
hiprandStatus_t operator()(Args... args) { \
using hiprandFunc = decltype(&::__name); \
std::call_once(hiprand_dso_flag, []() { \
hiprand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
}); \
static void *p_##__name = dlsym(hiprand_dso_handle, #__name); \
return reinterpret_cast<hiprandFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#define HIPRAND_RAND_ROUTINE_EACH(__macro) \ #define HIPRAND_RAND_ROUTINE_EACH(__macro) \
...@@ -49,7 +36,7 @@ extern void *hiprand_dso_handle; ...@@ -49,7 +36,7 @@ extern void *hiprand_dso_handle;
__macro(hiprandGenerateNormal); \ __macro(hiprandGenerateNormal); \
__macro(hiprandDestroyGenerator); __macro(hiprandDestroyGenerator);
HIPRAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP); HIPRAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
...@@ -13,23 +13,17 @@ See the License for the specific language governing permissions and ...@@ -13,23 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/dynload/hiprtc.h" #include "paddle/fluid/platform/dynload/hiprtc.h"
#include "paddle/pten/backends/dynload/hiprtc.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag hiprtc_dso_flag;
void* hiprtc_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
HIPRTC_ROUTINE_EACH(DEFINE_WRAP); HIPRTC_ROUTINE_EACH(DEFINE_WRAP);
bool HasNVRTC() { bool HasNVRTC() { return pten::dynload::HasNVRTC(); }
std::call_once(hiprtc_dso_flag,
[]() { hiprtc_dso_handle = GetNVRTCDsoHandle(); });
return hiprtc_dso_handle != nullptr;
}
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
...@@ -16,30 +16,17 @@ limitations under the License. */ ...@@ -16,30 +16,17 @@ limitations under the License. */
#include <hip/hiprtc.h> #include <hip/hiprtc.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/hiprtc.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag hiprtc_dso_flag;
extern void* hiprtc_dso_handle;
extern bool HasNVRTC(); extern bool HasNVRTC();
#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \ #define PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \
struct DynLoad__##__name { \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
template <typename... Args> \ extern DynLoad__##__name __name
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using hiprtc_func = decltype(&::__name); \
std::call_once(hiprtc_dso_flag, []() { \
hiprtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \
}); \
static void* p_##__name = dlsym(hiprtc_dso_handle, #__name); \
return reinterpret_cast<hiprtc_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/** /**
* include all needed hiprtc functions * include all needed hiprtc functions
...@@ -55,9 +42,9 @@ extern bool HasNVRTC(); ...@@ -55,9 +42,9 @@ extern bool HasNVRTC();
__macro(hiprtcGetProgramLog); \ __macro(hiprtcGetProgramLog); \
__macro(hiprtcGetProgramLogSize) __macro(hiprtcGetProgramLogSize)
HIPRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP); HIPRTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
#undef DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP #undef PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
...@@ -13,15 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,15 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/dynload/lapack.h" #include "paddle/fluid/platform/dynload/lapack.h"
#include <mutex>
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag lapack_dso_flag;
void* lapack_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
LAPACK_ROUTINE_EACH(DEFINE_WRAP); LAPACK_ROUTINE_EACH(DEFINE_WRAP);
......
...@@ -16,122 +16,20 @@ limitations under the License. */ ...@@ -16,122 +16,20 @@ limitations under the License. */
#include <complex> #include <complex>
#include <mutex> #include <mutex>
#include "paddle/fluid/platform/complex.h" #include "paddle/pten/backends/dynload/lapack.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/common/complex.h"
#include "paddle/fluid/platform/port.h"
// Note(zhouwei): because lapack doesn't provide appropriate header file.
// should expose API statement yourself.
// getrf_(For example)
extern "C" void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv,
int *info);
extern "C" void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv,
int *info);
// evd
extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex<double> *a,
int *lda, double *w, std::complex<double> *work,
int *lwork, double *rwork, int *lrwork, int *iwork,
int *liwork, int *info);
extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex<float> *a,
int *lda, float *w, std::complex<float> *work,
int *lwork, float *rwork, int *lrwork, int *iwork,
int *liwork, int *info);
extern "C" void dsyevd_(char *jobz, char *uplo, int *n, double *a, int *lda,
double *w, double *work, int *lwork, int *iwork,
int *liwork, int *info);
extern "C" void ssyevd_(char *jobz, char *uplo, int *n, float *a, int *lda,
float *w, float *work, int *lwork, int *iwork,
int *liwork, int *info);
// geev
extern "C" void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda,
double *wr, double *wi, double *vl, int *ldvl,
double *vr, int *ldvr, double *work, int *lwork,
int *info);
extern "C" void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda,
float *wr, float *wi, float *vl, int *ldvl, float *vr,
int *ldvr, float *work, int *lwork, int *info);
extern "C" void zgeev_(char *jobvl, char *jobvr, int *n,
std::complex<double> *a, int *lda,
std::complex<double> *w, std::complex<double> *vl,
int *ldvl, std::complex<double> *vr, int *ldvr,
std::complex<double> *work, int *lwork, double *rwork,
int *info);
extern "C" void cgeev_(char *jobvl, char *jobvr, int *n, std::complex<float> *a,
int *lda, std::complex<float> *w,
std::complex<float> *vl, int *ldvl,
std::complex<float> *vr, int *ldvr,
std::complex<float> *work, int *lwork, float *rwork,
int *info);
// gels
extern "C" void dgels_(char *trans, int *m, int *n, int *nrhs, double *a,
int *lda, double *b, int *ldb, double *work, int *lwork,
int *info);
extern "C" void sgels_(char *trans, int *m, int *n, int *nrhs, float *a,
int *lda, float *b, int *ldb, float *work, int *lwork,
int *info);
// gelsd
extern "C" void dgelsd_(int *m, int *n, int *nrhs, double *a, int *lda,
double *b, int *ldb, double *s, double *rcond,
int *rank, double *work, int *lwork, int *iwork,
int *info);
extern "C" void sgelsd_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
int *ldb, float *s, float *rcond, int *rank,
float *work, int *lwork, int *iwork, int *info);
// gelsy
extern "C" void dgelsy_(int *m, int *n, int *nrhs, double *a, int *lda,
double *b, int *ldb, int *jpvt, double *rcond,
int *rank, double *work, int *lwork, int *info);
extern "C" void sgelsy_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
int *ldb, int *jpvt, float *rcond, int *rank,
float *work, int *lwork, int *info);
// gelss
extern "C" void dgelss_(int *m, int *n, int *nrhs, double *a, int *lda,
double *b, int *ldb, double *s, double *rcond,
int *rank, double *work, int *lwork, int *info);
extern "C" void sgelss_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
int *ldb, float *s, float *rcond, int *rank,
float *work, int *lwork, int *info);
extern "C" void zpotrs_(char *uplo, int *n, int *nrhs, std::complex<double> *a,
int *lda, std::complex<double> *b, int *ldb, int *info);
extern "C" void cpotrs_(char *uplo, int *n, int *nrhs, std::complex<float> *a,
int *lda, std::complex<float> *b, int *ldb, int *info);
extern "C" void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda,
double *b, int *ldb, int *info);
extern "C" void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda,
float *b, int *ldb, int *info);
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag lapack_dso_flag;
extern void *lapack_dso_handle;
/** /**
* The following macro definition can generate structs * The following macro definition can generate structs
* (for each function) to dynamic load lapack routine * (for each function) to dynamic load lapack routine
* via operator overloading. * via operator overloading.
*/ */
#define DYNAMIC_LOAD_LAPACK_WRAP(__name) \ #define DYNAMIC_LOAD_LAPACK_WRAP(__name) \
struct DynLoad__##__name { \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using lapackFunc = decltype(&::__name); \
std::call_once(lapack_dso_flag, []() { \
lapack_dso_handle = paddle::platform::dynload::GetLAPACKDsoHandle(); \
}); \
static void *p_##_name = dlsym(lapack_dso_handle, #__name); \
return reinterpret_cast<lapackFunc>(p_##_name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \ #define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \
......
...@@ -13,13 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,13 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/dynload/miopen.h" #include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/pten/backends/dynload/cudnn.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag miopen_dso_flag;
void* miopen_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
...@@ -50,19 +48,7 @@ MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); ...@@ -50,19 +48,7 @@ MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP); MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
#endif #endif
bool HasCUDNN() { bool HasCUDNN() { return pten::dynload::HasCUDNN(); }
std::call_once(miopen_dso_flag,
[]() { miopen_dso_handle = GetCUDNNDsoHandle(); });
return miopen_dso_handle != nullptr;
}
void EnforceCUDNNLoaded(const char* fn_name) {
PADDLE_ENFORCE_NOT_NULL(
miopen_dso_handle,
platform::errors::PreconditionNotMet(
"Cannot load miopen shared library. Cannot invoke method %s.",
fn_name));
}
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
...@@ -18,66 +18,17 @@ limitations under the License. */ ...@@ -18,66 +18,17 @@ limitations under the License. */
#include <miopen/miopen.h> #include <miopen/miopen.h>
#include <miopen/version.h> #include <miopen/version.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/miopen.h"
#include "paddle/fluid/platform/port.h"
#define MIOPEN_VERSION \
(MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \
MIOPEN_VERSION_PATCH) // NOLINT
// MIOPEN only support NCHW, just for compatibility with CUDNN API
typedef enum {
MIOPEN_TENSOR_NCHW = 0,
MIOPEN_TENSOR_NHWC = 1,
} miopenTensorFormat_t;
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag miopen_dso_flag;
extern void* miopen_dso_handle;
extern bool HasCUDNN(); extern bool HasCUDNN();
inline const char* miopenGetErrorString(miopenStatus_t status) { #define PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name) \
switch (status) { using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
case miopenStatusSuccess: extern DynLoad__##__name __name
return "MIOPEN_STATUS_SUCCESS";
case miopenStatusNotInitialized:
return "MIOPEN_STATUS_NOT_INITIALIZED";
case miopenStatusInvalidValue:
return "MIOPEN_STATUS_INVALID_VALUE";
case miopenStatusBadParm:
return "MIOPEN_STATUS_BAD_PARAM";
case miopenStatusAllocFailed:
return "MIOPEN_STATUS_ALLOC_FAILED";
case miopenStatusInternalError:
return "MIOPEN_STATUS_INTERNAL_ERROR";
case miopenStatusNotImplemented:
return "MIOPEN_STATUS_NOT_IMPLEMENTED";
case miopenStatusUnsupportedOp:
return "MIOPEN_STATUS_UNSUPPORTED_OP";
case miopenStatusUnknownError:
default:
return "MIOPEN_STATUS_UNKNOWN_ERROR";
}
}
extern void EnforceCUDNNLoaded(const char* fn_name);
#define DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using miopen_func = decltype(&::__name); \
std::call_once(miopen_dso_flag, []() { \
miopen_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
}); \
EnforceCUDNNLoaded(#__name); \
static void* p_##__name = dlsym(miopen_dso_handle, #__name); \
return reinterpret_cast<miopen_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/** /**
* include all needed miopen functions in HPPL * include all needed miopen functions in HPPL
...@@ -145,23 +96,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -145,23 +96,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(miopenRNNForwardInference); \ __macro(miopenRNNForwardInference); \
__macro(miopenGetTensorNumBytes); __macro(miopenGetTensorNumBytes);
MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) MIOPEN_DNN_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
#define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \ #define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \
__macro(miopenConvolutionBackwardData); __macro(miopenConvolutionBackwardData);
MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) MIOPEN_DNN_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs available after R3: // APIs available after R3:
#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \ #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
__macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize); __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize);
MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs available after R4: // APIs available after R4:
#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \ #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \
__macro(miopenBatchNormalizationForwardTraining); \ __macro(miopenBatchNormalizationForwardTraining); \
__macro(miopenBatchNormalizationForwardInference); \ __macro(miopenBatchNormalizationForwardInference); \
__macro(miopenBatchNormalizationBackward); __macro(miopenBatchNormalizationBackward);
MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs in R5 // APIs in R5
#define MIOPEN_DNN_ROUTINE_EACH_R5(__macro) \ #define MIOPEN_DNN_ROUTINE_EACH_R5(__macro) \
...@@ -169,12 +120,12 @@ MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) ...@@ -169,12 +120,12 @@ MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
__macro(miopenSetActivationDescriptor); \ __macro(miopenSetActivationDescriptor); \
__macro(miopenGetActivationDescriptor); \ __macro(miopenGetActivationDescriptor); \
__macro(miopenDestroyActivationDescriptor); __macro(miopenDestroyActivationDescriptor);
MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) MIOPEN_DNN_ROUTINE_EACH_R5(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs in R6 // APIs in R6
#define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \ #define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \
/*__macro(miopenSetRNNDescriptor_v6);*/ /*__macro(miopenSetRNNDescriptor_v6);*/
MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) MIOPEN_DNN_ROUTINE_EACH_R6(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
#define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \ #define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \
__macro(miopenSetConvolutionGroupCount); \ __macro(miopenSetConvolutionGroupCount); \
...@@ -184,7 +135,7 @@ MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) ...@@ -184,7 +135,7 @@ MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
__macro(miopenSetCTCLossDescriptor); \ __macro(miopenSetCTCLossDescriptor); \
__macro(miopenGetCTCLossWorkspaceSize); \ __macro(miopenGetCTCLossWorkspaceSize); \
__macro(miopenCTCLoss); __macro(miopenCTCLoss);
MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) MIOPEN_DNN_ROUTINE_EACH_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \ #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \
/*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \ /*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
...@@ -192,7 +143,7 @@ __macro(cudnnBatchNormalizationForwardTrainingEx); \ ...@@ -192,7 +143,7 @@ __macro(cudnnBatchNormalizationForwardTrainingEx); \
__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \ __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \
__macro(cudnnBatchNormalizationBackwardEx); \ __macro(cudnnBatchNormalizationBackwardEx); \
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/ __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/
MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -18,9 +18,6 @@ namespace paddle { ...@@ -18,9 +18,6 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag mklml_dso_flag;
void* mklml_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
MKLML_ROUTINE_EACH(DEFINE_WRAP); MKLML_ROUTINE_EACH(DEFINE_WRAP);
......
...@@ -17,36 +17,23 @@ limitations under the License. */ ...@@ -17,36 +17,23 @@ limitations under the License. */
#include <mkl.h> #include <mkl.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/mklml.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag mklml_dso_flag;
extern void *mklml_dso_handle;
/** /**
* The following macro definition can generate structs * The following macro definition can generate structs
* (for each function) to dynamic load mklml routine * (for each function) to dynamic load mklml routine
* via operator overloading. * via operator overloading.
*/ */
#define DYNAMIC_LOAD_MKLML_WRAP(__name) \ #define DYNAMIC_LOAD_MKLML_WRAP(__name) \
struct DynLoad__##__name { \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using mklmlFunc = decltype(&::__name); \
std::call_once(mklml_dso_flag, []() { \
mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
}); \
static void *p_##_name = dlsym(mklml_dso_handle, #__name); \
return reinterpret_cast<mklmlFunc>(p_##_name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name) #define PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) \
DYNAMIC_LOAD_MKLML_WRAP(__name)
#define MKLML_ROUTINE_EACH(__macro) \ #define MKLML_ROUTINE_EACH(__macro) \
__macro(cblas_sgemm); \ __macro(cblas_sgemm); \
...@@ -111,7 +98,7 @@ extern void *mklml_dso_handle; ...@@ -111,7 +98,7 @@ extern void *mklml_dso_handle;
__macro(MKL_Set_Num_Threads); \ __macro(MKL_Set_Num_Threads); \
__macro(MKL_Get_Max_Threads); __macro(MKL_Get_Max_Threads);
MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); MKLML_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
#if !defined(_WIN32) #if !defined(_WIN32)
DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm); DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm);
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h" #include "paddle/pten/backends/dynload/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -32,18 +32,8 @@ extern void* mklrt_dso_handle; ...@@ -32,18 +32,8 @@ extern void* mklrt_dso_handle;
* (for each function) to dynamic load mkldfti routine * (for each function) to dynamic load mkldfti routine
* via operator overloading. * via operator overloading.
*/ */
#define DYNAMIC_LOAD_MKLRT_WRAP(__name) \ #define DYNAMIC_LOAD_MKLRT_WRAP(__name) \
struct DynLoad__##__name { \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using mklrtFunc = decltype(&::__name); \
std::call_once(mklrt_dso_flag, []() { \
mklrt_dso_handle = paddle::platform::dynload::GetMKLRTDsoHandle(); \
}); \
static void* p_##__name = dlsym(mklrt_dso_handle, #__name); \
return reinterpret_cast<mklrtFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
// mkl_dfti.h has a macro that shadows the function with the same name // mkl_dfti.h has a macro that shadows the function with the same name
......
...@@ -18,9 +18,6 @@ namespace paddle { ...@@ -18,9 +18,6 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag nccl_dso_flag;
void *nccl_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
......
...@@ -16,28 +16,14 @@ limitations under the License. */ ...@@ -16,28 +16,14 @@ limitations under the License. */
#include <nccl.h> #include <nccl.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/nccl.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag nccl_dso_flag; #define PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
extern void* nccl_dso_handle; using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using nccl_func = decltype(&::__name); \
std::call_once(nccl_dso_flag, []() { \
nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
}); \
static void* p_##__name = dlsym(nccl_dso_handle, #__name); \
return reinterpret_cast<nccl_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#define NCCL_RAND_ROUTINE_EACH(__macro) \ #define NCCL_RAND_ROUTINE_EACH(__macro) \
...@@ -57,30 +43,30 @@ extern void* nccl_dso_handle; ...@@ -57,30 +43,30 @@ extern void* nccl_dso_handle;
__macro(ncclReduceScatter); \ __macro(ncclReduceScatter); \
__macro(ncclGetErrorString); __macro(ncclGetErrorString);
NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) NCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#if NCCL_VERSION_CODE >= 2212 #if NCCL_VERSION_CODE >= 2212
#define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast); #define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
NCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) NCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif #endif
#if NCCL_VERSION_CODE >= 2304 #if NCCL_VERSION_CODE >= 2304
#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion); #define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) NCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif #endif
#if NCCL_VERSION_CODE >= 2703 #if NCCL_VERSION_CODE >= 2703
#define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \ #define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
__macro(ncclSend); \ __macro(ncclSend); \
__macro(ncclRecv); __macro(ncclRecv);
NCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) NCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif #endif
#if NCCL_VERSION_CODE >= 21100 #if NCCL_VERSION_CODE >= 21100
#define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \ #define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
__macro(ncclRedOpCreatePreMulSum); \ __macro(ncclRedOpCreatePreMulSum); \
__macro(ncclRedOpDestroy); __macro(ncclRedOpDestroy);
NCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) NCCL_RAND_ROUTINE_EACH_AFTER_21100(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif #endif
} // namespace dynload } // namespace dynload
......
...@@ -15,9 +15,6 @@ namespace paddle { ...@@ -15,9 +15,6 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag nvjpeg_dso_flag;
void *nvjpeg_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP); NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP);
......
...@@ -14,27 +14,14 @@ limitations under the License. */ ...@@ -14,27 +14,14 @@ limitations under the License. */
#include <nvjpeg.h> #include <nvjpeg.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/nvjpeg.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag nvjpeg_dso_flag;
extern void *nvjpeg_dso_handle; #define PLATFORM_DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
nvjpegStatus_t operator()(Args... args) { \
using nvjpegFunc = decltype(&::__name); \
std::call_once(nvjpeg_dso_flag, []() { \
nvjpeg_dso_handle = paddle::platform::dynload::GetNvjpegDsoHandle(); \
}); \
static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name); \
return reinterpret_cast<nvjpegFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#define NVJPEG_RAND_ROUTINE_EACH(__macro) \ #define NVJPEG_RAND_ROUTINE_EACH(__macro) \
...@@ -44,7 +31,7 @@ extern void *nvjpeg_dso_handle; ...@@ -44,7 +31,7 @@ extern void *nvjpeg_dso_handle;
__macro(nvjpegJpegStateDestroy); \ __macro(nvjpegJpegStateDestroy); \
__macro(nvjpegDecode); __macro(nvjpegDecode);
NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP); NVJPEG_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
...@@ -13,23 +13,17 @@ See the License for the specific language governing permissions and ...@@ -13,23 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/dynload/nvrtc.h" #include "paddle/fluid/platform/dynload/nvrtc.h"
#include "paddle/pten/backends/dynload/nvrtc.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag nvrtc_dso_flag;
void* nvrtc_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
NVRTC_ROUTINE_EACH(DEFINE_WRAP); NVRTC_ROUTINE_EACH(DEFINE_WRAP);
bool HasNVRTC() { bool HasNVRTC() { return pten::dynload::HasNVRTC(); }
std::call_once(nvrtc_dso_flag,
[]() { nvrtc_dso_handle = GetNVRTCDsoHandle(); });
return nvrtc_dso_handle != nullptr;
}
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
...@@ -17,30 +17,17 @@ limitations under the License. */ ...@@ -17,30 +17,17 @@ limitations under the License. */
#include <nvrtc.h> #include <nvrtc.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/nvrtc.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag nvrtc_dso_flag;
extern void* nvrtc_dso_handle;
extern bool HasNVRTC(); extern bool HasNVRTC();
#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \ #define PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \
struct DynLoad__##__name { \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
template <typename... Args> \ extern DynLoad__##__name __name
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using nvrtc_func = decltype(&::__name); \
std::call_once(nvrtc_dso_flag, []() { \
nvrtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \
}); \
static void* p_##__name = dlsym(nvrtc_dso_handle, #__name); \
return reinterpret_cast<nvrtc_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/** /**
* include all needed nvrtc functions * include all needed nvrtc functions
...@@ -56,9 +43,9 @@ extern bool HasNVRTC(); ...@@ -56,9 +43,9 @@ extern bool HasNVRTC();
__macro(nvrtcGetProgramLog); \ __macro(nvrtcGetProgramLog); \
__macro(nvrtcGetProgramLogSize) __macro(nvrtcGetProgramLogSize)
NVRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP); NVRTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP #undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
...@@ -18,9 +18,6 @@ namespace paddle { ...@@ -18,9 +18,6 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag nvtx_dso_flag;
void *nvtx_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
NVTX_ROUTINE_EACH(DEFINE_WRAP); NVTX_ROUTINE_EACH(DEFINE_WRAP);
......
...@@ -17,36 +17,23 @@ limitations under the License. */ ...@@ -17,36 +17,23 @@ limitations under the License. */
#include <nvToolsExt.h> #include <nvToolsExt.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/nvtx.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag nvtx_dso_flag;
extern void *nvtx_dso_handle; #define PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
int operator()(Args... args) { \
using nvtxFunc = decltype(&::__name); \
std::call_once(nvtx_dso_flag, []() { \
nvtx_dso_handle = paddle::platform::dynload::GetNvtxDsoHandle(); \
}); \
static void *p_##__name = dlsym(nvtx_dso_handle, #__name); \
return reinterpret_cast<nvtxFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#define NVTX_ROUTINE_EACH(__macro) \ #define NVTX_ROUTINE_EACH(__macro) \
__macro(nvtxRangePushA); \ __macro(nvtxRangePushA); \
__macro(nvtxRangePop); __macro(nvtxRangePop);
NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP); NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP #undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -18,9 +18,6 @@ namespace paddle { ...@@ -18,9 +18,6 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag rccl_dso_flag;
void *rccl_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
......
...@@ -16,28 +16,14 @@ limitations under the License. */ ...@@ -16,28 +16,14 @@ limitations under the License. */
#include <rccl.h> #include <rccl.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/rccl.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag rccl_dso_flag; #define PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name) \
extern void* rccl_dso_handle; using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using nccl_func = decltype(&::__name); \
std::call_once(rccl_dso_flag, []() { \
rccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
}); \
static void* p_##__name = dlsym(rccl_dso_handle, #__name); \
return reinterpret_cast<nccl_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#define RCCL_RAND_ROUTINE_EACH(__macro) \ #define RCCL_RAND_ROUTINE_EACH(__macro) \
...@@ -57,18 +43,18 @@ extern void* rccl_dso_handle; ...@@ -57,18 +43,18 @@ extern void* rccl_dso_handle;
__macro(ncclReduceScatter); \ __macro(ncclReduceScatter); \
__macro(ncclGetErrorString); __macro(ncclGetErrorString);
RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) RCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#if NCCL_VERSION_CODE >= 2212 #if NCCL_VERSION_CODE >= 2212
#define RCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast); #define RCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) RCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#endif #endif
#if NCCL_VERSION_CODE >= 2703 #if NCCL_VERSION_CODE >= 2703
#define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \ #define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
__macro(ncclSend); \ __macro(ncclSend); \
__macro(ncclRecv); __macro(ncclRecv);
RCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) RCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#endif #endif
} // namespace dynload } // namespace dynload
......
...@@ -17,8 +17,6 @@ limitations under the License. */ ...@@ -17,8 +17,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag rocblas_dso_flag;
void *rocblas_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
......
...@@ -19,16 +19,12 @@ limitations under the License. */ ...@@ -19,16 +19,12 @@ limitations under the License. */
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <type_traits> #include <type_traits>
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/rocblas.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag rocblas_dso_flag;
extern void *rocblas_dso_handle;
/** /**
* The following macro definition can generate structs * The following macro definition can generate structs
* (for each function) to dynamic load cublas routine * (for each function) to dynamic load cublas routine
...@@ -36,18 +32,8 @@ extern void *rocblas_dso_handle; ...@@ -36,18 +32,8 @@ extern void *rocblas_dso_handle;
* *
* note: default dynamic linked libs * note: default dynamic linked libs
*/ */
#define DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name) \ #define PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name) \
struct DynLoad__##__name { \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
template <typename... Args> \
rocblas_status operator()(Args... args) { \
using rocblas_func = decltype(&::__name); \
std::call_once(rocblas_dso_flag, []() { \
rocblas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
}); \
static void *p_##__name = dlsym(rocblas_dso_handle, #__name); \
return reinterpret_cast<rocblas_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \ #define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \
...@@ -83,7 +69,7 @@ extern void *rocblas_dso_handle; ...@@ -83,7 +69,7 @@ extern void *rocblas_dso_handle;
__macro(rocblas_set_pointer_mode); \ __macro(rocblas_set_pointer_mode); \
__macro(rocblas_get_pointer_mode); __macro(rocblas_get_pointer_mode);
ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) ROCBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
// APIs available after CUDA 8.0 // APIs available after CUDA 8.0
#define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \ #define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
...@@ -94,21 +80,21 @@ ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) ...@@ -94,21 +80,21 @@ ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
__macro(rocblas_zgemm_strided_batched); \ __macro(rocblas_zgemm_strided_batched); \
__macro(rocblas_hgemm_strided_batched); __macro(rocblas_hgemm_strided_batched);
ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) ROCBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
// HIP not supported in ROCM3.5 // HIP not supported in ROCM3.5
// #define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro) // #define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro)
// __macro(cublasSetMathMode); // __macro(cublasSetMathMode);
// __macro(cublasGetMathMode); // __macro(cublasGetMathMode);
// ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) // ROCBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
#define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \ #define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
__macro(rocblas_gemm_batched_ex); \ __macro(rocblas_gemm_batched_ex); \
__macro(rocblas_gemm_strided_batched_ex); __macro(rocblas_gemm_strided_batched_ex);
ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) ROCBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
#undef DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP #undef PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -13,22 +13,17 @@ See the License for the specific language governing permissions and ...@@ -13,22 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/dynload/rocm_driver.h" #include "paddle/fluid/platform/dynload/rocm_driver.h"
#include "paddle/pten/backends/dynload/rocm_driver.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag rocm_dso_flag;
void* rocm_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
ROCM_ROUTINE_EACH(DEFINE_WRAP); ROCM_ROUTINE_EACH(DEFINE_WRAP);
bool HasCUDADriver() { bool HasCUDADriver() { return pten::dynload::HasCUDADriver(); }
std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetCUDADsoHandle(); });
return rocm_dso_handle != nullptr;
}
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
...@@ -17,30 +17,17 @@ limitations under the License. */ ...@@ -17,30 +17,17 @@ limitations under the License. */
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/pten/backends/dynload/rocm_driver.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
extern std::once_flag rocm_dso_flag;
extern void* rocm_dso_handle;
extern bool HasCUDADriver(); extern bool HasCUDADriver();
#define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name) \ #define PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name) \
struct DynLoad__##__name { \ using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
template <typename... Args> \ extern DynLoad__##__name __name
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using rocm_func = decltype(&::__name); \
std::call_once(rocm_dso_flag, []() { \
rocm_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \
}); \
static void* p_##__name = dlsym(rocm_dso_handle, #__name); \
return reinterpret_cast<rocm_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/** /**
* include all needed cuda driver functions * include all needed cuda driver functions
...@@ -59,9 +46,9 @@ extern bool HasCUDADriver(); ...@@ -59,9 +46,9 @@ extern bool HasCUDADriver();
__macro(hipGetDeviceCount); \ __macro(hipGetDeviceCount); \
__macro(hipDevicePrimaryCtxGetState) __macro(hipDevicePrimaryCtxGetState)
ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP); ROCM_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
#undef DECLARE_DYNAMIC_LOAD_ROCM_WRAP #undef PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册