提交 1d755225 编写于 作者: J jim19930609

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into eager_dygraph_final_state_auto_codegen
......@@ -4,7 +4,8 @@ paddle/fluid/API_DEV.spec
paddle/fluid/API_PR.spec
paddle/fluid/op_use_default_grad_maker_DEV.spec
paddle/fluid/op_use_default_grad_maker_PR.spec
paddle/pten/api/*/api*
paddle/pten/api/*/api.*
paddle/pten/api/*/backward*
paddle/pten/include/*
paddle/pten/extension.h
......
......@@ -36,7 +36,7 @@ ENDIF()
if(NOT DEFINED XPU_BASE_URL)
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220104")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220116")
else()
SET(XPU_BASE_URL "${XPU_BASE_URL}")
endif()
......
......@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
namespace butil {
class IOBuf;
......@@ -78,11 +78,11 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
const framework::Scope* scope);
void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
butil::IOBufBytesIterator& iobuf,
butil::IOBufBytesIterator& iobuf, // NOLINT
const platform::DeviceContext& ctx);
void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
butil::IOBufBytesIterator& iobuf,
butil::IOBufBytesIterator& iobuf, // NOLINT
const platform::DeviceContext& ctx);
std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port);
......
......@@ -40,9 +40,9 @@
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle {
namespace distributed {
......@@ -202,7 +202,7 @@ class ValueBlock {
// value = _alloc.acquire(value_length_);
table[id] = value;
} else {
value = (VALUE *)(void *)(res->second);
value = (VALUE *)(void *)(res->second); // NOLINT
}
return value;
}
......@@ -282,8 +282,8 @@ class ValueBlock {
value->unseen_days_++;
if (value->unseen_days_ >= threshold) {
butil::return_object(iter->second);
//_alloc.release(iter->second);
//_alloc.release(value);
// _alloc.release(iter->second);
// _alloc.release(value);
iter = table.erase(iter);
} else {
++iter;
......
......@@ -216,8 +216,9 @@ void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
#define PADDLE_TENSOR_ADD(cpp_type) \
if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \
TensorAddFunctor<cpp_type> func(numel, src_tensor->data<cpp_type>(), \
dst_tensor->mutable_data<cpp_type>()); \
TensorAddFunctor<cpp_type> func( \
numel, src_tensor->data<cpp_type>(), \
dst_tensor->mutable_data<cpp_type>(place)); \
paddle::platform::VisitPlace(place, func); \
return; \
}
......
add_subdirectory(final_state_generator)
#add_subdirectory(final_state_generator)
set(EAGER_GENERETOR_DEPS ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
......
......@@ -75,6 +75,10 @@ def GetAutoGradMetaName(string):
return f"{string}_autograd_meta"
def GetAutoGradMetaVectorName(string):
return f"{string}_autograd_meta_vec"
######################
### File Readers ###
######################
......@@ -219,10 +223,6 @@ def ParseYamlBackward(args_str, returns_str):
def ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
forward_returns_list, orig_forward_inputs_list,
orig_forward_attrs_list, orig_forward_returns_list):
# inputs_list = [ [input_name, input_type, orig_position], ...]
# attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# forward_returns_list = [ [ret_name, ret_type, orig_position] ...]
# orig_returns_list = [ [ret_type, orig_position], ...]
for i in range(len(forward_inputs_list)):
forward_input_name = forward_inputs_list[i][0]
forward_input_type = forward_inputs_list[i][1]
......@@ -270,9 +270,6 @@ def ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
def BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
backward_attrs_list):
# backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...}
# backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# Check Order: TensorWrappers, GradTensors, Attributes
max_fwd_input_position = -1
......@@ -291,10 +288,6 @@ def BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
def DetermineForwardPositionMap(forward_inputs_list, forward_returns_list):
# inputs_list = [ [input_name, input_type, orig_position], ...]
# forward_returns_list = [ [ret_name, ret_type, orig_position] ...]
# forward_position_map = { "name" : [type, fwd_position] ...}
forward_inputs_position_map = {}
forward_outputs_position_map = {}
for i in range(len(forward_inputs_list)):
......@@ -319,15 +312,6 @@ def DetermineForwardPositionMap(forward_inputs_list, forward_returns_list):
def SlotNameMatching(backward_inputs_list, backward_returns_list,
forward_inputs_position_map, forward_outputs_position_map):
# backward_inputs_list = [ [input_name, input_type, orig_position], ...]
# backward_returns_list = [ [ret_name, ret_type, orig_position], ...]
# forward_inputs_position_map = { "name" : [type, fwd_position] }
# forward_outputs_position_map = { "name" : [type, fwd_position] }
# backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...}
# backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...}
backward_fwd_input_map = {}
backward_grad_input_map = {}
backward_grad_output_map = {}
......@@ -580,7 +564,14 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
compute_require_grad_args_list = ["trace_backward"]
for name, (ttype, pos) in forward_inputs_position_map.items():
input_autograd_meta_name = GetAutoGradMetaName(name)
input_autograd_meta = f" auto* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
if IsPlainTensorType(ttype):
input_autograd_meta = f" egr::EagerTensor* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
else:
assert IsVectorTensorType(ttype)
input_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
input_autograd_meta = f" std::vector<egr::EagerTensor*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({name});\n"
input_autograd_meta += f" std::vector<egr::EagerTensor*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
inputs_autograd_meta_list.append(input_autograd_meta)
compute_require_grad_args_list.append(input_autograd_meta_name)
inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list)
......@@ -592,11 +583,23 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
num_fwd_outputs = len(forward_outputs_position_map.keys())
for name, (rtype, pos) in forward_outputs_position_map.items():
output_autograd_meta_name = GetAutoGradMetaName(name)
output_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
if num_fwd_outputs == 1:
output_autograd_meta = f" auto* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(outputs);"
if IsPlainTensorType(rtype):
output_autograd_meta = f" egr::EagerTensor* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(outputs);"
else:
assert IsVectorTensorType(rtype)
output_autograd_meta = f" std::vector<egr::EagerTensor*> {output_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({outputs});\n"
output_autograd_meta += f" std::vector<egr::EagerTensor*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
else:
# Tuple api_result
outputs_autograd_meta = f" auto* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(outputs[{pos}]);"
if IsPlainTensorType(rtype):
outputs_autograd_meta = f" egr::EagerTensor* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(outputs[{pos}]);"
else:
assert IsVectorTensorType(rtype)
output_autograd_meta = f" std::vector<egr::EagerTensor*> {output_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta(outputs[{pos}]);\n"
output_autograd_meta += f" std::vector<egr::EagerTensor*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
outputs_autograd_meta_list.append(output_autograd_meta)
pass_stop_gradient_args_list.append(output_autograd_meta_name)
......@@ -786,7 +789,6 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
auto outputs = {};
// Node Creation
{}
// Returns
......@@ -903,17 +905,10 @@ if __name__ == "__main__":
# Collect Forward Inputs/Outputs
forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
bwd_forward_str)
print("Parsed Forward Inputs List: ", forward_inputs_list)
print("Prased Forward Attrs List: ", forward_attrs_list)
print("Parsed Forward Returns List: ", forward_returns_list)
# Collect Original Forward Inputs/Outputs and then perform validation checks
orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
fwd_args_str, fwd_returns_str)
print("Parsed Original Forward Inputs List: ", orig_forward_inputs_list)
print("Prased Original Forward Attrs List: ", orig_forward_attrs_list)
print("Parsed Original Forward Returns List: ",
orig_forward_returns_list)
# Forward Validation Checks
ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
......@@ -924,25 +919,15 @@ if __name__ == "__main__":
# Parse Backward Inputs/Outputs
backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
bwd_args_str, bwd_returns_str)
print("Parsed Backward Inputs List: ", backward_inputs_list)
print("Prased Backward Attrs List: ", backward_attrs_list)
print("Parsed Backward Returns List: ", backward_returns_list)
# Determine Forward Inputs/Outputs Position
forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
forward_inputs_list, forward_returns_list)
print("Generated Forward Input Position Map: ",
forward_inputs_position_map)
print("Generated Forward Output Position Map: ",
forward_outputs_position_map)
# SlotName Matching
backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
backward_inputs_list, backward_returns_list,
forward_inputs_position_map, forward_outputs_position_map)
print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
print("Generated Backward Grad Input Map: ", backward_grad_input_map)
print("Generated Backward Grad Output Map: ", backward_grad_output_map)
# Backward Validation Check
BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
......@@ -951,13 +936,11 @@ if __name__ == "__main__":
# Node Declaration Generation
node_declaration_str += GenerateNodeDeclaration(
fwd_api_name, backward_fwd_input_map, backward_attrs_list)
print("Generated Node Declaration: ", node_declaration_str)
node_definition_str += GenerateNodeDefinition(
fwd_api_name, bwd_api_name, backward_fwd_input_map,
backward_grad_input_map, backward_grad_output_map,
backward_attrs_list)
print("Generated Node Definition: ", node_definition_str)
# Node Definition Generation
definition_declaration_pair = GenerateForwardDefinition(
......@@ -965,8 +948,6 @@ if __name__ == "__main__":
forward_outputs_position_map, forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list)
print("Generated Forward Definition: ", forward_definition_str)
print("Generated Forward Declaration: ", forward_declaration_str)
forward_definition_str += definition_declaration_pair[0]
forward_declaration_str += definition_declaration_pair[1]
......
......@@ -36,7 +36,8 @@ TEST(AccumulationNode, EagerTensor) {
paddle::platform::CPUPlace())
.get(),
meta);
dt0->mutable_data<paddle::platform::float16>()[0] = 10.0;
dt0->mutable_data<paddle::platform::float16>(
paddle::platform::CPUPlace())[0] = 10.0;
EagerTensor et0 = EagerTensor(dt0);
std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>(
......@@ -45,7 +46,8 @@ TEST(AccumulationNode, EagerTensor) {
.get(),
meta);
dt1->mutable_data<paddle::platform::float16>()[0] = 20.0;
dt1->mutable_data<paddle::platform::float16>(
paddle::platform::CPUPlace())[0] = 20.0;
EagerTensor et1 = EagerTensor(dt1);
std::shared_ptr<pten::DenseTensor> grad_dt =
......
......@@ -46,7 +46,7 @@ TEST(AutogradMeta, MemberFunction) {
paddle::platform::CPUPlace())
.get(),
meta);
auto* dt_ptr = dt->mutable_data<float>();
auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr[0] = 5.0f;
dt_ptr[1] = 10.0f;
grad_t->set_impl(dt);
......
......@@ -40,7 +40,7 @@ TEST(EagerTensor, Constructor) {
paddle::platform::CPUPlace())
.get(),
meta);
auto* dt_ptr = dt->mutable_data<float>();
auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr[0] = 5.0f;
dt_ptr[1] = 10.0f;
egr::EagerTensor et3 = egr::EagerTensor(dt);
......@@ -70,7 +70,7 @@ TEST(EagerTensor, MemberFunction) {
paddle::platform::CPUPlace())
.get(),
meta);
auto* dt_ptr = dt->mutable_data<float>();
auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr[0] = 5.0f;
dt_ptr[1] = 10.0f;
VLOG(6) << "Make Dense Tensor";
......
......@@ -45,7 +45,7 @@ TEST(GradNodeInfo, GradNodeBase) {
paddle::platform::CPUPlace())
.get(),
meta);
auto* dt_ptr = dt->mutable_data<float>();
auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr[0] = 5.0f;
egr::EagerTensor et1(dt);
grads = {{et1}};
......@@ -102,7 +102,7 @@ TEST(GradNodeInfo, GradNodeBase) {
paddle::platform::CPUPlace())
.get(),
meta);
auto* dt_ptr = dt->mutable_data<float>();
auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr[0] = 6.0f;
auto* et_ptr =
std::dynamic_pointer_cast<pten::DenseTensor>(et.impl())->data<float>();
......@@ -121,8 +121,8 @@ TEST(GradNodeInfo, GradNodeBase) {
VLOG(6) << "Test Reduce Hook";
auto reduce_hook = [&](void) -> void {
auto* et_ptr = std::dynamic_pointer_cast<pten::DenseTensor>(et1.impl())
->mutable_data<float>();
auto* et_ptr =
std::dynamic_pointer_cast<pten::DenseTensor>(et1.impl())->data<float>();
et_ptr[0] = 100.0;
VLOG(6) << "Running Reduce Hook";
};
......
......@@ -41,7 +41,7 @@ class GradTestNode : public egr::GradNodeBase {
paddle::platform::CPUPlace())
.get(),
meta);
auto* dt_ptr = dt->mutable_data<float>();
auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr[0] = 6.0f;
egr::EagerTensor et1(dt);
std::vector<std::vector<egr::EagerTensor>> res = {{et1}};
......
......@@ -57,7 +57,7 @@ TEST(GradTensorHolder, Interfaces) {
paddle::platform::CPUPlace())
.get(),
meta);
dt0->mutable_data<float>()[0] = 10.0;
dt0->mutable_data<float>(paddle::platform::CPUPlace())[0] = 10.0;
EagerTensor et0 = EagerTensor(dt0);
std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>(
......@@ -65,7 +65,7 @@ TEST(GradTensorHolder, Interfaces) {
paddle::platform::CPUPlace())
.get(),
meta);
dt1->mutable_data<float>()[0] = 20.0;
dt1->mutable_data<float>(paddle::platform::CPUPlace())[0] = 20.0;
EagerTensor et1 = EagerTensor(dt1);
// Constructor empty GradTensorHolder
......
......@@ -29,7 +29,7 @@ TEST(TensorWrapper, Basic) {
paddle::platform::CPUPlace())
.get(),
meta);
auto* dt_ptr = dt->mutable_data<float>();
auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr[0] = 5.0f;
dt_ptr[1] = 10.0f;
et1.set_impl(dt);
......@@ -56,7 +56,7 @@ TEST(TensorWrapper, Basic) {
paddle::platform::CPUPlace())
.get(),
meta2);
auto* dt_ptr2 = dt->mutable_data<float>();
auto* dt_ptr2 = dt->mutable_data<float>(paddle::platform::CPUPlace());
dt_ptr2[0] = 6.0f;
dt_ptr2[1] = 11.0f;
et2.set_impl(dt2);
......
......@@ -35,7 +35,7 @@ TEST(EagerUtils, AutoGradMeta) {
paddle::platform::CPUPlace())
.get(),
meta);
dt0->mutable_data<float>()[0] = 10.0;
dt0->mutable_data<float>(paddle::platform::CPUPlace())[0] = 10.0;
EagerTensor et0 = EagerTensor(dt0);
std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>(
......@@ -43,7 +43,7 @@ TEST(EagerUtils, AutoGradMeta) {
paddle::platform::CPUPlace())
.get(),
meta);
dt1->mutable_data<float>()[0] = 20.0;
dt1->mutable_data<float>(paddle::platform::CPUPlace())[0] = 20.0;
EagerTensor et1 = EagerTensor(dt1);
std::vector<EagerTensor> ets = {et0, et1};
......@@ -112,7 +112,7 @@ egr::EagerTensor CreateTestCPUTensor(T val,
paddle::platform::CPUPlace())
.get(),
meta);
auto* dt_ptr = dt->mutable_data<T>();
auto* dt_ptr = dt->mutable_data<T>(paddle::platform::CPUPlace());
for (int64_t i = 0; i < dt->numel(); i++) {
dt_ptr[i] = val;
}
......
......@@ -44,8 +44,8 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) {
paddle::memory::Alloc(place, bytes_size)),
std::move(ret_meta));
float* t_ptr = t_dense->mutable_data<float>();
float* ret_ptr = ret_dense->mutable_data<float>();
float* t_ptr = t_dense->mutable_data<float>(place);
float* ret_ptr = ret_dense->mutable_data<float>(place);
for (int i = 0; i < ret_dense->numel(); i++) {
ret_ptr[i] = t_ptr[i] + 5.0;
}
......@@ -184,7 +184,7 @@ TEST(FwdBwdJoint, BranchedNodes) {
// Examine Forward Output 2
{
auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out2.impl());
float* ptr = dense_out->mutable_data<float>();
float* ptr = dense_out->mutable_data<float>(paddle::platform::CPUPlace());
for (int i = 0; i < 20; i++) {
PADDLE_ENFORCE(ptr[i] == 150.0,
paddle::platform::errors::Fatal(
......
......@@ -45,8 +45,8 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) {
paddle::memory::Alloc(place, bytes_size)),
std::move(ret_meta));
float* t_ptr = t_dense->mutable_data<float>();
float* ret_ptr = ret_dense->mutable_data<float>();
float* t_ptr = t_dense->mutable_data<float>(place);
float* ret_ptr = ret_dense->mutable_data<float>(place);
for (int i = 0; i < ret_dense->numel(); i++) {
ret_ptr[i] = t_ptr[i] + 3.0;
}
......
......@@ -34,7 +34,7 @@ bool CompareGradTensorWithValue(const egr::EagerTensor& target, T value) {
egr::AutogradMeta* meta = egr::EagerUtils::unsafe_autograd_meta(target);
auto grad_dense =
std::dynamic_pointer_cast<pten::DenseTensor>(meta->Grad().impl());
T* ptr = grad_dense->mutable_data<T>();
T* ptr = grad_dense->data<T>();
std::vector<T> host_data(grad_dense->numel());
if (paddle::platform::is_gpu_place(grad_dense->place())) {
......@@ -67,7 +67,7 @@ template <typename T>
bool CompareTensorWithValue(const egr::EagerTensor& target, T value) {
// TODO(jiabin): Support Selected Rows later
auto dense_t = std::dynamic_pointer_cast<pten::DenseTensor>(target.impl());
T* ptr = dense_t->mutable_data<T>();
T* ptr = dense_t->data<T>();
std::vector<T> host_data(dense_t->numel());
if (paddle::platform::is_gpu_place(dense_t->place())) {
......
......@@ -20,9 +20,9 @@ limitations under the License. */
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/eigen_ext.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
namespace paddle {
namespace framework {
......
......@@ -38,8 +38,8 @@ limitations under the License. */
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/timer.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle {
namespace framework {
......
......@@ -19,14 +19,18 @@
#define _LINUX
#endif
#ifdef _LINUX
#ifndef likely
#define likely(x) __builtin_expect((x), 1)
#ifdef _LINUX
#define likely(expr) (__builtin_expect(!!(expr), 1))
#else
#define likely(expr) (expr)
#endif
#endif
#ifdef _LINUX
#ifndef unlikely
#define unlikely(x) __builtin_expect((x), 0)
#ifdef _LINUX
#define unlikely(expr) (__builtin_expect(!!(expr), 0))
#else
#define unlikely(expr) (expr)
#endif
#endif
......@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "paddle/fluid/framework/io/shell.h"
#include "paddle/fluid/platform/enforce.h"
......
......@@ -34,8 +34,8 @@
#include <utility>
#include <vector>
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/pten/backends/dynload/port.h"
#if defined(__arm__) || defined(__aarch64__) || defined(__ARM_NEON) || \
defined(__ARM_NEON__)
......
......@@ -34,7 +34,7 @@ limitations under the License. */
#include "paddle/fluid/framework/trainer_desc.pb.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle {
namespace framework {
......
......@@ -19,7 +19,7 @@
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle {
namespace inference {
......
......@@ -28,7 +28,7 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
#ifdef _WIN32
#include <direct.h>
......
......@@ -20,7 +20,7 @@
#include <vector>
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle {
namespace framework {
......
......@@ -31,8 +31,8 @@
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/pten/backends/dynload/port.h"
extern std::string paddle::framework::DataTypeToString(
const framework::proto::VarType::Type type);
......
......@@ -22,8 +22,8 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/pten/backends/dynload/port.h"
DECLARE_bool(use_mkldnn);
......
......@@ -8,7 +8,6 @@ set(pybind_file_final ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operators/CMakeLists.txt. DO NOT EDIT!\n\n")
add_subdirectory(math)
add_subdirectory(eigen)
add_subdirectory(controlflow)
add_subdirectory(detection)
add_subdirectory(elementwise)
......
......@@ -23,7 +23,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
DECLARE_bool(use_mkldnn);
......
......@@ -22,49 +22,40 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
struct LessThanFunctor {
using ELEM_TYPE = T;
HOSTDEVICE bool operator()(const T a, const T b) const { return a < b; }
};
template <typename T>
struct LessEqualFunctor {
using ELEM_TYPE = T;
HOSTDEVICE bool operator()(const T a, const T b) const { return a <= b; }
};
template <typename T>
struct GreaterThanFunctor {
using ELEM_TYPE = T;
HOSTDEVICE bool operator()(const T a, const T b) const { return a > b; }
};
template <typename T>
struct GreaterEqualFunctor {
using ELEM_TYPE = T;
HOSTDEVICE bool operator()(const T a, const T b) const { return a >= b; }
};
template <typename T>
#define COMPARE_FUNCTOR(func_name, op) \
template <typename InT, typename OutT = bool> \
struct func_name { \
using ELEM_TYPE = InT; \
HOSTDEVICE OutT operator()(const InT a, const InT b) const { \
return static_cast<OutT>(a op b); \
} \
};
COMPARE_FUNCTOR(LessThanFunctor, <)
COMPARE_FUNCTOR(LessEqualFunctor, <=)
COMPARE_FUNCTOR(GreaterThanFunctor, >)
COMPARE_FUNCTOR(GreaterEqualFunctor, >=)
#undef COMPARE_FUNCTOR
template <typename InT, typename OutT = bool>
struct EqualFunctor {
using ELEM_TYPE = T;
HOSTDEVICE bool operator()(const T a, const T b) const {
if (std::is_floating_point<T>::value) {
using ELEM_TYPE = InT;
HOSTDEVICE OutT operator()(const InT a, const InT b) const {
if (std::is_floating_point<InT>::value) {
// This branch will be optimized while compiling if T is integer. It is
// safe to cast a and b to double.
return fabs(static_cast<double>(a - b)) < 1e-8;
return static_cast<OutT>(fabs(static_cast<double>(a - b)) < 1e-8);
} else {
return (a == b);
return static_cast<OutT>(a == b);
}
}
};
template <typename T>
template <typename InT, typename OutT = bool>
struct NotEqualFunctor {
using ELEM_TYPE = T;
HOSTDEVICE bool operator()(const T a, const T b) const {
return !EqualFunctor<T>()(a, b);
using ELEM_TYPE = InT;
HOSTDEVICE bool operator()(const InT a, const InT b) const {
return !EqualFunctor<InT, OutT>()(a, b);
}
};
......
file(GLOB EIGEN_CC_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
file(GLOB EIGEN_CU_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu")
if(WITH_GPU)
nv_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3)
elseif(WITH_ROCM)
hip_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3)
else()
cc_library(eigen_function SRCS ${EIGEN_CC_SOURCES} DEPS eigen3)
endif()
......@@ -18,243 +18,71 @@ limitations under the License. */
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include "unsupported/Eigen/CXX11/Tensor"
#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
namespace paddle {
namespace operators {
template <typename EigenDevice, typename T, int Rank>
struct EigenBroadcast {
using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
using InType32BitIndex =
Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
Eigen::Aligned>;
using OutType = Eigen::TensorMap<
Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType32BitIndex =
Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
Eigen::Aligned>;
static void Eval(const EigenDevice& dev, OutType out, InType in,
const Array& bcast);
static void Eval(const EigenDevice& dev, OutType32BitIndex out,
InType32BitIndex in, const Array& bcast);
};
using EigenBroadcast = pten::funcs::EigenBroadcast<EigenDevice, T, Rank>;
template <typename EigenDevice, typename T, int Rank>
struct EigenBroadcastGrad {
using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
using Array2 = Eigen::DSizes<Eigen::DenseIndex, Rank * 2>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, InType in,
const Array& reduce_dims, const Array2& reshape_dims);
};
using EigenBroadcastGrad =
pten::funcs::EigenBroadcastGrad<EigenDevice, T, Rank>;
template <typename EigenDevice, typename T, int Rank>
struct EigenConstant {
using Type = Eigen::TensorMap<
Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, Type out, const T value);
};
using EigenConstant = pten::funcs::EigenConstant<EigenDevice, T, Rank>;
template <typename EigenDevice, typename T>
struct EigenSign {
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in);
};
using EigenSign = pten::funcs::EigenSign<EigenDevice, T>;
template <typename EigenDevice, typename T, int Rank>
struct EigenReverse {
using Array = Eigen::DSizes<bool, Rank>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType = Eigen::TensorMap<
Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in,
const Array& reverse);
};
using EigenReverse = pten::funcs::EigenReverse<EigenDevice, T, Rank>;
template <typename EigenDevice, typename T>
struct EigenAdd {
using InType = Eigen::TensorMap<Eigen::TensorFixedSize<
const T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in,
const T value);
};
using EigenAdd = pten::funcs::EigenAdd<EigenDevice, T>;
template <typename EigenDevice, typename T>
struct EigenSub {
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& left,
const InType& right);
};
using EigenSub = pten::funcs::EigenSub<EigenDevice, T>;
template <typename EigenDevice, typename T, int Rank>
struct EigenSlice {
using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
using Array32Bit = Eigen::DSizes<int, Rank>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
using InType32BitIndex =
Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
Eigen::Aligned>;
using OutType = Eigen::TensorMap<
Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType32BitIndex =
Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
Eigen::Aligned>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in,
const Array& offsets, const Array& extents);
static void Eval(const EigenDevice& dev, OutType32BitIndex out,
const InType32BitIndex& in, const Array32Bit& offsets,
const Array32Bit& extents);
};
using EigenSlice = pten::funcs::EigenSlice<EigenDevice, T, Rank>;
template <typename EigenDevice, typename T, int Rank>
struct EigenPad {
using Array = std::array<std::pair<int64_t, int64_t>, Rank>;
using Array32Bit = std::array<std::pair<int, int>, Rank>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
using InType32BitIndex =
Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
Eigen::Aligned>;
using OutType = Eigen::TensorMap<
Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType32BitIndex =
Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
Eigen::Aligned>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in,
const Array& padding, const T value);
static void Eval(const EigenDevice& dev, OutType32BitIndex out,
const InType32BitIndex& in, const Array32Bit& padding,
const T value);
};
using EigenPad = pten::funcs::EigenPad<EigenDevice, T, Rank>;
template <typename EigenDevice, typename T>
struct EigenScale {
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in,
const T scale, const T bias, const bool bias_after_scale);
};
using EigenScale = pten::funcs::EigenScale<EigenDevice, T>;
template <typename EigenDevice, typename T>
struct EigenErf {
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in);
};
using EigenErf = pten::funcs::EigenErf<EigenDevice, T>;
template <typename EigenDevice, typename T>
struct EigenErfGrad {
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType din, const InType& in,
const InType& dout);
};
using EigenErfGrad = pten::funcs::EigenErfGrad<EigenDevice, T>;
template <typename EigenDevice, typename T>
struct EigenRankLoss {
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& label,
const InType& left, const InType& right);
};
using EigenRankLoss = pten::funcs::EigenRankLoss<EigenDevice, T>;
template <typename EigenDevice, typename T>
struct EigenRankLossGrad {
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void EvalLeft(const EigenDevice& dev, OutType dleft,
const InType& dout, const InType& label,
const InType& left, const InType& right);
static void EvalRight(const EigenDevice& dev, OutType dright,
const InType& dout, const InType& label,
const InType& left, const InType& right);
};
using EigenRankLossGrad = pten::funcs::EigenRankLossGrad<EigenDevice, T>;
template <typename EigenDevice, typename T>
struct EigenLogLoss {
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& pred,
const InType& label, const T& epsilon);
};
using EigenLogLoss = pten::funcs::EigenLogLoss<EigenDevice, T>;
template <typename EigenDevice, typename T>
struct EigenLogLossGrad {
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType dpred, const InType& dloss,
const InType& pred, const InType& label, const T& epsilon);
};
using EigenLogLossGrad = pten::funcs::EigenLogLossGrad<EigenDevice, T>;
template <typename EigenDevice, typename T>
struct EigenHingeLoss {
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType loss, const InType& pred,
const InType& label);
};
using EigenHingeLoss = pten::funcs::EigenHingeLoss<EigenDevice, T>;
template <typename EigenDevice, typename T>
struct EigenHingeLossGrad {
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType dpred, const InType& dloss,
const InType& pred, const InType& label);
};
using EigenHingeLossGrad = pten::funcs::EigenHingeLossGrad<EigenDevice, T>;
template <typename EigenDevice, typename T>
struct EigenL1Norm {
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType out, const InType& in);
};
using EigenL1Norm = pten::funcs::EigenL1Norm<EigenDevice, T>;
template <typename EigenDevice, typename T>
struct EigenL1NormGrad {
using Array = Eigen::DSizes<Eigen::DenseIndex, 1>;
using InType = Eigen::TensorMap<
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
using OutType =
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
static void Eval(const EigenDevice& dev, OutType din, const InType& dout,
const InType& in, const Array& bcast);
};
using EigenL1NormGrad = pten::funcs::EigenL1NormGrad<EigenDevice, T>;
} // namespace operators
} // namespace paddle
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/expect.h"
#include "paddle/fluid/operators/fused/fusion_gru_op.h"
#include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
......@@ -41,7 +42,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
ctx.InputName("X") + ctx.InputName("WeightH")) {
const bool is_INT8 = std::is_same<T, uint8_t>::value;
if (!this->isCached()) {
if (unlikely(!this->isCached())) {
// oneDNN kernel has hardcoded activation functions
PADDLE_ENFORCE_EQ(
ctx.Attr<std::string>("gate_activation"), "sigmoid",
......
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/expect.h"
#include "paddle/fluid/operators/fused/fusion_lstm_op.h"
#include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
......@@ -40,7 +41,7 @@ class LSTMMKLDNNHandler
ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
is_reverse, N, Ti, IC, OC, 4,
ctx.InputName("X") + ctx.InputName("WeightH")) {
if (!this->isCached()) {
if (unlikely(!this->isCached())) {
const bool is_INT8 = std::is_same<T, uint8_t>::value;
const bool use_peepholes = ctx.Attr<bool>("use_peepholes");
// oneDNN kernel has hardcoded activation functions
......
......@@ -15,9 +15,9 @@
#pragma once
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/platform/eigen_ext.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
namespace paddle {
namespace operators {
......
......@@ -477,6 +477,155 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
}
};
#ifdef PADDLE_WITH_XPU
template <typename T>
struct MergeAdd<platform::XPUDeviceContext, T> {
framework::SelectedRows operator()(const platform::XPUDeviceContext& context,
const framework::SelectedRows& input,
const bool sorted_result = false) {
framework::SelectedRows out;
(*this)(context, input, &out, sorted_result);
return out;
}
void operator()(const platform::XPUDeviceContext& context,
const framework::SelectedRows& input,
framework::SelectedRows* output,
const bool sorted_result = false) {
framework::Vector<int64_t> input_rows(input.rows());
if (input_rows.size() == 0) {
return;
}
framework::SelectedRows& out = *output;
std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
auto input_width = input.value().dims()[1];
out.set_rows(merge_rows);
out.set_height(input.height());
out.mutable_value()->mutable_data<T>(
framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace());
int r =
xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
merge_rows.size() * input_width, static_cast<T>(0.f));
PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
platform::errors::External("XPU constant op return"
" wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) {
rows_to_id[merge_rows[i]] = i;
}
auto* out_data = out.mutable_value()->data<T>();
auto* input_data = input.value().data<T>();
int n = input_width;
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id[input_rows[i]];
auto r = xpu::add(context.x_context(), &input_data[i * input_width],
&out_data[out_i * input_width],
&out_data[out_i * input_width], n);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU API return wrong value[%d %s], ", r,
XPUAPIErrorMsg[r]));
}
}
void operator()(const platform::XPUDeviceContext& context,
const std::vector<const framework::SelectedRows*>& inputs,
framework::SelectedRows* output,
const bool sorted_result = false) {
if (inputs.size() == 0) {
VLOG(3) << "no input! return";
return;
}
const framework::SelectedRows* has_value_input = nullptr;
for (auto* in : inputs) {
if (in->rows().size() > 0) {
has_value_input = in;
break;
}
}
if (has_value_input == nullptr) {
VLOG(3) << "no input has value! just return" << std::endl;
return;
}
auto input_width = has_value_input->value().dims()[1];
auto input_height = has_value_input->height();
framework::SelectedRows& out = *output;
std::set<int64_t> merged_row_set;
size_t row_num = 0;
for (auto* input : inputs) {
if (input->rows().size() == 0) {
continue;
}
PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
platform::errors::InvalidArgument(
"All inputs should have same "
"dimension except for the first one."));
PADDLE_ENFORCE_EQ(input_height, input->height(),
platform::errors::InvalidArgument(
"All inputs should have same height."));
row_num += input->rows().size();
merged_row_set.insert(input->rows().begin(), input->rows().end());
}
std::vector<int64_t> merge_rows(merged_row_set.begin(),
merged_row_set.end());
if (sorted_result) {
std::sort(merge_rows.begin(), merge_rows.end());
}
out.set_rows(merge_rows);
out.set_height(input_height);
out.mutable_value()->mutable_data<T>(
framework::make_ddim(
{static_cast<int64_t>(merged_row_set.size()), input_width}),
context.GetPlace());
int r =
xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
merge_rows.size() * input_width, static_cast<T>(0.f));
PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
platform::errors::External("XPU constant op return"
" wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
float* out_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) {
rows_to_id[merge_rows[i]] = i;
}
for (auto* input : inputs) {
if (input->rows().size() == 0) {
continue;
}
auto& input_rows = input->rows();
int n = input_width;
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id[input_rows[i]];
auto r = xpu::add(
context.x_context(), input->value().data<T>() + i * input_width,
&out_data[out_i * input_width], &out_data[out_i * input_width], n);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU API return wrong value[%d %s], ", r,
XPUAPIErrorMsg[r]));
}
}
}
};
#endif
template <typename T>
struct MergeAverage<platform::CPUDeviceContext, T> {
framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
......@@ -589,6 +738,10 @@ template struct MergeAdd<platform::CPUDeviceContext,
template struct MergeAdd<platform::CPUDeviceContext,
paddle::platform::bfloat16>;
#ifdef PADDLE_WITH_XPU
template struct MergeAdd<platform::XPUDeviceContext, float>;
#endif
template struct MergeAverage<platform::CPUDeviceContext, int>;
template struct MergeAverage<platform::CPUDeviceContext, int64_t>;
template struct MergeAverage<platform::CPUDeviceContext, float>;
......
......@@ -219,18 +219,20 @@ class MatrixRankCPUKernel : public framework::OpKernel<T> {
tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
Tensor compare_result;
compare_result.mutable_data<int>(detail::NewAxisDim(dim_out, k),
context.GetPlace());
compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k),
context.GetPlace());
int axis = -1;
if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) {
ElementwiseComputeEx<GreaterThanFunctor<T>, platform::CPUDeviceContext, T,
int>(context, &eigenvalue_tensor, &tol_tensor, axis,
GreaterThanFunctor<T>(), &compare_result);
ElementwiseComputeEx<GreaterThanFunctor<T, int64_t>,
platform::CPUDeviceContext, T, int>(
context, &eigenvalue_tensor, &tol_tensor, axis,
GreaterThanFunctor<T, int64_t>(), &compare_result);
} else {
ElementwiseComputeEx<LessThanFunctor<T>, platform::CPUDeviceContext, T,
int>(context, &eigenvalue_tensor, &tol_tensor, axis,
LessThanFunctor<T>(), &compare_result);
ElementwiseComputeEx<LessThanFunctor<T, int64_t>,
platform::CPUDeviceContext, T, int>(
context, &eigenvalue_tensor, &tol_tensor, axis,
LessThanFunctor<T, int64_t>(), &compare_result);
}
auto dito_int =
math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext,
......
......@@ -129,10 +129,10 @@ class MatrixRankGPUKernel : public framework::OpKernel<T> {
compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k),
context.GetPlace());
int axis = -1;
ElementwiseComputeEx<GreaterThanFunctor<T>, platform::CUDADeviceContext, T,
int64_t>(context, &eigenvalue_tensor, &tol_tensor,
axis, GreaterThanFunctor<T>(),
&compare_result);
ElementwiseComputeEx<GreaterThanFunctor<T, int64_t>,
platform::CUDADeviceContext, T, int64_t>(
context, &eigenvalue_tensor, &tol_tensor, axis,
GreaterThanFunctor<T, int64_t>(), &compare_result);
auto dito_int =
math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
int64_t>(context);
......
......@@ -16,6 +16,7 @@
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/controlflow/compare_op.h"
namespace paddle {
namespace operators {
......@@ -46,16 +47,6 @@ static DDim RemoveLastDim(const DDim& dim) {
}
} // namespace detail
template <typename T>
struct GreaterThanFunctor {
HOSTDEVICE int operator()(const T a, const T b) const { return a > b; }
};
template <typename T>
struct LessThanFunctor {
HOSTDEVICE int operator()(const T a, const T b) const { return a < b; }
};
template <typename T>
struct GreaterElementFunctor {
HOSTDEVICE T operator()(const T a, const T b) const {
......
......@@ -14,6 +14,7 @@
#include <tuple>
#include "paddle/fluid/framework/expect.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
......@@ -79,7 +80,7 @@ class ConvMKLDNNHandlerT
dev_ctx, mkldnn_engine, cpu_place,
platform::CreateKey(dev_ctx, framework::vectorize(input->dims()),
unique_name)) {
if (!this->isCached()) {
if (unlikely(!this->isCached())) {
PADDLE_ENFORCE_EQ(
input->layout(), framework::DataLayout::kMKLDNN,
platform::errors::InvalidArgument(
......@@ -264,7 +265,7 @@ class ConvMKLDNNHandlerT
dev_ctx, dev_ctx.GetEngine(), cpu_place,
platform::CreateKey(dev_ctx, framework::vectorize(in->dims()),
unique_name)) {
if (!this->isBwdCached()) {
if (unlikely(!this->isBwdCached())) {
PADDLE_ENFORCE_EQ(
in->layout(), framework::DataLayout::kMKLDNN,
platform::errors::InvalidArgument(
......
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/expect.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
......@@ -39,7 +40,7 @@ class PReluMKLDNNHandler
dev_ctx, engine, cpu_place,
platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
uniq_name)) {
if (!this->isCached()) {
if (unlikely(!this->isCached())) {
auto x_md = memory::desc(framework::vectorize(x->dims()),
MKLDNNGetDataType<T>(), x->format());
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/optimizers/adam_op.h"
#include "gflags/gflags.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace paddle {
namespace operators {
......@@ -155,6 +156,11 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
mom2_out.template mutable_data<float>(ctx.GetPlace()),
param_out.template mutable_data<float>(ctx.GetPlace()),
beta1, beta2, epsilon, param.numel());
xpu_wait(dev_ctx.x_context()->xpu_stream);
PADDLE_ENFORCE_EQ(
r == xpu::Error_t::SUCCESS, true,
platform::errors::External("XPU API return wrong value[%d],", r));
if (!use_global_beta_pow) {
// update in cpu and then copy to xpu
if (beta1_pow.place() == platform::CPUPlace() &&
......@@ -165,7 +171,6 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
const float* beta2_pow_p = beta2_pow.template data<float>();
beta2_pow_out->mutable_data<float>(platform::CPUPlace())[0] =
beta2 * beta2_pow_p[0];
xpu_wait(dev_ctx.x_context()->xpu_stream);
} else {
float* beta1_pow_out_p =
beta1_pow_out->mutable_data<float>(ctx.GetPlace());
......@@ -177,23 +182,129 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(
r, xpu::SUCCESS,
platform::errors::External(
"XPU kernel scale occur error in adamw error code ", r,
"XPU kernel scale occur error in adam error code ", r,
XPUAPIErrorMsg[r]));
r = xpu::scale(dev_ctx.x_context(), beta2_pow_ptr, beta2_pow_out_p,
beta2_pow.numel(), false, beta2, 0.0f);
PADDLE_ENFORCE_EQ(
r, xpu::SUCCESS,
platform::errors::External(
"XPU kernel scale occur error in adamw error code ", r,
"XPU kernel scale occur error in adam error code ", r,
XPUAPIErrorMsg[r]));
xpu_wait(dev_ctx.x_context()->xpu_stream);
}
}
} else if (grad_var->IsType<framework::SelectedRows>()) {
auto* grad = ctx.Input<framework::SelectedRows>("Grad");
auto& dev_ctx = ctx.template device_context<DeviceContext>();
if (grad->rows().size() == 0) {
VLOG(3) << "grad row size is 0!!";
return;
}
std::vector<int64_t> cpu_rows(grad->rows().begin(), grad->rows().end());
bool is_strict_sorted = true;
for (size_t i = 1; i < cpu_rows.size(); ++i) {
if (cpu_rows[i - 1] >= cpu_rows[i]) {
is_strict_sorted = false;
break;
}
}
framework::SelectedRows tmp_grad_merge;
const framework::SelectedRows* grad_merge_ptr;
if (is_strict_sorted) {
grad_merge_ptr = grad;
} else {
scatter::MergeAdd<platform::XPUDeviceContext, T> merge_func;
merge_func(ctx.template device_context<platform::XPUDeviceContext>(),
*grad, &tmp_grad_merge, true);
xpu_wait(dev_ctx.x_context()->xpu_stream);
grad_merge_ptr = &tmp_grad_merge;
}
const T* beta1_pow_ptr = beta1_pow.template data<T>();
const T* beta2_pow_ptr = beta2_pow.template data<T>();
Tensor xpu_beta1_pow;
Tensor xpu_beta2_pow;
if (beta1_pow.place() == platform::CPUPlace() &&
beta2_pow.place() == platform::CPUPlace()) {
paddle::framework::TensorCopy(beta1_pow, ctx.GetPlace(), dev_ctx,
&xpu_beta1_pow);
paddle::framework::TensorCopy(beta2_pow, ctx.GetPlace(), dev_ctx,
&xpu_beta2_pow);
dev_ctx.Wait();
beta1_pow_ptr = xpu_beta1_pow.template data<T>();
beta2_pow_ptr = xpu_beta2_pow.template data<T>();
}
auto& grad_merge = *grad_merge_ptr;
auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>();
int row_count = grad_merge.rows().size();
std::vector<int> rows(row_count);
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
int* xpu_rows = RAII_GUARD.alloc_l3_or_gm<int>(row_count);
std::vector<int64_t> merge_rows(grad_merge.rows().begin(),
grad_merge.rows().end());
for (size_t i = 0; i < grad_merge.rows().size(); ++i) {
rows[i] = static_cast<int>(merge_rows[i]);
}
xpu_wait(dev_ctx.x_context()->xpu_stream);
memory::Copy(ctx.GetPlace(), xpu_rows, platform::CPUPlace(), rows.data(),
row_count * sizeof(int));
auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
auto ori_rows = param.numel() / row_numel;
PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
platform::errors::External(
"XPU API return wrong value[%d], please check "
"where Baidu Kunlun Card is properly installed.",
r));
int lazy_mode = static_cast<int>(ctx.Attr<bool>("lazy_mode"));
int r = xpu::sparse_adam(
dev_ctx.x_context(), grad_data, mom1.template data<T>(),
mom2.template data<T>(), param.template data<T>(), beta1_pow_ptr,
beta2_pow_ptr, lr.template data<T>(),
mom1_out.template mutable_data<T>(ctx.GetPlace()),
mom2_out.template mutable_data<T>(ctx.GetPlace()),
param_out.template mutable_data<T>(ctx.GetPlace()), beta1, beta2,
epsilon, ori_rows, xpu_rows, row_numel, grad_merge.rows().size(),
lazy_mode);
PADDLE_ENFORCE_EQ(
r == xpu::Error_t::SUCCESS, true,
platform::errors::External("XPU API return wrong value[%d],", r));
if (!use_global_beta_pow) {
// update in cpu and then copy to xpu
if (beta1_pow.place() == platform::CPUPlace() &&
beta2_pow.place() == platform::CPUPlace()) {
const float* beta1_pow_p = beta1_pow.template data<float>();
beta1_pow_out->mutable_data<float>(platform::CPUPlace())[0] =
beta1 * beta1_pow_p[0];
const float* beta2_pow_p = beta2_pow.template data<float>();
beta2_pow_out->mutable_data<float>(platform::CPUPlace())[0] =
beta2 * beta2_pow_p[0];
} else {
float* beta1_pow_out_p =
beta1_pow_out->mutable_data<float>(ctx.GetPlace());
float* beta2_pow_out_p =
beta2_pow_out->mutable_data<float>(ctx.GetPlace());
int r =
xpu::scale(dev_ctx.x_context(), beta1_pow_ptr, beta1_pow_out_p,
beta1_pow.numel(), false, beta1, 0.0f);
PADDLE_ENFORCE_EQ(
r, xpu::SUCCESS,
platform::errors::External(
"XPU kernel scale occur error in adam error code ", r,
XPUAPIErrorMsg[r]));
r = xpu::scale(dev_ctx.x_context(), beta2_pow_ptr, beta2_pow_out_p,
beta2_pow.numel(), false, beta2, 0.0f);
PADDLE_ENFORCE_EQ(
r, xpu::SUCCESS,
platform::errors::External(
"XPU kernel scale occur error in adam error code ", r,
XPUAPIErrorMsg[r]));
}
}
xpu_wait(dev_ctx.x_context()->xpu_stream);
} else {
PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument(
"Variable type not supported by adam_op"));
......
......@@ -22,8 +22,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/operators/math/squared_l2_norm.h"
#include "paddle/fluid/platform/eigen_ext.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
namespace paddle {
namespace operators {
......
......@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle {
namespace operators {
......
......@@ -72,7 +72,8 @@ struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
}
};
template <template <typename T> typename CompareFunctor, typename T>
template <template <typename InT, typename OutT> typename CompareFunctor,
typename T>
struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
const Tensor& rhs, Tensor* mask) {
......@@ -81,7 +82,7 @@ struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kBinary, int64_t, T>(dev_ctx, ins, &outs,
CompareFunctor<int64_t>());
CompareFunctor<int64_t, T>());
}
};
......
......@@ -112,12 +112,13 @@ void SameDimsBinaryOP(const Tensor& lhs, const Tensor& rhs, Tensor* out) {
}
}
template <typename DeviceContext, template <typename T> typename CompareFunctor,
template <typename DeviceContext,
template <typename InT, typename OutT> typename CompareFunctor,
typename T>
struct GetMask {
void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
const Tensor& rhs, Tensor* mask) {
SameDimsBinaryOP<int64_t, CompareFunctor<int64_t>, T>(lhs, rhs, mask);
SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(lhs, rhs, mask);
}
};
......
......@@ -10,7 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/eigen_ext.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "gtest/gtest.h"
......
......@@ -14,7 +14,7 @@
#include "paddle/fluid/platform/complex.h"
#include <complex>
#include "paddle/fluid/platform/eigen_ext.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "gtest/gtest.h"
......@@ -267,56 +267,56 @@ TEST(complex, print) {
TEST(complex, isinf) {
// *********** complex<float> *************
complex<float> a;
a.real = float(INFINITY);
a.real = static_cast<float>(INFINITY);
EXPECT_EQ(std::isinf(a), true);
a.imag = float(INFINITY);
a.imag = static_cast<float>(INFINITY);
EXPECT_EQ(std::isinf(a), true);
complex<float> b = float(INFINITY);
complex<float> b = static_cast<float>(INFINITY);
EXPECT_EQ(std::isinf(b), true);
complex<float> c(float(INFINITY), 0);
complex<float> c(static_cast<float>(INFINITY), 0);
EXPECT_EQ(std::isinf(c), true);
// *********** complex<double> *************
complex<double> a1;
a1.real = double(INFINITY);
a1.real = static_cast<double>(INFINITY);
EXPECT_EQ(std::isinf(a1), true);
a1.imag = double(INFINITY);
a1.imag = static_cast<double>(INFINITY);
EXPECT_EQ(std::isinf(a1), true);
complex<double> b1 = double(INFINITY);
complex<double> b1 = static_cast<double>(INFINITY);
EXPECT_EQ(std::isinf(b1), true);
complex<double> c1(double(INFINITY), 0);
complex<double> c1(static_cast<double>(INFINITY), 0);
EXPECT_EQ(std::isinf(c1), true);
}
TEST(complex, isnan) {
// *********** complex<float> *************
complex<float> a;
a.real = float(NAN);
a.real = static_cast<float>(NAN);
EXPECT_EQ(std::isnan(a), true);
a.imag = float(NAN);
a.imag = static_cast<float>(NAN);
EXPECT_EQ(std::isnan(a), true);
complex<float> b = float(NAN);
complex<float> b = static_cast<float>(NAN);
EXPECT_EQ(std::isnan(b), true);
complex<float> c(float(NAN), 0);
complex<float> c(static_cast<float>(NAN), 0);
EXPECT_EQ(std::isnan(c), true);
// *********** complex<double> *************
complex<double> a1;
a1.real = double(NAN);
a1.real = static_cast<double>(NAN);
EXPECT_EQ(std::isnan(a1), true);
a1.imag = double(NAN);
a1.imag = static_cast<double>(NAN);
EXPECT_EQ(std::isnan(a1), true);
complex<double> b1 = double(NAN);
complex<double> b1 = static_cast<double>(NAN);
EXPECT_EQ(std::isnan(b1), true);
complex<double> c1(double(NAN), 0);
complex<double> c1(static_cast<double>(NAN), 0);
EXPECT_EQ(std::isnan(c1), true);
}
......
......@@ -23,8 +23,8 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/eigen_ext.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
namespace paddle {
......@@ -303,59 +303,59 @@ TEST(complex, print) {
TEST(complex, isinf) {
// *********** complex<float> *************
complex<float> a;
a.real = float(INFINITY);
a.real = static_cast<float>(INFINITY);
EXPECT_EQ(std::isinf(a), true);
a.imag = float(INFINITY);
a.imag = static_cast<float>(INFINITY);
EXPECT_EQ(std::isinf(a), true);
complex<float> b = float(INFINITY);
complex<float> b = static_cast<float>(INFINITY);
EXPECT_EQ(std::isinf(b), true);
complex<float> c(float(INFINITY), 0);
complex<float> c(static_cast<float>(INFINITY), 0);
EXPECT_EQ(std::isinf(c), true);
// *********** complex<double> *************
complex<double> a1;
a1.real = double(INFINITY);
a1.real = static_cast<double>(INFINITY);
EXPECT_EQ(std::isinf(a1), true);
a1.imag = double(INFINITY);
a1.imag = static_cast<double>(INFINITY);
EXPECT_EQ(std::isinf(a1), true);
complex<double> b1 = double(INFINITY);
complex<double> b1 = static_cast<double>(INFINITY);
EXPECT_EQ(std::isinf(b1), true);
complex<double> c1(double(INFINITY), 0);
complex<double> c1(static_cast<double>(INFINITY), 0);
EXPECT_EQ(std::isinf(c1), true);
}
TEST(complex, isnan) {
// *********** complex<float> *************
complex<float> a;
a.real = float(NAN);
a.real = static_cast<float>(NAN);
EXPECT_EQ(std::isnan(a), true);
a.imag = float(NAN);
a.imag = static_cast<float>(NAN);
EXPECT_EQ(std::isnan(a), true);
complex<float> b = float(NAN);
complex<float> b = static_cast<float>(NAN);
EXPECT_EQ(std::isnan(b), true);
complex<float> c(float(NAN), 0);
complex<float> c(static_cast<float>(NAN), 0);
EXPECT_EQ(std::isnan(c), true);
// *********** complex<double> *************
complex<double> a1;
a1.real = double(NAN);
a1.real = static_cast<double>(NAN);
EXPECT_EQ(std::isnan(a1), true);
a1.imag = double(NAN);
a1.imag = static_cast<double>(NAN);
EXPECT_EQ(std::isnan(a1), true);
complex<double> b1 = double(NAN);
complex<double> b1 = static_cast<double>(NAN);
EXPECT_EQ(std::isnan(b1), true);
complex<double> c1(double(NAN), 0);
complex<double> c1(static_cast<double>(NAN), 0);
EXPECT_EQ(std::isnan(c1), true);
}
} // namespace platform
} // namespace paddle
#endif
\ No newline at end of file
#endif
......@@ -20,7 +20,7 @@ limitations under the License. */
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
#define HCOM_GROUP_PREFIX "HCOM_GROUP_"
......
......@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/platform/ipu/ipu_backend.h"
#endif
#include "glog/logging.h"
#include "paddle/fluid/framework/expect.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
......@@ -841,15 +842,6 @@ unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const {
return num_entries;
}
// TODO(jczaja): Replace with C++20 equivalents when applicable
#ifdef _WIN32
#define likely(expr) (expr)
#define unlikely(expr) (expr)
#else
#define likely(expr) (__builtin_expect(!!(expr), 1))
#define unlikely(expr) (__builtin_expect(!!(expr), 0))
#endif
MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
const std::string& name) const {
BlobMap* pMap = p_blobmap_.get();
......
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce pten_dynamic_loader)
list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc nvtx.cc cufft.cc)
......@@ -34,24 +34,24 @@ if (CUPTI_FOUND)
list(APPEND CUDA_SRCS cupti.cc)
endif(CUPTI_FOUND)
if(WITH_ROCM)
hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader pten_dynload_cuda)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
elseif (WITH_ASCEND_CL)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl pten_dynload_warpctc)
else()
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader pten_dynload_cuda)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
endif()
if (WITH_MKLML)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml pten_dynload_mklml)
endif()
cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader)
cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader pten_dynload_lapack)
add_dependencies(dynload_lapack extern_lapack)
# TODO(TJ): add iomp, mkldnn?
if (MKL_FOUND AND WITH_ONEMKL)
message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader)
cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader pten_dynload_mklrt)
target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE})
endif()
......@@ -17,8 +17,6 @@ limitations under the License. */
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cublas_dso_flag;
void *cublas_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
......
......@@ -20,16 +20,12 @@ limitations under the License. */
#include <mutex> // NOLINT
#include <type_traits>
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cublas.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cublas_dso_flag;
extern void *cublas_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cublas routine
......@@ -37,19 +33,8 @@ extern void *cublas_dso_handle;
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cublas_func = \
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
std::call_once(cublas_dso_flag, []() { \
cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
}); \
static void *p_##__name = dlsym(cublas_dso_handle, #__name); \
return reinterpret_cast<cublas_func>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
......@@ -99,7 +84,7 @@ extern void *cublas_dso_handle;
__macro(cublasSgetrsBatched); \
__macro(cublasDgetrsBatched);
CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
CUBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
// APIs available after CUDA 8.0
#if CUDA_VERSION >= 8000
......@@ -111,7 +96,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
__macro(cublasZgemmStridedBatched); \
__macro(cublasHgemmStridedBatched);
CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
CUBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif
// APIs available after CUDA 9.0
......@@ -120,7 +105,7 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
__macro(cublasSetMathMode); \
__macro(cublasGetMathMode);
CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
CUBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif
// APIs available after CUDA 9.1
......@@ -129,10 +114,10 @@ CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
__macro(cublasGemmBatchedEx); \
__macro(cublasGemmStridedBatchedEx);
CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
CUBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -17,8 +17,6 @@ limitations under the License. */
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cublasLt_dso_flag;
void *cublasLt_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
......
......@@ -19,16 +19,12 @@ limitations under the License. */
#include <mutex> // NOLINT
#include <type_traits>
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cublasLt.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cublasLt_dso_flag;
extern void *cublasLt_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cublasLt routine
......@@ -36,20 +32,8 @@ extern void *cublasLt_dso_handle;
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cublasLt_func = \
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
std::call_once(cublasLt_dso_flag, []() { \
cublasLt_dso_handle = \
paddle::platform::dynload::GetCublasLtDsoHandle(); \
}); \
static void *p_##__name = dlsym(cublasLt_dso_handle, #__name); \
return reinterpret_cast<cublasLt_func>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
// APIs available after CUDA 10.1
......@@ -69,10 +53,10 @@ extern void *cublasLt_dso_handle;
__macro(cublasLtMatrixTransformDescDestroy); \
__macro(cublasLtMatrixTransformDescSetAttribute);
CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
CUBLASLT_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
// #endif
#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -13,14 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/cuda_driver.h"
#include "paddle/pten/backends/dynload/cuda_driver.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cuda_dso_flag;
void* cuda_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
#if CUDA_VERSION >= 10020
......@@ -28,10 +26,7 @@ CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP);
#endif
CUDA_ROUTINE_EACH(DEFINE_WRAP);
bool HasCUDADriver() {
std::call_once(cuda_dso_flag, []() { cuda_dso_handle = GetCUDADsoHandle(); });
return cuda_dso_handle != nullptr;
}
bool HasCUDADriver() { return pten::dynload::HasCUDADriver(); }
} // namespace dynload
} // namespace platform
......
......@@ -17,30 +17,17 @@ limitations under the License. */
#include <cuda.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cuda_driver.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cuda_dso_flag;
extern void* cuda_dso_handle;
extern bool HasCUDADriver();
#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cuda_func = decltype(&::__name); \
std::call_once(cuda_dso_flag, []() { \
cuda_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \
}); \
static void* p_##__name = dlsym(cuda_dso_handle, #__name); \
return reinterpret_cast<cuda_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
/**
* include all needed cuda driver functions
......@@ -72,12 +59,12 @@ extern bool HasCUDADriver();
__macro(cuMemRelease); \
__macro(cuMemAddressFree)
CUDA_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
CUDA_ROUTINE_EACH_VVM(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
#endif
CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
CUDA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
#undef DECLARE_DYNAMIC_LOAD_CUDA_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP
} // namespace dynload
} // namespace platform
......
......@@ -13,13 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/backends/dynload/cudnn.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cudnn_dso_flag;
void* cudnn_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
......@@ -45,19 +43,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
#endif
bool HasCUDNN() {
std::call_once(cudnn_dso_flag,
[]() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
return cudnn_dso_handle != nullptr;
}
void EnforceCUDNNLoaded(const char* fn_name) {
PADDLE_ENFORCE_NOT_NULL(
cudnn_dso_handle,
platform::errors::PreconditionNotMet(
"Cannot load cudnn shared library. Cannot invoke method %s.",
fn_name));
}
bool HasCUDNN() { return pten::dynload::HasCUDNN(); }
} // namespace dynload
} // namespace platform
......
......@@ -18,32 +18,17 @@ limitations under the License. */
#include <glog/logging.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cudnn.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cudnn_dso_flag;
extern void* cudnn_dso_handle;
extern bool HasCUDNN();
extern void EnforceCUDNNLoaded(const char* fn_name);
#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cudnn_func = decltype(&::__name); \
std::call_once(cudnn_dso_flag, []() { \
cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
}); \
EnforceCUDNNLoaded(#__name); \
static void* p_##__name = dlsym(cudnn_dso_handle, #__name); \
return reinterpret_cast<cudnn_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
/**
* include all needed cudnn functions in HPPL
......@@ -127,7 +112,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnGetActivationDescriptor); \
__macro(cudnnDestroyActivationDescriptor); \
__macro(cudnnSetRNNDescriptor_v6);
CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \
......@@ -135,7 +120,8 @@ CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnGetConvolutionBackwardDataAlgorithm); \
__macro(cudnnSetRNNDescriptor);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(
PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 7001
......@@ -153,7 +139,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
__macro(cudnnGetConvolutionForwardAlgorithm_v7); \
__macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 7201
......@@ -166,7 +152,7 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnRNNBackwardDataEx); \
__macro(cudnnRNNBackwardWeightsEx); \
__macro(cudnnRNNForwardInferenceEx);
CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 7401
......@@ -176,7 +162,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \
__macro(cudnnBatchNormalizationBackwardEx); \
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_AFTER_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 8000
......@@ -192,7 +178,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnSetFusedOpsConstParamPackAttribute); \
__macro(cudnnSetFusedOpsVariantParamPackAttribute); \
__macro(cudnnMakeFusedOpsPlan);
CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_R8(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
} // namespace dynload
......
......@@ -13,31 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/cufft.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/backends/dynload/cufft.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cufft_dso_flag;
void* cufft_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
bool HasCUFFT() {
std::call_once(cufft_dso_flag,
[]() { cufft_dso_handle = GetCUFFTDsoHandle(); });
return cufft_dso_handle != nullptr;
}
void EnforceCUFFTLoaded(const char* fn_name) {
PADDLE_ENFORCE_NOT_NULL(
cufft_dso_handle,
platform::errors::PreconditionNotMet(
"Cannot load cufft shared library. Cannot invoke method %s.",
fn_name));
}
bool HasCUFFT() { return pten::dynload::HasCUFFT(); }
} // namespace dynload
} // namespace platform
......
......@@ -19,32 +19,17 @@ limitations under the License. */
#include <glog/logging.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cufft.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cufft_dso_flag;
extern void* cufft_dso_handle;
extern bool HasCUFFT();
extern void EnforceCUFFTLoaded(const char* fn_name);
#define DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cufft_func = decltype(&::__name); \
std::call_once(cufft_dso_flag, []() { \
cufft_dso_handle = paddle::platform::dynload::GetCUFFTDsoHandle(); \
}); \
EnforceCUFFTLoaded(#__name); \
static void* p_##__name = dlsym(cufft_dso_handle, #__name); \
return reinterpret_cast<cufft_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
/**
* include all needed cufft functions in HPPL
......@@ -104,7 +89,7 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
__macro(cufftXtExecDescriptor); \
__macro(cufftXtSetWorkAreaPolicy);
CUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
CUFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
} // namespace dynload
} // namespace platform
......
......@@ -20,9 +20,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cupti_dso_flag;
void *cupti_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUPTI_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -19,16 +19,12 @@ limitations under the License. */
#include <cupti.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cupti.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cupti_dso_flag;
extern void *cupti_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cupti routine
......@@ -36,18 +32,8 @@ extern void *cupti_dso_handle;
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline CUptiResult CUPTIAPI operator()(Args... args) { \
using cuptiFunc = decltype(&::__name); \
std::call_once(cupti_dso_flag, []() { \
cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
}); \
static void *p_##__name = dlsym(cupti_dso_handle, #__name); \
return reinterpret_cast<cuptiFunc>(p_##__name)(args...); \
} \
}; \
#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define CUPTI_ROUTINE_EACH(__macro) \
......
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag curand_dso_flag;
void *curand_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -16,27 +16,14 @@ limitations under the License. */
#include <curand.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/curand.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag curand_dso_flag;
extern void *curand_dso_handle;
#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
curandStatus_t operator()(Args... args) { \
using curandFunc = decltype(&::__name); \
std::call_once(curand_dso_flag, []() { \
curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
}); \
static void *p_##__name = dlsym(curand_dso_handle, #__name); \
return reinterpret_cast<curandFunc>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define CURAND_RAND_ROUTINE_EACH(__macro) \
......@@ -48,7 +35,7 @@ extern void *curand_dso_handle;
__macro(curandGenerateNormal); \
__macro(curandDestroyGenerator);
CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
CURAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
} // namespace dynload
} // namespace platform
......
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cusolver_dso_flag;
void *cusolver_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUSOLVER_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -17,28 +17,14 @@ limitations under the License. */
#include <cusolverDn.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cusolver.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cusolver_dso_flag;
extern void *cusolver_dso_handle;
#define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
cusolverStatus_t operator()(Args... args) { \
using cusolverFunc = decltype(&::__name); \
std::call_once(cusolver_dso_flag, []() { \
cusolver_dso_handle = \
paddle::platform::dynload::GetCusolverDsoHandle(); \
}); \
static void *p_##__name = dlsym(cusolver_dso_handle, #__name); \
return reinterpret_cast<cusolverFunc>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define CUSOLVER_ROUTINE_EACH(__macro) \
......@@ -62,7 +48,7 @@ extern void *cusolver_dso_handle;
__macro(cusolverDnCheevd); \
__macro(cusolverDnZheevd);
CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
CUSOLVER_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
#if CUDA_VERSION >= 9020
#define CUSOLVER_ROUTINE_EACH_R1(__macro) \
......@@ -105,7 +91,7 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
__macro(cusolverDnCungqr); \
__macro(cusolverDnZungqr);
CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
CUSOLVER_ROUTINE_EACH_R1(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
#endif
#if CUDA_VERSION >= 9020
......@@ -117,10 +103,10 @@ CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
__macro(cusolverDnDsyevj); \
__macro(cusolverDnDestroySyevjInfo);
CUSOLVER_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
CUSOLVER_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
#endif
#undef DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cusparse_dso_flag;
void *cusparse_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
#ifdef CUSPARSE_ROUTINE_EACH
......
......@@ -17,28 +17,14 @@ limitations under the License. */
#include <cusparse.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cusparse.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cusparse_dso_flag;
extern void *cusparse_dso_handle;
#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
cusparseStatus_t operator()(Args... args) { \
using cusparseFunc = decltype(&::__name); \
std::call_once(cusparse_dso_flag, []() { \
cusparse_dso_handle = \
paddle::platform::dynload::GetCusparseDsoHandle(); \
}); \
static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
return reinterpret_cast<cusparseFunc>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#if defined(PADDLE_WITH_CUDA)
......@@ -54,7 +40,7 @@ extern void *cusparse_dso_handle;
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase);
CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
// APIs available after CUDA 11.2
#if CUDA_VERSION >= 11020
......@@ -74,7 +60,7 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
__macro(cusparseSparseToDense_bufferSize); \
__macro(cusparseSparseToDense);
CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH_11020(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
// APIs available after CUDA 11.3
#if CUDA_VERSION >= 11030
......@@ -83,13 +69,13 @@ CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
__macro(cusparseSDDMM_preprocess); \
__macro(cusparseSDDMM);
CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif
#endif
#endif
#endif
#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag hipfft_dso_flag;
void *hipfft_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -17,8 +17,7 @@ limitations under the License. */
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/hipfft.h"
namespace paddle {
namespace platform {
......@@ -26,18 +25,8 @@ namespace dynload {
extern std::once_flag hipfft_dso_flag;
extern void *hipfft_dso_handle;
#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using hipfftFunc = decltype(&::__name); \
std::call_once(hipfft_dso_flag, []() { \
hipfft_dso_handle = paddle::platform::dynload::GetROCFFTDsoHandle(); \
}); \
static void *p_##__name = dlsym(hipfft_dso_handle, #__name); \
return reinterpret_cast<hipfftFunc>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define HIPFFT_FFT_ROUTINE_EACH(__macro) \
......@@ -70,53 +59,8 @@ extern void *hipfft_dso_handle;
__macro(hipfftGetVersion); \
__macro(hipfftGetProperty);
HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
HIPFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
inline const char *hipfftGetErrorString(hipfftResult_t status) {
switch (status) {
case HIPFFT_SUCCESS:
return "'HIPFFT_SUCCESS'. The hipFFT operation was successful.";
case HIPFFT_INVALID_PLAN:
return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle.";
case HIPFFT_ALLOC_FAILED:
return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU "
"memory.";
case HIPFFT_INVALID_TYPE:
return "'HIPFFT_INVALID_TYPE'. No longer used.";
case HIPFFT_INVALID_VALUE:
return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or "
"parameter.";
case HIPFFT_INTERNAL_ERROR:
return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library "
"error.";
case HIPFFT_EXEC_FAILED:
return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU.";
case HIPFFT_SETUP_FAILED:
return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize.";
case HIPFFT_INVALID_SIZE:
return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size.";
case HIPFFT_UNALIGNED_DATA:
return "'HIPFFT_UNALIGNED_DATA'. No longer used.";
case HIPFFT_INCOMPLETE_PARAMETER_LIST:
return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call.";
case HIPFFT_INVALID_DEVICE:
return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different "
"GPU than plan creation.";
case HIPFFT_PARSE_ERROR:
return "'HIPFFT_PARSE_ERROR'. Internal plan database error.";
case HIPFFT_NO_WORKSPACE:
return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to "
"plan execution.";
case HIPFFT_NOT_IMPLEMENTED:
return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement "
"functionality for parameters given.";
case HIPFFT_NOT_SUPPORTED:
return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for "
"parameters given.";
default:
return "HIPFFT_STATUS_UNKNOWN_ERROR";
}
}
} // namespace dynload
} // namespace platform
} // namespace paddle
......
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag hiprand_dso_flag;
void *hiprand_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -16,28 +16,15 @@ limitations under the License. */
#include <hiprand.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/hiprand.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag hiprand_dso_flag;
extern void *hiprand_dso_handle;
#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
hiprandStatus_t operator()(Args... args) { \
using hiprandFunc = decltype(&::__name); \
std::call_once(hiprand_dso_flag, []() { \
hiprand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
}); \
static void *p_##__name = dlsym(hiprand_dso_handle, #__name); \
return reinterpret_cast<hiprandFunc>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define HIPRAND_RAND_ROUTINE_EACH(__macro) \
......@@ -49,7 +36,7 @@ extern void *hiprand_dso_handle;
__macro(hiprandGenerateNormal); \
__macro(hiprandDestroyGenerator);
HIPRAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
HIPRAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
} // namespace dynload
} // namespace platform
......
......@@ -13,23 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/hiprtc.h"
#include "paddle/pten/backends/dynload/hiprtc.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag hiprtc_dso_flag;
void* hiprtc_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
HIPRTC_ROUTINE_EACH(DEFINE_WRAP);
bool HasNVRTC() {
std::call_once(hiprtc_dso_flag,
[]() { hiprtc_dso_handle = GetNVRTCDsoHandle(); });
return hiprtc_dso_handle != nullptr;
}
bool HasNVRTC() { return pten::dynload::HasNVRTC(); }
} // namespace dynload
} // namespace platform
......
......@@ -16,30 +16,17 @@ limitations under the License. */
#include <hip/hiprtc.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/hiprtc.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag hiprtc_dso_flag;
extern void* hiprtc_dso_handle;
extern bool HasNVRTC();
#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using hiprtc_func = decltype(&::__name); \
std::call_once(hiprtc_dso_flag, []() { \
hiprtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \
}); \
static void* p_##__name = dlsym(hiprtc_dso_handle, #__name); \
return reinterpret_cast<hiprtc_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
#define PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
/**
* include all needed hiprtc functions
......@@ -55,9 +42,9 @@ extern bool HasNVRTC();
__macro(hiprtcGetProgramLog); \
__macro(hiprtcGetProgramLogSize)
HIPRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
HIPRTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
#undef DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
} // namespace dynload
} // namespace platform
......
......@@ -13,15 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/lapack.h"
#include <mutex>
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag lapack_dso_flag;
void* lapack_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
LAPACK_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -16,122 +16,20 @@ limitations under the License. */
#include <complex>
#include <mutex>
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
// Note(zhouwei): because lapack doesn't provide appropriate header file.
// should expose API statement yourself.
// getrf_(For example)
extern "C" void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv,
int *info);
extern "C" void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv,
int *info);
// evd
extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex<double> *a,
int *lda, double *w, std::complex<double> *work,
int *lwork, double *rwork, int *lrwork, int *iwork,
int *liwork, int *info);
extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex<float> *a,
int *lda, float *w, std::complex<float> *work,
int *lwork, float *rwork, int *lrwork, int *iwork,
int *liwork, int *info);
extern "C" void dsyevd_(char *jobz, char *uplo, int *n, double *a, int *lda,
double *w, double *work, int *lwork, int *iwork,
int *liwork, int *info);
extern "C" void ssyevd_(char *jobz, char *uplo, int *n, float *a, int *lda,
float *w, float *work, int *lwork, int *iwork,
int *liwork, int *info);
// geev
extern "C" void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda,
double *wr, double *wi, double *vl, int *ldvl,
double *vr, int *ldvr, double *work, int *lwork,
int *info);
extern "C" void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda,
float *wr, float *wi, float *vl, int *ldvl, float *vr,
int *ldvr, float *work, int *lwork, int *info);
extern "C" void zgeev_(char *jobvl, char *jobvr, int *n,
std::complex<double> *a, int *lda,
std::complex<double> *w, std::complex<double> *vl,
int *ldvl, std::complex<double> *vr, int *ldvr,
std::complex<double> *work, int *lwork, double *rwork,
int *info);
extern "C" void cgeev_(char *jobvl, char *jobvr, int *n, std::complex<float> *a,
int *lda, std::complex<float> *w,
std::complex<float> *vl, int *ldvl,
std::complex<float> *vr, int *ldvr,
std::complex<float> *work, int *lwork, float *rwork,
int *info);
// gels
extern "C" void dgels_(char *trans, int *m, int *n, int *nrhs, double *a,
int *lda, double *b, int *ldb, double *work, int *lwork,
int *info);
extern "C" void sgels_(char *trans, int *m, int *n, int *nrhs, float *a,
int *lda, float *b, int *ldb, float *work, int *lwork,
int *info);
// gelsd
extern "C" void dgelsd_(int *m, int *n, int *nrhs, double *a, int *lda,
double *b, int *ldb, double *s, double *rcond,
int *rank, double *work, int *lwork, int *iwork,
int *info);
extern "C" void sgelsd_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
int *ldb, float *s, float *rcond, int *rank,
float *work, int *lwork, int *iwork, int *info);
// gelsy
extern "C" void dgelsy_(int *m, int *n, int *nrhs, double *a, int *lda,
double *b, int *ldb, int *jpvt, double *rcond,
int *rank, double *work, int *lwork, int *info);
extern "C" void sgelsy_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
int *ldb, int *jpvt, float *rcond, int *rank,
float *work, int *lwork, int *info);
// gelss
extern "C" void dgelss_(int *m, int *n, int *nrhs, double *a, int *lda,
double *b, int *ldb, double *s, double *rcond,
int *rank, double *work, int *lwork, int *info);
extern "C" void sgelss_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
int *ldb, float *s, float *rcond, int *rank,
float *work, int *lwork, int *info);
extern "C" void zpotrs_(char *uplo, int *n, int *nrhs, std::complex<double> *a,
int *lda, std::complex<double> *b, int *ldb, int *info);
extern "C" void cpotrs_(char *uplo, int *n, int *nrhs, std::complex<float> *a,
int *lda, std::complex<float> *b, int *ldb, int *info);
extern "C" void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda,
double *b, int *ldb, int *info);
extern "C" void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda,
float *b, int *ldb, int *info);
#include "paddle/pten/backends/dynload/lapack.h"
#include "paddle/pten/common/complex.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag lapack_dso_flag;
extern void *lapack_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load lapack routine
* via operator overloading.
*/
#define DYNAMIC_LOAD_LAPACK_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using lapackFunc = decltype(&::__name); \
std::call_once(lapack_dso_flag, []() { \
lapack_dso_handle = paddle::platform::dynload::GetLAPACKDsoHandle(); \
}); \
static void *p_##_name = dlsym(lapack_dso_handle, #__name); \
return reinterpret_cast<lapackFunc>(p_##_name)(args...); \
} \
}; \
#define DYNAMIC_LOAD_LAPACK_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \
......
......@@ -13,13 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/backends/dynload/cudnn.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag miopen_dso_flag;
void* miopen_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
......@@ -50,19 +48,7 @@ MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
#endif
bool HasCUDNN() {
std::call_once(miopen_dso_flag,
[]() { miopen_dso_handle = GetCUDNNDsoHandle(); });
return miopen_dso_handle != nullptr;
}
void EnforceCUDNNLoaded(const char* fn_name) {
PADDLE_ENFORCE_NOT_NULL(
miopen_dso_handle,
platform::errors::PreconditionNotMet(
"Cannot load miopen shared library. Cannot invoke method %s.",
fn_name));
}
bool HasCUDNN() { return pten::dynload::HasCUDNN(); }
} // namespace dynload
} // namespace platform
......
......@@ -18,66 +18,17 @@ limitations under the License. */
#include <miopen/miopen.h>
#include <miopen/version.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#define MIOPEN_VERSION \
(MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \
MIOPEN_VERSION_PATCH) // NOLINT
// MIOPEN only support NCHW, just for compatibility with CUDNN API
typedef enum {
MIOPEN_TENSOR_NCHW = 0,
MIOPEN_TENSOR_NHWC = 1,
} miopenTensorFormat_t;
#include "paddle/pten/backends/dynload/miopen.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag miopen_dso_flag;
extern void* miopen_dso_handle;
extern bool HasCUDNN();
inline const char* miopenGetErrorString(miopenStatus_t status) {
switch (status) {
case miopenStatusSuccess:
return "MIOPEN_STATUS_SUCCESS";
case miopenStatusNotInitialized:
return "MIOPEN_STATUS_NOT_INITIALIZED";
case miopenStatusInvalidValue:
return "MIOPEN_STATUS_INVALID_VALUE";
case miopenStatusBadParm:
return "MIOPEN_STATUS_BAD_PARAM";
case miopenStatusAllocFailed:
return "MIOPEN_STATUS_ALLOC_FAILED";
case miopenStatusInternalError:
return "MIOPEN_STATUS_INTERNAL_ERROR";
case miopenStatusNotImplemented:
return "MIOPEN_STATUS_NOT_IMPLEMENTED";
case miopenStatusUnsupportedOp:
return "MIOPEN_STATUS_UNSUPPORTED_OP";
case miopenStatusUnknownError:
default:
return "MIOPEN_STATUS_UNKNOWN_ERROR";
}
}
extern void EnforceCUDNNLoaded(const char* fn_name);
#define DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using miopen_func = decltype(&::__name); \
std::call_once(miopen_dso_flag, []() { \
miopen_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
}); \
EnforceCUDNNLoaded(#__name); \
static void* p_##__name = dlsym(miopen_dso_handle, #__name); \
return reinterpret_cast<miopen_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
#define PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
/**
* include all needed miopen functions in HPPL
......@@ -145,23 +96,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(miopenRNNForwardInference); \
__macro(miopenGetTensorNumBytes);
MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
#define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \
__macro(miopenConvolutionBackwardData);
MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs available after R3:
#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
__macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize);
MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs available after R4:
#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \
__macro(miopenBatchNormalizationForwardTraining); \
__macro(miopenBatchNormalizationForwardInference); \
__macro(miopenBatchNormalizationBackward);
MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs in R5
#define MIOPEN_DNN_ROUTINE_EACH_R5(__macro) \
......@@ -169,12 +120,12 @@ MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
__macro(miopenSetActivationDescriptor); \
__macro(miopenGetActivationDescriptor); \
__macro(miopenDestroyActivationDescriptor);
MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH_R5(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs in R6
#define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \
/*__macro(miopenSetRNNDescriptor_v6);*/
MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH_R6(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
#define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \
__macro(miopenSetConvolutionGroupCount); \
......@@ -184,7 +135,7 @@ MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
__macro(miopenSetCTCLossDescriptor); \
__macro(miopenGetCTCLossWorkspaceSize); \
__macro(miopenCTCLoss);
MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \
/*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
......@@ -192,7 +143,7 @@ __macro(cudnnBatchNormalizationForwardTrainingEx); \
__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \
__macro(cudnnBatchNormalizationBackwardEx); \
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/
MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag mklml_dso_flag;
void* mklml_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
MKLML_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -17,36 +17,23 @@ limitations under the License. */
#include <mkl.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/mklml.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag mklml_dso_flag;
extern void *mklml_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load mklml routine
* via operator overloading.
*/
#define DYNAMIC_LOAD_MKLML_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using mklmlFunc = decltype(&::__name); \
std::call_once(mklml_dso_flag, []() { \
mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
}); \
static void *p_##_name = dlsym(mklml_dso_handle, #__name); \
return reinterpret_cast<mklmlFunc>(p_##_name)(args...); \
} \
}; \
#define DYNAMIC_LOAD_MKLML_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name)
#define PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) \
DYNAMIC_LOAD_MKLML_WRAP(__name)
#define MKLML_ROUTINE_EACH(__macro) \
__macro(cblas_sgemm); \
......@@ -111,7 +98,7 @@ extern void *mklml_dso_handle;
__macro(MKL_Set_Num_Threads); \
__macro(MKL_Get_Max_Threads);
MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
MKLML_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
#if !defined(_WIN32)
DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm);
......
......@@ -18,7 +18,7 @@ limitations under the License. */
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle {
namespace platform {
......@@ -32,18 +32,8 @@ extern void* mklrt_dso_handle;
* (for each function) to dynamic load mkldfti routine
* via operator overloading.
*/
#define DYNAMIC_LOAD_MKLRT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using mklrtFunc = decltype(&::__name); \
std::call_once(mklrt_dso_flag, []() { \
mklrt_dso_handle = paddle::platform::dynload::GetMKLRTDsoHandle(); \
}); \
static void* p_##__name = dlsym(mklrt_dso_handle, #__name); \
return reinterpret_cast<mklrtFunc>(p_##__name)(args...); \
} \
}; \
#define DYNAMIC_LOAD_MKLRT_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
// mkl_dfti.h has a macro that shadows the function with the same name
......
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag nccl_dso_flag;
void *nccl_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -16,28 +16,14 @@ limitations under the License. */
#include <nccl.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/nccl.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag nccl_dso_flag;
extern void* nccl_dso_handle;
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using nccl_func = decltype(&::__name); \
std::call_once(nccl_dso_flag, []() { \
nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
}); \
static void* p_##__name = dlsym(nccl_dso_handle, #__name); \
return reinterpret_cast<nccl_func>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define NCCL_RAND_ROUTINE_EACH(__macro) \
......@@ -57,30 +43,30 @@ extern void* nccl_dso_handle;
__macro(ncclReduceScatter); \
__macro(ncclGetErrorString);
NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
NCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#if NCCL_VERSION_CODE >= 2212
#define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
NCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
NCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif
#if NCCL_VERSION_CODE >= 2304
#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
NCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif
#if NCCL_VERSION_CODE >= 2703
#define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
__macro(ncclSend); \
__macro(ncclRecv);
NCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
NCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif
#if NCCL_VERSION_CODE >= 21100
#define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
__macro(ncclRedOpCreatePreMulSum); \
__macro(ncclRedOpDestroy);
NCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
NCCL_RAND_ROUTINE_EACH_AFTER_21100(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif
} // namespace dynload
......
......@@ -15,9 +15,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag nvjpeg_dso_flag;
void *nvjpeg_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -14,27 +14,14 @@ limitations under the License. */
#include <nvjpeg.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/nvjpeg.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag nvjpeg_dso_flag;
extern void *nvjpeg_dso_handle;
#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
nvjpegStatus_t operator()(Args... args) { \
using nvjpegFunc = decltype(&::__name); \
std::call_once(nvjpeg_dso_flag, []() { \
nvjpeg_dso_handle = paddle::platform::dynload::GetNvjpegDsoHandle(); \
}); \
static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name); \
return reinterpret_cast<nvjpegFunc>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define NVJPEG_RAND_ROUTINE_EACH(__macro) \
......@@ -44,7 +31,7 @@ extern void *nvjpeg_dso_handle;
__macro(nvjpegJpegStateDestroy); \
__macro(nvjpegDecode);
NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
NVJPEG_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
} // namespace dynload
} // namespace platform
......
......@@ -13,23 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/nvrtc.h"
#include "paddle/pten/backends/dynload/nvrtc.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag nvrtc_dso_flag;
void* nvrtc_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
NVRTC_ROUTINE_EACH(DEFINE_WRAP);
bool HasNVRTC() {
std::call_once(nvrtc_dso_flag,
[]() { nvrtc_dso_handle = GetNVRTCDsoHandle(); });
return nvrtc_dso_handle != nullptr;
}
bool HasNVRTC() { return pten::dynload::HasNVRTC(); }
} // namespace dynload
} // namespace platform
......
......@@ -17,30 +17,17 @@ limitations under the License. */
#include <nvrtc.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/nvrtc.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag nvrtc_dso_flag;
extern void* nvrtc_dso_handle;
extern bool HasNVRTC();
#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using nvrtc_func = decltype(&::__name); \
std::call_once(nvrtc_dso_flag, []() { \
nvrtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \
}); \
static void* p_##__name = dlsym(nvrtc_dso_handle, #__name); \
return reinterpret_cast<nvrtc_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
/**
* include all needed nvrtc functions
......@@ -56,9 +43,9 @@ extern bool HasNVRTC();
__macro(nvrtcGetProgramLog); \
__macro(nvrtcGetProgramLogSize)
NVRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
NVRTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
} // namespace dynload
} // namespace platform
......
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag nvtx_dso_flag;
void *nvtx_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
NVTX_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -17,36 +17,23 @@ limitations under the License. */
#include <nvToolsExt.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/nvtx.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag nvtx_dso_flag;
extern void *nvtx_dso_handle;
#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
int operator()(Args... args) { \
using nvtxFunc = decltype(&::__name); \
std::call_once(nvtx_dso_flag, []() { \
nvtx_dso_handle = paddle::platform::dynload::GetNvtxDsoHandle(); \
}); \
static void *p_##__name = dlsym(nvtx_dso_handle, #__name); \
return reinterpret_cast<nvtxFunc>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define NVTX_ROUTINE_EACH(__macro) \
__macro(nvtxRangePushA); \
__macro(nvtxRangePop);
NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
......
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag rccl_dso_flag;
void *rccl_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -16,28 +16,14 @@ limitations under the License. */
#include <rccl.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/rccl.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag rccl_dso_flag;
extern void* rccl_dso_handle;
#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using nccl_func = decltype(&::__name); \
std::call_once(rccl_dso_flag, []() { \
rccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
}); \
static void* p_##__name = dlsym(rccl_dso_handle, #__name); \
return reinterpret_cast<nccl_func>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define RCCL_RAND_ROUTINE_EACH(__macro) \
......@@ -57,18 +43,18 @@ extern void* rccl_dso_handle;
__macro(ncclReduceScatter); \
__macro(ncclGetErrorString);
RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
RCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#if NCCL_VERSION_CODE >= 2212
#define RCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
RCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#endif
#if NCCL_VERSION_CODE >= 2703
#define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
__macro(ncclSend); \
__macro(ncclRecv);
RCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
RCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#endif
} // namespace dynload
......
......@@ -17,8 +17,6 @@ limitations under the License. */
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag rocblas_dso_flag;
void *rocblas_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
......
......@@ -19,16 +19,12 @@ limitations under the License. */
#include <mutex> // NOLINT
#include <type_traits>
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/rocblas.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag rocblas_dso_flag;
extern void *rocblas_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cublas routine
......@@ -36,18 +32,8 @@ extern void *rocblas_dso_handle;
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
rocblas_status operator()(Args... args) { \
using rocblas_func = decltype(&::__name); \
std::call_once(rocblas_dso_flag, []() { \
rocblas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
}); \
static void *p_##__name = dlsym(rocblas_dso_handle, #__name); \
return reinterpret_cast<rocblas_func>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \
......@@ -83,7 +69,7 @@ extern void *rocblas_dso_handle;
__macro(rocblas_set_pointer_mode); \
__macro(rocblas_get_pointer_mode);
ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
ROCBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
// APIs available after CUDA 8.0
#define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
......@@ -94,21 +80,21 @@ ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
__macro(rocblas_zgemm_strided_batched); \
__macro(rocblas_hgemm_strided_batched);
ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
ROCBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
// HIP not supported in ROCM3.5
// #define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro)
// __macro(cublasSetMathMode);
// __macro(cublasGetMathMode);
// ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
// ROCBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
#define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
__macro(rocblas_gemm_batched_ex); \
__macro(rocblas_gemm_strided_batched_ex);
ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
ROCBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
#undef DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -13,22 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/rocm_driver.h"
#include "paddle/pten/backends/dynload/rocm_driver.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag rocm_dso_flag;
void* rocm_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
ROCM_ROUTINE_EACH(DEFINE_WRAP);
bool HasCUDADriver() {
std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetCUDADsoHandle(); });
return rocm_dso_handle != nullptr;
}
bool HasCUDADriver() { return pten::dynload::HasCUDADriver(); }
} // namespace dynload
} // namespace platform
......
......@@ -17,30 +17,17 @@ limitations under the License. */
#include <hip/hip_runtime.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/rocm_driver.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag rocm_dso_flag;
extern void* rocm_dso_handle;
extern bool HasCUDADriver();
#define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using rocm_func = decltype(&::__name); \
std::call_once(rocm_dso_flag, []() { \
rocm_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \
}); \
static void* p_##__name = dlsym(rocm_dso_handle, #__name); \
return reinterpret_cast<rocm_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
#define PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
/**
* include all needed cuda driver functions
......@@ -59,9 +46,9 @@ extern bool HasCUDADriver();
__macro(hipGetDeviceCount); \
__macro(hipDevicePrimaryCtxGetState)
ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
ROCM_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
#undef DECLARE_DYNAMIC_LOAD_ROCM_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP
} // namespace dynload
} // namespace platform
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册