未验证 提交 c5affb78 编写于 作者: L Leo Chen 提交者: GitHub

[pten] fit pten for amp (#39403)

* fit pten for amp

* fix typo
上级 db7d129e
...@@ -25,6 +25,80 @@ namespace imperative { ...@@ -25,6 +25,80 @@ namespace imperative {
class VarBase; class VarBase;
// According to the input `place` and `dtype`, this function returns a tuple
// consists of three sets:
// 1) All operators registered in the Paddle framework.
// 2) All operators supported for `place` and `dtype`.
// 3) All operators unsupported for `place` and `dtype`.
// The input `place` is a type of string, which can only be `GPU` or `CPU`.
// The input `dtype` is a type of paddle::framework::proto::VarType::Type,
// which can be paddle::framework::proto::VarType::FP16,
// paddle::framework::proto::VarType::FP32 and so on.
std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>,
std::unordered_set<std::string>>
OpSupportedInfos(const std::string& place,
framework::proto::VarType::Type dtype) {
std::string query_place;
std::transform(place.begin(), place.end(), std::back_inserter(query_place),
[](unsigned char c) { return std::toupper(c); });
using fn_type = std::add_pointer<bool(const platform::Place&)>::type;
std::unordered_map<std::string, fn_type> is_target_place{
{"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place},
{"XPU", &platform::is_xpu_place}, {"NPU", &platform::is_npu_place},
{"MLU", &platform::is_mlu_place},
};
PADDLE_ENFORCE_NE(is_target_place.count(query_place), 0,
platform::errors::InvalidArgument(
"The argument `place` should be 'GPU', 'CPU', 'XPU', "
"'NPU', 'MLU', but got '%s'.",
place));
std::unordered_set<std::string> all_ops;
const auto& op_info = framework::OpInfoMap::Instance().map();
for (auto it = op_info.begin(); it != op_info.end(); it++) {
all_ops.emplace(it->first);
}
std::unordered_set<std::string> supported_ops;
auto& all_kernels = framework::OperatorWithKernel::AllOpKernels();
for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
for (auto& kernel_type : it->second) {
if (is_target_place[query_place](kernel_type.first.place_) &&
kernel_type.first.data_type_ == dtype) {
supported_ops.emplace(it->first);
}
}
}
auto pten_kernels = pten::KernelFactory::Instance().kernels();
for (auto& kernel_pair : pten_kernels) {
auto op_type = pten::TransToFluidOpName(kernel_pair.first);
for (auto& info_pair : kernel_pair.second) {
framework::OpKernelType kernel_type =
framework::TransPtenKernelKeyToOpKernelType(info_pair.first);
if (is_target_place[query_place](kernel_type.place_) &&
kernel_type.data_type_ == dtype && all_ops.count(op_type)) {
VLOG(4) << op_type << " " << supported_ops.size();
supported_ops.emplace(op_type);
}
}
}
std::unordered_set<std::string> unsupported_ops;
for (auto& op : all_ops) {
if (!supported_ops.count(op)) {
unsupported_ops.emplace(op);
}
}
VLOG(4) << "-- The size of all_ops: " << all_ops.size() << " --";
VLOG(4) << "-- The size of supported_ops: " << supported_ops.size() << " --";
VLOG(4) << "-- The size of unsupported_ops: " << unsupported_ops.size()
<< " --";
return std::make_tuple(std::move(all_ops), std::move(supported_ops),
std::move(unsupported_ops));
}
AutoCastGuard::AutoCastGuard(std::shared_ptr<Tracer> tracer, AmpLevel level) AutoCastGuard::AutoCastGuard(std::shared_ptr<Tracer> tracer, AmpLevel level)
: tracer_(tracer) { : tracer_(tracer) {
pre_amp_level_ = tracer_->GetAmpLevel(); pre_amp_level_ = tracer_->GetAmpLevel();
...@@ -40,21 +114,25 @@ AmpOperators::AmpOperators() ...@@ -40,21 +114,25 @@ AmpOperators::AmpOperators()
: allow_ops_(new std::unordered_set<std::string>()), : allow_ops_(new std::unordered_set<std::string>()),
block_ops_(new std::unordered_set<std::string>()), block_ops_(new std::unordered_set<std::string>()),
unsupported_fp16_ops_(new std::unordered_set<std::string>()) { unsupported_fp16_ops_(new std::unordered_set<std::string>()) {
auto& all_kernels = framework::OperatorWithKernel::AllOpKernels(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto fp16_dtype = framework::proto::VarType::FP16; auto unsupported_ops_gpu = std::get<2>(
for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) { OpSupportedInfos("GPU", paddle::framework::proto::VarType::FP16));
bool supported = false; unsupported_fp16_ops_->insert(unsupported_ops_gpu.begin(),
for (auto& kernel_type : it->second) { unsupported_ops_gpu.end());
if ((platform::is_gpu_place(kernel_type.first.place_) || // NOTE: GPU/NPU/XPU is compiled seperatly.
platform::is_xpu_place(kernel_type.first.place_)) && #elif defined(PADDLE_WITH_ASCEND_CL)
kernel_type.first.data_type_ == fp16_dtype) { auto unsupported_ops_npu = std::get<2>(
supported = true; OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16));
} unsupported_fp16_ops_->insert(unsupported_ops_npu.begin(),
} unsupported_ops_npu.end());
if (!supported) { #elif defined(PADDLE_WITH_XPU)
unsupported_fp16_ops_->insert(it->first); auto unsupported_ops_xpu = std::get<2>(
} OpSupportedInfos("XPU", paddle::framework::proto::VarType::FP16));
} unsupported_fp16_ops_->insert(unsupported_ops_xpu.begin(),
unsupported_ops_xpu.end());
#endif
VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " "
<< unsupported_fp16_ops_->size();
} }
AmpOperators::~AmpOperators() {} AmpOperators::~AmpOperators() {}
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <tuple> #include <tuple>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/type_defs.h"
namespace paddle { namespace paddle {
...@@ -32,6 +33,11 @@ enum class AmpLevel { ...@@ -32,6 +33,11 @@ enum class AmpLevel {
O3, // fp16 O3, // fp16
}; };
std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>,
std::unordered_set<std::string>>
OpSupportedInfos(const std::string& place,
framework::proto::VarType::Type dtype);
class Tracer; class Tracer;
// Singleton implementation with C++ 11 // Singleton implementation with C++ 11
......
...@@ -59,6 +59,7 @@ limitations under the License. */ ...@@ -59,6 +59,7 @@ limitations under the License. */
#include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/trainer.h"
#include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/framework/version.h" #include "paddle/fluid/framework/version.h"
#include "paddle/fluid/imperative/amp_auto_cast.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/memory/allocation/mmap_allocator.h" #include "paddle/fluid/memory/allocation/mmap_allocator.h"
...@@ -304,66 +305,6 @@ bool SupportsVNNI() { ...@@ -304,66 +305,6 @@ bool SupportsVNNI() {
#endif #endif
} }
// According to the input `place` and `dtype`, this function returns a tuple
// consists of three sets:
// 1) All operators registered in the Paddle framework.
// 2) All operators supported for `place` and `dtype`.
// 3) All operators unsupported for `place` and `dtype`.
// The input `place` is a type of string, which can only be `GPU` or `CPU`.
// The input `dtype` is a type of paddle::framework::proto::VarType::Type,
// which can be paddle::framework::proto::VarType::FP16,
// paddle::framework::proto::VarType::FP32 and so on.
std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>,
std::unordered_set<std::string>>
OpSupportedInfos(const std::string &place,
framework::proto::VarType::Type dtype) {
std::string query_place;
std::transform(place.begin(), place.end(), std::back_inserter(query_place),
[](unsigned char c) { return std::toupper(c); });
using fn_type = std::add_pointer<bool(const platform::Place &)>::type;
std::unordered_map<std::string, fn_type> is_target_place{
{"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place},
{"XPU", &platform::is_xpu_place}, {"NPU", &platform::is_npu_place},
{"MLU", &platform::is_mlu_place},
};
PADDLE_ENFORCE_NE(
is_target_place.count(query_place), 0,
platform::errors::InvalidArgument(
"The argument `place` should be 'GPU' or 'CPU', but get '%s'.",
place));
std::unordered_set<std::string> all_ops;
const auto &op_info = framework::OpInfoMap::Instance().map();
for (auto it = op_info.begin(); it != op_info.end(); it++) {
all_ops.emplace(it->first);
}
std::unordered_set<std::string> supported_ops;
auto &all_kernels = framework::OperatorWithKernel::AllOpKernels();
for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
for (auto &kernel_type : it->second) {
if (is_target_place[query_place](kernel_type.first.place_) &&
kernel_type.first.data_type_ == dtype) {
supported_ops.emplace(it->first);
}
}
}
std::unordered_set<std::string> unsupported_ops;
for (auto &op : all_ops) {
if (!supported_ops.count(op)) {
unsupported_ops.emplace(op);
}
}
VLOG(4) << "-- The size of all_ops: " << all_ops.size() << " --";
VLOG(4) << "-- The size of supported_ops: " << supported_ops.size() << " --";
VLOG(4) << "-- The size of unsupported_ops: " << unsupported_ops.size()
<< " --";
return std::make_tuple(std::move(all_ops), std::move(supported_ops),
std::move(unsupported_ops));
}
bool IsCompiledWithBrpc() { bool IsCompiledWithBrpc() {
#ifndef PADDLE_WITH_DISTRIBUTE #ifndef PADDLE_WITH_DISTRIBUTE
return false; return false;
...@@ -2449,7 +2390,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2449,7 +2390,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance); m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
m.def("supports_int8", SupportsInt8); m.def("supports_int8", SupportsInt8);
m.def("supports_vnni", SupportsVNNI); m.def("supports_vnni", SupportsVNNI);
m.def("op_supported_infos", OpSupportedInfos); m.def("op_supported_infos", imperative::OpSupportedInfos);
m.def("is_compiled_with_brpc", IsCompiledWithBrpc); m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
m.def("is_compiled_with_dist", IsCompiledWithDIST); m.def("is_compiled_with_dist", IsCompiledWithDIST);
m.def("_cuda_synchronize", [](const platform::CUDAPlace &place) { m.def("_cuda_synchronize", [](const platform::CUDAPlace &place) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册