未验证 提交 c5affb78 编写于 作者: L Leo Chen 提交者: GitHub

[pten] fit pten for amp (#39403)

* fit pten for amp

* fix typo
上级 db7d129e
......@@ -25,6 +25,80 @@ namespace imperative {
class VarBase;
// According to the input `place` and `dtype`, this function returns a tuple
// consists of three sets:
// 1) All operators registered in the Paddle framework.
// 2) All operators supported for `place` and `dtype`.
// 3) All operators unsupported for `place` and `dtype`.
// The input `place` is a type of string, which can only be `GPU` or `CPU`.
// The input `dtype` is a type of paddle::framework::proto::VarType::Type,
// which can be paddle::framework::proto::VarType::FP16,
// paddle::framework::proto::VarType::FP32 and so on.
std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>,
std::unordered_set<std::string>>
OpSupportedInfos(const std::string& place,
framework::proto::VarType::Type dtype) {
std::string query_place;
std::transform(place.begin(), place.end(), std::back_inserter(query_place),
[](unsigned char c) { return std::toupper(c); });
using fn_type = std::add_pointer<bool(const platform::Place&)>::type;
std::unordered_map<std::string, fn_type> is_target_place{
{"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place},
{"XPU", &platform::is_xpu_place}, {"NPU", &platform::is_npu_place},
{"MLU", &platform::is_mlu_place},
};
PADDLE_ENFORCE_NE(is_target_place.count(query_place), 0,
platform::errors::InvalidArgument(
"The argument `place` should be 'GPU', 'CPU', 'XPU', "
"'NPU', 'MLU', but got '%s'.",
place));
std::unordered_set<std::string> all_ops;
const auto& op_info = framework::OpInfoMap::Instance().map();
for (auto it = op_info.begin(); it != op_info.end(); it++) {
all_ops.emplace(it->first);
}
std::unordered_set<std::string> supported_ops;
auto& all_kernels = framework::OperatorWithKernel::AllOpKernels();
for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
for (auto& kernel_type : it->second) {
if (is_target_place[query_place](kernel_type.first.place_) &&
kernel_type.first.data_type_ == dtype) {
supported_ops.emplace(it->first);
}
}
}
auto pten_kernels = pten::KernelFactory::Instance().kernels();
for (auto& kernel_pair : pten_kernels) {
auto op_type = pten::TransToFluidOpName(kernel_pair.first);
for (auto& info_pair : kernel_pair.second) {
framework::OpKernelType kernel_type =
framework::TransPtenKernelKeyToOpKernelType(info_pair.first);
if (is_target_place[query_place](kernel_type.place_) &&
kernel_type.data_type_ == dtype && all_ops.count(op_type)) {
VLOG(4) << op_type << " " << supported_ops.size();
supported_ops.emplace(op_type);
}
}
}
std::unordered_set<std::string> unsupported_ops;
for (auto& op : all_ops) {
if (!supported_ops.count(op)) {
unsupported_ops.emplace(op);
}
}
VLOG(4) << "-- The size of all_ops: " << all_ops.size() << " --";
VLOG(4) << "-- The size of supported_ops: " << supported_ops.size() << " --";
VLOG(4) << "-- The size of unsupported_ops: " << unsupported_ops.size()
<< " --";
return std::make_tuple(std::move(all_ops), std::move(supported_ops),
std::move(unsupported_ops));
}
AutoCastGuard::AutoCastGuard(std::shared_ptr<Tracer> tracer, AmpLevel level)
: tracer_(tracer) {
pre_amp_level_ = tracer_->GetAmpLevel();
......@@ -40,21 +114,25 @@ AmpOperators::AmpOperators()
: allow_ops_(new std::unordered_set<std::string>()),
block_ops_(new std::unordered_set<std::string>()),
unsupported_fp16_ops_(new std::unordered_set<std::string>()) {
auto& all_kernels = framework::OperatorWithKernel::AllOpKernels();
auto fp16_dtype = framework::proto::VarType::FP16;
for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
bool supported = false;
for (auto& kernel_type : it->second) {
if ((platform::is_gpu_place(kernel_type.first.place_) ||
platform::is_xpu_place(kernel_type.first.place_)) &&
kernel_type.first.data_type_ == fp16_dtype) {
supported = true;
}
}
if (!supported) {
unsupported_fp16_ops_->insert(it->first);
}
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto unsupported_ops_gpu = std::get<2>(
OpSupportedInfos("GPU", paddle::framework::proto::VarType::FP16));
unsupported_fp16_ops_->insert(unsupported_ops_gpu.begin(),
unsupported_ops_gpu.end());
// NOTE: GPU/NPU/XPU is compiled seperatly.
#elif defined(PADDLE_WITH_ASCEND_CL)
auto unsupported_ops_npu = std::get<2>(
OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16));
unsupported_fp16_ops_->insert(unsupported_ops_npu.begin(),
unsupported_ops_npu.end());
#elif defined(PADDLE_WITH_XPU)
auto unsupported_ops_xpu = std::get<2>(
OpSupportedInfos("XPU", paddle::framework::proto::VarType::FP16));
unsupported_fp16_ops_->insert(unsupported_ops_xpu.begin(),
unsupported_ops_xpu.end());
#endif
VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " "
<< unsupported_fp16_ops_->size();
}
AmpOperators::~AmpOperators() {}
......
......@@ -19,6 +19,7 @@
#include <tuple>
#include <unordered_set>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/imperative/type_defs.h"
namespace paddle {
......@@ -32,6 +33,11 @@ enum class AmpLevel {
O3, // fp16
};
std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>,
std::unordered_set<std::string>>
OpSupportedInfos(const std::string& place,
framework::proto::VarType::Type dtype);
class Tracer;
// Singleton implementation with C++ 11
......
......@@ -59,6 +59,7 @@ limitations under the License. */
#include "paddle/fluid/framework/trainer.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/framework/version.h"
#include "paddle/fluid/imperative/amp_auto_cast.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/memory/allocation/mmap_allocator.h"
......@@ -304,66 +305,6 @@ bool SupportsVNNI() {
#endif
}
// According to the input `place` and `dtype`, this function returns a tuple
// consists of three sets:
// 1) All operators registered in the Paddle framework.
// 2) All operators supported for `place` and `dtype`.
// 3) All operators unsupported for `place` and `dtype`.
// The input `place` is a type of string, which can only be `GPU` or `CPU`.
// The input `dtype` is a type of paddle::framework::proto::VarType::Type,
// which can be paddle::framework::proto::VarType::FP16,
// paddle::framework::proto::VarType::FP32 and so on.
std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>,
std::unordered_set<std::string>>
OpSupportedInfos(const std::string &place,
framework::proto::VarType::Type dtype) {
std::string query_place;
std::transform(place.begin(), place.end(), std::back_inserter(query_place),
[](unsigned char c) { return std::toupper(c); });
using fn_type = std::add_pointer<bool(const platform::Place &)>::type;
std::unordered_map<std::string, fn_type> is_target_place{
{"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place},
{"XPU", &platform::is_xpu_place}, {"NPU", &platform::is_npu_place},
{"MLU", &platform::is_mlu_place},
};
PADDLE_ENFORCE_NE(
is_target_place.count(query_place), 0,
platform::errors::InvalidArgument(
"The argument `place` should be 'GPU' or 'CPU', but get '%s'.",
place));
std::unordered_set<std::string> all_ops;
const auto &op_info = framework::OpInfoMap::Instance().map();
for (auto it = op_info.begin(); it != op_info.end(); it++) {
all_ops.emplace(it->first);
}
std::unordered_set<std::string> supported_ops;
auto &all_kernels = framework::OperatorWithKernel::AllOpKernels();
for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
for (auto &kernel_type : it->second) {
if (is_target_place[query_place](kernel_type.first.place_) &&
kernel_type.first.data_type_ == dtype) {
supported_ops.emplace(it->first);
}
}
}
std::unordered_set<std::string> unsupported_ops;
for (auto &op : all_ops) {
if (!supported_ops.count(op)) {
unsupported_ops.emplace(op);
}
}
VLOG(4) << "-- The size of all_ops: " << all_ops.size() << " --";
VLOG(4) << "-- The size of supported_ops: " << supported_ops.size() << " --";
VLOG(4) << "-- The size of unsupported_ops: " << unsupported_ops.size()
<< " --";
return std::make_tuple(std::move(all_ops), std::move(supported_ops),
std::move(unsupported_ops));
}
bool IsCompiledWithBrpc() {
#ifndef PADDLE_WITH_DISTRIBUTE
return false;
......@@ -2449,7 +2390,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
m.def("supports_int8", SupportsInt8);
m.def("supports_vnni", SupportsVNNI);
m.def("op_supported_infos", OpSupportedInfos);
m.def("op_supported_infos", imperative::OpSupportedInfos);
m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
m.def("is_compiled_with_dist", IsCompiledWithDIST);
m.def("_cuda_synchronize", [](const platform::CUDAPlace &place) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册