未验证 提交 6e65fe02 编写于 作者: Z Zhen Wang 提交者: GitHub

The unsupported_fp16_list using in AMP will be created automatically during the runtime. (#32102)

* Use the runtime to create the unsupported_fp16_list using in AMP.

* Add more infos about supported ops.

* Add some comments for the function of OpSupportedInfos.

* Fix the unit test of test_multi_precision_fp16_train.
上级 72302033
......@@ -14,11 +14,15 @@ limitations under the License. */
#include <Python.h>
#include <algorithm>
#include <cctype>
#include <cstdlib>
#include <iterator>
#include <map>
#include <memory>
#include <mutex> // NOLINT // for call_once
#include <string>
#include <tuple>
#include <type_traits>
#include <unordered_map>
#include <unordered_set>
#include <utility>
......@@ -189,6 +193,64 @@ bool SupportsBfloat16FastPerformance() {
#endif
}
// According to the input `place` and `dtype`, this function returns a tuple
// consists of three sets:
// 1) All operators registered in the Paddle framework.
// 2) All operators supported for `place` and `dtype`.
// 3) All operators unsupported for `place` and `dtype`.
// The input `place` is a type of string, which can only be `GPU` or `CPU`.
// The input `dtype` is a type of paddle::framework::proto::VarType::Type,
// which can be paddle::framework::proto::VarType::FP16,
// paddle::framework::proto::VarType::FP32 and so on.
std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>,
std::unordered_set<std::string>>
OpSupportedInfos(const std::string &place,
framework::proto::VarType::Type dtype) {
std::string query_place;
std::transform(place.begin(), place.end(), std::back_inserter(query_place),
[](unsigned char c) { return std::toupper(c); });
using fn_type = std::add_pointer<bool(const platform::Place &)>::type;
std::unordered_map<std::string, fn_type> is_target_place{
{"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place},
};
PADDLE_ENFORCE_NE(
is_target_place.count(query_place), 0,
platform::errors::InvalidArgument(
"The argument `place` should be 'GPU' or 'CPU', but get '%s'.",
place));
std::unordered_set<std::string> all_ops;
const auto &op_info = framework::OpInfoMap::Instance().map();
for (auto it = op_info.begin(); it != op_info.end(); it++) {
all_ops.emplace(it->first);
}
std::unordered_set<std::string> supported_ops;
auto &all_kernels = framework::OperatorWithKernel::AllOpKernels();
for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
for (auto &kernel_type : it->second) {
if (is_target_place[query_place](kernel_type.first.place_) &&
kernel_type.first.data_type_ == dtype) {
supported_ops.emplace(it->first);
}
}
}
std::unordered_set<std::string> unsupported_ops;
for (auto &op : all_ops) {
if (!supported_ops.count(op)) {
unsupported_ops.emplace(op);
}
}
VLOG(4) << "-- The size of all_ops: " << all_ops.size() << " --";
VLOG(4) << "-- The size of supported_ops: " << supported_ops.size() << " --";
VLOG(4) << "-- The size of unsupported_ops: " << unsupported_ops.size()
<< " --";
return std::make_tuple(std::move(all_ops), std::move(supported_ops),
std::move(unsupported_ops));
}
bool IsCompiledWithBrpc() {
#ifndef PADDLE_WITH_DISTRIBUTE
return false;
......@@ -1770,6 +1832,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
m.def("supports_bfloat16", SupportsBfloat16);
m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
m.def("op_supported_infos", OpSupportedInfos);
m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
m.def("is_compiled_with_dist", IsCompiledWithDIST);
m.def("_cuda_synchronize", [](const platform::CUDAPlace &place) {
......
......@@ -13,6 +13,7 @@
# limitations under the License.
import copy
from ... import core
__all__ = ["CustomOpLists", "AutoMixedPrecisionLists"]
......@@ -147,147 +148,10 @@ gray_list = {
}
# The set of ops that don't support fp16 calculation
unsupported_fp16_list = {
# from python/paddle/fluid/layers/io.py
'send',
'send_barrier',
'recv',
'fetch_barrier',
'create_py_reader',
'create_double_buffer_reader',
'read',
'load',
# from python/paddle/fluid/control_flow.py
'increment',
'less_than',
'less_equal',
'greater_than',
'greater_equal',
'equal',
'not_equal',
'read_from_array',
'shrink_rnn_memory',
'lod_array_length',
'logical_and',
'logical_or',
'logical_xor',
'logical_not',
'print',
'conditional_block',
'while',
'ifelse',
'is_empty',
'lstm',
'cudnn_lstm',
'lstmp',
'gru',
'gru_unit',
'linear_chain_crf',
'crf_decoding',
'bpr_loss',
'chunk_eval',
'sequence_conv',
'sequence_softmax',
# Depthwise conv2d isn't fast and safe currently.
# ref: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h#L79
'depthwise_conv2d',
# Tensor Core kernels are not available for 3D convolutions currently.
'conv3d',
'sequence_pool',
'sequence_concat',
'sequence_slice',
'data_norm',
'group_norm',
'spectral_norm',
'depthwise_conv2d_transpose',
'sequence_expand',
'conv_transposed2d',
'conv_transposed3d',
'sequence_expand_as',
'sequence_pad',
'sequence_unpad',
'sequence_erase',
'beam_search',
'beam_search_decode',
'lstm_unit',
'reduce_sum',
'reduce_mean',
'reduce_max',
'reduce_min',
'reduce_prod',
'reduce_all',
'reduce_any',
'split',
'edit_distance',
'ctc_align',
'warpctc',
'sequence_reshape',
'nce',
'hierarchical_sigmoid',
'im2sequence',
'row_conv',
'multiplex',
'sample_logits',
'one_hot',
'smooth_l1_loss',
'squeeze2',
'unsqueeze2',
'lod_reset',
'lrn',
'pad',
'pad_constant_like',
'label_smooth',
'scatter',
'sequence_scatter',
'random_crop',
'mean_iou',
'selu',
'crop',
'affine_grid',
'rank_loss',
'margin_rank_loss',
'pad2d',
'elu',
'pow',
'stanh',
'hard_sigmoid',
'swish',
'prelu',
'brelu',
'sequence_enumerate',
'sequence_mask',
'expand',
'sampling_id',
'maxout',
'space_to_depth',
'sequence_reverse',
'similarity_focus',
'hash',
'grid_sampler',
'log_loss',
'teacher_student_sigmoid_loss',
'add_position_encoding',
'bilinear_tensor_product',
'shuffle_channel',
'temporal_shift',
'psroi_pool',
'huber_loss',
'kldiv_loss',
'tree_conv',
'pixel_shuffle',
'fsp',
'cvm',
'affine_channel',
'roi_pool',
'roi_align',
'anchor_generator',
'generate_proposals',
'generate_proposal_labels',
'generate_mask_labels',
# fp16 is slower than fp32, though fp16 is supported.
'lookup_table',
'lookup_table_v2',
}
# lookup_table fp16 is slower than fp32, though fp16 is supported.
_, _, _sys_unsupported_fp16_list = core.op_supported_infos(
'GPU', core.VarDesc.VarType.FP16)
unsupported_fp16_list = {'lookup_table',
'lookup_table_v2'} | _sys_unsupported_fp16_list
CustomOpLists = AutoMixedPrecisionLists
......@@ -258,7 +258,8 @@ class TestAmpWithNonIterableDataLoader(unittest.TestCase):
cast_model_to_fp16(main_prog, use_fp16_guard=False)
def test_non_iterable_dataloader(self):
self.decorate_with_data_loader()
if fluid.core.is_compiled_with_cuda():
self.decorate_with_data_loader()
if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册