未验证 提交 aff7397b 编写于 作者: W wanghuancoder 提交者: GitHub

[Eager] coreops to 495 (#37926)

* refine a test case, test=develop

* publish python c api for eager, test=develop

* revert modify about test_allclose_layer.py, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* delete numpy includes, use pybind11 numpy.h, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* suport eager error msg, and add grad test case, test=develop

* refine, test=develop

* refine, test=develop

* generate eager core ops, only 4 ops, test=develop

* Rearranged Eager AutoCodeGen directory structure

* Removed USE_OP in Eager AutoCodeGen

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* open 500 list

* refine, test=develop

* refine, test=develop

* refine, test=develop

* fix auto code gen, test=develop

* Enabled generation for Operators without Grad/Inputs/Outputs

* refine, test=develop

* refine, test=develop

* refine, test=develop

* add to pyobject, test=develop

* Resolved operators without input

* merge pr 37837

* refine

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine,test=develop
Co-authored-by: Njim19930609 <jim19930609@gmail.com>
上级 b72e6021
cc_library(performance_benchmark_utils SRCS benchmark_utils.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node scale_op matmul_v2_op)
cc_library(performance_benchmark_utils SRCS benchmark_utils.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node scale_op matmul_v2_op dygraph_function)
cc_test(test_egr_performance_benchmark_eager_cpu SRCS benchmark_eager_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
cc_test(test_egr_performance_benchmark_fluid_cpu SRCS benchmark_fluid_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
......
......@@ -32,8 +32,502 @@
#endif
#include "paddle/fluid/pybind/op_function_generator.h"
std::set<std::string> gen_list = {"elementwise_add", "reduce_sum", "matmul_v2",
"sigmoid"};
std::set<std::string> gen_list = {
"sigmoid",
"matmul_v2",
"reduce_sum",
"elementwise_add",
"rsqrt",
"multihead_matmul",
"addmm",
"gru",
"round",
"push_dense",
"rank_attention",
"fused_embedding_fc_lstm",
"where_index",
"bicubic_interp",
"arg_min",
"tile",
"bilinear_tensor_product",
"ctc_align",
"pow2_decay_with_linear_warmup",
"marker",
"split",
"fc",
"clear_float_status",
"load",
"elementwise_max",
"adadelta",
"sparse_momentum",
"tan",
"adam",
"fsp",
"where",
"logical_xor",
"multiclass_nms3",
"one_hot_v2",
"sequence_softmax",
"affine_channel",
"triangular_solve",
"sequence_topk_avg_pooling",
"space_to_depth",
"reverse",
"fused_embedding_eltwise_layernorm",
"expand_v2",
"lgamma",
"solve",
"deformable_psroi_pooling",
"instance_norm",
"decode_jpeg",
"gather_nd",
"reduce_prod",
"matrix_rank",
"asin",
"lstmp",
"iou_similarity",
"huber_loss",
"one_hot",
"sequence_slice",
"lookup_table",
"softplus",
"depthwise_conv2d",
"fused_fc_elementwise_layernorm",
"sigmoid_cross_entropy_with_logits",
"exp",
"scatter",
"equal_all",
"searchsorted",
"fusion_squared_mat_sub",
"unique",
"log",
"conv_shift",
"smooth_l1_loss",
"linear_interp_v2",
"momentum",
"temporal_shift",
"nce",
"mv",
"proximal_gd",
"memcpy_h2d",
"add_position_encoding",
"cosh",
"hash",
"grad_add",
"sign",
"prelu",
"linspace",
"fill_diagonal",
"logsigmoid",
"load_combine",
"fetch_v2",
"randperm",
"sequence_scatter",
"partial_sum",
"relu6",
"conv3d",
"lstm_unit",
"not_equal",
"transpose2",
"uniform_random_batch_size_like",
"unfold",
"lrn",
"softmax_with_cross_entropy",
"isfinite_v2",
"bernoulli",
"max_pool3d_with_index",
"gaussian_random",
"flatten2",
"matmul",
"cvm",
"adamax",
"masked_select",
"range",
"bitwise_not",
"trace",
"multinomial",
"modified_huber_loss",
"roll",
"squared_l2_distance",
"conv3d_transpose",
"share_data",
"fake_quantize_abs_max",
"unique_with_counts",
"fill",
"concat",
"fill_zeros_like",
"hierarchical_sigmoid",
"isinf_v2",
"squeeze",
"multiclass_nms2",
"bpr_loss",
"fft_c2c",
"bicubic_interp_v2",
"reshape",
"coalesce_tensor",
"roi_align",
"reshape2",
"reduce_any",
"unstack",
"scatter_nd_add",
"sequence_reshape",
"bilateral_slice",
"fill_any_like",
"empty",
"pad_constant_like",
"pool2d",
"size",
"imag",
"eigh",
"stack",
"dgc_momentum",
"lamb",
"generate_proposals_v2",
"bitwise_or",
"gru_unit",
"fake_channel_wise_quantize_dequantize_abs_max",
"sampling_id",
"unsqueeze2",
"average_accumulates",
"sequence_enumerate",
"fusion_seqconv_eltadd_relu",
"bce_loss",
"generate_proposal_labels",
"im2sequence",
"isinf",
"adagrad",
"linear_chain_crf",
"retinanet_target_assign",
"fusion_group",
"teacher_student_sigmoid_loss",
"random_crop",
"lookup_table_v2",
"detection_map",
"l1_norm",
"sqrt",
"fused_elemwise_activation",
"slogdeterminant",
"share_buffer",
"bitwise_and",
"diag_embed",
"unbind",
"dropout",
"moving_average_abs_max_scale",
"beam_search",
"log_loss",
"greater_than",
"kron",
"sigmoid_focal_loss",
"rmsprop",
"conv2d",
"uniform_random_inplace",
"maxout",
"linear_interp",
"auc",
"logical_or",
"batch_norm",
"acos",
"unpool",
"cumprod",
"sample_logits",
"crop_tensor",
"fill_constant",
"deformable_conv",
"generate_mask_labels",
"locality_aware_nms",
"expand_as",
"matrix_power",
"greater_equal",
"generate_proposals",
"bilinear_interp",
"inplace_abn",
"softshrink",
"mul",
"data_norm",
"get_tensor_from_selected_rows",
"spp",
"floor",
"gelu",
"retinanet_detection_output",
"push_dense",
"silu",
"sequence_erase",
"real",
"nearest_interp_v2",
"dgc_clip_by_norm",
"squeeze2",
"strided_slice",
"conj",
"precision_recall",
"save",
"fusion_seqexpand_concat_fc",
"fake_quantize_range_abs_max",
"depthwise_conv2d_transpose",
"positive_negative_pair",
"square",
"var_conv_2d",
"log1p",
"fused_softmax_mask_upper_triangle",
"clip_by_norm",
"atan2",
"box_decoder_and_assign",
"fft_r2c",
"roi_pool",
"overlap_add",
"fill_constant_batch_size_like",
"fill_any",
"dequantize_log",
"max_pool2d_with_index",
"pad3d",
"norm",
"viterbi_decode",
"mish",
"box_coder",
"flatten",
"elementwise_mod",
"margin_cross_entropy",
"logical_and",
"pow",
"stanh",
"label_smooth",
"merged_momentum",
"ascend_trigger",
"fused_feedforward",
"rpn_target_assign",
"roi_perspective_transform",
"expand",
"prroi_pool",
"pool3d",
"memcpy",
"distribute_fpn_proposals",
"frame",
"bincount",
"shape",
"group_norm",
"resnet_unit",
"sequence_expand_as",
"cos_sim",
"eigvals",
"save_combine",
"class_center_sample",
"read_file",
"isfinite",
"arg_max",
"equal",
"fake_dequantize_max_abs",
"qr",
"anchor_generator",
"layer_norm",
"merge_selected_rows",
"less_equal",
"rnn",
"fusion_lstm",
"lars_momentum",
"hard_sigmoid",
"isnan",
"elementwise_floordiv",
"correlation",
"histogram",
"gather_tree",
"segment_pool",
"sync_batch_norm",
"fusion_repeated_fc_relu",
"nop",
"expand_as_v2",
"filter_by_instag",
"nll_loss",
"dot",
"scale",
"ncclBcast",
"shuffle_batch",
"ncclReduce",
"diag",
"multiplex",
"leaky_relu",
"allclose",
"adamw",
"elementwise_pow",
"prior_box",
"p_norm",
"unique_consecutive",
"lod_reset",
"pad",
"sequence_conv",
"log10",
"set_value",
"bitwise_xor",
"center_loss",
"randint",
"attention_lstm",
"uniform_random",
"slice",
"meshgrid",
"hard_swish",
"sin",
"mean_iou",
"pad2d",
"inverse",
"spectral_norm",
"shuffle_channel",
"psroi_pool",
"seed",
"ceil",
"eig",
"reduce_min",
"cos",
"ncclAllReduce",
"cudnn_lstm",
"digamma",
"assign_value",
"increment",
"tdm_sampler",
"fused_softmax_mask",
"sequence_reverse",
"eigvalsh",
"diagonal",
"trunc",
"log2",
"tanh",
"yolov3_loss",
"graph_send_recv",
"accuracy",
"atan",
"less_than",
"unsqueeze",
"crf_decoding",
"log_softmax",
"ftrl",
"matrix_nms",
"top_k_v2",
"cast",
"tanh_shrink",
"hard_shrink",
"multiclass_nms",
"fusion_transpose_flatten_concat",
"sequence_unpad",
"fused_elemwise_add_activation",
"frobenius_norm",
"crop",
"cross_entropy2",
"skip_layernorm",
"tdm_child",
"fused_embedding_seq_pool",
"erf",
"conv2d_inception_fusion",
"trilinear_interp",
"logsumexp",
"fusion_seqpool_concat",
"alloc_float_status",
"sequence_concat",
"fusion_seqpool_cvm_concat",
"similarity_focus",
"argsort",
"sequence_expand",
"sgd",
"fused_bn_add_activation",
"bilinear_interp_v2",
"clip",
"deformable_conv_v1",
"hinge_loss",
"determinant",
"conv2d_transpose",
"memcpy_d2h",
"softsign",
"fake_quantize_dequantize_abs_max",
"broadcast_tensors",
"grid_sampler",
"fft_c2r",
"pyramid_hash",
"fake_quantize_dequantize_moving_average_abs_max",
"multi_dot",
"sequence_pool",
"transpose",
"top_k",
"dist",
"affine_grid",
"gaussian_random_batch_size_like",
"fake_channel_wise_dequantize_max_abs",
"reciprocal",
"sequence_mask",
"fill_diagonal_tensor",
"abs",
"partial_concat",
"elu",
"index_select",
"row_conv",
"cross",
"elementwise_mul",
"decayed_adagrad",
"bipartite_match",
"fake_quantize_moving_average_abs_max",
"mine_hard_examples",
"target_assign",
"lstm",
"truncated_gaussian_random",
"match_matrix_tensor",
"elementwise_div",
"kldiv_loss",
"cumsum",
"sum",
"proximal_adagrad",
"shard_index",
"selu",
"mean",
"gumbel_softmax",
"sequence_pad",
"tree_conv",
"assign",
"flatten_contiguous_range",
"tril_triu",
"brelu",
"celu",
"reduce_mean",
"sinh",
"rank_loss",
"reduce_max",
"fusion_gru",
"fill_zeros_like2",
"expm1",
"squared_l2_norm",
"elementwise_sub",
"margin_rank_loss",
"faster_tokenizer",
"relu",
"is_empty",
"reduce_all",
"edit_distance",
"bmm",
"yolo_box",
"soft_relu",
"density_prior_box",
"eye",
"swish",
"cross_entropy",
"dpsgd",
"cholesky",
"batch_fc",
"nearest_interp",
"gather",
"trilinear_interp_v2",
"box_clip",
"isnan_v2",
"softmax",
"conv2d_fusion",
"fused_batch_norm_act",
"get_float_status",
"index_sample",
"elementwise_min",
"logical_not",
"collect_fpn_proposals",
"pixel_shuffle",
"thresholded_relu",
"polygon_box_transform",
"lookup_table_dequant",
"warpctc",
"fake_channel_wise_quantize_abs_max",
"dequantize_abs_max",
"svd",
"flip"};
// clang-format off
const char* OUT_INITIALIZER_TEMPLATE =
......@@ -178,16 +672,8 @@ std::string GenerateOpFunctionsBody(
ins_cast_str += paddle::string::Sprintf(in_cast_type, in_name, op_type,
in_name, arg_idx++, dispensable);
if (input.dispensable()) {
const auto in_template = input.duplicable()
? INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST
: INPUT_INITIALIZER_TEMPLATE_WITH_NULL;
ins_initializer_with_null +=
paddle::string::Sprintf(in_template, in_name, in_name, in_name);
} else {
call_api_str += in_name + ", ";
}
}
if (!input_args.empty() && input_args.back() == ',') {
input_args.pop_back();
......@@ -237,6 +723,8 @@ std::string GenerateOpFunctionsBody(
auto dispensable = output.dispensable() ? "true" : "false";
ins_cast_str += paddle::string::Sprintf(in_cast_type, out_name, op_type,
out_name, arg_idx++, dispensable);
// call_api_str += out_name + ", ";
} else {
// There are few Operators that have duplicable output, like `Out` in
// split op. We need to specify the number of variables for the
......@@ -281,11 +769,9 @@ std::string GenerateOpFunctionsBody(
HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name,
viwe_input_name, viwe_output_name);
}
if (outs_num == 0) {
return_str = "Py_INCREF(Py_None);\n return Py_None;";
} else {
return_str = "return ToPyObject(out);";
}
std::string function_args = "";
if (input_args == "") {
function_args = FUNCTION_ARGS_NO_INPUT;
......
......@@ -370,6 +370,15 @@ PyObject* ToPyObject(const platform::Place& value) {
return obj.ptr();
}
PyObject* ToPyObject(const void* value) {
if (value == nullptr) {
Py_INCREF(Py_None);
return Py_None;
}
PADDLE_THROW(
platform::errors::Fatal("ToPyObject do not support void* with value."));
}
egr::EagerTensor GetEagerTensorFromArgs(const std::string& op_type,
const std::string& arg_name,
PyObject* args, ssize_t arg_idx,
......
......@@ -51,6 +51,7 @@ PyObject* ToPyObject(const std::vector<float>& value);
PyObject* ToPyObject(const std::vector<double>& value);
PyObject* ToPyObject(const std::vector<egr::EagerTensor>& value);
PyObject* ToPyObject(const platform::Place& value);
PyObject* ToPyObject(const void* value);
template <typename Tuple, size_t N>
struct TupleEagerTensorResult {
......
......@@ -75,7 +75,9 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/pybind/cuda_streams_py.h"
#ifndef PADDLE_ON_INFERENCE
#include "paddle/fluid/pybind/eager.h"
#endif
#include "paddle/fluid/pybind/io.h"
#include "paddle/utils/none.h"
#ifdef PADDLE_WITH_ASCEND
......@@ -541,7 +543,9 @@ PYBIND11_MODULE(core_avx, m) {
PYBIND11_MODULE(core_noavx, m) {
#endif
#ifndef PADDLE_ON_INFERENCE
BindEager(&m);
#endif
BindCudaStream(&m);
// Not used, just make sure cpu_info.cc is linked.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册