Add fuse_ops.yaml and fused_backward.yaml (#52010)

* add fused_yaml fused_backward * fix eager_funciton bug * add some comment of fused yaml file * add 'support_dygraph_mode' configuration in fused yaml * delete some 'fused_api.h' in include file * add fused flag in api_gen

Add fuse_ops.yaml and fused_backward.yaml (#52010)
* add fused_yaml fused_backward * fix eager_funciton bug * add some comment of fused yaml file * add 'support_dygraph_mode' configuration in fused yaml * delete some 'fused_api.h' in include file * add fused flag in api_gen
10145cb6 · HappyHeavyRain · GitHub · 7d416161 · 10145cb6 · 10145cb6
17 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -7,16 +7,20 @@ paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
 paddle/fluid/operators/ops_extra_info.cc
 paddle/phi/api/backward/backward_api.h
+paddle/phi/api/backward/fused_backward_api.h
 paddle/phi/api/backward/sparse_bw_api.h
 paddle/phi/api/include/api.h
+paddle/phi/api/include/fused_api.h
 paddle/phi/api/include/operants_base.h
 paddle/phi/api/include/operants_manager.h
 paddle/phi/api/include/sparse_api.h
 paddle/phi/api/include/strings_api.h
 paddle/phi/api/include/tensor_operants.h
 paddle/phi/api/lib/api.cc
+paddle/phi/api/lib/fused_api.cc
 paddle/phi/api/lib/dygraph_api.*
 paddle/phi/api/lib/backward_api.cc
+paddle/phi/api/lib/fused_backward_api.cc
 paddle/phi/api/lib/operants_manager.cc
 paddle/phi/api/lib/sparse_api.cc
 paddle/phi/api/lib/strings_api.cc
@@ -85,6 +89,7 @@ tools/nvcc_lazy
 paddle/fluid/operators/generated_op*.cc
 paddle/fluid/operators/generated_sparse_op.cc
 paddle/fluid/operators/generated_static_op.cc
+paddle/fluid/operators/generated_fused_op.cc
 paddle/phi/ops/compat/generated_*.cc
 paddle/phi/api/yaml/parsed_apis/
 paddle/fluid/operators/generator/parsed_ops/

--- a/paddle/fluid/distributed/collective/reducer.h
+++ b/paddle/fluid/distributed/collective/reducer.h
@@ -26,6 +26,7 @@
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/api/include/api.h"
+#include "paddle/phi/api/include/fused_api.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

--- a/paddle/fluid/eager/auto_code_generator/generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/generator/CMakeLists.txt
 set(api_yaml_path
-    "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/ops.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_ops.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_ops.yaml"
+    "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/ops.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_ops.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_ops.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/fused_ops.yaml"
 )
 set(backward_yaml_path
-    "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/backward.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_backward.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_backward.yaml"
+    "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/backward.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_backward.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_backward.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/fused_backward.yaml"
 )
 set(tmp_forwards_cc_path
    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc"

--- a/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
@@ -119,12 +119,31 @@ def ReadFwdFile(filepath):
    # empty file loaded by yaml is None
    contents = yaml.load(f, Loader=yaml.FullLoader)
    f.close()
+    # not all fused ops supoort dygraph
+    if filepath.endswith("fused_ops.yaml") is True:
+        new_apis = [
+            api
+            for api in contents
+            if "support_dygraph_mode" in api
+            and api["support_dygraph_mode"] is True
+        ]
+        contents = new_apis
    return contents if contents is not None else []


 def ReadBwdFile(filepath):
    f = open(filepath, 'r')
    contents = yaml.load(f, Loader=yaml.FullLoader)
+    # not all fused ops supoort dygraph
+    if filepath.endswith("fused_backward.yaml") is True:
+        new_apis = [
+            api
+            for api in contents
+            if "support_dygraph_mode" in api
+            and api["support_dygraph_mode"] is True
+        ]
+        contents = new_apis
+
    ret = {}
    if contents is not None:
        for content in contents:

--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -336,6 +336,7 @@ NODE_CC_FILE_TEMPLATE = """
 #include "glog/logging.h"
 #include "paddle/phi/api/all.h"
 #include "paddle/phi/api/backward/backward_api.h"
+#include "paddle/phi/api/backward/fused_backward_api.h"
 #include "paddle/phi/api/backward/sparse_bw_api.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/framework/op_registry.h"

--- a/paddle/fluid/operators/generator/CMakeLists.txt
+++ b/paddle/fluid/operators/generator/CMakeLists.txt
@@ -6,6 +6,7 @@ set(op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/ops.yaml)
 set(legacy_op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_ops.yaml)
 set(bw_op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/backward.yaml)
 set(static_op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/static_ops.yaml)
+set(fused_op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/fused_ops.yaml)
 set(legacy_bw_op_yaml_file
    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_backward.yaml)
 set(sparse_op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_ops.yaml)
@@ -13,6 +14,8 @@ set(sparse_bw_op_yaml_file
    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_backward.yaml)
 set(static_bw_op_yaml_file
    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/static_backward.yaml)
+set(fused_bw_op_yaml_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/fused_backward.yaml)

 if(NOT PYTHONINTERP_FOUND)
  find_package(PythonInterp REQUIRED)
@@ -40,10 +43,14 @@ set(generated_op_path_4
    ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_op4.cc)
 set(generated_static_op_path
    ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_static_op.cc)
+set(generated_fused_op_path
+    ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_fused_op.cc)
 set(generated_sparse_ops_path
    ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_sparse_op.cc)
 set(generated_argument_mapping_path
    ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_sig.cc)
+set(generated_fused_argument_mapping_path
+    ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_fused_sig.cc)
 set(generated_static_argument_mapping_path
    ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_static_sig.cc)
 set(generated_sparse_argument_mapping_path
@@ -54,7 +61,9 @@ message(
 - ${op_yaml_file}
 - ${legacy_op_yaml_file}
 - ${bw_op_yaml_file}
- ${legacy_bw_op_yaml_file}")
+- ${legacy_bw_op_yaml_file}
+- ${fused_op_yaml_file}
+- ${static_op_yaml_file}")
 execute_process(
  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generator
  COMMAND ${CMAKE_COMMAND} -E make_directory ${parsed_op_dir}
@@ -69,6 +78,8 @@ execute_process(
    --output_path ./parsed_ops/legacy_backward_ops.parsed.yaml --backward
  COMMAND ${PYTHON_EXECUTABLE} parse_op.py --op_yaml_path ${static_op_yaml_file}
          --output_path ./parsed_ops/static_ops.parsed.yaml
+  COMMAND ${PYTHON_EXECUTABLE} parse_op.py --op_yaml_path ${fused_op_yaml_file}
+          --output_path ./parsed_ops/fused_ops.parsed.yaml
  COMMAND ${PYTHON_EXECUTABLE} parse_op.py --op_yaml_path ${sparse_op_yaml_file}
          --output_path ./parsed_ops/sparse_ops.parsed.yaml
  COMMAND
@@ -77,6 +88,9 @@ execute_process(
  COMMAND
    ${PYTHON_EXECUTABLE} parse_op.py --op_yaml_path ${static_bw_op_yaml_file}
    --output_path ./parsed_ops/static_backward.parsed.yaml --backward
+  COMMAND
+    ${PYTHON_EXECUTABLE} parse_op.py --op_yaml_path ${fused_bw_op_yaml_file}
+    --output_path ./parsed_ops/fused_backward.parsed.yaml --backward
    RESULTS_VARIABLE _results)
 foreach(_result in ${_results})
  if(${_result})
@@ -111,6 +125,17 @@ if(${_result})
  message(FATAL_ERROR "static ops validation failed, exiting.")
 endif()

+execute_process(
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generator
+  COMMAND
+    ${PYTHON_EXECUTABLE} cross_validate.py --forward_yaml_paths
+    ./parsed_ops/fused_ops.parsed.yaml --backward_yaml_paths
+    ./parsed_ops/fused_backward.parsed.yaml
+  RESULT_VARIABLE _result)
+if(${_result})
+  message(FATAL_ERROR "fused ops validation failed, exiting.")
+endif()
+
 execute_process(
  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generator
  COMMAND
@@ -158,6 +183,21 @@ if(${_result})
  message(FATAL_ERROR "operator codegen failed, exiting.")
 endif()

+execute_process(
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generator
+  COMMAND
+    ${PYTHON_EXECUTABLE} generate_op.py --ops_yaml_path
+    ./parsed_ops/fused_ops.parsed.yaml --backward_yaml_path
+    ./parsed_ops/fused_backward.parsed.yaml --op_version_yaml_path
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/op_version.yaml
+    --op_compat_yaml_path ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/op_compat.yaml
+    --output_op_path "${generated_fused_op_path}.tmp" --output_arg_map_path
+    "${generated_fused_argument_mapping_path}.tmp"
+  RESULT_VARIABLE _result)
+if(${_result})
+  message(FATAL_ERROR "operator codegen failed, exiting.")
+endif()
+
 execute_process(
  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generator
  COMMAND
@@ -177,10 +217,12 @@ set(generated_static_files
    "${generated_op_path_3}"
    "${generated_op_path_4}"
    "${generated_static_op_path}"
+    "${generated_fused_op_path}"
    "${generated_sparse_ops_path}"
    "${generated_argument_mapping_path}"
    "${generated_static_argument_mapping_path}"
-    "${generated_sparse_argument_mapping_path}")
+    "${generated_sparse_argument_mapping_path}"
+    "${generated_fused_argument_mapping_path}")

 foreach(generated_static_file ${generated_static_files})
  if(EXISTS "${generated_static_file}.tmp" AND EXISTS

--- a/paddle/fluid/operators/generator/parse_utils.py
+++ b/paddle/fluid/operators/generator/parse_utils.py
@@ -340,6 +340,7 @@ def check_op_config(op_entry, op_name):
        'no_need_buffer',
        'data_transform',
        'composite',
+        'support_dygraph_mode',
    )
    infer_meta_key_set = ('func', 'param')
    kernel_key_set = (

--- a/paddle/phi/api/all.h
+++ b/paddle/phi/api/all.h
@@ -27,6 +27,7 @@ limitations under the License. */
 // new phi apis
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/api/include/context_pool.h"
+#include "paddle/phi/api/include/fused_api.h"
 #include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/phi/api/include/tensor.h"


--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -67,6 +67,26 @@ set(dygraph_api_source_file
 set(dygraph_api_header_file_tmp ${dygraph_api_header_file}.tmp)
 set(dygraph_api_source_file_tmp ${dygraph_api_source_file}.tmp)

+# fused_op forward api file
+set(fused_api_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/fused_ops.yaml)
+set(fused_api_header_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/fused_api.h)
+set(fused_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/fused_api.cc)
+set(fused_api_header_file_tmp ${fused_api_header_file}.tmp)
+set(fused_api_source_file_tmp ${fused_api_source_file}.tmp)
+
+# fused_op backward api file
+set(fused_bw_api_gen_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/backward_api_gen.py)
+set(fused_bw_api_yaml_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/fused_backward.yaml)
+set(fused_bw_api_header_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/fused_backward_api.h)
+set(fused_bw_api_source_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/fused_backward_api.cc)
+set(fused_bw_api_header_file_tmp ${fused_bw_api_header_file}.tmp)
+set(fused_bw_api_source_file_tmp ${fused_bw_api_source_file}.tmp)
+
 # sparse api file
 set(sparse_api_gen_file
    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/sparse_api_gen.py)
@@ -171,6 +191,40 @@ add_custom_command(
          ${legacy_bw_api_yaml_file}
  VERBATIM)

+# generate fused_op api
+add_custom_command(
+  OUTPUT ${fused_api_header_file} ${fused_api_source_file}
+  COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${api_gen_file} --api_yaml_path ${fused_api_yaml_file}
+    --is_fused_ops_yaml --api_header_path ${fused_api_header_file_tmp}
+    --api_source_path ${fused_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_api_header_file_tmp}
+          ${fused_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_api_source_file_tmp}
+          ${fused_api_source_file}
+  COMMENT "copy_if_different ${fused_api_header_file} ${fused_api_source_file}"
+  DEPENDS ${fused_api_yaml_file} ${api_gen_file} ${api_gen_base}
+  VERBATIM)
+
+# generate fused_op backward api
+add_custom_command(
+  OUTPUT ${fused_bw_api_header_file} ${fused_bw_api_source_file}
+         ${fused_bw_api_header_file_tmp} ${fused_bw_api_source_file_tmp}
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${fused_bw_api_gen_file} --backward_yaml_path
+    ${fused_bw_api_yaml_file} --is_fused_backward_yaml --backward_header_path
+    ${fused_bw_api_header_file_tmp} --backward_source_path
+    ${fused_bw_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_bw_api_header_file_tmp}
+          ${fused_bw_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_bw_api_source_file_tmp}
+          ${fused_bw_api_source_file}
+  COMMENT
+    "copy_if_different ${fused_bw_api_header_file} ${fused_bw_api_source_file}"
+  DEPENDS ${fused_bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
+  VERBATIM)
+
 # generate sparse api
 add_custom_command(
  OUTPUT ${sparse_api_header_file} ${sparse_api_source_file}
@@ -333,7 +387,7 @@ cc_library(
       phi_profiler)
 cc_library(
  phi_function_api
-  SRCS ${api_source_file}
+  SRCS ${api_source_file} ${fused_api_source_file}
  DEPS phi_tensor_raw
       phi
       kernel_dispatch
@@ -344,7 +398,7 @@ cc_library(
       phi_profiler)
 cc_library(
  phi_bw_function_api
-  SRCS ${bw_api_source_file}
+  SRCS ${bw_api_source_file} ${fused_bw_api_source_file}
  DEPS phi_tensor_raw
       phi
       kernel_dispatch

--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -605,16 +605,6 @@
  kernel :
    func : frame_grad

- backward_op : fused_dropout_add_grad
-  forward : fused_dropout_add (Tensor x, Tensor y, Scalar p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(seed_offset)
-  args : (Tensor seed_offset, Tensor out_grad, Scalar p, bool is_test, str mode, bool fix_seed)
-  output : Tensor(x_grad), Tensor(y_grad)
-  infer_meta :
-    func : GeneralBinaryGradInferMeta
-    param : [out_grad, out_grad]
-  kernel :
-    func : fused_dropout_add_grad
-
 - backward_op : gather_nd_grad
  forward : gather_nd (Tensor x, Tensor index) -> Tensor(out)
  args : (Tensor x, Tensor index, Tensor out_grad)

--- a/paddle/phi/api/yaml/fused_backward.yaml
+++ b/paddle/phi/api/yaml/fused_backward.yaml
+# This file is designed for fusion C++ backward operators, which manages the
+# generated code for dynamic mode and static mode.
+# The operators in the file have extra configuration item "support_dygraph_mode".
+# If one operator have "support_dygraph_mode : True", it supports dygraph mode.
+
+- backward_op : fused_dropout_add_grad
+  forward : fused_dropout_add (Tensor x, Tensor y, Scalar p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(seed_offset)
+  args : (Tensor seed_offset, Tensor out_grad, Scalar p, bool is_test, str mode, bool fix_seed)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [out_grad, out_grad]
+  kernel :
+    func : fused_dropout_add_grad
+  support_dygraph_mode : true
--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
+# This file is designed for fusion C++ farward operators, which manages the
+# generated code for dynamic mode and static mode.
+# The operators in the file have extra configuration item "support_dygraph_mode".
+# If one operator have "support_dygraph_mode : True", it supports dygraph mode.
+
+- op : embedding_with_eltwise_add_xpu
+  args : (Tensor[] ids, Tensor[] tables, int64_t padding_idx)
+  output: Tensor
+  infer_meta :
+    func: EmbeddingWithEltwiseAddXPUInferMeta
+  kernel:
+    func: embedding_with_eltwise_add_xpu
+    data_type: tables
+
+- op : fc_xpu
+  args : (Tensor x, Tensor x_max, Tensor w, Tensor w_max, Tensor bias, int in_num_col_dims, bool transpose_x, float alpha, float beta, int act_type, float act_alpha)
+  output : Tensor(out), Tensor(out_max)
+  infer_meta :
+    func : FcXPUInferMeta
+  kernel :
+    func : fc_xpu
+    data_type : x
+  optional : bias, x_max
+
+- op : fused_dropout_add
+  args : (Tensor x, Tensor y, Scalar p, bool is_test, str mode, int seed, bool fix_seed)
+  output : Tensor(out), Tensor(seed_offset)
+  infer_meta :
+    func : FusedDropoutAddInferMeta
+  kernel :
+    func : fused_dropout_add
+    data_type : x
+  backward : fused_dropout_add_grad
+  support_dygraph_mode : true
+
+- op : fused_linear_param_grad_add
+  args : (Tensor x, Tensor dout, Tensor dweight, Tensor dbias, bool multi_precision = true)
+  output : Tensor(dweight_out), Tensor(dbias_out)
+  infer_meta:
+    func : FusedLinearParamGradAddInferMeta
+  optional : dweight, dbias
+  kernel:
+    func : fused_linear_param_grad_add
+    data_type : dout
+  support_dygraph_mode : true
+
+- op : fused_multi_transformer_xpu
+  args : (Tensor x, Tensor[] ln_scale, Tensor[] ln_bias, Tensor[] qkvw, Tensor[] qkvw_max, Tensor[] qkv_bias, Tensor[] out_linear_w, Tensor[] out_linear_wmax, Tensor[] out_linear_bias, Tensor[] ffn_ln_scale, Tensor[] ffn_ln_bias, Tensor[] ffn1_weight, Tensor[] ffn1_weight_max, Tensor[] ffn1_bias, Tensor[] ffn2_weight, Tensor[] ffn2_weight_max, Tensor[] ffn2_bias, Tensor[] cache_kv, Tensor[] pre_caches, Tensor rotary_pos_emb, Tensor time_step, Tensor seq_lengths, Tensor src_mask, bool pre_layer_norm, int rotary_emb_dims, float epsilon, float dropout_rate, bool is_test, str dropout_implementation, str act_method, bool trans_qkvw, int ring_id)
+  output : Tensor(out), Tensor[](cache_kv_out){out_linear_w.size()}
+  infer_meta :
+    func : FusedMultiTransformerXpuInferMeta
+  kernel :
+    func : fused_multi_transformer_xpu
+    data_type : x
+  optional : cache_kv, pre_caches, rotary_pos_emb, time_step, seq_lengths, src_mask
+
+- op : generate_sequence_xpu
+  args : (Tensor x, DataType dtype)
+  output : Tensor
+  infer_meta :
+    func : GenerateSequenceXPUInferMeta
+  kernel :
+    func : generate_sequence_xpu
+    data_type : dtype
+
+- op : multi_encoder_xpu
+  args : (Tensor x, Tensor[] fc_weight, Tensor[] fc_weight_max, Tensor[] fc_bias, Tensor[] ln_scale, Tensor[] ln_bias, Tensor mask, int layer_num, bool norm_before, int hidden_dim, int head_num, int size_per_head, int ffn_hidden_dim_scale, int act_type, int relative_type, int slice_idx)
+  output : Tensor(out), Tensor(x_fp16), Tensor(out_fp16)
+  infer_meta :
+    func : MultiEncoderXPUInferMeta
+  kernel :
+    func : multi_encoder_xpu
+    data_type : x
+  optional : mask, x_fp16, out_fp16
--- a/paddle/phi/api/yaml/generator/api_gen.py
+++ b/paddle/phi/api/yaml/generator/api_gen.py
@@ -406,7 +406,9 @@ PD_DECLARE_API(from_blob);
 """


-def generate_api(api_yaml_path, header_file_path, source_file_path):
+def generate_api(
+    api_yaml_path, is_fused_ops_yaml, header_file_path, source_file_path
+):
    apis = []

    for each_api_yaml in api_yaml_path:
@@ -424,7 +426,21 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
    header_file.write(header_include())
    header_file.write(namespace[0])

-    include_header_file = "paddle/phi/api/include/api.h"
+    include_header_file = (
+        "paddle/phi/api/include/fused_api.h"
+        if is_fused_ops_yaml is True
+        else "paddle/phi/api/include/api.h"
+    )
+    # not all fused ops supoort dygraph
+    if is_fused_ops_yaml is True:
+        new_apis = [
+            api
+            for api in apis
+            if "support_dygraph_mode" in api
+            and api["support_dygraph_mode"] is True
+        ]
+        apis = new_apis
+
    source_file.write(source_include(include_header_file))
    source_file.write(namespace[0])

@@ -456,6 +472,12 @@ def main():
        default=['paddle/phi/api/yaml/ops.yaml'],
    )

+    parser.add_argument(
+        '--is_fused_ops_yaml',
+        help='flag of fused ops yaml',
+        action='store_true',
+    )
+
    parser.add_argument(
        '--api_header_path',
        help='output of generated api header code file',
@@ -471,10 +493,13 @@ def main():
    options = parser.parse_args()

    api_yaml_path = options.api_yaml_path
+    is_fused_ops_yaml = options.is_fused_ops_yaml
    header_file_path = options.api_header_path
    source_file_path = options.api_source_path

-    generate_api(api_yaml_path, header_file_path, source_file_path)
+    generate_api(
+        api_yaml_path, is_fused_ops_yaml, header_file_path, source_file_path
+    )


 if __name__ == '__main__':

--- a/paddle/phi/api/yaml/generator/backward_api_gen.py
+++ b/paddle/phi/api/yaml/generator/backward_api_gen.py
@@ -269,7 +269,7 @@ def header_include():
 """


-def source_include(header_file_path):
+def source_include(header_file_path, fw_header_file_path):
    return f"""
 #include "{header_file_path}"
 #include <memory>
@@ -282,7 +282,7 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/api/include/api.h"
+#include "{fw_header_file_path}"
 #include "paddle/phi/infermeta/backward.h"
 #include "paddle/phi/infermeta/unary.h"

@@ -310,7 +310,10 @@ namespace experimental {


 def generate_backward_api(
-    backward_yaml_path, header_file_path, source_file_path
+    backward_yaml_path,
+    is_fused_backward_yaml,
+    header_file_path,
+    source_file_path,
 ):

    bw_apis = []
@@ -329,9 +332,29 @@ def generate_backward_api(
    header_file.write(header_include())
    header_file.write(namespace[0])

-    include_header_file = "paddle/phi/api/backward/backward_api.h"
-    source_file.write(source_include(include_header_file))
+    include_header_file = (
+        "paddle/phi/api/backward/fused_backward_api.h"
+        if is_fused_backward_yaml
+        else "paddle/phi/api/backward/backward_api.h"
+    )
+    include_fw_header_file = (
+        "paddle/phi/api/include/fused_api.h"
+        if is_fused_backward_yaml
+        else "paddle/phi/api/include/api.h"
+    )
+    source_file.write(
+        source_include(include_header_file, include_fw_header_file)
+    )
    source_file.write(namespace[0])
+    # not all fused ops supoort dygraph
+    if is_fused_backward_yaml is True:
+        new_bw_apis = [
+            bw_api
+            for bw_api in bw_apis
+            if "support_dygraph_mode" in bw_api
+            and bw_api["support_dygraph_mode"] is True
+        ]
+        bw_apis = new_bw_apis

    for bw_api in bw_apis:
        bw_api = BackwardAPI(bw_api)
@@ -355,6 +378,13 @@ def main():
        nargs='+',
        default=['paddle/phi/api/yaml/backward.yaml'],
    )
+
+    parser.add_argument(
+        '--is_fused_backward_yaml',
+        help='flag of fused backward yaml',
+        action='store_true',
+    )
+
    parser.add_argument(
        '--backward_header_path',
        help='output of generated backward header code file',
@@ -370,11 +400,15 @@ def main():
    options = parser.parse_args()

    backward_yaml_path = options.backward_yaml_path
+    is_fused_backward_yaml = options.is_fused_backward_yaml
    header_file_path = options.backward_header_path
    source_file_path = options.backward_source_path

    generate_backward_api(
-        backward_yaml_path, header_file_path, source_file_path
+        backward_yaml_path,
+        is_fused_backward_yaml,
+        header_file_path,
+        source_file_path,
    )



--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -726,16 +726,6 @@
  optional : skip_update, master_params
  inplace : (params -> params_out), (moments1 -> moments1_out), (moments2 -> moments2_out), (beta1_pows -> beta1_pows_out), (beta2_pows -> beta2_pows_out), (master_params -> master_params_out)

- op : fused_linear_param_grad_add
-  args : (Tensor x, Tensor dout, Tensor dweight, Tensor dbias, bool multi_precision = true)
-  output : Tensor(dweight_out), Tensor(dbias_out)
-  infer_meta:
-    func : FusedLinearParamGradAddInferMeta
-  optional : dweight, dbias
-  kernel:
-    func : fused_linear_param_grad_add
-    data_type : dout
-
 - op : gather
  args : (Tensor x, Tensor index, Scalar(int) axis=0)
  output : Tensor(out)

--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -584,16 +584,6 @@
    func : frame
  backward : frame_grad

- op : fused_dropout_add
-  args : (Tensor x, Tensor y, Scalar p, bool is_test, str mode, int seed, bool fix_seed)
-  output : Tensor(out), Tensor(seed_offset)
-  infer_meta :
-    func : FusedDropoutAddInferMeta
-  kernel :
-    func : fused_dropout_add
-    data_type : x
-  backward : fused_dropout_add_grad
-
 - op : gather_nd
  args : (Tensor x, Tensor index)
  output : Tensor

--- a/paddle/phi/api/yaml/static_ops.yaml
+++ b/paddle/phi/api/yaml/static_ops.yaml
@@ -47,15 +47,6 @@
    func : broadcast
    param: [x, root]

- op : embedding_with_eltwise_add_xpu
-  args : (Tensor[] ids, Tensor[] tables, int64_t padding_idx)
-  output: Tensor
-  infer_meta :
-    func: EmbeddingWithEltwiseAddXPUInferMeta
-  kernel:
-    func: embedding_with_eltwise_add_xpu
-    data_type: tables
-
 - op : equal
  args : (Tensor x, Tensor y, int axis = -1, bool force_cpu=false)
  output : Tensor(out)
@@ -68,16 +59,6 @@
    backend : x
    force_backend : force_cpu

- op : fc_xpu
-  args : (Tensor x, Tensor x_max, Tensor w, Tensor w_max, Tensor bias, int in_num_col_dims, bool transpose_x, float alpha, float beta, int act_type, float act_alpha)
-  output : Tensor(out), Tensor(out_max)
-  infer_meta :
-    func : FcXPUInferMeta
-  kernel :
-    func : fc_xpu
-    data_type : x
-  optional : bias, x_max
-
 - op : frobenius_norm
  args : (Tensor x, IntArray axis={0}, bool keepdim=false, bool reduce_all=false, int in_dtype=-1, int out_dtype=-1)
  output : Tensor(out)
@@ -88,25 +69,6 @@
    param : [x, axis, keepdim, reduce_all]
  backward : frobenius_norm_grad

- op : fused_multi_transformer_xpu
-  args : (Tensor x, Tensor[] ln_scale, Tensor[] ln_bias, Tensor[] qkvw, Tensor[] qkvw_max, Tensor[] qkv_bias, Tensor[] out_linear_w, Tensor[] out_linear_wmax, Tensor[] out_linear_bias, Tensor[] ffn_ln_scale, Tensor[] ffn_ln_bias, Tensor[] ffn1_weight, Tensor[] ffn1_weight_max, Tensor[] ffn1_bias, Tensor[] ffn2_weight, Tensor[] ffn2_weight_max, Tensor[] ffn2_bias, Tensor[] cache_kv, Tensor[] pre_caches, Tensor rotary_pos_emb, Tensor time_step, Tensor seq_lengths, Tensor src_mask, bool pre_layer_norm, int rotary_emb_dims, float epsilon, float dropout_rate, bool is_test, str dropout_implementation, str act_method, bool trans_qkvw, int ring_id)
-  output : Tensor(out), Tensor[](cache_kv_out){out_linear_w.size()}
-  infer_meta :
-    func : FusedMultiTransformerXpuInferMeta
-  kernel :
-    func : fused_multi_transformer_xpu
-    data_type : x
-  optional : cache_kv, pre_caches, rotary_pos_emb, time_step, seq_lengths, src_mask
-
- op : generate_sequence_xpu
-  args : (Tensor x, DataType dtype)
-  output : Tensor
-  infer_meta :
-    func : GenerateSequenceXPUInferMeta
-  kernel :
-    func : generate_sequence_xpu
-    data_type : dtype
-
 - op : greater_equal
  args : (Tensor x, Tensor y, int axis = -1, bool force_cpu=false)
  output : Tensor(out)
@@ -155,16 +117,6 @@
    backend : x
    force_backend : force_cpu

- op : multi_encoder_xpu
-  args : (Tensor x, Tensor[] fc_weight, Tensor[] fc_weight_max, Tensor[] fc_bias, Tensor[] ln_scale, Tensor[] ln_bias, Tensor mask, int layer_num, bool norm_before, int hidden_dim, int head_num, int size_per_head, int ffn_hidden_dim_scale, int act_type, int relative_type, int slice_idx)
-  output : Tensor(out), Tensor(x_fp16), Tensor(out_fp16)
-  infer_meta :
-    func : MultiEncoderXPUInferMeta
-  kernel :
-    func : multi_encoder_xpu
-    data_type : x
-  optional : mask, x_fp16, out_fp16
-
 - op : not_equal
  args : (Tensor x, Tensor y, int axis = -1, bool force_cpu=false)
  output : Tensor(out)