Add singlely compile gpu kernel camke function (#53305)

* support register single .cu file * add register GPU kernel function

Add singlely compile gpu kernel camke function (#53305)
* support register single .cu file * add register GPU kernel function
af986bd5 · lzydev · GitHub · 6c152472 · af986bd5 · af986bd5
7 changed file
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -51,6 +51,58 @@ function(find_phi_register FILENAME ADD_PATH PATTERN)
  endif()
 endfunction()
+# Just for those gpu kernels locating at "fluid/operators/", such as 'class_center_sample_op.cu'.
+# Add other file modes if need in the future.
+function(register_cu_kernel TARGET)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(register_cu_kernel "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  set(cu_srcs)
+  set(op_common_deps operator op_registry math_function layer
+                     common_infer_shape_functions)
+  foreach(cu_src ${register_cu_kernel_SRCS})
+    if(${cu_src} MATCHES ".*\\.cu$")
+      list(APPEND cu_srcs ${cu_src})
+    endif()
+  endforeach()
+  list(LENGTH cu_srcs cu_srcs_len)
+  if(${cu_srcs_len} EQUAL 0)
+    message(
+      FATAL_ERROR
+        "The GPU kernel file of ${TARGET} should contains at least one .cu file"
+    )
+  endif()
+  if(WITH_GPU)
+    nv_library(
+      ${TARGET}
+      SRCS ${cu_srcs}
+      DEPS ${op_library_DEPS} ${op_common_deps})
+  elseif(WITH_ROCM)
+    hip_library(
+      ${TARGET}
+      SRCS ${cu_srcs}
+      DEPS ${op_library_DEPS} ${op_common_deps})
+  endif()
+  set(OP_LIBRARY
+      ${TARGET} ${OP_LIBRARY}
+      CACHE INTERNAL "op libs")
+  foreach(cu_src ${cu_srcs})
+    set(op_name "")
+    # Add PHI Kernel Registry Message
+    find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_KERNEL")
+    find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_STRUCT_KERNEL")
+    find_phi_register(${cu_src} ${pybind_file}
+                      "PD_REGISTER_KERNEL_FOR_ALL_DTYPE")
+    find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name)
+    if(NOT ${op_name} EQUAL "")
+      file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
+    endif()
+  endforeach()
+endfunction()
 function(op_library TARGET)
  # op_library is a function to create op library. The interface is same as
  # cc_library. But it handle split GPU/CPU code and link some common library

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -102,6 +102,10 @@ op_library(quantize_linear_op DEPS phi)
 op_library(save_combine_op DEPS string_array phi)
 op_library(load_combine_op DEPS string_array)
+if (WITH_GPU OR WITH_ROCM)
+    register_cu_kernel(class_center_sample_op SRCS class_center_sample_op.cu DEPS ${OP_HEADER_DEPS})
+endif()
 if (WITH_GPU OR WITH_ROCM)
    op_library(activation_op SRCS activation_op.cc activation_op.kps soft_relu_op.cu DEPS ${OP_HEADER_DEPS})
 elseif (WITH_XPU_KP)

--- a/paddle/fluid/operators/class_center_sample_op.cc
+++ b/paddle/fluid/operators/class_center_sample_op.cc
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-namespace paddle {
-namespace operators {
-class ClassCenterSampleOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Label"),
-                          ctx.device_context().GetPlace());
-  }
-};
-class ClassCenterSampleOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "Label",
-        "(Tensor<int|int64>) The input of ClassCenterSample op. Each value "
-        "of Label is an integer label.");
-    AddOutput("RemappedLabel",
-              "(Tensor<int|int64>) Output tensor with same shape as Label. "
-              "Each label is remap using sampled class.");
-    AddOutput("SampledLocalClassCenter",
-              "(Tensor<int|int64>) The sampled class center for local rank,"
-              "value in [0, num_classes).");
-    AddAttr<int>(
-        "num_classes",
-        "A positive integer to specify the number of classes at local rank. "
-        "Note that num_classes of each GPU can be different.");
-    AddAttr<int>(
-        "num_samples",
-        "A positive integer to specify the number of class center to sample.");
-    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
-        .SetDefault(0);
-    AddAttr<int>("nranks", "(int default 1) The total number of GPUs.")
-        .SetDefault(1);
-    AddAttr<int>("rank", "(int default 0) The rank id in nranks.")
-        .SetDefault(0);
-    AddAttr<bool>("fix_seed",
-                  "A flag indicating whether to use a fixed seed to generate "
-                  "random negative class center. NOTE: DO NOT set this flag to"
-                  "true in training. Setting this flag to true is only useful "
-                  "in unittest or for debug")
-        .SetDefault(false);
-    AddAttr<int>("seed",
-                 "Random seed used to generate random negative class center. "
-                 "[default 0].")
-        .SetDefault(0);
-    AddComment(R"DOC(
-    Class center sample method is proposed from the paper PartialFC that only sample a subset of the class centers.
-    The process of sampling subset class centers is straightforward: 1) First select the positive class centers;
-    2) Randomly sample negative class centers. Specifically, given a Label tensor, shape [batch_size], select all
-    the positive class centers and randomly sample negative class centers, then remap the input label tensor using
-    the sampled class centers. Note that if the number of the positive class centers is greater than the input
-    num_samples, it keeps all the positive class centers and the shape of SampledLocalClassCenter will be
-    [num_positive_class_centers]. The op supports CPU, single GPU and multi GPU.
-    For more information, Partial FC: Training 10 Million Identities on a Single Machine
-    arxiv: https://arxiv.org/abs/2010.05222
-    Examples:
-      For CPU or only one GPU
-      Given:
-        Label: [11, 5 , 1 , 3 , 12, 2 , 15, 19, 18, 19]
-        num_classes = 20
-        num_samples = 6
-      Then:
-        RemappedLabel: [4, 3, 0, 2, 5, 1, 6, 8, 7, 8]
-        SampledLocalClassCenter: [1 , 2 , 3 , 5 , 11, 12, 15, 18, 19]
-      For multi GPU
-      Given:
-        rank0:
-            Label: [10, 17, 15, 11, 9 , 12, 18, 18, 17, 18, 19, 2 , 8 , 13, 11, 13, 9 , 10, 0 , 4 ]
-            num_classes = 10
-            num_samples = 6
-            ring_id = 0
-            nranks = 2
-            rank = 0
-        rank1:
-            Label: [10, 17, 15, 11, 9 , 12, 18, 18, 17, 18, 19, 2 , 8 , 13, 11, 13, 9 , 10, 0 , 4 ]
-            num_classes = 10
-            num_samples = 6
-            ring_id = 0
-            nranks = 2
-            rank = 1
-      Then:
-        rank0:
-            RemappedLabel: [6 , 11, 10, 7 , 4 , 8 , 12, 12, 11, 12, 13, 1 , 3 , 9 , 7 , 9 , 4 , 6 , 0 , 2 ]
-            SampledLocalClassCenter: [0, 2, 4, 8, 9, 3]
-        rank1:
-            RemappedLabel: [6 , 11, 10, 7 , 4 , 8 , 12, 12, 11, 12, 13, 1 , 3 , 9 , 7 , 9 , 4 , 6 , 0 , 2 ]
-            SampledLocalClassCenter: [0, 1, 2, 3, 5, 7, 8]
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(class_center_sample,
-                            ClassCenterSampleInferShapeFunctor,
-                            PD_INFER_META(phi::ClassCenterSampleInferMeta));
-REGISTER_OP_WITHOUT_GRADIENT(class_center_sample,
-                             ops::ClassCenterSampleOp,
-                             ops::ClassCenterSampleOpMaker,
-                             ClassCenterSampleInferShapeFunctor);
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -180,14 +180,6 @@
    data_type : x
  inplace : (x -> out), (input_found_infinite -> output_found_infinite)
- op : class_center_sample
-  args : (Tensor label, int num_classes, int num_samples, int ring_id, int rank, int nranks, bool fix_seed, int seed)
-  output : Tensor(remapped_label), Tensor(sampled_local_class_center)
-  infer_meta :
-    func : ClassCenterSampleInferMeta
-  kernel :
-    func : class_center_sample
 - op : coalesce_tensor
  args : (Tensor[] input, DataType dtype, bool copy_data = false, bool set_constant = false, bool persist_output = false, float constant = 0.0, bool use_align = true, int align_size = -1, int size_of_dtype = -1, int64_t[] concated_shapes = {}, int64_t[] concated_ranks = {})
  output : Tensor[](output){input.size()}, Tensor(fused_output)

--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -379,6 +379,12 @@
  outputs :
    out : Out
+- op : class_center_sample
+  inputs :
+    label : Label
+  outputs :
+    {remapped_label : RemappedLabel, sampled_local_class_center : SampledLocalClassCenter}
 - op : clip
  backward : clip_grad, clip_double_grad
  inputs :

--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -377,6 +377,15 @@
    func : cholesky_solve
  backward : cholesky_solve_grad
+- op : class_center_sample
+  args : (Tensor label, int num_classes, int num_samples, int ring_id = 0, int rank = 0, int nranks = 1, bool fix_seed = false, int seed = 0)
+  output : Tensor(remapped_label), Tensor(sampled_local_class_center)
+  infer_meta :
+    func : ClassCenterSampleInferMeta
+  kernel :
+    func : class_center_sample
+    data_type : label
 - op : clip
  args : (Tensor x, Scalar(float) min, Scalar(float) max)
  output : Tensor(out)

--- a/paddle/phi/ops/compat/class_center_sample_sig.cc
+++ b/paddle/phi/ops/compat/class_center_sample_sig.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/phi/core/compat/op_utils.h"
-namespace phi {
-KernelSignature ClassCenterSampleOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("class_center_sample",
-                         {"Label"},
-                         {"num_classes",
-                          "num_samples",
-                          "ring_id",
-                          "rank",
-                          "nranks",
-                          "fix_seed",
-                          "seed"},
-                         {"RemappedLabel", "SampledLocalClassCenter"});
-}
-}  // namespace phi
-PD_REGISTER_ARG_MAPPING_FN(class_center_sample,
-                           phi::ClassCenterSampleOpArgumentMapping);