[Fluid] move assign_pos to phi (#55794)

9d899273 · Sonder · GitHub · 7c9abfb2 · 9d899273 · 7c9abfb2
7 changed file
--- a/paddle/fluid/operators/assign_pos_op.cc
+++ b/paddle/fluid/operators/assign_pos_op.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/assign_pos_op.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
@@ -78,6 +80,3 @@ namespace plat = paddle::platform;
 REGISTER_OP_WITHOUT_GRADIENT(assign_pos,
                             ops::AssignPosOp,
                             ops::AssignPosOpMaker);
-PD_REGISTER_STRUCT_KERNEL(
-    assign_pos, CPU, ALL_LAYOUT, ops::AssignPosOpCPUKernel, int, int64_t) {}
--- a/paddle/fluid/operators/assign_pos_op.cu
+++ b/paddle/fluid/operators/assign_pos_op.cu
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-The file has been adapted from the two files:
-     https://github.com/laekov/fastmoe/blob/master/cuda/local_exchange.cu
-     https://github.com/laekov/fastmoe/blob/master/cuda/local_exchange.cuh
-     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4
-We retain the following license from the original files:
-         Copyright 2021, Jiaao He
-   Licensed under the Apache License, Version 2.0 (the "License").
-*/
-#include "paddle/fluid/operators/assign_pos_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-DECLARE_bool(avoid_op_randomness);
-namespace paddle {
-namespace operators {
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-template <typename T>
-__global__ void AssignPos(T* cum_count,
-                          const T* numbers,
-                          T* out,
-                          int64_t limit) {
-  CUDA_KERNEL_LOOP(i, limit) {
-    int number_idx = numbers[i];
-    if (number_idx > -1) {
-      int p = phi::CudaAtomicAdd(cum_count + number_idx, -1);
-      out[p - 1] = i;
-    }
-  }
-}
-template <typename T, typename DeviceContext>
-class AssignPosCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // assign pos decides which tokens should be fetched belong to specially
-    // counter orderingly.
-    auto cum_count = context.Input<phi::DenseTensor>(
-        "cum_count");  // (counter number) int32 | int64
-    auto numbers = context.Input<phi::DenseTensor>(
-        "X");  // (batch_size * seq_len, topk) int32
-    auto eff_num_len =
-        context.Input<phi::DenseTensor>("eff_num_len");  // (sum(cum_count))
-    auto out =
-        context.Output<phi::DenseTensor>("Out");  // (cum_count) value ranges
-                                                  // from 0 to batch_size *
-                                                  // seq_len * topk
-    auto place = context.GetPlace();
-    auto numel = numbers->numel();
-    T* cum_data = const_cast<T*>(cum_count->data<T>());
-    auto cum_size = cum_count->numel();
-    phi::DenseTensor cpu_eff_num_len;
-    int64_t cpu_eff_num_len_data = 0;
-    if (platform::is_cpu_place(eff_num_len->place())) {
-      cpu_eff_num_len_data = eff_num_len->data<T>()[0];
-    } else {
-      framework::TensorCopySync(
-          *eff_num_len, platform::CPUPlace(), &cpu_eff_num_len);
-      cpu_eff_num_len_data = cpu_eff_num_len.data<T>()[0];
-    }
-    const auto& dev_ctx = context.template device_context<phi::GPUContext>();
-    framework::DDim out_dims = phi::make_ddim({cpu_eff_num_len_data});
-    auto out_data = out->mutable_data<T>(out_dims, place);
-    const T* num_data = numbers->data<T>();
-    int blocks = NumBlocks(numel);
-    int threads = kNumCUDAThreads;
-    AssignPos<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        cum_data, num_data, out_data, numel);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-PD_REGISTER_STRUCT_KERNEL(
-    assign_pos, GPU, ALL_LAYOUT, ops::AssignPosCUDAKernel, int64_t) {}
--- a/paddle/phi/kernels/assign_pos_kernel.h
+++ b/paddle/phi/kernels/assign_pos_kernel.h
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void AssignPosKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& cum_count,
+                     const DenseTensor& eff_num_len,
+                     DenseTensor* out);
+}  // namespace phi
--- a/paddle/phi/kernels/cpu/assign_pos_kernel.cc
+++ b/paddle/phi/kernels/cpu/assign_pos_kernel.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/assign_pos_kernel.h"
+#include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context>
+void AssignPosKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& cum_count,
+                     const DenseTensor& eff_num_len,
+                     DenseTensor* out) {
+  PADDLE_THROW(phi::errors::Unavailable(
+      "Do not support assign pos op for cpu kernel now."));
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    assign_pos, CPU, ALL_LAYOUT, phi::AssignPosKernel, int, int64_t) {}
--- a/paddle/phi/kernels/gpu/assign_pos_kernel.cu
+++ b/paddle/phi/kernels/gpu/assign_pos_kernel.cu
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/assign_pos_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+namespace phi {
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+template <typename T>
+__global__ void AssignPos(T* cum_count,
+                          const T* numbers,
+                          T* out,
+                          int64_t limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
+    int number_idx = numbers[i];
+    if (number_idx > -1) {
+      int p = phi::CudaAtomicAdd(cum_count + number_idx, -1);
+      out[p - 1] = i;
+    }
+  }
+}
+template <typename T, typename Context>
+void AssignPosKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& cum_count,
+                     const DenseTensor& eff_num_len,
+                     DenseTensor* out) {
+  // assign pos decides which tokens should be fetched belong to specially
+  // counter orderingly.
+  auto cum_count_ptr = &cum_count;      // (counter number) int32 | int64
+  auto numbers = &x;                    // (batch_size * seq_len, topk) int32
+  auto eff_num_len_ptr = &eff_num_len;  // (sum(cum_count))
+  auto out_ptr = &out;                  // (cum_count) value ranges
+                                        // from 0 to batch_size *
+                                        // seq_len * topk
+  auto numel = numbers->numel();
+  T* cum_data = const_cast<T*>(cum_count_ptr->data<T>());
+  auto cum_size = cum_count_ptr->numel();
+  phi::DenseTensor cpu_eff_num_len;
+  int64_t cpu_eff_num_len_data = 0;
+  bool is_cpu_place = eff_num_len_ptr->place() == phi::CPUPlace();
+  if (is_cpu_place) {
+    cpu_eff_num_len_data = eff_num_len_ptr->data<T>()[0];
+  } else {
+    phi::Copy(dev_ctx, eff_num_len, phi::CPUPlace(), false, &cpu_eff_num_len);
+    cpu_eff_num_len_data = cpu_eff_num_len.data<T>()[0];
+  }
+  phi::DDim out_dims = phi::make_ddim({cpu_eff_num_len_data});
+  out->Resize(out_dims);
+  auto out_data = dev_ctx.template Alloc<T>(out);
+  const T* num_data = numbers->data<T>();
+  int blocks = NumBlocks(numel);
+  int threads = kNumCUDAThreads;
+  AssignPos<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+      cum_data, num_data, out_data, numel);
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(assign_pos, GPU, ALL_LAYOUT, phi::AssignPosKernel, int64_t) {
+}
--- a/paddle/fluid/operators/assign_pos_op.h
+++ b/paddle/fluid/operators/assign_pos_op.h
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,22 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
+#include "paddle/phi/core/compat/op_utils.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
+namespace phi {
-#include "paddle/fluid/framework/op_registry.h"
+KernelSignature AssignPosOpArgumentMapping(
-namespace paddle {
+    const ArgumentMappingContext& ctx UNUSED) {
-namespace operators {
+  return KernelSignature(
+      "assign_pos", {"X", "cum_count", "eff_num_len"}, {}, {"Out"});
-template <typename T, typename DeviceContext>
+}
-class AssignPosOpCPUKernel : public framework::OpKernel<T> {
- public:
+}  // namespace phi
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::Unavailable(
+PD_REGISTER_ARG_MAPPING_FN(assign_pos, phi::AssignPosOpArgumentMapping);
-        "Do not support assign pos op for cpu kernel now."));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/test/legacy_test/test_assign_pos_op.py
+++ b/test/legacy_test/test_assign_pos_op.py
@@ -87,8 +87,9 @@ class TestAssignPosOpInt64(eager_op_test.OpTest):
        self.cum_count = cum_count
    def test_forward(self):
+        paddle.enable_static()
        np.testing.assert_allclose = get_redefined_allclose(self.cum_count)
-        self.check_output_with_place(paddle.CUDAPlace(0))
+        self.check_output_with_place(paddle.CUDAPlace(0), check_dygraph=False)
 @unittest.skipIf(