move Masked select to pten (#39193)

* move masked select cpu kernel * add masked selected gpu kernel; test=develop * fix bugs; test=develop * bug fix; test=develop * bug fix; test=develop * add namespace to set mask array; test=develop * fix bug; test=develop * fix bugs; test=develop * fix ddim bug; test=develop * fix npu op bug; test=develop * fix xpu dependecy bug; test=develop * move kernel args to sig.cc; test=develop

move Masked select to pten (#39193)
* move masked select cpu kernel * add masked selected gpu kernel; test=develop * fix bugs; test=develop * bug fix; test=develop * bug fix; test=develop * add namespace to set mask array; test=develop * fix bug; test=develop * fix bugs; test=develop * fix ddim bug; test=develop * fix npu op bug; test=develop * fix xpu dependecy bug; test=develop * move kernel args to sig.cc; test=develop
e2ad433b · hong · GitHub · 8b58862a · e2ad433b · 8b58862a
12 changed file
--- a/paddle/fluid/operators/masked_select_op.cc
+++ b/paddle/fluid/operators/masked_select_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/masked_select_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
@@ -106,16 +105,3 @@ REGISTER_OPERATOR(masked_select, ops::MaskedSelectOp, ops::MaskedSelectOpMaker,
                  ops::MaskedSelectGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(masked_select_grad, ops::MaskedSelectOpGrad,
                  ops::MaskedSelectedGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    masked_select,
-    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    masked_select_grad,
-    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/masked_select_op.cu
+++ b/paddle/fluid/operators/masked_select_op.cu
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-#include <thrust/reverse.h>
-#include <thrust/scan.h>
-#include "paddle/fluid/operators/masked_select_op.h"
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-__global__ void SetMaskArray(const bool* mask, int32_t* mask_array, int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx])
-      mask_array[idx] = 1;
-    else
-      mask_array[idx] = 0;
-  }
-}
-template <typename T>
-__global__ void SelectWithPrefixMask(const int32_t* mask_prefix_sum,
-                                     const bool* mask, const T* input, T* out,
-                                     int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx]) {
-      int index = mask_prefix_sum[idx];
-      out[index] = input[idx];
-    }
-  }
-}
-template <typename T>
-__global__ void SelectGradWithPrefixMask(const int32_t* mask_prefix_sum,
-                                         const bool* mask, const T* input,
-                                         T* out, int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx]) {
-      int index = mask_prefix_sum[idx];
-      out[idx] = input[index];
-    } else {
-      out[idx] = 0;
-    }
-  }
-}
-template <typename DeviceContext, typename T>
-class MaskedSelectCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto input = ctx.Input<framework::Tensor>("X");
-    auto mask = ctx.Input<framework::Tensor>("Mask");
-    auto out = ctx.Output<framework::Tensor>("Y");
-    auto* mask_data = mask->data<bool>();
-    auto input_data = input->data<T>();
-    auto mask_size = mask->numel();
-    auto input_dim = input->dims();
-    auto mask_dim = mask->dims();
-    PADDLE_ENFORCE_EQ(
-        input_dim, mask_dim,
-        platform::errors::InvalidArgument(
-            "The dim size of input and mask in OP(masked_selected) "
-            "must be equal, but got input dim:(%ld), mask dim: "
-            "(%ld). Please check input "
-            "value.",
-            input_dim, mask_dim));
-    thrust::device_ptr<const bool> mask_dev_ptr =
-        thrust::device_pointer_cast(mask_data);
-    thrust::device_vector<T> mask_vec(mask_dev_ptr, mask_dev_ptr + mask_size);
-    auto out_size = thrust::count(mask_vec.begin(), mask_vec.end(), true);
-    framework::DDim out_dim{out_size};
-    out->Resize(out_dim);
-    auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    Tensor mask_array;
-    Tensor mask_prefix_sum;
-    mask_array.Resize(mask_dim);
-    mask_prefix_sum.Resize(mask_dim);
-    int32_t* mask_array_data = mask_array.mutable_data<int32_t>(ctx.GetPlace());
-    int32_t* mask_prefix_sum_data =
-        mask_prefix_sum.mutable_data<int32_t>(ctx.GetPlace());
-    int threads = 512;
-    int grid = (mask_size + threads - 1) / threads;
-    auto stream = ctx.cuda_device_context().stream();
-    SetMaskArray<<<grid, threads, 0, stream>>>(mask_data, mask_array_data,
-                                               mask_size);
-    thrust::device_ptr<int32_t> mask_array_dev_ptr =
-        thrust::device_pointer_cast(mask_array_data);
-    thrust::device_vector<int32_t> mask_array_vec(
-        mask_array_dev_ptr, mask_array_dev_ptr + mask_size);
-    thrust::exclusive_scan(thrust::device, mask_array_vec.begin(),
-                           mask_array_vec.end(), mask_prefix_sum_data);
-    SelectWithPrefixMask<T><<<grid, threads, 0, stream>>>(
-        mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
-  }
-};
-template <typename DeviceContext, typename T>
-class MaskedSelectGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto input = ctx.Input<framework::Tensor>(framework::GradVarName("Y"));
-    auto mask = ctx.Input<framework::Tensor>("Mask");
-    auto out = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* mask_data = mask->data<bool>();
-    auto* input_data = input->data<T>();
-    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto input_size = input->numel();
-    auto mask_size = mask->numel();
-    auto mask_dim = mask->dims();
-    auto out_size = mask_size;
-    Tensor mask_array;
-    Tensor mask_prefix_sum;
-    mask_array.Resize(mask_dim);
-    mask_prefix_sum.Resize(mask_dim);
-    int32_t* mask_array_data = mask_array.mutable_data<int32_t>(ctx.GetPlace());
-    int32_t* mask_prefix_sum_data =
-        mask_prefix_sum.mutable_data<int32_t>(ctx.GetPlace());
-    int threads = 512;
-    int grid = (mask_size + threads - 1) / threads;
-    auto stream = ctx.cuda_device_context().stream();
-    SetMaskArray<<<grid, threads, 0, stream>>>(mask_data, mask_array_data,
-                                               mask_size);
-    thrust::device_ptr<int32_t> mask_array_dev_ptr =
-        thrust::device_pointer_cast(mask_array_data);
-    thrust::device_vector<int32_t> mask_array_vec(
-        mask_array_dev_ptr, mask_array_dev_ptr + mask_size);
-    thrust::exclusive_scan(thrust::device, mask_array_vec.begin(),
-                           mask_array_vec.end(), mask_prefix_sum_data);
-    SelectGradWithPrefixMask<T><<<grid, threads, 0, stream>>>(
-        mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    masked_select,
-    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    masked_select_grad,
-    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                    double>,
-    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t>);
--- a/paddle/fluid/operators/masked_select_op.h
+++ b/paddle/fluid/operators/masked_select_op.h
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-template <typename DeviceContext, typename T>
-class MaskedSelectKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto input = context.Input<framework::Tensor>("X");
-    auto mask = context.Input<framework::Tensor>("Mask");
-    auto out = context.Output<framework::Tensor>("Y");
-    auto* mask_data = mask->data<bool>();
-    auto input_data = input->data<T>();
-    auto mask_size = mask->numel();
-    auto input_dim = input->dims();
-    auto mask_dim = mask->dims();
-    PADDLE_ENFORCE_EQ(
-        input_dim, mask_dim,
-        platform::errors::InvalidArgument(
-            "The dim size of input and mask in OP(masked_selected) "
-            "must be equal, but got input dim:(%ld), mask dim: "
-            "(%ld). Please check input "
-            "value.",
-            input_dim, mask_dim));
-    int out_size = 0;
-    for (int i = 0; i < mask_size; i++) {
-      if (mask_data[i]) out_size++;
-    }
-    framework::DDim out_dim{out_size};
-    out->Resize(out_dim);
-    auto out_data = out->mutable_data<T>(context.GetPlace());
-    int index = 0;
-    for (int i = 0; i < mask_size; i++) {
-      if (mask_data[i]) {
-        out_data[index] = input_data[i];
-        index++;
-      }
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class MaskedSelectGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto out = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto mask = context.Input<framework::Tensor>("Mask");
-    auto input = context.Input<framework::Tensor>(framework::GradVarName("Y"));
-    auto* mask_data = mask->data<bool>();
-    auto* input_data = input->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int mask_size = mask->numel();
-    int index = 0;
-    for (int i = 0; i < mask_size; i++) {
-      if (mask_data[i]) {
-        out_data[i] = input_data[index];
-        index++;
-      } else {
-        out_data[i] = 0;
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/masked_select_op_npu.cc
+++ b/paddle/fluid/operators/masked_select_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/masked_select_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 namespace paddle {

--- a/paddle/fluid/operators/masked_select_op_xpu.cc
+++ b/paddle/fluid/operators/masked_select_op_xpu.cc
@@ -11,7 +11,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/masked_select_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {

--- a/paddle/pten/kernels/cpu/masked_select_grad_kernel.cc
+++ b/paddle/pten/kernels/cpu/masked_select_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/pten/kernels/masked_select_grad_kernel.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+namespace pten {
+template <typename T, typename Context>
+void MaskedSelectGradKernel(const Context& dev_ctx,
+                            const DenseTensor& out_grad,
+                            const DenseTensor& x,
+                            const DenseTensor& mask,
+                            DenseTensor* x_grad) {
+  auto* mask_data = mask.data<bool>();
+  auto* input_data = out_grad.data<T>();
+  auto* out_data = x_grad->mutable_data<T>(dev_ctx.GetPlace());
+  int mask_size = mask.numel();
+  int index = 0;
+  for (int i = 0; i < mask_size; i++) {
+    if (mask_data[i]) {
+      out_data[i] = input_data[index];
+      index++;
+    } else {
+      out_data[i] = 0;
+    }
+  }
+}
+}  // namespace pten
+PT_REGISTER_KERNEL(masked_select_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::MaskedSelectGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/pten/kernels/cpu/masked_select_kernel.cc
+++ b/paddle/pten/kernels/cpu/masked_select_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/pten/kernels/masked_select_kernel.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+namespace pten {
+template <typename T, typename Context>
+void MaskedSelectKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& mask,
+                        DenseTensor* out) {
+  auto* mask_data = mask.data<bool>();
+  auto input_data = x.data<T>();
+  auto mask_size = mask.numel();
+  auto input_dim = x.dims();
+  auto mask_dim = mask.dims();
+  PADDLE_ENFORCE_EQ(input_dim,
+                    mask_dim,
+                    pten::errors::InvalidArgument(
+                        "The dim size of input and mask in OP(masked_selected) "
+                        "must be equal, but got input dim:(%ld), mask dim: "
+                        "(%ld). Please check input "
+                        "value.",
+                        input_dim,
+                        mask_dim));
+  int out_size = 0;
+  for (int i = 0; i < mask_size; i++) {
+    if (mask_data[i]) out_size++;
+  }
+  framework::DDim out_dim{out_size};
+  out->Resize(out_dim);
+  auto out_data = out->mutable_data<T>(paddle::platform::CPUPlace());
+  int index = 0;
+  for (int i = 0; i < mask_size; i++) {
+    if (mask_data[i]) {
+      out_data[index] = input_data[i];
+      index++;
+    }
+  }
+}
+}  // namespace pten
+PT_REGISTER_KERNEL(masked_select,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::MaskedSelectKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {
+  kernel->InputAt(1).SetDataType(pten::DataType::BOOL);
+}
--- a/paddle/pten/kernels/gpu/masked_select_grad_kernel.cu
+++ b/paddle/pten/kernels/gpu/masked_select_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/reverse.h>
+#include <thrust/scan.h>
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/masked_select_grad_kernel.h"
+namespace pten {
+__global__ void SetMaskArrayT(const bool* mask, int32_t* mask_array, int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    if (mask[idx])
+      mask_array[idx] = 1;
+    else
+      mask_array[idx] = 0;
+  }
+}
+template <typename T>
+__global__ void SelectGradWithPrefixMask(const int32_t* mask_prefix_sum,
+                                         const bool* mask,
+                                         const T* input,
+                                         T* out,
+                                         int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    if (mask[idx]) {
+      int index = mask_prefix_sum[idx];
+      out[idx] = input[index];
+    } else {
+      out[idx] = 0;
+    }
+  }
+}
+template <typename T, typename Context>
+void MaskedSelectGradKernel(const Context& dev_ctx,
+                            const DenseTensor& out_grad,
+                            const DenseTensor& x,
+                            const DenseTensor& mask,
+                            DenseTensor* x_grad) {
+  auto* mask_data = mask.data<bool>();
+  auto* input_data = out_grad.data<T>();
+  auto* out_data = x_grad->mutable_data<T>(dev_ctx.GetPlace());
+  auto input_size = out_grad.numel();
+  auto mask_size = mask.numel();
+  auto mask_dim = mask.dims();
+  auto out_size = mask_size;
+  DenseTensor mask_array;
+  DenseTensor mask_prefix_sum;
+  mask_array.Resize(mask_dim);
+  mask_prefix_sum.Resize(mask_dim);
+  int32_t* mask_array_data =
+      mask_array.mutable_data<int32_t>(dev_ctx.GetPlace());
+  int32_t* mask_prefix_sum_data =
+      mask_prefix_sum.mutable_data<int32_t>(dev_ctx.GetPlace());
+  int threads = 512;
+  int grid = (mask_size + threads - 1) / threads;
+  auto stream = dev_ctx.stream();
+  SetMaskArrayT<<<grid, threads, 0, stream>>>(
+      mask_data, mask_array_data, mask_size);
+  thrust::device_ptr<int32_t> mask_array_dev_ptr =
+      thrust::device_pointer_cast(mask_array_data);
+  thrust::device_vector<int32_t> mask_array_vec(mask_array_dev_ptr,
+                                                mask_array_dev_ptr + mask_size);
+  thrust::exclusive_scan(thrust::device,
+                         mask_array_vec.begin(),
+                         mask_array_vec.end(),
+                         mask_prefix_sum_data);
+  SelectGradWithPrefixMask<T><<<grid, threads, 0, stream>>>(
+      mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+}
+}  // namespace pten
+PT_REGISTER_KERNEL(masked_select_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MaskedSelectGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/pten/kernels/gpu/masked_select_kernel.cu
+++ b/paddle/pten/kernels/gpu/masked_select_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/reverse.h>
+#include <thrust/scan.h>
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/masked_select_kernel.h"
+namespace pten {
+__global__ void SetMaskArray(const bool* mask, int32_t* mask_array, int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    if (mask[idx])
+      mask_array[idx] = 1;
+    else
+      mask_array[idx] = 0;
+  }
+}
+template <typename T>
+__global__ void SelectWithPrefixMask(const int32_t* mask_prefix_sum,
+                                     const bool* mask,
+                                     const T* input,
+                                     T* out,
+                                     int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    if (mask[idx]) {
+      int index = mask_prefix_sum[idx];
+      out[index] = input[idx];
+    }
+  }
+}
+template <typename T, typename Context>
+void MaskedSelectKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& mask,
+                        DenseTensor* out) {
+  auto* mask_data = mask.data<bool>();
+  auto input_data = x.data<T>();
+  auto mask_size = mask.numel();
+  auto input_dim = x.dims();
+  auto mask_dim = mask.dims();
+  PADDLE_ENFORCE_EQ(input_dim,
+                    mask_dim,
+                    pten::errors::InvalidArgument(
+                        "The dim size of input and mask in OP(masked_selected) "
+                        "must be equal, but got input dim:(%ld), mask dim: "
+                        "(%ld). Please check input "
+                        "value.",
+                        input_dim,
+                        mask_dim));
+  thrust::device_ptr<const bool> mask_dev_ptr =
+      thrust::device_pointer_cast(mask_data);
+  thrust::device_vector<T> mask_vec(mask_dev_ptr, mask_dev_ptr + mask_size);
+  auto out_size = thrust::count(mask_vec.begin(), mask_vec.end(), true);
+  framework::DDim out_dim{out_size};
+  out->Resize(out_dim);
+  auto out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+  DenseTensor mask_array;
+  DenseTensor mask_prefix_sum;
+  mask_array.Resize(mask_dim);
+  mask_prefix_sum.Resize(mask_dim);
+  int32_t* mask_array_data =
+      mask_array.mutable_data<int32_t>(dev_ctx.GetPlace());
+  int32_t* mask_prefix_sum_data =
+      mask_prefix_sum.mutable_data<int32_t>(dev_ctx.GetPlace());
+  int threads = 512;
+  int grid = (mask_size + threads - 1) / threads;
+  auto stream = dev_ctx.stream();
+  SetMaskArray<<<grid, threads, 0, stream>>>(
+      mask_data, mask_array_data, mask_size);
+  thrust::device_ptr<int32_t> mask_array_dev_ptr =
+      thrust::device_pointer_cast(mask_array_data);
+  thrust::device_vector<int32_t> mask_array_vec(mask_array_dev_ptr,
+                                                mask_array_dev_ptr + mask_size);
+  thrust::exclusive_scan(thrust::device,
+                         mask_array_vec.begin(),
+                         mask_array_vec.end(),
+                         mask_prefix_sum_data);
+  SelectWithPrefixMask<T><<<grid, threads, 0, stream>>>(
+      mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+}
+}  // namespace pten
+PT_REGISTER_KERNEL(masked_select,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MaskedSelectKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {
+  kernel->InputAt(1).SetDataType(pten::DataType::BOOL);
+}
--- a/paddle/pten/kernels/masked_select_grad_kernel.h
+++ b/paddle/pten/kernels/masked_select_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/pten/core/dense_tensor.h"
+namespace pten {
+template <typename T, typename Context>
+void MaskedSelectGradKernel(const Context& dev_ctx,
+                            const DenseTensor& out_grad,
+                            const DenseTensor& x,
+                            const DenseTensor& mask,
+                            DenseTensor* x_grad);
+}  // namspace pten
--- a/paddle/pten/kernels/masked_select_kernel.h
+++ b/paddle/pten/kernels/masked_select_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/pten/core/dense_tensor.h"
+namespace pten {
+template <typename T, typename Context>
+void MaskedSelectKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& mask,
+                        DenseTensor* out);
+}  // namspace pten
--- a/paddle/pten/ops/compat/masked_select_sig.cc
+++ b/paddle/pten/ops/compat/masked_select_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/pten/core/compat/op_utils.h"
+namespace pten {
+KernelSignature MaskedSelectOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("masked_select", {"X", "Mask"}, {}, {"Y"});
+}
+KernelSignature MaskedSelectGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("masked_select_grad",
+                         {GradVarName("Y"), "X", "Mask"},
+                         {},
+                         {GradVarName("X")});
+}
+}  // namespace pten
+PT_REGISTER_ARG_MAPPING_FN(masked_select, pten::MaskedSelectOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(masked_select_grad,
+                           pten::MaskedSelectGradOpArgumentMapping);