From ea5b2f26e26075112b49fcba7287103ffeb93cde Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Mon, 28 Mar 2022 14:10:07 +0800
Subject: [PATCH] add fused_seqpool_cvm op (#37928)

* add fused_seqpool_cvm op;test=develop
---
 .../operators/fused/fused_seqpool_cvm_op.cc   | 237 +++++++++
 .../operators/fused/fused_seqpool_cvm_op.cu   | 492 ++++++++++++++++++
 .../operators/fused/fused_seqpool_cvm_op.h    |  48 ++
 python/paddle/fluid/contrib/layers/nn.py      |  60 ++-
 4 files changed, 836 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
 create mode 100644 paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
 create mode 100644 paddle/fluid/operators/fused/fused_seqpool_cvm_op.h

diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
new file mode 100644
index 00000000000..23b82ac5d96
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
@@ -0,0 +1,237 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fused_seqpool_cvm_op.h"
+#include <string>
+namespace paddle {
+namespace operators {
+
+class FusedSeqpoolCVMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_GE(
+        ctx->Inputs("X").size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Inputs(X) of FusedSeqpoolCVMOp should not be empty."));
+    PADDLE_ENFORCE_GE(
+        ctx->Outputs("Out").size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Outputs(Out) of FusedSeqpoolCVMOp should not be empty."));
+
+    auto cvm_dims = ctx->GetInputDim("CVM");
+    PADDLE_ENFORCE_EQ(
+        cvm_dims.size(), 2UL,
+        platform::errors::InvalidArgument("Input(CVM)'s rank should be 2."));
+    PADDLE_ENFORCE_EQ(cvm_dims[1], 2UL, platform::errors::InvalidArgument(
+                                            "The 2nd dimension of "
+                                            "Input(CVM) should be 2."));
+
+    auto ins_dims = ctx->GetInputsDim("X");
+    const int cvm_offset = ctx->Attrs().Get<int>("cvm_offset");
+    const size_t num_inputs = ins_dims.size();
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.resize(num_inputs);
+    bool use_cvm = ctx->Attrs().Get<bool>("use_cvm");
+
+    PADDLE_ENFORCE_GT(num_inputs, 0UL,
+                      platform::errors::InvalidArgument(
+                          "Input tensors count should be greater than 0, "
+                          "but received value is %d.",
+                          num_inputs));
+
+    // The output height should be confirmed in Compute,
+    // since input lod is not accessible here.
+    PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2,
+                      platform::errors::InvalidArgument(
+                          "The dims size of first input should be equal to 2, "
+                          "but received value is %d.",
+                          ins_dims[0].size()));
+
+    for (size_t i = 0; i < num_inputs; ++i) {
+      const auto dims = ins_dims[i];
+      int rank = dims.size();
+      if (use_cvm) {
+        PADDLE_ENFORCE_GT(
+            dims[rank - 1], 2,
+            platform::errors::InvalidArgument(
+                "Shape error in %lu id, the last dimension(embedding) of the "
+                "'X' tensor must be larger than 2.",
+                i));
+      }
+      // input lod is not accessible here
+      std::vector<int64_t> out_dim;
+      if (use_cvm) {
+        out_dim = {-1, dims[rank - 1]};
+      } else {
+        out_dim = {-1, dims[rank - 1] - cvm_offset};
+      }
+      outs_dims[i] = phi::make_ddim(out_dim);
+    }
+    ctx->SetOutputsDim("Out", outs_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto inputs = ctx.MultiInput<LoDTensor>("X");
+    auto input_data_type = framework::proto::VarType::Type(0);
+    bool flag = 0;
+    for (auto* input : inputs) {
+      if (input->IsInitialized() && input->numel() > 0) {
+        input_data_type = framework::TransToProtoVarType(input->dtype());
+        flag = 1;
+        break;
+      }
+    }
+    PADDLE_ENFORCE_EQ(flag, 1,
+                      platform::errors::InvalidArgument(
+                          "All Inputs of fused_seqpool_cvm OP are Empty!"));
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    // return framework::OpKernelType(framework::proto::VarType::FP32,
+    //                                ctx.device_context());
+    // return framework::OpKernelType(
+    //   OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class FusedSeqpoolCVMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(vector<LoDTensor>) The input tensors of"
+             " operator.")
+        .AsDuplicable();
+    AddInput("CVM",
+             "(Tensor),  a 2-D Tensor with shape [N x 2], where N is the batch "
+             "size, 2 is show and click.");
+    AddOutput("Out",
+              "(vector<Tensor>) The output of Op does not contain LoD "
+              "information.")
+        .AsDuplicable();
+    AddAttr<std::string>("pooltype",
+                         "(string, default 'SUM') the pooling pooltype of "
+                         "SequencePoolOp, only support SUM now.")
+        .SetDefault("SUM")
+        .InEnum({"SUM"});
+    AddAttr<float>("pad_value",
+                   "(float, default 0.0) The value to pad for empty sequence.")
+        .SetDefault(0.0);
+    AddAttr<bool>("use_cvm", "bool, use cvm or not").SetDefault(true);
+    AddAttr<int>("cvm_offset", "(int, default 2)").SetDefault(2);
+
+    AddComment(R"DOC(
+Fuse multiple pairs of Sequence Pool and CVM Operator.
+
+)DOC");
+  }
+};
+
+class FusedSeqpoolCVMGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto og_dims = ctx->GetInputsDim(framework::GradVarName("Out"));
+    auto x_dims = ctx->GetInputsDim("X");
+    auto cvm_dims = ctx->GetInputDim("CVM");
+    const int cvm_offset = ctx->Attrs().Get<int>("cvm_offset");
+    bool use_cvm = ctx->Attrs().Get<bool>("use_cvm");
+
+    PADDLE_ENFORCE_EQ(
+        cvm_dims.size(), 2,
+        platform::errors::InvalidArgument("Input(CVM)'s rank should be 2."));
+
+    for (size_t i = 0; i < og_dims.size(); i++) {
+      PADDLE_ENFORCE_EQ(
+          og_dims[i].size(), x_dims[i].size(),
+          platform::errors::InvalidArgument(
+              "The rank of output grad must equal to Input(X). But "
+              "received: input rank %u, input shape [%s].",
+              og_dims[i].size(), og_dims[i]));
+      if (use_cvm) {
+        auto o_dim = og_dims[i][og_dims[i].size() - 1];
+        PADDLE_ENFORCE_EQ(
+            o_dim, x_dims[i][og_dims[i].size() - 1],
+            platform::errors::InvalidArgument(
+                "The dimension mismatch between Input(OUT@GRAD) and "
+                "Input(X). Received Input(OUT@GRAD): input rank %u, "
+                "input shape [%s]; received Input(X): input rank %u, "
+                "input shape [%s].",
+                og_dims[i].size(), og_dims[i], x_dims[i].size(), x_dims[i]));
+      } else {
+        PADDLE_ENFORCE_EQ(
+            og_dims[i][og_dims[i].size() - 1],
+            x_dims[i][og_dims[i].size() - 1] - cvm_offset,
+            platform::errors::InvalidArgument(
+                "The dimension mismatch between Input(OUT@GRAD) and "
+                "Input(X). Received Input(OUT@GRAD): input rank %u, "
+                "input shape [%s]; received Input(X): input rank %u, "
+                "input shape [%s].",
+                og_dims[i].size(), og_dims[i], x_dims[i].size(), x_dims[i]));
+      }
+    }
+    for (size_t i = 0; i < x_dims.size(); ++i) {
+      ctx->ShareLoD("X", framework::GradVarName("X"), i, i);
+      ctx->ShareDim("X", framework::GradVarName("X"), i, i);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class FusedSeqpoolCVMGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op_desc_ptr) const override {
+    op_desc_ptr->SetType("fused_seqpool_cvm_grad");
+    op_desc_ptr->SetInput("X", this->Input("X"));
+    op_desc_ptr->SetInput("CVM", this->Input("CVM"));
+
+    op_desc_ptr->SetInput(framework::GradVarName("Out"),
+                          this->OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"),
+                           this->InputGrad("X", false));
+    op_desc_ptr->SetOutput(framework::GradVarName("CVM"),
+                           this->InputGrad("CVM"));
+    op_desc_ptr->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(fused_seqpool_cvm, ops::FusedSeqpoolCVMOp,
+                  ops::FusedSeqpoolCVMOpMaker,
+                  ops::FusedSeqpoolCVMGradOpMaker<paddle::framework::OpDesc>,
+                  ops::FusedSeqpoolCVMGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_seqpool_cvm_grad, ops::FusedSeqpoolCVMGradOp)
+
+REGISTER_OP_CPU_KERNEL(fused_seqpool_cvm,
+                       ops::FusedSeqpoolCVMOpCPUKernel<float>)
+REGISTER_OP_CPU_KERNEL(fused_seqpool_cvm_grad,
+                       ops::FusedSeqpoolCVMGradOpCPUKernel<float>)
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
new file mode 100644
index 00000000000..3770a536a8f
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
@@ -0,0 +1,492 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/operators/fused/fused_seqpool_cvm_op.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+using Vector = framework::Vector<T>;
+
+#define CUDA_KERNEL_LOOP(i, n)                                  \
+  for (auto i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+// normal
+template <typename T>
+__global__ void FusedSeqpoolKernelNormal(const size_t N, T **input_values,
+                                         T **seqpool_output_values,
+                                         size_t **lods_values,
+                                         const int batch_size,
+                                         const int embedding_size,
+                                         const float pad_value) {
+  CUDA_KERNEL_LOOP(i, N) {
+    int key = i / embedding_size;
+    int offset = i % embedding_size;
+    int x = key / batch_size;  // slot id
+    int y = key % batch_size;  // ins id
+    auto &start = *(lods_values[x] + y);
+    auto &end = *(lods_values[x] + y + 1);
+
+    T val = static_cast<T>(pad_value);
+    for (auto k = start; k < end; ++k) {
+      val += *(input_values[x] + k * embedding_size + offset);
+    }
+    *(seqpool_output_values[x] + y * embedding_size + offset) = val;
+  }
+}
+
+// join need show click input
+template <typename T>
+__global__ void FusedCVMKernelWithCVM(const size_t N, T **output_values,
+                                      T **seqpool_output_values,
+                                      const int batch_size,
+                                      const int embedding_size,
+                                      const int cvm_offset) {
+  CUDA_KERNEL_LOOP(i, N) {
+    int key = i / embedding_size;
+    int offset = i % embedding_size;
+    int x = key / batch_size;  // slot id
+    int y = key % batch_size;  // ins id
+    if (offset == 0) {         // show
+      *(output_values[x] + y * embedding_size) =
+          log(*(seqpool_output_values[x] + y * embedding_size) + 1);
+    } else if (offset == 1) {  // click
+      *(output_values[x] + y * embedding_size + offset) =
+          log(*(seqpool_output_values[x] + y * embedding_size + 1) + 1) -
+          log(*(seqpool_output_values[x] + y * embedding_size) + 1);
+    } else {
+      *(output_values[x] + y * embedding_size + offset) =
+          *(seqpool_output_values[x] + y * embedding_size + offset);
+    }
+  }
+}
+
+// update not need show click input
+template <typename T>
+__global__ void FusedCVMKernelNoCVM(const size_t N, T **output_values,
+                                    T **seqpool_output_values,
+                                    const int batch_size,
+                                    const int no_cvm_embedding_size,
+                                    const int cvm_offset) {
+  CUDA_KERNEL_LOOP(i, N) {
+    int key = i / no_cvm_embedding_size;
+    int offset = i % no_cvm_embedding_size;
+    int x = key / batch_size;  // slot id
+    int y = key % batch_size;  // ins id
+    // no cvm
+    *(output_values[x] + y * no_cvm_embedding_size + offset) =
+        *(seqpool_output_values[x] + y * (no_cvm_embedding_size + cvm_offset) +
+          offset + cvm_offset);
+  }
+}
+
+template <typename T>
+void FusedSeqpoolCVM(const framework::ExecutionContext
+                         &ctx,  // const paddle::platform::Place &place,
+                     const std::vector<const T *> &input_data,
+                     const std::vector<T *> &output_data,
+                     const std::vector<T *> &seqpool_output_data,
+                     std::vector<const size_t *> lods, const int batch_size,
+                     const int slot_num, const int embedding_size,
+                     const float padding_value, const bool use_cvm,
+                     const int cvm_offset) {
+  auto stream =
+      ctx.template device_context<platform::CUDADeviceContext>().stream();
+  auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  size_t total_ptr_len = input_data.size() + output_data.size() +
+                         seqpool_output_data.size() + lods.size();
+  auto temp_ptr =
+      memory::AllocShared(ctx.GetPlace(), total_ptr_len * sizeof(void *));
+  void *ptr = temp_ptr->ptr();
+
+#ifdef PADDLE_WITH_HIP
+  T **gpu_input_values = reinterpret_cast<T **>(temp_ptr->ptr());
+  platform::GpuMemcpyAsync(gpu_input_values, input_data.data(),
+                           input_data.size() * sizeof(T *),
+                           hipMemcpyHostToDevice, stream);
+  T **gpu_output_values =
+      reinterpret_cast<T **>(&gpu_input_values[input_data.size()]);
+  platform::GpuMemcpyAsync(gpu_output_values, output_data.data(),
+                           output_data.size() * sizeof(T *),
+                           hipMemcpyHostToDevice, stream);
+  T **gpu_seqpool_output_values =
+      reinterpret_cast<T **>(&gpu_output_values[output_data.size()]);
+  platform::GpuMemcpyAsync(
+      gpu_seqpool_output_values, seqpool_output_data.data(),
+      seqpool_output_data.size() * sizeof(T *), hipMemcpyHostToDevice, stream);
+  size_t **lods_values = reinterpret_cast<size_t **>(
+      &gpu_seqpool_output_values[seqpool_output_data.size()]);
+  platform::GpuMemcpyAsync(lods_values, lods.data(),
+                           lods.size() * sizeof(size_t *),
+                           hipMemcpyHostToDevice, stream);
+#else
+  T **gpu_input_values = reinterpret_cast<T **>(temp_ptr->ptr());
+  platform::GpuMemcpyAsync(gpu_input_values, input_data.data(),
+                           input_data.size() * sizeof(T *),
+                           cudaMemcpyHostToDevice, stream);
+  T **gpu_output_values =
+      reinterpret_cast<T **>(&gpu_input_values[input_data.size()]);
+  platform::GpuMemcpyAsync(gpu_output_values, output_data.data(),
+                           output_data.size() * sizeof(T *),
+                           cudaMemcpyHostToDevice, stream);
+  T **gpu_seqpool_output_values =
+      reinterpret_cast<T **>(&gpu_output_values[output_data.size()]);
+  platform::GpuMemcpyAsync(
+      gpu_seqpool_output_values, seqpool_output_data.data(),
+      seqpool_output_data.size() * sizeof(T *), cudaMemcpyHostToDevice, stream);
+  size_t **lods_values = reinterpret_cast<size_t **>(
+      &gpu_seqpool_output_values[seqpool_output_data.size()]);
+  platform::GpuMemcpyAsync(lods_values, lods.data(),
+                           lods.size() * sizeof(size_t *),
+                           cudaMemcpyHostToDevice, stream);
+#endif
+
+  size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
+  platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
+  // first sum pool
+  FusedSeqpoolKernelNormal<<<config.block_per_grid.x, config.thread_per_block.x,
+                             0, stream>>>(
+      N, gpu_input_values, gpu_seqpool_output_values, lods_values, batch_size,
+      embedding_size, padding_value);
+  // second log
+  if (use_cvm) {
+    FusedCVMKernelWithCVM<<<config.block_per_grid.x, config.thread_per_block.x,
+                            0, stream>>>(N, gpu_output_values,
+                                         gpu_seqpool_output_values, batch_size,
+                                         embedding_size, cvm_offset);
+  } else {
+    // not need show click input
+    N = static_cast<size_t>(batch_size * slot_num *
+                            (embedding_size - cvm_offset));
+    platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
+    FusedCVMKernelNoCVM<<<config.block_per_grid.x, config.thread_per_block.x, 0,
+                          stream>>>(N, gpu_output_values,
+                                    gpu_seqpool_output_values, batch_size,
+                                    (embedding_size - cvm_offset), cvm_offset);
+  }
+}
+
+// join grad
+template <typename T>
+__global__ void FusedSeqpoolCVMGradKernelWithCVM(
+    const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values,
+    size_t **lods_values, const int batch_size, const int embedding_size,
+    const int cvm_offset) {
+  CUDA_KERNEL_LOOP(i, N) {
+    int key = i / embedding_size;
+    int offset = i % embedding_size;  // embedx offset
+    int x = key / batch_size;         // slot id
+    int y = key % batch_size;         // ins id
+
+    T &val = (offset < cvm_offset)
+                 ? *(cvm_values[x] + y * cvm_offset + offset)
+                 : *(out_grads_values[x] + y * embedding_size + offset);
+
+    auto &start = *(lods_values[x] + y);
+    auto &end = *(lods_values[x] + y + 1);
+    for (auto k = start; k < end; ++k) {
+      *(in_grads_values[x] + k * embedding_size + offset) = val;
+    }
+  }
+}
+
+// join only show not has click
+template <typename T>
+__global__ void FusedSeqpoolCVMGradKernelWithShow(
+    const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values,
+    size_t **lods_values, const int batch_size, const int embedding_size,
+    const int cvm_offset) {
+  CUDA_KERNEL_LOOP(i, N) {
+    int key = i / embedding_size;
+    int offset = i % embedding_size;  // embedx offset
+    int x = key / batch_size;         // slot id
+    int y = key % batch_size;         // ins id
+
+    T &val =
+        (offset < cvm_offset)
+            ? *(cvm_values[x] + y * cvm_offset + offset)
+            : *(out_grads_values[x] + y * (embedding_size - 1) + offset - 1);
+
+    auto &start = *(lods_values[x] + y);
+    auto &end = *(lods_values[x] + y + 1);
+    for (auto k = start; k < end; ++k) {
+      *(in_grads_values[x] + k * embedding_size + offset) = val;
+    }
+  }
+}
+
+// update grad
+template <typename T>
+__global__ void FusedSeqpoolCVMGradKernelNoCVM(
+    const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values,
+    size_t **lods_values, const int batch_size, const int embedding_size,
+    const int cvm_offset) {
+  CUDA_KERNEL_LOOP(i, N) {
+    int key = i / embedding_size;
+    int offset = i % embedding_size;  // embedx offset
+    int x = key / batch_size;         // slot id
+    int y = key % batch_size;         // ins id
+
+    T &val = (offset < cvm_offset)
+                 ? *(cvm_values[x] + y * cvm_offset + offset)
+                 : *(out_grads_values[x] + y * (embedding_size - cvm_offset) +
+                     offset - cvm_offset);
+
+    auto &start = *(lods_values[x] + y);
+    auto &end = *(lods_values[x] + y + 1);
+    for (auto k = start; k < end; ++k) {
+      *(in_grads_values[x] + k * embedding_size + offset) = val;
+    }
+  }
+}
+
+template <typename T>
+void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
+                         const std::vector<const T *> &out_grads_data,
+                         const std::vector<T *> &in_grads_data,
+                         const std::vector<const T *> &cvm_data,
+                         const std::vector<const size_t *> &lods,
+                         const int batch_size, const int slot_num,
+                         const int embedding_size, const bool use_cvm,
+                         const int cvm_offset) {
+  auto stream =
+      ctx.template device_context<platform::CUDADeviceContext>().stream();
+  auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  size_t total_ptr_len = out_grads_data.size() + in_grads_data.size() +
+                         cvm_data.size() + lods.size();
+  auto temp_ptr =
+      memory::AllocShared(ctx.GetPlace(), total_ptr_len * sizeof(void *));
+#ifdef PADDLE_WITH_HIP
+  T **gpu_out_grads_values = reinterpret_cast<T **>(temp_ptr->ptr());
+  platform::GpuMemcpyAsync(gpu_out_grads_values, out_grads_data.data(),
+                           out_grads_data.size() * sizeof(T *),
+                           hipMemcpyHostToDevice, stream);
+
+  T **gpu_in_grads_values =
+      reinterpret_cast<T **>(&gpu_out_grads_values[out_grads_data.size()]);
+  platform::GpuMemcpyAsync(gpu_in_grads_values, in_grads_data.data(),
+                           in_grads_data.size() * sizeof(T *),
+                           hipMemcpyHostToDevice, stream);
+
+  T **gpu_cvm_values =
+      reinterpret_cast<T **>(&gpu_in_grads_values[in_grads_data.size()]);
+  platform::GpuMemcpyAsync(gpu_cvm_values, cvm_data.data(),
+                           cvm_data.size() * sizeof(T *), hipMemcpyHostToDevice,
+                           stream);
+
+  size_t **lods_values =
+      reinterpret_cast<size_t **>(&gpu_cvm_values[cvm_data.size()]);
+  platform::GpuMemcpyAsync(lods_values, lods.data(),
+                           lods.size() * sizeof(size_t *),
+                           hipMemcpyHostToDevice, stream);
+#else
+  T **gpu_out_grads_values = reinterpret_cast<T **>(temp_ptr->ptr());
+  platform::GpuMemcpyAsync(gpu_out_grads_values, out_grads_data.data(),
+                           out_grads_data.size() * sizeof(T *),
+                           cudaMemcpyHostToDevice, stream);
+
+  T **gpu_in_grads_values =
+      reinterpret_cast<T **>(&gpu_out_grads_values[out_grads_data.size()]);
+  platform::GpuMemcpyAsync(gpu_in_grads_values, in_grads_data.data(),
+                           in_grads_data.size() * sizeof(T *),
+                           cudaMemcpyHostToDevice, stream);
+
+  T **gpu_cvm_values =
+      reinterpret_cast<T **>(&gpu_in_grads_values[in_grads_data.size()]);
+  platform::GpuMemcpyAsync(gpu_cvm_values, cvm_data.data(),
+                           cvm_data.size() * sizeof(T *),
+                           cudaMemcpyHostToDevice, stream);
+
+  size_t **lods_values =
+      reinterpret_cast<size_t **>(&gpu_cvm_values[cvm_data.size()]);
+  platform::GpuMemcpyAsync(lods_values, lods.data(),
+                           lods.size() * sizeof(size_t *),
+                           cudaMemcpyHostToDevice, stream);
+#endif
+
+  size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
+  auto config = GetGpuLaunchConfig1D(dev_ctx, N);
+  if (use_cvm) {
+    // join grad
+    FusedSeqpoolCVMGradKernelWithCVM<<<config.block_per_grid.x,
+                                       config.thread_per_block.x, 0, stream>>>(
+        N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values,
+        lods_values, batch_size, embedding_size, cvm_offset);
+  } else {
+    // update grad
+    FusedSeqpoolCVMGradKernelNoCVM<<<config.block_per_grid.x,
+                                     config.thread_per_block.x, 0, stream>>>(
+        N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values,
+        lods_values, batch_size, embedding_size, cvm_offset);
+  }
+}
+
+template <typename T>
+class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto inputs = ctx.MultiInput<LoDTensor>("X");
+    auto outputs = ctx.MultiOutput<framework::Tensor>("Out");
+
+    const auto slot_size = inputs.size();
+    std::vector<const float *> input_data(slot_size);
+    std::vector<const size_t *> lods_data(slot_size);
+    std::vector<T *> output_data(slot_size);
+
+    std::vector<LoDTensor> seqpool_outputs(slot_size);
+    std::vector<T *> seqpool_output_data(slot_size);
+
+    auto padding_value = ctx.Attr<float>("pad_value");
+    auto use_cvm = ctx.Attr<bool>("use_cvm");
+    const int cvm_offset = ctx.Attr<int>("cvm_offset");
+
+    int embedding_size = inputs[0]->numel() / inputs[0]->dims()[0];
+    int batch_size = -1;
+    std::vector<paddle::framework::MixVector<size_t> *> mix_lods_v(slot_size);
+
+    for (size_t i = 0; i < slot_size; ++i) {
+      const auto *input = inputs[i];
+
+      Vector<size_t> lods;
+      if (input->lod().size() != 0) {
+        auto lod = input->lod();
+        lods = lod[0];
+      } else {
+        lods.push_back(0);
+        for (int i = 0; i < input->dims()[0]; i++) {
+          lods.push_back(i + 1);
+        }
+      }
+      int cur_batch_size =
+          input->lod().size() ? input->lod()[0].size() - 1 : input->dims()[0];
+      if (batch_size == -1) {
+        batch_size = cur_batch_size;
+      } else {
+        PADDLE_ENFORCE_EQ(batch_size, cur_batch_size,
+                          platform::errors::PreconditionNotMet(
+                              "The batch size of all input should be same, "
+                              "please cheack, last batchsize is %d, current "
+                              "batchsize is %d",
+                              batch_size, cur_batch_size));
+      }
+      input_data[i] = reinterpret_cast<const T *>(input->data<T>());
+
+      auto *output = outputs[i];
+      if (use_cvm) {
+        output->Resize({batch_size, embedding_size});
+      } else {
+        output->Resize({batch_size, embedding_size - cvm_offset});
+      }
+      output_data[i] =
+          reinterpret_cast<T *>(output->mutable_data<T>(ctx.GetPlace()));
+      mix_lods_v[i] = new paddle::framework::MixVector<size_t>(&lods);
+      lods_data[i] = mix_lods_v[i]->CUDAData(ctx.GetPlace());
+      seqpool_output_data[i] =
+          reinterpret_cast<T *>(seqpool_outputs[i].mutable_data<T>(
+              {batch_size, embedding_size}, ctx.GetPlace()));
+    }
+
+    FusedSeqpoolCVM(ctx, input_data, output_data, seqpool_output_data,
+                    lods_data, batch_size, slot_size, embedding_size,
+                    padding_value, use_cvm, cvm_offset);
+
+    for (int i = 0; i < slot_size; i++) {
+      delete mix_lods_v[i];
+    }
+  }
+};
+
+template <typename T>
+class FusedSeqpoolCVMGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto out_grads = ctx.MultiInput<LoDTensor>(framework::GradVarName("Out"));
+    auto in_grads = ctx.MultiOutput<LoDTensor>(framework::GradVarName("X"));
+    auto *cvm = ctx.Input<LoDTensor>("CVM");
+
+    std::string pooltype = ctx.Attr<std::string>("pooltype");
+    auto use_cvm = ctx.Attr<bool>("use_cvm");
+    const int cvm_offset = ctx.Attr<int>("cvm_offset");
+
+    const auto slot_size = in_grads.size();
+    std::vector<const T *> out_grads_data(slot_size);
+    std::vector<T *> in_grads_data(slot_size);
+    std::vector<const T *> cvm_data(slot_size);
+    std::vector<const size_t *> lods_data(slot_size);
+
+    int embedding_size = in_grads[0]->numel() / in_grads[0]->dims()[0];
+    int batch_size = -1;
+    std::vector<paddle::framework::MixVector<size_t> *> mix_lods_v(slot_size);
+
+    for (size_t i = 0; i < slot_size; ++i) {
+      auto *in_grad = in_grads[i];
+
+      Vector<size_t> lods;
+      if (in_grad->lod().size() != 0) {
+        auto lod = in_grad->lod();
+        lods = lod[0];
+      } else {
+        lods.push_back(0);
+        for (int i = 0; i < in_grad->dims()[0]; i++) {
+          lods.push_back(i + 1);
+        }
+      }
+
+      int cur_batch_size = in_grad->lod().size() ? in_grad->lod()[0].size() - 1
+                                                 : in_grad->dims()[0];
+      if (batch_size == -1) {
+        batch_size = cur_batch_size;
+      } else {
+        PADDLE_ENFORCE_EQ(batch_size, cur_batch_size,
+                          platform::errors::PreconditionNotMet(
+                              "The batch size of all input should be same, "
+                              "please cheack, last batchsize is %d, current "
+                              "batchsize is %d",
+                              batch_size, cur_batch_size));
+      }
+
+      auto *out_grad = out_grads[i];
+      out_grads_data[i] = reinterpret_cast<const T *>(out_grad->data<T>());
+
+      in_grads_data[i] =
+          reinterpret_cast<T *>(in_grad->mutable_data<T>(ctx.GetPlace()));
+      mix_lods_v[i] = new paddle::framework::MixVector<size_t>(&lods);
+      lods_data[i] = mix_lods_v[i]->CUDAData(ctx.GetPlace());
+      cvm_data[i] = reinterpret_cast<const T *>(cvm->data<T>());
+    }
+    FusedSeqpoolCVMGrad(ctx, out_grads_data, in_grads_data, cvm_data, lods_data,
+                        batch_size, slot_size, embedding_size, use_cvm,
+                        cvm_offset);
+
+    for (int i = 0; i < slot_size; i++) {
+      delete mix_lods_v[i];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(fused_seqpool_cvm,
+                        ops::FusedSeqpoolCVMCUDAKernel<float>);
+
+REGISTER_OP_CUDA_KERNEL(fused_seqpool_cvm_grad,
+                        ops::FusedSeqpoolCVMGradCUDAKernel<float>);
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h
new file mode 100644
index 00000000000..6042772adb0
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+class FusedSeqpoolCVMOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unimplemented CPU kernel for FusedSeqpoolCVMOp, only support GPU "
+        "now."));
+  }
+};
+
+template <typename T>
+class FusedSeqpoolCVMGradOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unimplemented CPU kernel for FusedSeqpoolCVMGradOp, only support GPU "
+        "now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index de7a9d95429..fde42e35e07 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -44,7 +44,7 @@ __all__ = [
     'multiclass_nms2', 'search_pyramid_hash', 'shuffle_batch', 'partial_concat',
     'sparse_embedding', 'partial_sum', 'tdm_child', 'rank_attention',
     'tdm_sampler', 'batch_fc', '_pull_box_extended_sparse', 'bilateral_slice',
-    'correlation', 'fused_bn_add_act'
+    'correlation', 'fused_bn_add_act', 'fused_seqpool_cvm'
 ]
 
 
@@ -523,6 +523,64 @@ def fused_embedding_seq_pool(input,
     return out
 
 
+def fused_seqpool_cvm(input,
+                      pool_type,
+                      cvm,
+                      pad_value=0.0,
+                      use_cvm=True,
+                      cvm_offset=2):
+    """
+    **Embedding Sequence pool**
+
+    This layer is the fusion of sequence_pool and continuous_value_model.
+
+    **Notes: The Op only receives List of LoDTensor as input, only support SUM pooling now.
+
+    Args:
+        input(Variable|list of Variable): Input is List of LoDTensor.
+        pool_type(str): pooling type, only support SUM pooling now.
+        cvm(Variable): cvm Variable.
+        pad_value(float): padding value of sequence pool.
+        use_cvm(bool): use cvm or not.
+    Returns:
+        Variable|list of Variable: The tensor variable storing sequence pool and cvm
+        of input.
+    """
+    helper = LayerHelper('fused_seqpool_cvm', **locals())
+
+    if pool_type.upper() != 'SUM':
+        raise ValueError(
+            "fused_seqpool_cvm only support SUM pooling now, and your type is: "
+            + pool_type)
+
+    check_type(input, 'input', list, 'fused_seqpool_cvm')
+    if isinstance(input, list):
+        for _input in input:
+            check_variable_and_dtype(_input, 'input', ['float32'],
+                                     'fused_seqpool_cvm')
+
+    dtype = helper.input_dtype()
+    inputs = helper.multiple_input()
+    outs = [
+        helper.create_variable_for_type_inference(dtype)
+        for i in range(len(inputs))
+    ]
+
+    helper.append_op(
+        type="fused_seqpool_cvm",
+        inputs={"X": inputs,
+                "CVM": cvm},
+        outputs={"Out": outs},
+        attrs={
+            "pooltype": pool_type.upper(),
+            "pad_value": pad_value,
+            "use_cvm": use_cvm,
+            "cvm_offset": cvm_offset,
+        })
+
+    return outs
+
+
 def multiclass_nms2(bboxes,
                     scores,
                     score_threshold,
-- 
GitLab