sequence_mask_op.h 4.4 KB
Newer Older
Q
qingqing01 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

17
#if defined(__NVCC__) || defined(__HIPCC__)
S
sneaxiy 已提交
18 19 20 21 22 23 24
#include <thrust/device_ptr.h>
#include <thrust/functional.h>
#include <thrust/reduce.h>
#else
#include <algorithm>
#endif

Q
qingqing01 已提交
25 26 27 28 29 30
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h"

namespace paddle {
namespace operators {

31 32
using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor;
Q
qingqing01 已提交
33 34 35

template <typename Tx, typename Ty>
struct SequenceMaskForRangeFunctor {
S
sneaxiy 已提交
36 37
  HOSTDEVICE SequenceMaskForRangeFunctor(const Tx *x, Ty *y, int maxlen)
      : x_(x), y_(y), maxlen_(maxlen) {}
Q
qingqing01 已提交
38 39

  HOSTDEVICE void operator()(int y_idx) const {
S
sneaxiy 已提交
40 41
    int x_idx = y_idx / maxlen_;
    int j = y_idx % maxlen_;
Q
qingqing01 已提交
42 43 44 45 46 47
    y_[y_idx] = static_cast<Ty>(j < x_[x_idx] ? 1 : 0);
  }

 private:
  const Tx *x_;
  Ty *y_;
S
sneaxiy 已提交
48
  int maxlen_;
Q
qingqing01 已提交
49 50 51 52 53
};

template <typename DeviceContext, typename Tx>
struct SequenceMaskFunctor {
  SequenceMaskFunctor(const DeviceContext &ctx, const Tx *x, Tensor *y,
S
sneaxiy 已提交
54 55
                      int limits, int maxlen)
      : ctx_(ctx), x_(x), y_(y), limits_(limits), maxlen_(maxlen) {}
Q
qingqing01 已提交
56 57

  template <typename Ty>
D
dzhwinter 已提交
58
  void apply() const {
Q
qingqing01 已提交
59 60
    auto *y_data = y_->mutable_data<Ty>(ctx_.GetPlace());
    platform::ForRange<DeviceContext> for_range(ctx_, limits_);
S
sneaxiy 已提交
61
    for_range(SequenceMaskForRangeFunctor<Tx, Ty>(x_, y_data, maxlen_));
Q
qingqing01 已提交
62 63 64 65 66 67 68
  }

 private:
  const DeviceContext &ctx_;
  const Tx *x_;
  Tensor *y_;
  int limits_;
S
sneaxiy 已提交
69
  int maxlen_;
Q
qingqing01 已提交
70 71 72 73 74 75 76 77 78 79
};

template <typename DeviceContext, typename Tx>
class SequenceMaskKernel : public framework::OpKernel<Tx> {
  using Tensor = framework::LoDTensor;

 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    auto *x = ctx.Input<Tensor>("X");
    auto *y = ctx.Output<Tensor>("Y");
80 81 82
    int maxlen = ctx.Attr<int>("maxlen");
    if (ctx.HasInput("MaxLenTensor")) {
      auto max_len_tensor = ctx.Input<Tensor>("MaxLenTensor");
83 84 85 86
      PADDLE_ENFORCE_NOT_NULL(max_len_tensor,
                              platform::errors::InvalidArgument(
                                  "Input(MaxLenTensor) should not be NULL."
                                  "But received Input(MaxLenTensor) is NULL"));
87 88 89 90 91 92 93 94
      if (platform::is_gpu_place(max_len_tensor->place())) {
        framework::Tensor temp;
        TensorCopySync(*max_len_tensor, platform::CPUPlace(), &temp);
        maxlen = *temp.data<int32_t>();
      } else {
        maxlen = *max_len_tensor->data<int32_t>();
      }

95
      auto y_dim = framework::vectorize<int>(x->dims());
96 97 98
      y_dim.push_back(maxlen);
      y->Resize(framework::make_ddim(y_dim));

99 100 101 102 103 104
      PADDLE_ENFORCE_GT(
          maxlen, 0,
          platform::errors::InvalidArgument(
              "Input(MaxLenTensor) value should be greater than 0. But "
              "received Input(MaxLenTensor) value = %d.",
              maxlen));
105
    }
S
sneaxiy 已提交
106 107 108 109

    auto *x_data = x->data<Tx>();
    auto x_numel = x->numel();
    if (maxlen < 0) {
110
#if defined(__NVCC__) || defined(__HIPCC__)
M
minqiyang 已提交
111
      VLOG(10)
S
sneaxiy 已提交
112 113 114 115 116 117 118 119
          << "SequenceMaskOp on GPU may be slow when maxlen is not provided.";
      maxlen = static_cast<int>(
          thrust::reduce(thrust::device_pointer_cast(x_data),
                         thrust::device_pointer_cast(x_data) + x_numel,
                         static_cast<Tx>(0), thrust::maximum<Tx>()));
#else
      maxlen = static_cast<int>(*std::max_element(x_data, x_data + x_numel));
#endif
120
      auto y_dim = framework::vectorize<int>(x->dims());
S
sneaxiy 已提交
121 122 123 124
      y_dim.push_back(maxlen);
      y->Resize(framework::make_ddim(y_dim));
    }

Q
qingqing01 已提交
125 126 127
    auto out_dtype = static_cast<framework::proto::VarType::Type>(
        ctx.Attr<int>("out_dtype"));
    auto &dev_ctx = ctx.template device_context<DeviceContext>();
S
sneaxiy 已提交
128 129 130
    framework::VisitDataType(out_dtype,
                             SequenceMaskFunctor<DeviceContext, Tx>(
                                 dev_ctx, x_data, y, x_numel * maxlen, maxlen));
Q
qingqing01 已提交
131 132 133 134 135
  }
};

}  // namespace operators
}  // namespace paddle