gru_op.cc 15.4 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
L
Luo Tao 已提交
2 3 4 5 6 7 8 9 10 11 12 13

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
G
guosheng 已提交
14

Y
Yi Wang 已提交
15
#include "paddle/fluid/operators/gru_op.h"
16
#include <string>
G
guosheng 已提交
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49

namespace paddle {
namespace operators {

using framework::Tensor;

class GRUOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Input"),
                   "Input(%s) of GRUOp should not be null.", "Input");
    PADDLE_ENFORCE(ctx->HasInput("Weight"),
                   "Input(%s) of GRUOp should not be null.", "Weight");
    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
                   "Output(%s) of GRUOp should not be null.", "BatchGate");
    PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
                   "Output(%s) of GRUOp should not be null.",
                   "BatchResetHiddenPrev");
    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
                   "Output(%s) of GRUOp should not be null.", "BatchHidden");
    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                   "Output(%s) of GRUOp should not be null.", "Hidden");
    auto input_dims = ctx->GetInputDim("Input");
    auto weight_dims = ctx->GetInputDim("Weight");
    int input_size = input_dims[1];
    int frame_size = weight_dims[0];
    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
                      "The input_size must be 3 times of frame_size in GRUOp.");
    PADDLE_ENFORCE_EQ(
        weight_dims[1], frame_size * 3,
        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
50
    if (ctx->HasInput("H0")) {
G
guosheng 已提交
51 52 53 54
      auto h0_dims = ctx->GetInputDim("H0");
      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
                        "The width of H0 must be equal to frame_size.");
    }
55
    if (ctx->HasInput("Bias")) {
G
guosheng 已提交
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
      auto bias_dims = ctx->GetInputDim("Bias");
      int bias_height = bias_dims[0];
      int bias_width = bias_dims[1];
      PADDLE_ENFORCE_EQ(bias_height, 1,
                        "The shape of Bias must be [1, frame_size * 3].");
      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
                        "The shape of Bias must be [1, frame_size * 3].");
    }
    ctx->SetOutputDim("BatchGate", input_dims);
    ctx->SetOutputDim("BatchResetHiddenPrev", {input_dims[0], frame_size});
    ctx->SetOutputDim("BatchHidden", {input_dims[0], frame_size});
    ctx->SetOutputDim("Hidden", {input_dims[0], frame_size});
    ctx->ShareLoD("Input", "Hidden");
  }
};

class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
Y
Yu Yang 已提交
74
  void Make() override {
G
guosheng 已提交
75
    AddInput("Input",
76
             "(LoDTensor) The first input is a LodTensor, which supports "
G
guosheng 已提交
77 78 79 80
             "variable-time length input sequence. The underlying tensor in "
             "this LoDTenosr is a matrix with shape (T X 3D), where, T is the "
             "total time steps in this mini-batch, D is the hidden size.");
    AddInput("H0",
81
             "(Tensor, optional) The initial hidden state is an optional "
G
guosheng 已提交
82
             "input. This is a tensor with shape (N x D), where N is the "
83 84
             "batch size, D is the hidden size.")
        .AsDispensable();
G
guosheng 已提交
85 86
    AddInput(
        "Weight",
87 88 89 90 91
        "(Tensor) The learnable hidden-hidden weight matrix with shape "
        "(D x 3D), where D is the hidden size. The elements continuous in "
        "memory can be divided into two parts. The first part are weights of "
        "the update gate and reset gate with shape (D x 2D), and the second "
        "part are weights of output candidate with shape (D x D).");
G
guosheng 已提交
92
    AddInput("Bias",
93 94 95
             "(Tensor, optional) Bias vector with shape (1 x 3D) concating "
             "bias of the update gate, reset gate and output candidate.")
        .AsDispensable();
G
guosheng 已提交
96
    AddOutput("BatchGate",
97 98 99 100 101 102 103
              "(LoDTensor) To compute with batches, sequence data will be "
              "reorganized into several successive batches each containing "
              "data from the same time step. The LoDTensor BatchGate contains "
              "the update gate, reset gate and output candidate values "
              "organized in batches. The LoD size is 2. The first LoD contains "
              "the batch offsets and the second LoD contains the indexes in "
              "the raw sequence data.")
G
guosheng 已提交
104 105 106
        .AsIntermediate();
    AddOutput(
        "BatchResetHiddenPrev",
107 108 109
        "(LoDTensor) The reseted hidden state LoDTensor organized in batches. "
        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
        "with `BatchGate`.")
G
guosheng 已提交
110 111 112
        .AsIntermediate();
    AddOutput(
        "BatchHidden",
113 114 115
        "(LoDTensor) The hidden state LoDTensor organized in batches.  "
        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
        "with `BatchGate`.")
G
guosheng 已提交
116
        .AsIntermediate();
117 118 119 120 121
    AddOutput(
        "Hidden",
        "(LoDTensor) the hidden state LoDTensor organized in sequences. "
        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
        "with `BatchGate`.");
G
guosheng 已提交
122 123 124 125 126 127 128 129 130 131 132 133 134 135
    AddAttr<std::string>("activation",
                         "(string, default tanh) "
                         "The activation type used for output candidate {h}_t.")
        .SetDefault("tanh");
    AddAttr<std::string>(
        "gate_activation",
        "(string, default sigmoid) "
        "The activation type used in update gate and reset gate.")
        .SetDefault("sigmoid");
    AddAttr<bool>("is_reverse",
                  "(bool, defalut: False) "
                  "whether to compute reversed GRU.")
        .SetDefault(false);
    AddComment(R"DOC(
136 137
GRU Operator implements part calculations of the complete GRU as following:

K
kavyasrinet 已提交
138 139 140 141
$$
update\_gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
reset\_gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
output\_candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
142
output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
K
kavyasrinet 已提交
143
$$
144

K
kavyasrinet 已提交
145
@note To implement the complete GRU, fully-connected operator must be used
146
before to feed xu, xr and xc as the Input of GRU operator.
G
guosheng 已提交
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
)DOC");
  }
};

class GRUGradOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Input"),
                   "Input(%s) of GRUGradOp should not be null.", "Input");
    PADDLE_ENFORCE(ctx->HasInput("Weight"),
                   "Input(%s) of GRUGradOp should not be null.", "Weight");
    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
                   "Input(%s) of GRUGradOp should not be null.", "BatchGate");
    PADDLE_ENFORCE(ctx->HasInput("BatchResetHiddenPrev"),
                   "Input(%s) of GRUGradOp should not be null.",
                   "BatchResetHiddenPrev");
    PADDLE_ENFORCE(ctx->HasInput("BatchHidden"),
                   "Input(%s) of GRUOp should not be null.", "BatchHidden");
    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
                   "Input(%s) of GRUGradOp should not be null.", "Hidden");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
                   "Input(%s@GRAD) of GRUGradOp should not be null.", "Hidden");
    auto input_dims = ctx->GetInputDim("Input");
    auto weight_dims = ctx->GetInputDim("Weight");
    int input_size = input_dims[1];
    int frame_size = weight_dims[0];
    int weight_height = weight_dims[0];
    int weight_width = weight_dims[1];
    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
                      "The input_size must be 3 times of frame_size in GRUOp.");
    PADDLE_ENFORCE_EQ(
        weight_height, frame_size,
        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
    PADDLE_ENFORCE_EQ(
        weight_width, frame_size * 3,
        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
185
    if (ctx->HasInput("H0")) {
G
guosheng 已提交
186 187 188 189 190 191 192
      auto h0_dims = ctx->GetInputDim("H0");
      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
                        "The width of H0 must be equal to frame_size.");
      auto h0_grad_name = framework::GradVarName("H0");
      if (ctx->HasOutput(h0_grad_name))
        ctx->SetOutputDim(h0_grad_name, h0_dims);
    }
193
    if (ctx->HasInput("Bias")) {
G
guosheng 已提交
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
      auto bias_dims = ctx->GetInputDim("Bias");
      int bias_height = bias_dims[0];
      int bias_width = bias_dims[1];
      PADDLE_ENFORCE_EQ(bias_height, 1,
                        "The shape of Bias must be [1, frame_size * 3].");
      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
                        "The shape of Bias must be [1, frame_size * 3].");
      auto bias_grad_name = framework::GradVarName("Bias");
      if (ctx->HasOutput(bias_grad_name))
        ctx->SetOutputDim(bias_grad_name, bias_dims);
    }
    auto input_grad_name = framework::GradVarName("Input");
    if (ctx->HasOutput(input_grad_name))
      ctx->SetOutputDim(input_grad_name, input_dims);
    auto weight_grad_name = framework::GradVarName("Weight");
    if (ctx->HasOutput(weight_grad_name))
      ctx->SetOutputDim(weight_grad_name, weight_dims);
  }
};

214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
template <typename T>
class GRUCPUKernel : public framework::OpKernel<T> {
 public:
  void BatchCompute(const framework::ExecutionContext& context) const {
    using DeviceContext = paddle::platform::CPUDeviceContext;
    auto* input = context.Input<LoDTensor>("Input");
    auto* h0 = context.Input<Tensor>("H0");
    auto* weight = context.Input<Tensor>("Weight");
    const T* weight_data = weight->data<T>();
    auto* bias = context.Input<Tensor>("Bias");
    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
    batch_gate->mutable_data<T>(context.GetPlace());
    auto* batch_reset_hidden_prev =
        context.Output<LoDTensor>("BatchResetHiddenPrev");
    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
    batch_hidden->mutable_data<T>(context.GetPlace());
    auto* hidden = context.Output<LoDTensor>("Hidden");
    hidden->mutable_data<T>(context.GetPlace());

    auto hidden_dims = hidden->dims();

    bool is_reverse = context.Attr<bool>("is_reverse");
    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
    auto& dev_ctx = context.template device_context<DeviceContext>();
    to_batch(dev_ctx, *input, batch_gate, true, is_reverse);

    if (bias) {
      math::RowwiseAdd<DeviceContext, T> add_bias;
      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
    }

    int frame_size = hidden_dims[1];
    math::GRUMetaValue<T> gru_value;
    gru_value.gate_weight = const_cast<T*>(weight_data);
    gru_value.state_weight =
        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
    Tensor ordered_h0;

    framework::Vector<size_t> order(batch_gate->lod()[2]);

    if (h0) {
      // Since the batch computing for GRU reorders the input sequences
      // according to their length. The initialized cell state also needs
      // to reorder.
      ReorderInitState<DeviceContext, T>(
          context.template device_context<DeviceContext>(), *h0, order,
          &ordered_h0, true);
      gru_value.prev_out_value = ordered_h0.data<T>();
    } else {
      gru_value.prev_out_value = nullptr;
    }
    auto batch_starts = batch_gate->lod()[0];
    size_t num_batch = batch_starts.size() - 1;
    auto active_node = math::detail::GetActivationType(
        context.Attr<std::string>("activation"));
    auto active_gate = math::detail::GetActivationType(
        context.Attr<std::string>("gate_activation"));

#ifdef PADDLE_WITH_MKLML
    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
    // TODO(TJ): make a class
    T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
                                     frame_size * 2 /*width of weight*/,
                                     frame_size /*height of height*/);
    PADDLE_ENFORCE(packed_gate);
    blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2,
                   frame_size, T(1.0), gru_value.gate_weight, frame_size * 2,
                   packed_gate);
    T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
                                      frame_size /*width of weight*/,
                                      frame_size /*height of height*/);
    PADDLE_ENFORCE(packed_state);
    blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size,
                   frame_size, T(1.0), gru_value.state_weight, frame_size,
                   packed_state);
#endif
    for (size_t n = 0; n < num_batch; n++) {
      int bstart = static_cast<int>(batch_starts[n]);
      int bend = static_cast<int>(batch_starts[n + 1]);
      int cur_batch_size = bend - bstart;

      Tensor gate_t = batch_gate->Slice(bstart, bend);
      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
      gru_value.output_value = hidden_t.data<T>();
      gru_value.gate_value = gate_t.data<T>();
      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();

#ifdef PADDLE_WITH_MKLML
      if (gru_value.prev_out_value) {
        blas.GEMM_COMPUTE(CblasNoTrans, CblasPacked, cur_batch_size,
                          frame_size * 2, frame_size, gru_value.prev_out_value,
                          frame_size, packed_gate, frame_size * 2, T(1),
                          gru_value.gate_value, frame_size * 3);
      }

      math::detail::forward_reset_output(
          math::detail::forward::gru_resetOutput<T>(), gru_value, frame_size,
          cur_batch_size, active_gate);

      if (gru_value.prev_out_value) {
        blas.GEMM_COMPUTE(
            CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size,
            gru_value.reset_output_value, frame_size, packed_state, frame_size,
            T(1), gru_value.gate_value + frame_size * 2, frame_size * 3);
      }

      math::detail::forward_final_output(
          math::detail::forward::gru_finalOutput<T>(), gru_value, frame_size,
          cur_batch_size, active_node);
#else
      math::GRUUnitFunctor<DeviceContext, T>::compute(
          dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
          active_gate);
#endif
      gru_value.prev_out_value = gru_value.output_value;
    }
#ifdef PADDLE_WITH_MKLML
    blas.GEMM_FREE(packed_gate);
    blas.GEMM_FREE(packed_state);
#endif

    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
    batch_hidden->set_lod(batch_gate->lod());
    to_seq(dev_ctx, *batch_hidden, hidden);
  }

  void Compute(const framework::ExecutionContext& context) const override {
    BatchCompute(context);
  }
};

G
guosheng 已提交
347 348 349 350
}  // namespace operators
}  // namespace paddle

namespace ops = paddle::operators;
Y
Yang Yang 已提交
351
REGISTER_OPERATOR(gru, ops::GRUOp, ops::GRUOpMaker,
352 353
                  paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(gru_grad, ops::GRUGradOp);
354 355
REGISTER_OP_CPU_KERNEL(gru, ops::GRUCPUKernel<float>,
                       ops::GRUCPUKernel<double>);
Q
QI JUN 已提交
356 357 358
REGISTER_OP_CPU_KERNEL(
    gru_grad, ops::GRUGradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::GRUGradKernel<paddle::platform::CPUDeviceContext, double>);