gru_cpu_kernel.h 30.4 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
G
guosheng 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
#include <type_traits>
17 18
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/activation_op.h"
Y
Yi Wang 已提交
19 20
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/gru_compute.h"
G
guosheng 已提交
21 22 23 24 25

namespace paddle {
namespace operators {
namespace math {
namespace detail {
26 27 28 29
using Array1 = Eigen::DSizes<int64_t, 1>;
template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
G
guosheng 已提交
30

31
#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group for GRU CPU
G
guosheng 已提交
32
template <class OpResetOutput, typename T>
33 34 35 36
void hl_naive_gru_forward_reset_output(
    OpResetOutput op_reset_output, T *gate_value, T *reset_output_value,
    const T *prev_output_value, int frame_size, ActivationType active_gate,
    bool old_version = true, const T *reset_bias = nullptr) {
G
guosheng 已提交
37 38 39 40
  T r_value_update_gate;
  T r_value_reset_gate;
  T r_value_reset_output;
  T r_prev_out = 0;
41 42 43 44 45 46 47 48 49 50
  T r_reset_bias = 0;
  T *update_gate = nullptr;
  T *reset_gate = nullptr;
  if (old_version) {
    update_gate = gate_value;
    reset_gate = gate_value + frame_size;
  } else {
    reset_gate = gate_value;
    update_gate = gate_value + frame_size;
  }
G
guosheng 已提交
51 52 53
  for (int i = 0; i < frame_size; i++) {
    r_value_update_gate = update_gate[i];
    r_value_reset_gate = reset_gate[i];
54 55 56 57
    if (!old_version) {
      r_value_reset_output = reset_output_value[i];
      r_reset_bias = reset_bias[i];
    }
G
guosheng 已提交
58 59
    if (prev_output_value) {
      r_prev_out = prev_output_value[i];
G
guosheng 已提交
60 61
    }

62
    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
63 64
                    &r_value_reset_output, active_gate, &r_reset_bias,
                    old_version);
G
guosheng 已提交
65

G
guosheng 已提交
66 67 68
    update_gate[i] = r_value_update_gate;
    reset_gate[i] = r_value_reset_gate;
    reset_output_value[i] = r_value_reset_output;
G
guosheng 已提交
69 70 71 72
  }
}

template <class OpFinalOutput, typename T>
73 74 75 76
void hl_naive_gru_forward_final_output(
    OpFinalOutput op_final_output, T *gate_value, const T *prev_output_value,
    T *output_value, int frame_size, ActivationType active_node,
    bool origin_mode, bool old_version = true) {
G
guosheng 已提交
77 78 79 80
  T r_value_update_gate;
  T r_value_frame_state;
  T r_prev_out = 0;
  T r_output;
81 82 83 84 85 86
  T *update_gate;
  if (old_version) {
    update_gate = gate_value;
  } else {
    update_gate = gate_value + frame_size;
  }
G
guosheng 已提交
87 88 89 90 91 92 93
  T *frame_state = gate_value + frame_size * 2;

  for (int i = 0; i < frame_size; i++) {
    r_value_update_gate = update_gate[i];
    r_value_frame_state = frame_state[i];
    if (prev_output_value) {
      r_prev_out = prev_output_value[i];
G
guosheng 已提交
94 95
    }

96
    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
Q
Qiao Longfei 已提交
97
                    &r_output, active_node, origin_mode);
G
guosheng 已提交
98

G
guosheng 已提交
99 100
    frame_state[i] = r_value_frame_state;
    output_value[i] = r_output;
G
guosheng 已提交
101 102 103 104
  }
}

template <class OpResetOutput, typename T>
G
guosheng 已提交
105 106
void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
                                     T *gate_value, T *reset_output_value,
107 108 109 110
                                     const T *prev_output_value, int frame_size,
                                     ActivationType active_gate,
                                     bool old_version = true,
                                     const T *reset_bias = nullptr) {
G
guosheng 已提交
111
#ifdef __AVX__
112 113
  __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f);
  __m256 r_value_reset_gate, r_value_reset_gate_last = _mm256_set1_ps(0.0f);
G
guosheng 已提交
114
  __m256 r_value_reset_output;
115 116
  __m256 r_prev_out = _mm256_set1_ps(0.0f),
         r_prev_out_last = _mm256_set1_ps(0.0f);
117 118 119 120 121 122 123 124 125 126
  __m256 r_reset_bias = _mm256_set1_ps(0.0f);
  T *update_gate;
  T *reset_gate;
  if (old_version) {
    update_gate = gate_value;
    reset_gate = gate_value + frame_size;
  } else {
    reset_gate = gate_value;
    update_gate = gate_value + frame_size;
  }
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
  int block = 8;
  const int n = frame_size;
  const int rest = n % block;
  const int end = n - rest;
  int i = 0;

  if (rest > 0) {
    i = n - block;
    r_value_update_gate_last =
        _mm256_loadu_ps((const float *)(update_gate + i));
    r_value_reset_gate_last = _mm256_loadu_ps((const float *)(reset_gate + i));
    if (prev_output_value) {
      r_prev_out_last = _mm256_loadu_ps((const float *)(prev_output_value + i));
    }
  }
G
guosheng 已提交
142

143 144 145
  for (i = 0; i < end; i += block) {
    r_value_update_gate = _mm256_loadu_ps((const float *)(update_gate + i));
    r_value_reset_gate = _mm256_loadu_ps((const float *)(reset_gate + i));
G
guosheng 已提交
146
    if (prev_output_value) {
147
      r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i));
G
guosheng 已提交
148
    }
149 150 151 152 153
    if (!old_version) {
      r_reset_bias = _mm256_loadu_ps((const float *)(reset_bias + i));
      r_value_reset_output =
          _mm256_loadu_ps((const float *)(reset_output_value + i));
    }
G
guosheng 已提交
154

155
    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
156 157
                    &r_value_reset_output, active_gate, &r_reset_bias,
                    old_version);
G
guosheng 已提交
158

159 160 161 162 163 164 165 166 167 168 169 170
    _mm256_storeu_ps(reinterpret_cast<float *>(update_gate + i),
                     r_value_update_gate);
    _mm256_storeu_ps(reinterpret_cast<float *>(reset_gate + i),
                     r_value_reset_gate);
    _mm256_storeu_ps(reinterpret_cast<float *>(reset_output_value + i),
                     r_value_reset_output);
  }

  if (rest > 0) {
    i = n - block;

    op_reset_output(&r_value_update_gate_last, &r_value_reset_gate_last,
171 172
                    &r_prev_out_last, &r_value_reset_output, active_gate,
                    &r_reset_bias, old_version);
173 174 175 176 177 178 179

    _mm256_storeu_ps(reinterpret_cast<float *>(update_gate + i),
                     r_value_update_gate_last);
    _mm256_storeu_ps(reinterpret_cast<float *>(reset_gate + i),
                     r_value_reset_gate_last);
    _mm256_storeu_ps(reinterpret_cast<float *>(reset_output_value + i),
                     r_value_reset_output);
G
guosheng 已提交
180 181 182 183 184
  }
#endif
}

template <class OpFinalOutput, typename T>
G
guosheng 已提交
185
void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
186
                                     T *gate_value, const T *prev_output_value,
G
guosheng 已提交
187
                                     T *output_value, int frame_size,
Q
Qiao Longfei 已提交
188
                                     ActivationType active_node,
189 190
                                     bool origin_mode,
                                     bool old_version = true) {
G
guosheng 已提交
191
#ifdef __AVX__
192 193 194 195
  __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f);
  __m256 r_value_frame_state, r_value_frame_state_last = _mm256_set1_ps(0.0f);
  __m256 r_prev_out = _mm256_set1_ps(0.0f),
         r_prev_out_last = _mm256_set1_ps(0.0f);
G
guosheng 已提交
196
  __m256 r_output;
197 198 199 200 201 202 203
  T *update_gate;
  if (old_version) {
    update_gate = gate_value;
  } else {
    update_gate = gate_value + frame_size;
  }

204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
  T *frame_state = gate_value + frame_size * 2;
  int block = 8;
  const int n = frame_size;
  const int rest = n % block;
  const int end = n - rest;
  int i = 0;

  if (rest > 0) {
    i = n - block;
    r_value_update_gate_last =
        _mm256_loadu_ps((const float *)(update_gate + i));
    r_value_frame_state_last =
        _mm256_loadu_ps((const float *)(frame_state + i));
    if (prev_output_value) {
      r_prev_out_last = _mm256_loadu_ps((const float *)(prev_output_value + i));
    }
  }
G
guosheng 已提交
221

222 223 224
  for (i = 0; i < end; i += block) {
    r_value_update_gate = _mm256_loadu_ps((const float *)(update_gate + i));
    r_value_frame_state = _mm256_loadu_ps((const float *)(frame_state + i));
G
guosheng 已提交
225
    if (prev_output_value) {
226
      r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i));
G
guosheng 已提交
227 228
    }

229
    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
Q
Qiao Longfei 已提交
230
                    &r_output, active_node, origin_mode);
G
guosheng 已提交
231

232 233 234 235 236 237 238 239
    _mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
                     r_value_frame_state);
    _mm256_storeu_ps(reinterpret_cast<float *>(output_value + i), r_output);
  }

  if (rest > 0) {
    i = n - block;
    op_final_output(&r_value_update_gate_last, &r_value_frame_state_last,
Q
Qiao Longfei 已提交
240
                    &r_prev_out_last, &r_output, active_node, origin_mode);
241 242 243 244

    _mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
                     r_value_frame_state_last);
    _mm256_storeu_ps(reinterpret_cast<float *>(output_value + i), r_output);
G
guosheng 已提交
245
  }
246

G
guosheng 已提交
247 248 249
#endif
}

250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
template <typename T>
inline void forward_reset_outputV2(const platform::CPUDeviceContext &context,
                                   GRUMetaValue<T> value, int frame_size) {
  auto &place = *context.eigen_device();
  auto value_reset_gate =
      typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
  auto value_update_gate = typename EigenVector<T>::Type(
      value.gate_value + frame_size, Array1(frame_size));
  auto value_reset_output = typename EigenVector<T>::Type(
      value.reset_output_value, Array1(frame_size));
  auto value_reset_bias =
      typename EigenVector<T>::ConstType(value.reset_bias, Array1(frame_size));
  SigmoidFunctor<T>()(place, value_reset_gate, value_reset_gate);
  SigmoidFunctor<T>()(place, value_update_gate, value_update_gate);
  value_reset_output.device(place) =
      (value_reset_output + value_reset_bias) * value_reset_gate;
}

G
guosheng 已提交
268
template <class OpResetOutput, typename T>
269 270 271 272
inline void forward_reset_output(
    OpResetOutput op_reset_output, GRUMetaValue<T> value, int frame_size,
    int batch_size, ActivationType active_gate, bool old_version = true,
    const platform::CPUDeviceContext *context = nullptr) {
G
guosheng 已提交
273
  for (int b = 0; b < batch_size; b++) {
274 275 276
    if (!old_version) {
      // use eigen
      forward_reset_outputV2(*context, value, frame_size);
G
guosheng 已提交
277
    } else {
278
      if (OpResetOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
279 280 281 282 283 284 285 286 287 288 289
          (sizeof(T) == 4)) {
        hl_avx_gru_forward_reset_output(
            op_reset_output, value.gate_value, value.reset_output_value,
            value.prev_out_value, frame_size, active_gate, old_version,
            value.reset_bias);
      } else {
        hl_naive_gru_forward_reset_output(
            op_reset_output, value.gate_value, value.reset_output_value,
            value.prev_out_value, frame_size, active_gate, old_version,
            value.reset_bias);
      }
G
guosheng 已提交
290
    }
G
guosheng 已提交
291 292 293 294
    value.gate_value += frame_size * 3;
    value.reset_output_value += frame_size;
    if (value.prev_out_value) {
      value.prev_out_value += frame_size;
G
guosheng 已提交
295 296 297 298
    }
  }
}

299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
template <typename T>
inline void forward_final_outputV2(const platform::CPUDeviceContext &context,
                                   GRUMetaValue<T> value, int frame_size) {
  auto &place = *context.eigen_device();
  auto value_update_gate = typename EigenVector<T>::Type(
      value.gate_value + frame_size, Array1(frame_size));
  auto value_frame_state = typename EigenVector<T>::Type(
      value.gate_value + 2 * frame_size, Array1(frame_size));
  auto value_output =
      typename EigenVector<T>::Type(value.output_value, Array1(frame_size));
  TanhFunctor<T>()(place, value_frame_state, value_frame_state);
  value_output.device(place) =
      (static_cast<T>(1.0) - value_update_gate) * value_frame_state;
  if (value.prev_out_value) {
    auto value_prev_out = typename EigenVector<T>::ConstType(
        value.prev_out_value, Array1(frame_size));
    value_output.device(place) =
        value_output + value_update_gate * value_prev_out;
  }
}

G
guosheng 已提交
320
template <class OpFinalOutput, typename T>
321 322 323 324 325
inline void forward_final_output(
    OpFinalOutput op_final_output, GRUMetaValue<T> value, int frame_size,
    int batch_size, ActivationType active_node, bool origin_mode,
    bool old_version = true,
    const platform::CPUDeviceContext *context = nullptr) {
G
guosheng 已提交
326
  for (int b = 0; b < batch_size; b++) {
327 328 329
    if (!old_version) {
      // eigen
      forward_final_outputV2(*context, value, frame_size);
G
guosheng 已提交
330
    } else {
331
      if (OpFinalOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
332 333
          (sizeof(T) == 4)) {
        hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
334 335 336
                                        value.prev_out_value,
                                        value.output_value, frame_size,
                                        active_node, origin_mode, old_version);
337 338 339 340 341 342
      } else {
        hl_naive_gru_forward_final_output(
            op_final_output, value.gate_value, value.prev_out_value,
            value.output_value, frame_size, active_node, origin_mode,
            old_version);
      }
G
guosheng 已提交
343
    }
G
guosheng 已提交
344 345 346 347
    value.gate_value += frame_size * 3;
    value.output_value += frame_size;
    if (value.prev_out_value) {
      value.prev_out_value += frame_size;
G
guosheng 已提交
348 349 350 351 352
    }
  }
}

template <class OpStateGrad, typename T>
G
guosheng 已提交
353
void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
354
                                      T *gate_grad, const T *prev_out_value,
G
guosheng 已提交
355 356
                                      T *prev_out_grad, T *output_grad,
                                      int frame_size,
Q
Qiao Longfei 已提交
357 358
                                      ActivationType active_node,
                                      bool origin_mode) {
G
guosheng 已提交
359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
  T r_update_gate_value;
  T r_update_gate_grad;
  T r_frame_state_value;
  T r_frame_state_grad;
  T r_out_grad;
  T r_prev_out_value = 0;
  T r_prev_out_grad = 0;
  T *update_gate_value = gate_value;
  T *update_gate_grad = gate_grad;
  T *frame_state_value = gate_value + frame_size * 2;
  T *frame_state_grad = gate_grad + frame_size * 2;

  for (int i = 0; i < frame_size; i++) {
    r_update_gate_value = update_gate_value[i];
    r_frame_state_value = frame_state_value[i];
    r_out_grad = output_grad[i];
    if (prev_out_value) {
      r_prev_out_value = prev_out_value[i];
G
guosheng 已提交
377
    }
G
guosheng 已提交
378 379
    if (prev_out_grad) {
      r_prev_out_grad = prev_out_grad[i];
G
guosheng 已提交
380 381
    }

382 383
    op_state_grad(&r_update_gate_value, &r_update_gate_grad,
                  &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
Q
Qiao Longfei 已提交
384
                  &r_prev_out_grad, &r_out_grad, active_node, origin_mode);
G
guosheng 已提交
385

G
guosheng 已提交
386 387 388 389
    update_gate_grad[i] = r_update_gate_grad;
    frame_state_grad[i] = r_frame_state_grad;
    if (prev_out_grad) {
      prev_out_grad[i] = r_prev_out_grad;
G
guosheng 已提交
390 391 392 393 394
    }
  }
}

template <class OpResetGrad, typename T>
G
guosheng 已提交
395
void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
396
                                      T *gate_grad, const T *prev_out_value,
G
guosheng 已提交
397 398
                                      T *prev_out_grad, T *reset_output_grad,
                                      int frame_size,
Q
Qiao Longfei 已提交
399
                                      ActivationType active_gate) {
G
guosheng 已提交
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
  T r_update_gate_value;
  T r_update_gate_grad;
  T r_reset_gate_value;
  T r_reset_gate_grad;
  T r_reset_output_grad = 0;
  T r_prev_out_value = 0;
  T r_prev_out_grad = 0;
  T *update_gate_value = gate_value;
  T *update_gate_grad = gate_grad;
  T *reset_gate_value = gate_value + frame_size;
  T *reset_gate_grad = gate_grad + frame_size;

  for (int i = 0; i < frame_size; i++) {
    r_update_gate_value = update_gate_value[i];
    r_update_gate_grad = update_gate_grad[i];
    r_reset_gate_value = reset_gate_value[i];

    if (prev_out_value && prev_out_grad) {
      r_reset_output_grad = reset_output_grad[i];
G
guosheng 已提交
419
    }
G
guosheng 已提交
420 421
    if (prev_out_value) {
      r_prev_out_value = prev_out_value[i];
G
guosheng 已提交
422
    }
G
guosheng 已提交
423 424
    if (prev_out_grad) {
      r_prev_out_grad = prev_out_grad[i];
G
guosheng 已提交
425 426
    }

427 428
    op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
                  &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
Q
Qiao Longfei 已提交
429
                  &r_prev_out_grad, &r_reset_output_grad, active_gate);
G
guosheng 已提交
430

G
guosheng 已提交
431 432 433 434
    update_gate_grad[i] = r_update_gate_grad;
    reset_gate_grad[i] = r_reset_gate_grad;
    if (prev_out_grad) {
      prev_out_grad[i] = r_prev_out_grad;
G
guosheng 已提交
435 436 437 438 439
    }
  }
}

template <class OpStateGrad, typename T>
G
guosheng 已提交
440
void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
441
                                    T *gate_grad, const T *prev_out_value,
G
guosheng 已提交
442
                                    T *prev_out_grad, T *output_grad,
Q
Qiao Longfei 已提交
443 444
                                    int frame_size, ActivationType active_node,
                                    bool origin_mode) {
G
guosheng 已提交
445
#ifdef __AVX__
G
guosheng 已提交
446 447 448 449 450 451 452
  __m256 r_update_gate_value;
  __m256 r_update_gate_grad;
  __m256 r_frame_state_value;
  __m256 r_frame_state_grad;
  __m256 r_out_grad;
  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
453 454 455 456 457 458
  __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
  __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
  __m256 *frame_state_value =
      reinterpret_cast<__m256 *>(gate_value + frame_size * 2);
  __m256 *frame_state_grad =
      reinterpret_cast<__m256 *>(gate_grad + frame_size * 2);
G
guosheng 已提交
459 460 461 462

  for (int i = 0; i < frame_size / 8; i++) {
    r_update_gate_value = update_gate_value[i];
    r_frame_state_value = frame_state_value[i];
463
    r_out_grad = (reinterpret_cast<__m256 *>(output_grad))[i];
G
guosheng 已提交
464
    if (prev_out_value) {
465
      r_prev_out_value = (reinterpret_cast<const __m256 *>(prev_out_value))[i];
G
guosheng 已提交
466
    }
G
guosheng 已提交
467
    if (prev_out_grad) {
468
      r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
G
guosheng 已提交
469 470
    }

471 472
    op_state_grad(&r_update_gate_value, &r_update_gate_grad,
                  &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
Q
Qiao Longfei 已提交
473
                  &r_prev_out_grad, &r_out_grad, active_node, origin_mode);
G
guosheng 已提交
474

G
guosheng 已提交
475 476 477
    update_gate_grad[i] = r_update_gate_grad;
    frame_state_grad[i] = r_frame_state_grad;
    if (prev_out_grad) {
478
      (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
G
guosheng 已提交
479 480 481 482 483 484
    }
  }
#endif
}

template <class OpResetGrad, typename T>
G
guosheng 已提交
485
void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
486
                                    T *gate_grad, const T *prev_out_value,
G
guosheng 已提交
487
                                    T *prev_out_grad, T *reset_output_grad,
Q
Qiao Longfei 已提交
488 489
                                    int frame_size,
                                    ActivationType active_gate) {
G
guosheng 已提交
490
#ifdef __AVX__
G
guosheng 已提交
491 492 493 494 495 496 497
  __m256 r_update_gate_value;
  __m256 r_update_gate_grad;
  __m256 r_reset_gate_value;
  __m256 r_reset_gate_grad;
  __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
498 499 500 501 502
  __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
  __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
  __m256 *reset_gate_value =
      reinterpret_cast<__m256 *>(gate_value + frame_size);
  __m256 *reset_gate_grad = reinterpret_cast<__m256 *>(gate_grad + frame_size);
G
guosheng 已提交
503 504 505 506 507 508 509

  for (int i = 0; i < frame_size / 8; i++) {
    r_update_gate_value = update_gate_value[i];
    r_update_gate_grad = update_gate_grad[i];
    r_reset_gate_value = reset_gate_value[i];

    if (prev_out_value && prev_out_grad) {
510
      r_reset_output_grad = (reinterpret_cast<__m256 *>(reset_output_grad))[i];
G
guosheng 已提交
511
    }
G
guosheng 已提交
512
    if (prev_out_value) {
513
      r_prev_out_value = (reinterpret_cast<const __m256 *>(prev_out_value))[i];
G
guosheng 已提交
514
    }
G
guosheng 已提交
515
    if (prev_out_grad) {
516
      r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
G
guosheng 已提交
517 518
    }

519 520
    op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
                  &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
Q
Qiao Longfei 已提交
521
                  &r_prev_out_grad, &r_reset_output_grad, active_gate);
G
guosheng 已提交
522

G
guosheng 已提交
523 524 525
    update_gate_grad[i] = r_update_gate_grad;
    reset_gate_grad[i] = r_reset_gate_grad;
    if (prev_out_grad) {
526
      (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
G
guosheng 已提交
527 528 529 530 531
    }
  }
#endif
}

532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660
template <class OpGruGrad, typename T>
inline void hl_naive_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
                                  T *gate_grad, const T *prev_out_value,
                                  T *prev_out_grad, T *reset_output_value,
                                  T *reset_output_grad, T *output_grad,
                                  int frame_size, ActivationType active_node,
                                  ActivationType active_gate) {
  T r_value_reset_gate;
  T r_grad_reset_gate;
  T r_value_update_gate;
  T r_grad_update_gate;
  T r_value_frame_state;
  T r_grad_frame_state;
  T r_value_prev_out = 0;
  T r_grad_prev_out = 0;
  T r_grad_output;
  T r_value_reset_output;
  T r_grad_reset_output = 0;
  T *reset_gate_value = gate_value;
  T *reset_gate_grad = gate_grad;
  T *update_gate_value = gate_value + frame_size;
  T *update_gate_grad = gate_grad + frame_size;
  T *frame_state_value = gate_value + 2 * frame_size;
  T *frame_state_grad = gate_grad + 2 * frame_size;

  for (int i = 0; i < frame_size; ++i) {
    r_value_reset_gate = reset_gate_value[i];
    r_grad_reset_gate = reset_gate_grad[i];
    r_value_update_gate = update_gate_value[i];
    r_grad_update_gate = update_gate_grad[i];
    r_value_frame_state = frame_state_value[i];
    r_grad_frame_state = frame_state_grad[i];
    if (prev_out_value) {
      r_value_prev_out = prev_out_value[i];
    }
    if (prev_out_grad) {
      r_grad_prev_out = prev_out_grad[i];
    }
    r_grad_output = output_grad[i];
    r_value_reset_output = reset_output_value[i];
    if (prev_out_value && prev_out_grad) {
      r_grad_reset_output = reset_output_grad[i];
    }

    op_gru_grad(&r_value_reset_gate, &r_grad_reset_gate, &r_value_update_gate,
                &r_grad_update_gate, &r_value_frame_state, &r_grad_frame_state,
                &r_value_prev_out, &r_grad_prev_out, &r_grad_output,
                &r_value_reset_output, &r_grad_reset_output, active_node,
                active_gate);

    reset_gate_grad[i] = r_grad_reset_gate;
    update_gate_grad[i] = r_grad_update_gate;
    frame_state_grad[i] = r_grad_frame_state;
    if (prev_out_grad) {
      prev_out_grad[i] = r_grad_prev_out;
    }
    if (prev_out_value && prev_out_grad) {
      reset_output_grad[i] = r_grad_reset_output;
    }
  }
}

template <class OpGruGrad, typename T>
inline void hl_avx_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
                                T *gate_grad, const T *prev_out_value,
                                T *prev_out_grad, T *reset_output_value,
                                T *reset_output_grad, T *output_grad,
                                int frame_size, ActivationType active_node,
                                ActivationType active_gate) {
#ifdef __AVX__
  __m256 r_value_reset_gate;
  __m256 r_grad_reset_gate;
  __m256 r_value_update_gate;
  __m256 r_grad_update_gate;
  __m256 r_value_frame_state;
  __m256 r_grad_frame_state;
  __m256 r_value_prev_out = _mm256_set1_ps(0.0f);
  __m256 r_grad_prev_out = _mm256_set1_ps(0.0f);
  __m256 r_grad_output;
  __m256 r_value_reset_output;
  __m256 r_grad_reset_output = _mm256_set1_ps(0.0f);
  __m256 *reset_gate_value = reinterpret_cast<__m256 *>(gate_value);
  __m256 *reset_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
  __m256 *update_gate_value =
      reinterpret_cast<__m256 *>(gate_value + frame_size);
  __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad + frame_size);
  __m256 *frame_state_value =
      reinterpret_cast<__m256 *>(gate_value + 2 * frame_size);
  __m256 *frame_state_grad =
      reinterpret_cast<__m256 *>(gate_grad + 2 * frame_size);

  for (int i = 0; i < frame_size / 8; ++i) {
    r_value_reset_gate = reset_gate_value[i];
    r_grad_reset_gate = reset_gate_grad[i];
    r_value_update_gate = update_gate_value[i];
    r_grad_update_gate = update_gate_grad[i];
    r_value_frame_state = frame_state_value[i];
    r_grad_frame_state = frame_state_grad[i];
    if (prev_out_value) {
      r_value_prev_out = (reinterpret_cast<const __m256 *>(prev_out_value))[i];
    }
    if (prev_out_grad) {
      r_grad_prev_out = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
    }
    r_grad_output = (reinterpret_cast<__m256 *>(output_grad))[i];
    r_value_reset_output = (reinterpret_cast<__m256 *>(reset_output_value))[i];
    if (prev_out_value && prev_out_grad) {
      r_grad_reset_output = (reinterpret_cast<__m256 *>(reset_output_grad))[i];
    }

    op_gru_grad(&r_value_reset_gate, &r_grad_reset_gate, &r_value_update_gate,
                &r_grad_update_gate, &r_value_frame_state, &r_grad_frame_state,
                &r_value_prev_out, &r_grad_prev_out, &r_grad_output,
                &r_value_reset_output, &r_grad_reset_output, active_node,
                active_gate);

    reset_gate_grad[i] = r_grad_reset_gate;
    update_gate_grad[i] = r_grad_update_gate;
    frame_state_grad[i] = r_grad_frame_state;
    if (prev_out_grad) {
      (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_grad_prev_out;
    }
    if (prev_out_value && prev_out_grad) {
      (reinterpret_cast<__m256 *>(reset_output_grad))[i] = r_grad_reset_output;
    }
  }
#endif
}

G
guosheng 已提交
661
template <class OpStateGrad, typename T>
G
guosheng 已提交
662
inline void backward_state_grad(OpStateGrad op_state_grad,
663
                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
G
guosheng 已提交
664
                                int frame_size, int batch_size,
Q
Qiao Longfei 已提交
665
                                ActivationType active_node, bool origin_mode) {
G
guosheng 已提交
666 667
  for (int b = 0; b < batch_size; b++) {
    if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
Q
Qiao Longfei 已提交
668 669 670 671
      hl_avx_gru_backward_state_grad(op_state_grad, value.gate_value,
                                     grad.gate_grad, value.prev_out_value,
                                     grad.prev_out_grad, grad.output_grad,
                                     frame_size, active_node, origin_mode);
G
guosheng 已提交
672
    } else {
Q
Qiao Longfei 已提交
673 674 675 676
      hl_naive_gru_backward_state_grad(op_state_grad, value.gate_value,
                                       grad.gate_grad, value.prev_out_value,
                                       grad.prev_out_grad, grad.output_grad,
                                       frame_size, active_node, origin_mode);
G
guosheng 已提交
677 678
    }

G
guosheng 已提交
679 680 681
    value.gate_value += frame_size * 3;
    if (value.prev_out_value) {
      value.prev_out_value += frame_size;
G
guosheng 已提交
682 683
    }

G
guosheng 已提交
684 685 686 687
    grad.gate_grad += frame_size * 3;
    grad.output_grad += frame_size;
    if (grad.prev_out_grad) {
      grad.prev_out_grad += frame_size;
G
guosheng 已提交
688 689 690 691 692
    }
  }
}

template <class OpResetGrad, typename T>
G
guosheng 已提交
693
inline void backward_reset_grad(OpResetGrad op_reset_grad,
694
                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
G
guosheng 已提交
695
                                int frame_size, int batch_size,
Q
Qiao Longfei 已提交
696
                                ActivationType active_gate) {
G
guosheng 已提交
697 698
  for (int b = 0; b < batch_size; b++) {
    if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
Q
Qiao Longfei 已提交
699 700 701
      hl_avx_gru_backward_reset_grad(
          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
G
guosheng 已提交
702 703
    } else {
      hl_naive_gru_backward_reset_grad(
G
guosheng 已提交
704
          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
Q
Qiao Longfei 已提交
705
          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
G
guosheng 已提交
706 707
    }

G
guosheng 已提交
708 709 710
    value.gate_value += frame_size * 3;
    if (value.prev_out_value) {
      value.prev_out_value += frame_size;
G
guosheng 已提交
711 712
    }

G
guosheng 已提交
713 714 715 716
    grad.gate_grad += frame_size * 3;
    grad.reset_output_grad += frame_size;
    if (grad.prev_out_grad) {
      grad.prev_out_grad += frame_size;
G
guosheng 已提交
717 718 719 720
    }
  }
}

721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775
template <typename T>
inline void gru_backward(const platform::CPUDeviceContext &context,
                         GRUMetaValue<T> value, GRUMetaGrad<T> grad,
                         int frame_size) {
  auto &place = *context.eigen_device();

  auto value_reset_gate =
      typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
  auto grad_reset_gate =
      typename EigenVector<T>::Type(grad.gate_grad, Array1(frame_size));
  auto value_update_gate = typename EigenVector<T>::Type(
      value.gate_value + frame_size, Array1(frame_size));
  auto grad_update_gate = typename EigenVector<T>::Type(
      grad.gate_grad + frame_size, Array1(frame_size));
  auto value_frame_state = typename EigenVector<T>::Type(
      value.gate_value + frame_size * 2, Array1(frame_size));
  auto grad_frame_state = typename EigenVector<T>::Type(
      grad.gate_grad + frame_size * 2, Array1(frame_size));

  auto grad_output =
      typename EigenVector<T>::Type(grad.output_grad, Array1(frame_size));
  auto value_reset_output = typename EigenVector<T>::Type(
      value.reset_output_value, Array1(frame_size));
  auto grad_reset_output =
      typename EigenVector<T>::Type(grad.reset_output_grad, Array1(frame_size));

  if (value.prev_out_value) {
    auto value_prev_out = typename EigenVector<T>::ConstType(
        value.prev_out_value, Array1(frame_size));
    SigmoidGradFunctor<T>()(place, 1 /*useless*/, value_update_gate,
                            (value_prev_out - value_frame_state) * grad_output,
                            grad_update_gate);
  } else {
    SigmoidGradFunctor<T>()(
        place, 1 /*useless*/, value_update_gate,
        static_cast<T>(-1) * value_frame_state * grad_output, grad_update_gate);
  }
  if (grad.prev_out_grad) {
    auto grad_prev_out =
        typename EigenVector<T>::Type(grad.prev_out_grad, Array1(frame_size));
    grad_prev_out.device(place) =
        grad_prev_out + grad_output * value_update_gate;
  }
  TanhGradFunctor<T>()(place, 1 /*useless*/, value_frame_state,
                       grad_output * (static_cast<T>(1.0) - value_update_gate),
                       grad_frame_state);
  SigmoidGradFunctor<T>()(
      place, 1 /*useless*/, value_reset_gate,
      value_reset_output / value_reset_gate * grad_frame_state,
      grad_reset_gate);
  if (value.prev_out_value && grad.prev_out_grad) {
    grad_reset_output.device(place) = value_reset_gate * grad_frame_state;
  }
}

776
template <class OpGruGrad, typename T>
777 778
inline void cpu_gru_backward(const platform::CPUDeviceContext &context,
                             OpGruGrad op_gru_grad, GRUMetaValue<T> value,
779 780 781 782
                             GRUMetaGrad<T> grad, int frame_size,
                             int batch_size, ActivationType active_node,
                             ActivationType active_gate) {
  for (int b = 0; b < batch_size; ++b) {
783 784
    // eigen
    gru_backward(context, value, grad, frame_size);
785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800

    value.gate_value += frame_size * 3;
    value.reset_output_value += frame_size;
    if (value.prev_out_value) {
      value.prev_out_value += frame_size;
    }

    grad.gate_grad += frame_size * 3;
    grad.output_grad += frame_size;
    grad.reset_output_grad += frame_size;
    if (grad.prev_out_grad) {
      grad.prev_out_grad += frame_size;
    }
  }
}

801
#endif  // @} End Group for GRU CPU
G
guosheng 已提交
802 803 804 805 806

}  // namespace detail
}  // namespace math
}  // namespace operators
}  // namespace paddle