gru_cpu_kernel.h 30.3 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
G
guosheng 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
#include <type_traits>
17 18
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/activation_op.h"
Y
Yi Wang 已提交
19 20
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/gru_compute.h"
G
guosheng 已提交
21 22 23 24 25

namespace paddle {
namespace operators {
namespace math {
namespace detail {
26 27 28 29
using Array1 = Eigen::DSizes<int64_t, 1>;
template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
G
guosheng 已提交
30 31 32 33

#ifndef __NVCC__

template <class OpResetOutput, typename T>
34 35 36 37
void hl_naive_gru_forward_reset_output(
    OpResetOutput op_reset_output, T *gate_value, T *reset_output_value,
    const T *prev_output_value, int frame_size, ActivationType active_gate,
    bool old_version = true, const T *reset_bias = nullptr) {
G
guosheng 已提交
38 39 40 41
  T r_value_update_gate;
  T r_value_reset_gate;
  T r_value_reset_output;
  T r_prev_out = 0;
42 43 44 45 46 47 48 49 50 51
  T r_reset_bias = 0;
  T *update_gate = nullptr;
  T *reset_gate = nullptr;
  if (old_version) {
    update_gate = gate_value;
    reset_gate = gate_value + frame_size;
  } else {
    reset_gate = gate_value;
    update_gate = gate_value + frame_size;
  }
G
guosheng 已提交
52 53 54
  for (int i = 0; i < frame_size; i++) {
    r_value_update_gate = update_gate[i];
    r_value_reset_gate = reset_gate[i];
55 56 57 58
    if (!old_version) {
      r_value_reset_output = reset_output_value[i];
      r_reset_bias = reset_bias[i];
    }
G
guosheng 已提交
59 60
    if (prev_output_value) {
      r_prev_out = prev_output_value[i];
G
guosheng 已提交
61 62
    }

63
    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
64 65
                    &r_value_reset_output, active_gate, &r_reset_bias,
                    old_version);
G
guosheng 已提交
66

G
guosheng 已提交
67 68 69
    update_gate[i] = r_value_update_gate;
    reset_gate[i] = r_value_reset_gate;
    reset_output_value[i] = r_value_reset_output;
G
guosheng 已提交
70 71 72 73
  }
}

template <class OpFinalOutput, typename T>
74 75 76 77
void hl_naive_gru_forward_final_output(
    OpFinalOutput op_final_output, T *gate_value, const T *prev_output_value,
    T *output_value, int frame_size, ActivationType active_node,
    bool origin_mode, bool old_version = true) {
G
guosheng 已提交
78 79 80 81
  T r_value_update_gate;
  T r_value_frame_state;
  T r_prev_out = 0;
  T r_output;
82 83 84 85 86 87
  T *update_gate;
  if (old_version) {
    update_gate = gate_value;
  } else {
    update_gate = gate_value + frame_size;
  }
G
guosheng 已提交
88 89 90 91 92 93 94
  T *frame_state = gate_value + frame_size * 2;

  for (int i = 0; i < frame_size; i++) {
    r_value_update_gate = update_gate[i];
    r_value_frame_state = frame_state[i];
    if (prev_output_value) {
      r_prev_out = prev_output_value[i];
G
guosheng 已提交
95 96
    }

97
    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
Q
Qiao Longfei 已提交
98
                    &r_output, active_node, origin_mode);
G
guosheng 已提交
99

G
guosheng 已提交
100 101
    frame_state[i] = r_value_frame_state;
    output_value[i] = r_output;
G
guosheng 已提交
102 103 104 105
  }
}

template <class OpResetOutput, typename T>
G
guosheng 已提交
106 107
void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
                                     T *gate_value, T *reset_output_value,
108 109 110 111
                                     const T *prev_output_value, int frame_size,
                                     ActivationType active_gate,
                                     bool old_version = true,
                                     const T *reset_bias = nullptr) {
G
guosheng 已提交
112
#ifdef __AVX__
113 114
  __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f);
  __m256 r_value_reset_gate, r_value_reset_gate_last = _mm256_set1_ps(0.0f);
G
guosheng 已提交
115
  __m256 r_value_reset_output;
116 117
  __m256 r_prev_out = _mm256_set1_ps(0.0f),
         r_prev_out_last = _mm256_set1_ps(0.0f);
118 119 120 121 122 123 124 125 126 127
  __m256 r_reset_bias = _mm256_set1_ps(0.0f);
  T *update_gate;
  T *reset_gate;
  if (old_version) {
    update_gate = gate_value;
    reset_gate = gate_value + frame_size;
  } else {
    reset_gate = gate_value;
    update_gate = gate_value + frame_size;
  }
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
  int block = 8;
  const int n = frame_size;
  const int rest = n % block;
  const int end = n - rest;
  int i = 0;

  if (rest > 0) {
    i = n - block;
    r_value_update_gate_last =
        _mm256_loadu_ps((const float *)(update_gate + i));
    r_value_reset_gate_last = _mm256_loadu_ps((const float *)(reset_gate + i));
    if (prev_output_value) {
      r_prev_out_last = _mm256_loadu_ps((const float *)(prev_output_value + i));
    }
  }
G
guosheng 已提交
143

144 145 146
  for (i = 0; i < end; i += block) {
    r_value_update_gate = _mm256_loadu_ps((const float *)(update_gate + i));
    r_value_reset_gate = _mm256_loadu_ps((const float *)(reset_gate + i));
G
guosheng 已提交
147
    if (prev_output_value) {
148
      r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i));
G
guosheng 已提交
149
    }
150 151 152 153 154
    if (!old_version) {
      r_reset_bias = _mm256_loadu_ps((const float *)(reset_bias + i));
      r_value_reset_output =
          _mm256_loadu_ps((const float *)(reset_output_value + i));
    }
G
guosheng 已提交
155

156
    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
157 158
                    &r_value_reset_output, active_gate, &r_reset_bias,
                    old_version);
G
guosheng 已提交
159

160 161 162 163 164 165 166 167 168 169 170 171
    _mm256_storeu_ps(reinterpret_cast<float *>(update_gate + i),
                     r_value_update_gate);
    _mm256_storeu_ps(reinterpret_cast<float *>(reset_gate + i),
                     r_value_reset_gate);
    _mm256_storeu_ps(reinterpret_cast<float *>(reset_output_value + i),
                     r_value_reset_output);
  }

  if (rest > 0) {
    i = n - block;

    op_reset_output(&r_value_update_gate_last, &r_value_reset_gate_last,
172 173
                    &r_prev_out_last, &r_value_reset_output, active_gate,
                    &r_reset_bias, old_version);
174 175 176 177 178 179 180

    _mm256_storeu_ps(reinterpret_cast<float *>(update_gate + i),
                     r_value_update_gate_last);
    _mm256_storeu_ps(reinterpret_cast<float *>(reset_gate + i),
                     r_value_reset_gate_last);
    _mm256_storeu_ps(reinterpret_cast<float *>(reset_output_value + i),
                     r_value_reset_output);
G
guosheng 已提交
181 182 183 184 185
  }
#endif
}

template <class OpFinalOutput, typename T>
G
guosheng 已提交
186
void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
187
                                     T *gate_value, const T *prev_output_value,
G
guosheng 已提交
188
                                     T *output_value, int frame_size,
Q
Qiao Longfei 已提交
189
                                     ActivationType active_node,
190 191
                                     bool origin_mode,
                                     bool old_version = true) {
G
guosheng 已提交
192
#ifdef __AVX__
193 194 195 196
  __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f);
  __m256 r_value_frame_state, r_value_frame_state_last = _mm256_set1_ps(0.0f);
  __m256 r_prev_out = _mm256_set1_ps(0.0f),
         r_prev_out_last = _mm256_set1_ps(0.0f);
G
guosheng 已提交
197
  __m256 r_output;
198 199 200 201 202 203 204
  T *update_gate;
  if (old_version) {
    update_gate = gate_value;
  } else {
    update_gate = gate_value + frame_size;
  }

205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
  T *frame_state = gate_value + frame_size * 2;
  int block = 8;
  const int n = frame_size;
  const int rest = n % block;
  const int end = n - rest;
  int i = 0;

  if (rest > 0) {
    i = n - block;
    r_value_update_gate_last =
        _mm256_loadu_ps((const float *)(update_gate + i));
    r_value_frame_state_last =
        _mm256_loadu_ps((const float *)(frame_state + i));
    if (prev_output_value) {
      r_prev_out_last = _mm256_loadu_ps((const float *)(prev_output_value + i));
    }
  }
G
guosheng 已提交
222

223 224 225
  for (i = 0; i < end; i += block) {
    r_value_update_gate = _mm256_loadu_ps((const float *)(update_gate + i));
    r_value_frame_state = _mm256_loadu_ps((const float *)(frame_state + i));
G
guosheng 已提交
226
    if (prev_output_value) {
227
      r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i));
G
guosheng 已提交
228 229
    }

230
    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
Q
Qiao Longfei 已提交
231
                    &r_output, active_node, origin_mode);
G
guosheng 已提交
232

233 234 235 236 237 238 239 240
    _mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
                     r_value_frame_state);
    _mm256_storeu_ps(reinterpret_cast<float *>(output_value + i), r_output);
  }

  if (rest > 0) {
    i = n - block;
    op_final_output(&r_value_update_gate_last, &r_value_frame_state_last,
Q
Qiao Longfei 已提交
241
                    &r_prev_out_last, &r_output, active_node, origin_mode);
242 243 244 245

    _mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
                     r_value_frame_state_last);
    _mm256_storeu_ps(reinterpret_cast<float *>(output_value + i), r_output);
G
guosheng 已提交
246
  }
247

G
guosheng 已提交
248 249 250
#endif
}

251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
template <typename T>
inline void forward_reset_outputV2(const platform::CPUDeviceContext &context,
                                   GRUMetaValue<T> value, int frame_size) {
  auto &place = *context.eigen_device();
  auto value_reset_gate =
      typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
  auto value_update_gate = typename EigenVector<T>::Type(
      value.gate_value + frame_size, Array1(frame_size));
  auto value_reset_output = typename EigenVector<T>::Type(
      value.reset_output_value, Array1(frame_size));
  auto value_reset_bias =
      typename EigenVector<T>::ConstType(value.reset_bias, Array1(frame_size));
  SigmoidFunctor<T>()(place, value_reset_gate, value_reset_gate);
  SigmoidFunctor<T>()(place, value_update_gate, value_update_gate);
  value_reset_output.device(place) =
      (value_reset_output + value_reset_bias) * value_reset_gate;
}

G
guosheng 已提交
269
template <class OpResetOutput, typename T>
270 271 272 273
inline void forward_reset_output(
    OpResetOutput op_reset_output, GRUMetaValue<T> value, int frame_size,
    int batch_size, ActivationType active_gate, bool old_version = true,
    const platform::CPUDeviceContext *context = nullptr) {
G
guosheng 已提交
274
  for (int b = 0; b < batch_size; b++) {
275 276 277
    if (!old_version) {
      // use eigen
      forward_reset_outputV2(*context, value, frame_size);
G
guosheng 已提交
278
    } else {
279 280 281 282 283 284 285 286 287 288 289 290
      if (OpResetOutput::avx && (frame_size & static_cast<int>(8 - 1)) &&
          (sizeof(T) == 4)) {
        hl_avx_gru_forward_reset_output(
            op_reset_output, value.gate_value, value.reset_output_value,
            value.prev_out_value, frame_size, active_gate, old_version,
            value.reset_bias);
      } else {
        hl_naive_gru_forward_reset_output(
            op_reset_output, value.gate_value, value.reset_output_value,
            value.prev_out_value, frame_size, active_gate, old_version,
            value.reset_bias);
      }
G
guosheng 已提交
291
    }
G
guosheng 已提交
292 293 294 295
    value.gate_value += frame_size * 3;
    value.reset_output_value += frame_size;
    if (value.prev_out_value) {
      value.prev_out_value += frame_size;
G
guosheng 已提交
296 297 298 299
    }
  }
}

300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
template <typename T>
inline void forward_final_outputV2(const platform::CPUDeviceContext &context,
                                   GRUMetaValue<T> value, int frame_size) {
  auto &place = *context.eigen_device();
  auto value_update_gate = typename EigenVector<T>::Type(
      value.gate_value + frame_size, Array1(frame_size));
  auto value_frame_state = typename EigenVector<T>::Type(
      value.gate_value + 2 * frame_size, Array1(frame_size));
  auto value_output =
      typename EigenVector<T>::Type(value.output_value, Array1(frame_size));
  TanhFunctor<T>()(place, value_frame_state, value_frame_state);
  value_output.device(place) =
      (static_cast<T>(1.0) - value_update_gate) * value_frame_state;
  if (value.prev_out_value) {
    auto value_prev_out = typename EigenVector<T>::ConstType(
        value.prev_out_value, Array1(frame_size));
    value_output.device(place) =
        value_output + value_update_gate * value_prev_out;
  }
}

G
guosheng 已提交
321
template <class OpFinalOutput, typename T>
322 323 324 325 326
inline void forward_final_output(
    OpFinalOutput op_final_output, GRUMetaValue<T> value, int frame_size,
    int batch_size, ActivationType active_node, bool origin_mode,
    bool old_version = true,
    const platform::CPUDeviceContext *context = nullptr) {
G
guosheng 已提交
327
  for (int b = 0; b < batch_size; b++) {
328 329 330
    if (!old_version) {
      // eigen
      forward_final_outputV2(*context, value, frame_size);
G
guosheng 已提交
331
    } else {
332 333 334
      if (OpFinalOutput::avx && (frame_size & static_cast<int>(8 - 1)) &&
          (sizeof(T) == 4)) {
        hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
335 336 337
                                        value.prev_out_value,
                                        value.output_value, frame_size,
                                        active_node, origin_mode, old_version);
338 339 340 341 342 343
      } else {
        hl_naive_gru_forward_final_output(
            op_final_output, value.gate_value, value.prev_out_value,
            value.output_value, frame_size, active_node, origin_mode,
            old_version);
      }
G
guosheng 已提交
344
    }
G
guosheng 已提交
345 346 347 348
    value.gate_value += frame_size * 3;
    value.output_value += frame_size;
    if (value.prev_out_value) {
      value.prev_out_value += frame_size;
G
guosheng 已提交
349 350 351 352 353
    }
  }
}

template <class OpStateGrad, typename T>
G
guosheng 已提交
354
void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
355
                                      T *gate_grad, const T *prev_out_value,
G
guosheng 已提交
356 357
                                      T *prev_out_grad, T *output_grad,
                                      int frame_size,
Q
Qiao Longfei 已提交
358 359
                                      ActivationType active_node,
                                      bool origin_mode) {
G
guosheng 已提交
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
  T r_update_gate_value;
  T r_update_gate_grad;
  T r_frame_state_value;
  T r_frame_state_grad;
  T r_out_grad;
  T r_prev_out_value = 0;
  T r_prev_out_grad = 0;
  T *update_gate_value = gate_value;
  T *update_gate_grad = gate_grad;
  T *frame_state_value = gate_value + frame_size * 2;
  T *frame_state_grad = gate_grad + frame_size * 2;

  for (int i = 0; i < frame_size; i++) {
    r_update_gate_value = update_gate_value[i];
    r_frame_state_value = frame_state_value[i];
    r_out_grad = output_grad[i];
    if (prev_out_value) {
      r_prev_out_value = prev_out_value[i];
G
guosheng 已提交
378
    }
G
guosheng 已提交
379 380
    if (prev_out_grad) {
      r_prev_out_grad = prev_out_grad[i];
G
guosheng 已提交
381 382
    }

383 384
    op_state_grad(&r_update_gate_value, &r_update_gate_grad,
                  &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
Q
Qiao Longfei 已提交
385
                  &r_prev_out_grad, &r_out_grad, active_node, origin_mode);
G
guosheng 已提交
386

G
guosheng 已提交
387 388 389 390
    update_gate_grad[i] = r_update_gate_grad;
    frame_state_grad[i] = r_frame_state_grad;
    if (prev_out_grad) {
      prev_out_grad[i] = r_prev_out_grad;
G
guosheng 已提交
391 392 393 394 395
    }
  }
}

template <class OpResetGrad, typename T>
G
guosheng 已提交
396
void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
397
                                      T *gate_grad, const T *prev_out_value,
G
guosheng 已提交
398 399
                                      T *prev_out_grad, T *reset_output_grad,
                                      int frame_size,
Q
Qiao Longfei 已提交
400
                                      ActivationType active_gate) {
G
guosheng 已提交
401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419
  T r_update_gate_value;
  T r_update_gate_grad;
  T r_reset_gate_value;
  T r_reset_gate_grad;
  T r_reset_output_grad = 0;
  T r_prev_out_value = 0;
  T r_prev_out_grad = 0;
  T *update_gate_value = gate_value;
  T *update_gate_grad = gate_grad;
  T *reset_gate_value = gate_value + frame_size;
  T *reset_gate_grad = gate_grad + frame_size;

  for (int i = 0; i < frame_size; i++) {
    r_update_gate_value = update_gate_value[i];
    r_update_gate_grad = update_gate_grad[i];
    r_reset_gate_value = reset_gate_value[i];

    if (prev_out_value && prev_out_grad) {
      r_reset_output_grad = reset_output_grad[i];
G
guosheng 已提交
420
    }
G
guosheng 已提交
421 422
    if (prev_out_value) {
      r_prev_out_value = prev_out_value[i];
G
guosheng 已提交
423
    }
G
guosheng 已提交
424 425
    if (prev_out_grad) {
      r_prev_out_grad = prev_out_grad[i];
G
guosheng 已提交
426 427
    }

428 429
    op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
                  &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
Q
Qiao Longfei 已提交
430
                  &r_prev_out_grad, &r_reset_output_grad, active_gate);
G
guosheng 已提交
431

G
guosheng 已提交
432 433 434 435
    update_gate_grad[i] = r_update_gate_grad;
    reset_gate_grad[i] = r_reset_gate_grad;
    if (prev_out_grad) {
      prev_out_grad[i] = r_prev_out_grad;
G
guosheng 已提交
436 437 438 439 440
    }
  }
}

template <class OpStateGrad, typename T>
G
guosheng 已提交
441
void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
442
                                    T *gate_grad, const T *prev_out_value,
G
guosheng 已提交
443
                                    T *prev_out_grad, T *output_grad,
Q
Qiao Longfei 已提交
444 445
                                    int frame_size, ActivationType active_node,
                                    bool origin_mode) {
G
guosheng 已提交
446
#ifdef __AVX__
G
guosheng 已提交
447 448 449 450 451 452 453
  __m256 r_update_gate_value;
  __m256 r_update_gate_grad;
  __m256 r_frame_state_value;
  __m256 r_frame_state_grad;
  __m256 r_out_grad;
  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
454 455 456 457 458 459
  __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
  __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
  __m256 *frame_state_value =
      reinterpret_cast<__m256 *>(gate_value + frame_size * 2);
  __m256 *frame_state_grad =
      reinterpret_cast<__m256 *>(gate_grad + frame_size * 2);
G
guosheng 已提交
460 461 462 463

  for (int i = 0; i < frame_size / 8; i++) {
    r_update_gate_value = update_gate_value[i];
    r_frame_state_value = frame_state_value[i];
464
    r_out_grad = (reinterpret_cast<__m256 *>(output_grad))[i];
G
guosheng 已提交
465
    if (prev_out_value) {
466
      r_prev_out_value = (reinterpret_cast<const __m256 *>(prev_out_value))[i];
G
guosheng 已提交
467
    }
G
guosheng 已提交
468
    if (prev_out_grad) {
469
      r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
G
guosheng 已提交
470 471
    }

472 473
    op_state_grad(&r_update_gate_value, &r_update_gate_grad,
                  &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
Q
Qiao Longfei 已提交
474
                  &r_prev_out_grad, &r_out_grad, active_node, origin_mode);
G
guosheng 已提交
475

G
guosheng 已提交
476 477 478
    update_gate_grad[i] = r_update_gate_grad;
    frame_state_grad[i] = r_frame_state_grad;
    if (prev_out_grad) {
479
      (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
G
guosheng 已提交
480 481 482 483 484 485
    }
  }
#endif
}

template <class OpResetGrad, typename T>
G
guosheng 已提交
486
void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
487
                                    T *gate_grad, const T *prev_out_value,
G
guosheng 已提交
488
                                    T *prev_out_grad, T *reset_output_grad,
Q
Qiao Longfei 已提交
489 490
                                    int frame_size,
                                    ActivationType active_gate) {
G
guosheng 已提交
491
#ifdef __AVX__
G
guosheng 已提交
492 493 494 495 496 497 498
  __m256 r_update_gate_value;
  __m256 r_update_gate_grad;
  __m256 r_reset_gate_value;
  __m256 r_reset_gate_grad;
  __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
499 500 501 502 503
  __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
  __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
  __m256 *reset_gate_value =
      reinterpret_cast<__m256 *>(gate_value + frame_size);
  __m256 *reset_gate_grad = reinterpret_cast<__m256 *>(gate_grad + frame_size);
G
guosheng 已提交
504 505 506 507 508 509 510

  for (int i = 0; i < frame_size / 8; i++) {
    r_update_gate_value = update_gate_value[i];
    r_update_gate_grad = update_gate_grad[i];
    r_reset_gate_value = reset_gate_value[i];

    if (prev_out_value && prev_out_grad) {
511
      r_reset_output_grad = (reinterpret_cast<__m256 *>(reset_output_grad))[i];
G
guosheng 已提交
512
    }
G
guosheng 已提交
513
    if (prev_out_value) {
514
      r_prev_out_value = (reinterpret_cast<const __m256 *>(prev_out_value))[i];
G
guosheng 已提交
515
    }
G
guosheng 已提交
516
    if (prev_out_grad) {
517
      r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
G
guosheng 已提交
518 519
    }

520 521
    op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
                  &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
Q
Qiao Longfei 已提交
522
                  &r_prev_out_grad, &r_reset_output_grad, active_gate);
G
guosheng 已提交
523

G
guosheng 已提交
524 525 526
    update_gate_grad[i] = r_update_gate_grad;
    reset_gate_grad[i] = r_reset_gate_grad;
    if (prev_out_grad) {
527
      (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
G
guosheng 已提交
528 529 530 531 532
    }
  }
#endif
}

533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661
template <class OpGruGrad, typename T>
inline void hl_naive_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
                                  T *gate_grad, const T *prev_out_value,
                                  T *prev_out_grad, T *reset_output_value,
                                  T *reset_output_grad, T *output_grad,
                                  int frame_size, ActivationType active_node,
                                  ActivationType active_gate) {
  T r_value_reset_gate;
  T r_grad_reset_gate;
  T r_value_update_gate;
  T r_grad_update_gate;
  T r_value_frame_state;
  T r_grad_frame_state;
  T r_value_prev_out = 0;
  T r_grad_prev_out = 0;
  T r_grad_output;
  T r_value_reset_output;
  T r_grad_reset_output = 0;
  T *reset_gate_value = gate_value;
  T *reset_gate_grad = gate_grad;
  T *update_gate_value = gate_value + frame_size;
  T *update_gate_grad = gate_grad + frame_size;
  T *frame_state_value = gate_value + 2 * frame_size;
  T *frame_state_grad = gate_grad + 2 * frame_size;

  for (int i = 0; i < frame_size; ++i) {
    r_value_reset_gate = reset_gate_value[i];
    r_grad_reset_gate = reset_gate_grad[i];
    r_value_update_gate = update_gate_value[i];
    r_grad_update_gate = update_gate_grad[i];
    r_value_frame_state = frame_state_value[i];
    r_grad_frame_state = frame_state_grad[i];
    if (prev_out_value) {
      r_value_prev_out = prev_out_value[i];
    }
    if (prev_out_grad) {
      r_grad_prev_out = prev_out_grad[i];
    }
    r_grad_output = output_grad[i];
    r_value_reset_output = reset_output_value[i];
    if (prev_out_value && prev_out_grad) {
      r_grad_reset_output = reset_output_grad[i];
    }

    op_gru_grad(&r_value_reset_gate, &r_grad_reset_gate, &r_value_update_gate,
                &r_grad_update_gate, &r_value_frame_state, &r_grad_frame_state,
                &r_value_prev_out, &r_grad_prev_out, &r_grad_output,
                &r_value_reset_output, &r_grad_reset_output, active_node,
                active_gate);

    reset_gate_grad[i] = r_grad_reset_gate;
    update_gate_grad[i] = r_grad_update_gate;
    frame_state_grad[i] = r_grad_frame_state;
    if (prev_out_grad) {
      prev_out_grad[i] = r_grad_prev_out;
    }
    if (prev_out_value && prev_out_grad) {
      reset_output_grad[i] = r_grad_reset_output;
    }
  }
}

template <class OpGruGrad, typename T>
inline void hl_avx_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
                                T *gate_grad, const T *prev_out_value,
                                T *prev_out_grad, T *reset_output_value,
                                T *reset_output_grad, T *output_grad,
                                int frame_size, ActivationType active_node,
                                ActivationType active_gate) {
#ifdef __AVX__
  __m256 r_value_reset_gate;
  __m256 r_grad_reset_gate;
  __m256 r_value_update_gate;
  __m256 r_grad_update_gate;
  __m256 r_value_frame_state;
  __m256 r_grad_frame_state;
  __m256 r_value_prev_out = _mm256_set1_ps(0.0f);
  __m256 r_grad_prev_out = _mm256_set1_ps(0.0f);
  __m256 r_grad_output;
  __m256 r_value_reset_output;
  __m256 r_grad_reset_output = _mm256_set1_ps(0.0f);
  __m256 *reset_gate_value = reinterpret_cast<__m256 *>(gate_value);
  __m256 *reset_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
  __m256 *update_gate_value =
      reinterpret_cast<__m256 *>(gate_value + frame_size);
  __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad + frame_size);
  __m256 *frame_state_value =
      reinterpret_cast<__m256 *>(gate_value + 2 * frame_size);
  __m256 *frame_state_grad =
      reinterpret_cast<__m256 *>(gate_grad + 2 * frame_size);

  for (int i = 0; i < frame_size / 8; ++i) {
    r_value_reset_gate = reset_gate_value[i];
    r_grad_reset_gate = reset_gate_grad[i];
    r_value_update_gate = update_gate_value[i];
    r_grad_update_gate = update_gate_grad[i];
    r_value_frame_state = frame_state_value[i];
    r_grad_frame_state = frame_state_grad[i];
    if (prev_out_value) {
      r_value_prev_out = (reinterpret_cast<const __m256 *>(prev_out_value))[i];
    }
    if (prev_out_grad) {
      r_grad_prev_out = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
    }
    r_grad_output = (reinterpret_cast<__m256 *>(output_grad))[i];
    r_value_reset_output = (reinterpret_cast<__m256 *>(reset_output_value))[i];
    if (prev_out_value && prev_out_grad) {
      r_grad_reset_output = (reinterpret_cast<__m256 *>(reset_output_grad))[i];
    }

    op_gru_grad(&r_value_reset_gate, &r_grad_reset_gate, &r_value_update_gate,
                &r_grad_update_gate, &r_value_frame_state, &r_grad_frame_state,
                &r_value_prev_out, &r_grad_prev_out, &r_grad_output,
                &r_value_reset_output, &r_grad_reset_output, active_node,
                active_gate);

    reset_gate_grad[i] = r_grad_reset_gate;
    update_gate_grad[i] = r_grad_update_gate;
    frame_state_grad[i] = r_grad_frame_state;
    if (prev_out_grad) {
      (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_grad_prev_out;
    }
    if (prev_out_value && prev_out_grad) {
      (reinterpret_cast<__m256 *>(reset_output_grad))[i] = r_grad_reset_output;
    }
  }
#endif
}

G
guosheng 已提交
662
template <class OpStateGrad, typename T>
G
guosheng 已提交
663
inline void backward_state_grad(OpStateGrad op_state_grad,
664
                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
G
guosheng 已提交
665
                                int frame_size, int batch_size,
Q
Qiao Longfei 已提交
666
                                ActivationType active_node, bool origin_mode) {
G
guosheng 已提交
667 668
  for (int b = 0; b < batch_size; b++) {
    if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
Q
Qiao Longfei 已提交
669 670 671 672
      hl_avx_gru_backward_state_grad(op_state_grad, value.gate_value,
                                     grad.gate_grad, value.prev_out_value,
                                     grad.prev_out_grad, grad.output_grad,
                                     frame_size, active_node, origin_mode);
G
guosheng 已提交
673
    } else {
Q
Qiao Longfei 已提交
674 675 676 677
      hl_naive_gru_backward_state_grad(op_state_grad, value.gate_value,
                                       grad.gate_grad, value.prev_out_value,
                                       grad.prev_out_grad, grad.output_grad,
                                       frame_size, active_node, origin_mode);
G
guosheng 已提交
678 679
    }

G
guosheng 已提交
680 681 682
    value.gate_value += frame_size * 3;
    if (value.prev_out_value) {
      value.prev_out_value += frame_size;
G
guosheng 已提交
683 684
    }

G
guosheng 已提交
685 686 687 688
    grad.gate_grad += frame_size * 3;
    grad.output_grad += frame_size;
    if (grad.prev_out_grad) {
      grad.prev_out_grad += frame_size;
G
guosheng 已提交
689 690 691 692 693
    }
  }
}

template <class OpResetGrad, typename T>
G
guosheng 已提交
694
inline void backward_reset_grad(OpResetGrad op_reset_grad,
695
                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
G
guosheng 已提交
696
                                int frame_size, int batch_size,
Q
Qiao Longfei 已提交
697
                                ActivationType active_gate) {
G
guosheng 已提交
698 699
  for (int b = 0; b < batch_size; b++) {
    if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
Q
Qiao Longfei 已提交
700 701 702
      hl_avx_gru_backward_reset_grad(
          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
G
guosheng 已提交
703 704
    } else {
      hl_naive_gru_backward_reset_grad(
G
guosheng 已提交
705
          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
Q
Qiao Longfei 已提交
706
          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
G
guosheng 已提交
707 708
    }

G
guosheng 已提交
709 710 711
    value.gate_value += frame_size * 3;
    if (value.prev_out_value) {
      value.prev_out_value += frame_size;
G
guosheng 已提交
712 713
    }

G
guosheng 已提交
714 715 716 717
    grad.gate_grad += frame_size * 3;
    grad.reset_output_grad += frame_size;
    if (grad.prev_out_grad) {
      grad.prev_out_grad += frame_size;
G
guosheng 已提交
718 719 720 721
    }
  }
}

722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776
template <typename T>
inline void gru_backward(const platform::CPUDeviceContext &context,
                         GRUMetaValue<T> value, GRUMetaGrad<T> grad,
                         int frame_size) {
  auto &place = *context.eigen_device();

  auto value_reset_gate =
      typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
  auto grad_reset_gate =
      typename EigenVector<T>::Type(grad.gate_grad, Array1(frame_size));
  auto value_update_gate = typename EigenVector<T>::Type(
      value.gate_value + frame_size, Array1(frame_size));
  auto grad_update_gate = typename EigenVector<T>::Type(
      grad.gate_grad + frame_size, Array1(frame_size));
  auto value_frame_state = typename EigenVector<T>::Type(
      value.gate_value + frame_size * 2, Array1(frame_size));
  auto grad_frame_state = typename EigenVector<T>::Type(
      grad.gate_grad + frame_size * 2, Array1(frame_size));

  auto grad_output =
      typename EigenVector<T>::Type(grad.output_grad, Array1(frame_size));
  auto value_reset_output = typename EigenVector<T>::Type(
      value.reset_output_value, Array1(frame_size));
  auto grad_reset_output =
      typename EigenVector<T>::Type(grad.reset_output_grad, Array1(frame_size));

  if (value.prev_out_value) {
    auto value_prev_out = typename EigenVector<T>::ConstType(
        value.prev_out_value, Array1(frame_size));
    SigmoidGradFunctor<T>()(place, 1 /*useless*/, value_update_gate,
                            (value_prev_out - value_frame_state) * grad_output,
                            grad_update_gate);
  } else {
    SigmoidGradFunctor<T>()(
        place, 1 /*useless*/, value_update_gate,
        static_cast<T>(-1) * value_frame_state * grad_output, grad_update_gate);
  }
  if (grad.prev_out_grad) {
    auto grad_prev_out =
        typename EigenVector<T>::Type(grad.prev_out_grad, Array1(frame_size));
    grad_prev_out.device(place) =
        grad_prev_out + grad_output * value_update_gate;
  }
  TanhGradFunctor<T>()(place, 1 /*useless*/, value_frame_state,
                       grad_output * (static_cast<T>(1.0) - value_update_gate),
                       grad_frame_state);
  SigmoidGradFunctor<T>()(
      place, 1 /*useless*/, value_reset_gate,
      value_reset_output / value_reset_gate * grad_frame_state,
      grad_reset_gate);
  if (value.prev_out_value && grad.prev_out_grad) {
    grad_reset_output.device(place) = value_reset_gate * grad_frame_state;
  }
}

777
template <class OpGruGrad, typename T>
778 779
inline void cpu_gru_backward(const platform::CPUDeviceContext &context,
                             OpGruGrad op_gru_grad, GRUMetaValue<T> value,
780 781 782 783
                             GRUMetaGrad<T> grad, int frame_size,
                             int batch_size, ActivationType active_node,
                             ActivationType active_gate) {
  for (int b = 0; b < batch_size; ++b) {
784 785
    // eigen
    gru_backward(context, value, grad, frame_size);
786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801

    value.gate_value += frame_size * 3;
    value.reset_output_value += frame_size;
    if (value.prev_out_value) {
      value.prev_out_value += frame_size;
    }

    grad.gate_grad += frame_size * 3;
    grad.output_grad += frame_size;
    grad.reset_output_grad += frame_size;
    if (grad.prev_out_grad) {
      grad.prev_out_grad += frame_size;
    }
  }
}

G
guosheng 已提交
802 803 804 805 806 807
#endif

}  // namespace detail
}  // namespace math
}  // namespace operators
}  // namespace paddle