mix.cc 8.9 KB
Newer Older
T
tensor-tang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. */

#include "paddle/fluid/operators/jit/more/mix/mix.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"

namespace paddle {
namespace operators {
namespace jit {
namespace more {
namespace mix {

26 27
using CPUPlace = platform::CPUPlace;

T
tensor-tang 已提交
28 29 30 31 32 33 34
void VSigmoid(const T* x, T* y, int n) {
  const float min = SIGMOID_THRESHOLD_MIN;
  const float max = SIGMOID_THRESHOLD_MAX;
  for (int i = 0; i < n; ++i) {
    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
    y[i] = static_cast<T>(0) - y[i];
  }
35
  auto compute = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
T
tensor-tang 已提交
36 37 38 39 40 41 42 43
  compute(y, y, n);
  for (int i = 0; i < n; ++i) {
    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
  }
}

void VTanh(const T* x, T* y, int n) {
  const T a = 2, b = -1;
44 45 46
  auto compute_scal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
  auto compute_addbias = KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
  auto compute_sigmoid = KernelFuncs<VSigmoidTuple<T>, CPUPlace>::Cache().At(n);
T
tensor-tang 已提交
47 48 49 50 51 52
  compute_scal(&a, x, y, n);
  compute_sigmoid(y, y, n);
  compute_scal(&a, y, y, n);
  compute_addbias(&b, y, y, n);
}

D
dengkaipeng 已提交
53
// remain is the product of dimension shapes after the axis dimension
D
dengkaipeng 已提交
54
void Softmax(const T* x, T* y, int n, int bs, int remain) {
55 56 57
  auto compute_hmax = KernelFuncs<HMaxTuple<T>, CPUPlace>::Cache().At(n);
  auto compute_hsum = KernelFuncs<HSumTuple<T>, CPUPlace>::Cache().At(n);
  auto compute_vscal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
D
dengkaipeng 已提交
58
  auto compute_strideasum =
D
dengkaipeng 已提交
59 60 61
      KernelFuncs<StrideASumTuple<T>, CPUPlace>::Cache().At(n);
  auto compute_stridescal =
      KernelFuncs<StrideScalTuple<T>, CPUPlace>::Cache().At(n);
T
tensor-tang 已提交
62
  auto compute_vaddbias =
63 64
      KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
  auto compute_vexp = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
T
tensor-tang 已提交
65

T
tensor-tang 已提交
66 67 68 69 70 71
  for (int i = 0; i < bs; ++i) {
    T scalar;
    compute_hmax(x, &scalar, n);
    scalar = static_cast<T>(0) - scalar;
    compute_vaddbias(&scalar, x, y, n);  // x - max
    compute_vexp(y, y, n);
D
dengkaipeng 已提交
72
    if (remain == 1) {
73 74 75 76
      compute_hsum(y, &scalar, n);
      scalar = static_cast<T>(1) / scalar;
      compute_vscal(&scalar, y, y, n);
    } else {
D
dengkaipeng 已提交
77
      for (int j = 0; j < remain; ++j) {
D
dengkaipeng 已提交
78
        compute_strideasum(&y[j], &scalar, n, remain);
79
        scalar = static_cast<T>(1) / scalar;
D
dengkaipeng 已提交
80
        compute_stridescal(&scalar, &y[j], &y[j], n, remain);
81 82
      }
    }
T
tensor-tang 已提交
83 84 85 86 87
    x += n;
    y += n;
  }
}

88
void (*getActFunc(KernelType type, int d))(const T*, T*, int) {  // NOLINT
T
tensor-tang 已提交
89
  if (type == kVSigmoid) {
90
    return KernelFuncs<VSigmoidTuple<T>, CPUPlace>::Cache().At(d);
T
tensor-tang 已提交
91
  } else if (type == kVRelu) {
92
    return KernelFuncs<VReluTuple<T>, CPUPlace>::Cache().At(d);
T
tensor-tang 已提交
93
  } else if (type == kVTanh) {
94
    return KernelFuncs<VTanhTuple<T>, CPUPlace>::Cache().At(d);
T
tensor-tang 已提交
95
  } else if (type == kVIdentity) {
96
    return KernelFuncs<VIdentityTuple<T>, CPUPlace>::Cache().At(d);
97 98 99
  }
  PADDLE_THROW("Not support type: %s", type);
  return nullptr;
T
tensor-tang 已提交
100 101
}

102 103 104 105 106 107 108 109 110 111
void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
  T* gates = reinterpret_cast<T*>(step->gates);
  const T* ct_1 = reinterpret_cast<const T*>(step->ct_1);
  T* ct = reinterpret_cast<T*>(step->ct);
  T* ht = reinterpret_cast<T*>(step->ht);
  const T* wp = reinterpret_cast<const T*>(step->wp);
  T* checked = reinterpret_cast<T*>(step->checked);
  const int d = attr->d;
  const int d2 = d * 2;
  const int d3 = d * 3;
112 113 114
  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
  auto vadd_d = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d);
  auto vadd_d2 = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d2);
115 116
  auto act_gate_d = getActFunc(attr->act_gate, d);
  auto act_gate_d2 = getActFunc(attr->act_gate, d2);
T
tensor-tang 已提交
117
  auto act_gate_d3 = getActFunc(attr->act_gate, d3);
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
  auto act_cand_d = getActFunc(attr->act_cand, d);
  auto act_cell_d = getActFunc(attr->act_cell, d);

  if (attr->use_peephole) {
    vmul_d(wp, ct_1, checked, d);
    vmul_d(wp + d, ct_1, checked + d, d);
    vadd_d2(checked, gates + d, gates + d, d2);
    act_gate_d2(gates + d, gates + d, d2);
  } else {
    act_gate_d3(gates + d, gates + d, d3);
  }

  // C_t = C_t-1 * fgated + cand_gated * igated
  act_cand_d(gates, gates, d);
  vmul_d(gates, gates + d, gates + d, d);
  vmul_d(ct_1, gates + d2, gates + d2, d);
  vadd_d(gates + d, gates + d2, ct, d);

  if (attr->use_peephole) {
    // get ogated
    vmul_d(wp + d2, ct, gates + d, d);
    vadd_d(gates + d, gates + d3, gates + d3, d);
    act_gate_d(gates + d3, gates + d3, d);
  }
  // H_t = act_cell(C_t) * ogated
  act_cell_d(ct, gates + d2, d);
  vmul_d(gates + d2, gates + d3, ht, d);
T
tensor-tang 已提交
145 146
}

147 148 149 150 151 152 153
void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
  T* gates = reinterpret_cast<T*>(step->gates);
  T* ct = reinterpret_cast<T*>(step->ct);
  T* ht = reinterpret_cast<T*>(step->ht);
  int d = attr->d;
  int d2 = d * 2;
  int d3 = d * 3;
154 155
  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
  auto vadd_d = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d);
156 157 158 159 160 161 162 163 164 165 166 167
  auto act_gate_d = getActFunc(attr->act_gate, d);
  auto act_cand_d = getActFunc(attr->act_cand, d);
  auto act_cell_d = getActFunc(attr->act_cell, d);
  /* C_t = igated * cgated*/
  act_gate_d(gates + d, gates + d, d);
  act_cand_d(gates, gates, d);
  vmul_d(gates, gates + d, ct, d);
  if (attr->use_peephole) {
    // get outgated, put W_oc * C_t on igated
    const T* wp = reinterpret_cast<const T*>(step->wp);
    vmul_d(wp + d2, ct, gates + d, d);
    vadd_d(gates + d, gates + d3, gates + d3, d);
T
tensor-tang 已提交
168
  }
169 170 171 172 173 174 175 176 177 178 179 180 181 182
  /* H_t = act_cell(C_t) * ogated */
  act_gate_d(gates + d3, gates + d3, d);
  act_cell_d(ct, gates + d2, d);
  vmul_d(gates + d2, gates + d3, ht, d);
}

// compute h1 without h0
void GRUH1(gru_t* step, const gru_attr_t* attr) {
  T* gates = reinterpret_cast<T*>(step->gates);
  T* ht = reinterpret_cast<T*>(step->ht);
  int d = attr->d;
  int d2 = d * 2;
  auto act_gate = getActFunc(attr->act_gate, d);
  auto act_cand = getActFunc(attr->act_cand, d);
183
  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
184 185 186 187 188 189 190 191 192 193 194 195
  act_gate(gates, gates, d);
  act_cand(gates + d2, gates + d2, d);
  vmul_d(gates, gates + d2, ht, d);
}

// compute the first part of GRU: ht = act_gate(r) * ht_1
void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
  // W: {W_update, W_reset; W_state}
  T* gates = reinterpret_cast<T*>(step->gates);
  T* ht = reinterpret_cast<T*>(step->ht);
  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
  auto act_gate = getActFunc(attr->act_gate, attr->d);
196
  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(attr->d);
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
  act_gate(gates + attr->d, gates + attr->d, attr->d);
  vmul_d(ht_1, gates + attr->d, ht, attr->d);
}

// compute the second part of GRU:
// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
  T* gates = reinterpret_cast<T*>(step->gates);
  T* ht = reinterpret_cast<T*>(step->ht);
  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
  int d = attr->d;
  auto act_gate = getActFunc(attr->act_gate, d);
  auto act_cand = getActFunc(attr->act_cand, d);
  T* y = gates + d * 2;
  act_gate(gates, gates, d);
  act_cand(y, y, d);
  // out = zt*ht~ + (1-zt)*ht_1
  for (int i = 0; i < d; ++i) {
    ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
  }
}

// TODO(TJ): tuning me
220
bool VSigmoidKernel::CanBeUsed(const int& d) const { return true; }
221

222
bool VTanhKernel::CanBeUsed(const int& d) const { return true; }
223

224
bool SoftmaxKernel::CanBeUsed(const int& d) const { return true; }
T
tensor-tang 已提交
225

226
bool LSTMCtHtKernel::CanBeUsed(const lstm_attr_t& attr) const { return true; }
227

228
bool LSTMC1H1Kernel::CanBeUsed(const lstm_attr_t& attr) const { return true; }
229

230
bool GRUH1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
T
tensor-tang 已提交
231

232
bool GRUHtPart1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
T
tensor-tang 已提交
233

234
bool GRUHtPart2Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
T
tensor-tang 已提交
235 236 237 238 239 240 241 242 243

}  // namespace mix
}  // namespace more
}  // namespace jit
}  // namespace operators
}  // namespace paddle

namespace mix = paddle::operators::jit::more::mix;

244 245 246 247 248 249 250 251 252 253 254
#define REGISTER_MORE_KERNEL(func) \
  REGISTER_JITKERNEL_MORE(k##func, mix, mix::func##Kernel)

REGISTER_MORE_KERNEL(VSigmoid);
REGISTER_MORE_KERNEL(VTanh);
REGISTER_MORE_KERNEL(Softmax);
REGISTER_MORE_KERNEL(LSTMCtHt);
REGISTER_MORE_KERNEL(LSTMC1H1);
REGISTER_MORE_KERNEL(GRUH1);
REGISTER_MORE_KERNEL(GRUHtPart1);
REGISTER_MORE_KERNEL(GRUHtPart2);
T
tensor-tang 已提交
255 256

#undef REGISTER_MORE_KERNEL