act.cc 5.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. */

#include "paddle/fluid/operators/jit/gen/act.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"

namespace paddle {
namespace operators {
namespace jit {
namespace gen {

T
tensor-tang 已提交
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
const float ALIGN32_BEG exp_float_consts[] ALIGN32_END = {
    REPEAT_8TIMES(1.f),
    REPEAT_8TIMES(2.f),
    REPEAT_8TIMES(0.5f),
    REPEAT_8TIMES(EXP_HIG),
    REPEAT_8TIMES(EXP_LOW),
    REPEAT_8TIMES(CEPHES_LOG2EF),
    REPEAT_8TIMES(CEPHES_EXP_C1),
    REPEAT_8TIMES(CEPHES_EXP_C2),
    REPEAT_8TIMES(CEPHES_EXP_P0),
    REPEAT_8TIMES(CEPHES_EXP_P1),
    REPEAT_8TIMES(CEPHES_EXP_P2),
    REPEAT_8TIMES(CEPHES_EXP_P3),
    REPEAT_8TIMES(CEPHES_EXP_P4),
    REPEAT_8TIMES(CEPHES_EXP_P5),
    REPEAT_8TIMES(EXP_MAX_INPUT),
    REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
    REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
42

T
tensor-tang 已提交
43 44
const int ALIGN32_BEG exp_int_0x7f[] ALIGN32_END = {REPEAT_8TIMES(0x7f)};
int ALIGN32_BEG g_tmp_mem[16] ALIGN32_END = {0};
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83

void VActJitCode::genCode() {
  int offset = 0;
  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
    vmovups(ymm_src, ptr[param1 + offset]);
    act<ymm_t>(ymm_dst, ymm_src, type_);
    vmovups(ptr[param2 + offset], ymm_dst);
    offset += sizeof(float) * YMM_FLOAT_BLOCK;
  }
  int rest = num_ % YMM_FLOAT_BLOCK;
  while (rest > 0) {
    int block = XMM_FLOAT_BLOCK;
    if (rest >= 4) {
      block = 4;
      vmovups(xmm_src, ptr[param1 + offset]);
    } else if (rest >= 2) {
      block = 2;
      vmovq(xmm_src, ptr[param1 + offset]);
    } else {
      block = 1;
      vmovss(xmm_src, ptr[param1 + offset]);
    }
    act<xmm_t>(xmm_dst, xmm_src, type_);
    if (rest >= 4) {
      vmovups(ptr[param2 + offset], xmm_dst);
    } else if (rest >= 2) {
      vmovq(ptr[param2 + offset], xmm_dst);
    } else {
      vmovss(ptr[param2 + offset], xmm_dst);
    }
    offset += sizeof(float) * block;
    rest -= block;
  }
  ret();
}

#define DECLARE_ACT_CREATOR(name)                                            \
  class name##Creator : public JitCodeCreator<int> {                         \
   public:                                                                   \
T
tensor-tang 已提交
84
    bool UseMe(const int& attr) const override;                              \
85 86 87 88 89 90 91
    size_t CodeSize(const int& d) const override;                            \
    std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
      return make_unique<name##JitCode>(attr, CodeSize(attr));               \
    }                                                                        \
  }

DECLARE_ACT_CREATOR(VRelu);
T
tensor-tang 已提交
92
DECLARE_ACT_CREATOR(VSquare);
93 94 95 96 97 98
DECLARE_ACT_CREATOR(VIdentity);
DECLARE_ACT_CREATOR(VExp);
DECLARE_ACT_CREATOR(VSigmoid);
DECLARE_ACT_CREATOR(VTanh);

// TODO(TJ): tuning use me
T
tensor-tang 已提交
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
bool VReluCreator::UseMe(const int& d) const {
  return platform::MayIUse(platform::avx);
}

bool VSquareCreator::UseMe(const int& d) const {
  return platform::MayIUse(platform::avx);
}

bool VIdentityCreator::UseMe(const int& d) const {
  return platform::MayIUse(platform::avx);
}

bool VExpCreator::UseMe(const int& d) const {
  return platform::MayIUse(platform::avx) && d < 32;
}

bool VSigmoidCreator::UseMe(const int& d) const {
  return platform::MayIUse(platform::avx);
}

bool VTanhCreator::UseMe(const int& d) const {
  return platform::MayIUse(platform::avx);
}

123 124 125 126 127 128
size_t VReluCreator::CodeSize(const int& d) const {
  return 96 /* init size */ +
         (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ *
             8 /* average bytes for each instruction */;
}

T
tensor-tang 已提交
129 130 131 132
size_t VSquareCreator::CodeSize(const int& d) const {
  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 4 * 8;
}

133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
size_t VIdentityCreator::CodeSize(const int& d) const {
  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 4 * 8;
}

size_t VExpCreator::CodeSize(const int& d) const {
  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 70 * 8;
}

size_t VSigmoidCreator::CodeSize(const int& d) const {
  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 82 * 8;
}

size_t VTanhCreator::CodeSize(const int& d) const {
  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 84 * 8;
}

#undef DECLARE_ACT_CREATOR

}  // namespace gen
}  // namespace jit
}  // namespace operators
}  // namespace paddle

namespace gen = paddle::operators::jit::gen;

T
tensor-tang 已提交
158
REGISTER_JITKERNEL_GEN(kVRelu, gen::VReluCreator);
T
tensor-tang 已提交
159
REGISTER_JITKERNEL_GEN(kVSquare, gen::VSquareCreator);
T
tensor-tang 已提交
160 161 162 163
REGISTER_JITKERNEL_GEN(kVIdentity, gen::VIdentityCreator);
REGISTER_JITKERNEL_GEN(kVExp, gen::VExpCreator);
REGISTER_JITKERNEL_GEN(kVSigmoid, gen::VSigmoidCreator);
REGISTER_JITKERNEL_GEN(kVTanh, gen::VTanhCreator);