sgd.cc 4.7 KB
Newer Older
T
tensor-tang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. */

15
#include "paddle/phi/kernels/funcs/jit/gen/sgd.h"
W
wanghuancoder 已提交
16

T
tensor-tang 已提交
17
#include <stddef.h>  // offsetof
W
wanghuancoder 已提交
18

19
#include "paddle/phi/backends/cpu/cpu_info.h"
20
#include "paddle/phi/kernels/funcs/jit/registry.h"
T
tensor-tang 已提交
21

22
namespace phi {
T
tensor-tang 已提交
23 24 25
namespace jit {
namespace gen {

26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
void SgdJitCode::mainCode(int num_regs) {
  constexpr size_t block_size = sizeof(float) * YMM_FLOAT_BLOCK;
  // load grad
  for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
    vmovups(ymm_t(reg_i), ptr[reg_ptr_grad_i]);
    add(reg_ptr_grad_i, block_size);
  }
  // load param
  for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
    vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_param_i]);
    add(reg_ptr_param_i, block_size);
  }
  // compute out
  for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
    vmulps(ymm_t(reg_i), ymm_t(reg_i), ymm_lr);
    vsubps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs), ymm_t(reg_i));
  }
  // save out
  for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
    vmovups(ptr[reg_ptr_out_i], ymm_t(reg_i + num_regs));
    add(reg_ptr_out_i, block_size);
  }
}

T
tensor-tang 已提交
50 51 52 53 54 55 56
void SgdJitCode::genCode() {
  preCode();
  constexpr int block = YMM_FLOAT_BLOCK;
  constexpr int max_num_regs = 7;
  const int num_block = w_ / block;
  const int num_groups = num_block / max_num_regs;
  int rest_num_regs = num_block % max_num_regs;
57
  const size_t width_size = w_ * sizeof(float);
T
tensor-tang 已提交
58 59

  vbroadcastss(ymm_lr, ptr[param_lr]);
60

T
tensor-tang 已提交
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
  mov(reg_ptr_grad_i, param_grad);
  mov(reg_ptr_rows_i, param_rows);

  mov(reg_rows_size_in_byte,
      qword[param_attr + offsetof(sgd_attr_t, selected_rows_size)]);
  mov(rax, sizeof(int64_t));
  mul(reg_rows_size_in_byte);
  mov(reg_rows_size_in_byte, rax);
  add(reg_rows_size_in_byte, reg_ptr_rows_i);

  Label l_next_row;
  L(l_next_row);
  {
    mov(reg_row, qword[reg_ptr_rows_i]);
    mov(rax, width_size);
    mul(reg_row);
    mov(reg_row, rax);

    mov(reg_ptr_param_i, param_param);
    mov(reg_ptr_out_i, param_out);
    add(reg_ptr_param_i, reg_row);
    add(reg_ptr_out_i, reg_row);

84 85
    Label inner_loop;
    Label escape_loop;
86
    mov(rax, 0);  // NOLINT
87 88 89 90 91 92 93 94 95
    L(inner_loop);
    {
      cmp(rax, num_groups);
      jnb(escape_loop, T_NEAR);

      mainCode(max_num_regs);

      inc(rax);
      jmp(inner_loop, T_NEAR);
T
tensor-tang 已提交
96
    }
97 98
    L(escape_loop);
    mainCode(rest_num_regs);
T
tensor-tang 已提交
99 100

    add(reg_ptr_rows_i, sizeof(int64_t));
101

T
tensor-tang 已提交
102 103 104 105 106 107 108 109
    cmp(reg_ptr_rows_i, reg_rows_size_in_byte);
    jl(l_next_row, T_NEAR);
  }
  postCode();
}

class SgdCreator : public JitCodeCreator<sgd_attr_t> {
 public:
110
  bool CanBeUsed(const sgd_attr_t& attr) const override {
111
    return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) &&
T
tensor-tang 已提交
112 113
           attr.grad_width % YMM_FLOAT_BLOCK == 0;
  }
114
  size_t CodeSize(const sgd_attr_t& attr) const override { return 96 + 32 * 8; }
T
tensor-tang 已提交
115 116
  std::unique_ptr<GenBase> CreateJitCode(
      const sgd_attr_t& attr) const override {
117 118
    PADDLE_ENFORCE_EQ(attr.param_width,
                      attr.grad_width,
119
                      phi::errors::InvalidArgument(
G
GaoWei8 已提交
120 121 122
                          "The attribute param_width of Sgd should be "
                          "equal to the attribute grad_width. But param_width "
                          "is %d and grad_width is %d.",
123 124 125 126
                          attr.param_width,
                          attr.grad_width));
    PADDLE_ENFORCE_LE(attr.selected_rows_size,
                      attr.grad_height,
127
                      phi::errors::InvalidArgument(
G
GaoWei8 已提交
128 129 130
                          "The attribute selected_rows_size of Sgd should be "
                          "equal to or less than the attribute grad_height. "
                          "But selected_rows_size is %d and grad_height is %d.",
131 132
                          attr.selected_rows_size,
                          attr.grad_height));
G
GaoWei8 已提交
133
    PADDLE_ENFORCE_GE(
134 135
        attr.selected_rows_size,
        0,
136
        phi::errors::InvalidArgument(
G
GaoWei8 已提交
137 138 139
            "The attribute selected_rows_size of Sgd should be "
            "equal to or larger than 0. But selected_rows_size is %d.",
            attr.selected_rows_size));
T
tensor-tang 已提交
140 141 142 143 144 145
    return make_unique<SgdJitCode>(attr, CodeSize(attr));
  }
};

}  // namespace gen
}  // namespace jit
146
}  // namespace phi
T
tensor-tang 已提交
147

148
namespace gen = phi::jit::gen;
T
tensor-tang 已提交
149 150

REGISTER_JITKERNEL_GEN(kSgd, gen::SgdCreator);