提交 40402d5e 编写于 作者: T tensor-tang

add emb seqpool jitcode

test=develop
上级 2ccbcb15
...@@ -31,3 +31,4 @@ USE_JITKERNEL_GEN(kNCHW16CMulNC) ...@@ -31,3 +31,4 @@ USE_JITKERNEL_GEN(kNCHW16CMulNC)
USE_JITKERNEL_GEN(kSeqPool) USE_JITKERNEL_GEN(kSeqPool)
USE_JITKERNEL_GEN(kHMax) USE_JITKERNEL_GEN(kHMax)
USE_JITKERNEL_GEN(kHSum) USE_JITKERNEL_GEN(kHSum)
USE_JITKERNEL_GEN(kEmbSeqPool)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/embseqpool.h"
#include <stddef.h> // offsetof
#include <vector>
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
void EmbSeqPoolJitCode::genCode() {
preCode();
constexpr int block = YMM_FLOAT_BLOCK;
constexpr int max_num_regs = 8;
const int num_block = tbl_w_ / block;
const int num_groups = num_block / max_num_regs;
const size_t block_size = sizeof(float) * block;
std::vector<int> groups(num_groups, max_num_regs);
int rest_num_regs = num_block % max_num_regs;
if (rest_num_regs > 0) {
groups.push_back(rest_num_regs);
}
// protect param_dst
mov(reg_ptr_param_dst, param_dst);
mov(reg_idx_width_in_byte,
qword[param_attr + offsetof(emb_seq_pool_attr_t, index_width)]);
mov(reg_idx_height,
qword[param_attr + offsetof(emb_seq_pool_attr_t, index_height)]);
mov(rax, sizeof(int64_t));
mul(reg_idx_width_in_byte);
mov(reg_idx_width_in_byte, rax);
const size_t tbl_width_in_byte = sizeof(float) * tbl_w_;
int acc_num_regs = 0;
for (int num_regs : groups) {
Label l_next_idx_w, l_next_idx_h, l_save_now;
xor_(reg_idx_w_i_in_byte, reg_idx_w_i_in_byte);
mov(reg_ptr_dst_i, reg_ptr_param_dst);
add(reg_ptr_dst_i, acc_num_regs * block_size);
add(param_tbl, acc_num_regs * block_size);
L(l_next_idx_w);
{
// h == 0
mov(reg_ptr_idx_i, param_idx);
add(reg_ptr_idx_i, reg_idx_w_i_in_byte);
mov(reg_idx, qword[reg_ptr_idx_i]);
mov(rax, tbl_width_in_byte);
mul(reg_idx);
mov(reg_ptr_tbl_i, rax); // reg is offset now
add(reg_ptr_tbl_i, param_tbl); // reg is ptr_i now
size_t w_offset = 0;
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_tbl_i + w_offset]);
w_offset += block_size;
}
add(reg_ptr_idx_i, reg_idx_width_in_byte);
// end condition of idx h
mov(reg_idx_h_end, reg_idx_height);
mov(rax, reg_idx_width_in_byte);
mul(reg_idx_h_end);
mov(reg_idx_h_end, rax);
add(reg_idx_h_end, reg_idx_w_i_in_byte);
add(reg_idx_h_end, param_idx);
cmp(reg_ptr_idx_i, reg_idx_h_end);
jge(l_save_now, T_NEAR);
L(l_next_idx_h);
{
mov(reg_idx, qword[reg_ptr_idx_i]);
mov(reg_ptr_tbl_i, reg_idx);
mov(rax, tbl_width_in_byte);
mul(reg_idx);
mov(reg_ptr_tbl_i, rax);
add(reg_ptr_tbl_i, param_tbl);
size_t w_offset = 0;
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ymm_t(reg_i), ptr[reg_ptr_tbl_i + w_offset]);
vaddps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs),
ymm_t(reg_i));
w_offset += block_size;
}
add(reg_ptr_idx_i, reg_idx_width_in_byte);
cmp(reg_ptr_idx_i, reg_idx_h_end);
jl(l_next_idx_h, T_NEAR);
} // end of idx h
L(l_save_now);
// avg or sqrt here, if needed
w_offset = 0;
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i + num_regs));
w_offset += block_size;
}
add(reg_ptr_dst_i, tbl_width_in_byte);
add(reg_idx_w_i_in_byte, sizeof(int64_t));
cmp(reg_idx_w_i_in_byte, reg_idx_width_in_byte);
jl(l_next_idx_w, T_NEAR);
} // end of idx w
acc_num_regs += num_regs;
} // end of groups
postCode();
}
class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
public:
bool UseMe(const emb_seq_pool_attr_t& attr) const override {
return platform::MayIUse(platform::avx) &&
attr.table_width % YMM_FLOAT_BLOCK == 0;
}
size_t CodeSize(const emb_seq_pool_attr_t& attr) const override {
return 96 + (attr.table_width / YMM_FLOAT_BLOCK) * 96 * 8;
}
std::unique_ptr<GenBase> CreateJitCode(
const emb_seq_pool_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.table_height, 0);
PADDLE_ENFORCE_GT(attr.table_width, 0);
PADDLE_ENFORCE_GT(attr.index_height, 0);
PADDLE_ENFORCE_GT(attr.index_width, 0);
PADDLE_ENFORCE_GT(attr.out_width, 0);
return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
}
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
namespace gen = paddle::operators::jit::gen;
REGISTER_JITKERNEL_GEN(kEmbSeqPool, gen::EmbSeqPoolCreator);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/operators/jit/gen/jitcode.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
class EmbSeqPoolJitCode : public JitCode {
public:
explicit EmbSeqPoolJitCode(const emb_seq_pool_attr_t& attr,
size_t code_size = 256 * 1024,
void* code_ptr = nullptr)
: JitCode(code_size, code_ptr),
tbl_w_(attr.table_width),
type_(attr.pool_type) {
if (type_ != SeqPoolType::kSum) {
LOG(FATAL) << "Only support sum pool yet ";
}
this->genCode();
}
std::string name() const override {
std::string base = "EmbSeqPoolJitCode";
if (type_ == SeqPoolType::kSum) {
base += "_Sum";
} else if (type_ == SeqPoolType::kAvg) {
base += "_Avg";
} else if (type_ == SeqPoolType::kSqrt) {
base += "_Sqrt";
}
base += ("_W" + std::to_string(tbl_w_));
return base;
}
void genCode() override;
private:
int tbl_w_;
SeqPoolType type_;
reg64_t param_tbl{abi_param1};
reg64_t param_idx{abi_param2};
reg64_t param_dst{abi_param3};
reg64_t param_attr{abi_param4};
reg64_t reg_tmp{rax};
reg64_t reg_idx_width_in_byte{r8};
reg64_t reg_idx_height{r9};
reg64_t reg_ptr_tbl_i{r10};
reg64_t reg_idx{r10}; // could use same of reg_ptr_tbl_i
reg64_t reg_ptr_idx_i{r11};
reg64_t reg_ptr_dst_i{r12};
reg64_t reg_ptr_param_dst{r13}; // rdx is used in mul so protect param_dst
reg64_t reg_idx_w_i_in_byte{r14};
reg64_t reg_idx_h_end{r15};
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
...@@ -32,7 +32,7 @@ class SeqPoolJitCode : public JitCode { ...@@ -32,7 +32,7 @@ class SeqPoolJitCode : public JitCode {
: JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) { : JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) {
if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg || if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg ||
type_ == SeqPoolType::kSqrt)) { type_ == SeqPoolType::kSqrt)) {
LOG(FATAL) << "Only support sum pool yet "; LOG(FATAL) << "Only supported pool type: sum, avg and sqrt.";
} }
fp_h_[0] = 1.f; fp_h_[0] = 1.f;
this->genCode(); this->genCode();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册