未验证 提交 ba02ac46 编写于 作者: T tensor-tang 提交者: GitHub

use mat attr and refine test (#15448)

* use mat attr and refine test

test=develop

* add matmul jitcode

test=develop

* fix mac compile

test=develop
上级 b5ebca47
...@@ -79,17 +79,17 @@ void FusionRepeatedFCReluOpMaker::Make() { ...@@ -79,17 +79,17 @@ void FusionRepeatedFCReluOpMaker::Make() {
} }
template <typename T> template <typename T>
static void fc_relu(const T* x, const T* w, const T* b, T* y, int m, int n, static void fc_relu(const T* x, const T* w, const T* b, T* y,
int k) { const jit::matmul_attr_t& attr) {
auto matmul = auto matmul =
jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(k); jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
auto addbias_relu = auto addbias_relu =
jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(n); jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(attr.n);
matmul(x, w, y, m, n, k); matmul(x, w, y, &attr);
T* dst = y; T* dst = y;
for (int i = 0; i < m; ++i) { for (int i = 0; i < attr.m; ++i) {
addbias_relu(b, dst, dst, n); addbias_relu(b, dst, dst, attr.n);
dst += n; dst += attr.n;
} }
} }
...@@ -107,32 +107,33 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> { ...@@ -107,32 +107,33 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
auto i_dims = in->dims(); auto i_dims = in->dims();
auto w_dims = weights[0]->dims(); auto w_dims = weights[0]->dims();
int m = i_dims[0]; jit::matmul_attr_t attr;
int n = w_dims[1]; attr.m = i_dims[0];
int k = w_dims[0]; attr.n = w_dims[1];
relus[0]->Resize({m, n}); attr.k = w_dims[0];
relus[0]->Resize({attr.m, attr.n});
fc_relu(in->data<T>(), weights[0]->data<T>(), biases[0]->data<T>(), fc_relu(in->data<T>(), weights[0]->data<T>(), biases[0]->data<T>(),
relus[0]->mutable_data<T>(place), m, n, k); relus[0]->mutable_data<T>(place), attr);
for (int i = 1; i < weight_sz - 1; ++i) { for (int i = 1; i < weight_sz - 1; ++i) {
auto i_dims = relus[i - 1]->dims(); auto i_dims = relus[i - 1]->dims();
auto w_dims = weights[i]->dims(); auto w_dims = weights[i]->dims();
int m = i_dims[0]; attr.m = i_dims[0];
int n = w_dims[1]; attr.n = w_dims[1];
int k = w_dims[0]; attr.k = w_dims[0];
relus[i]->Resize({m, n}); relus[i]->Resize({attr.m, attr.n});
fc_relu(relus[i - 1]->data<T>(), weights[i]->data<T>(), fc_relu(relus[i - 1]->data<T>(), weights[i]->data<T>(),
biases[i]->data<T>(), relus[i]->mutable_data<T>(place), m, n, k); biases[i]->data<T>(), relus[i]->mutable_data<T>(place), attr);
} }
auto i_dims_last = relus[weight_sz - 2]->dims(); auto i_dims_last = relus[weight_sz - 2]->dims();
auto w_dims_last = weights[weight_sz - 1]->dims(); auto w_dims_last = weights[weight_sz - 1]->dims();
m = i_dims_last[0]; attr.m = i_dims_last[0];
n = w_dims_last[1]; attr.n = w_dims_last[1];
k = w_dims_last[0]; attr.k = w_dims_last[0];
fc_relu(relus[weight_sz - 2]->data<T>(), weights[weight_sz - 1]->data<T>(), fc_relu(relus[weight_sz - 2]->data<T>(), weights[weight_sz - 1]->data<T>(),
biases[weight_sz - 1]->data<T>(), out->mutable_data<T>(place), m, n, biases[weight_sz - 1]->data<T>(), out->mutable_data<T>(place),
k); attr);
} }
}; };
......
...@@ -87,15 +87,18 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> { ...@@ -87,15 +87,18 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
auto x_dims = x->dims(); auto x_dims = x->dims();
auto y_dims = y->dims(); auto y_dims = y->dims();
int m = x_dims[0]; jit::matmul_attr_t attr;
int k = x_dims[1]; attr.m = x_dims[0];
int n = y_dims[1]; attr.k = x_dims[1];
int o_numel = m * n; attr.n = y_dims[1];
int o_numel = attr.m * attr.n;
auto vsquare_x = auto vsquare_x =
jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(m * k); jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.m *
attr.k);
auto vsquare_y = auto vsquare_y =
jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(k * n); jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.k *
attr.n);
auto vsquare_xy = auto vsquare_xy =
jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(o_numel); jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(o_numel);
auto vsub = auto vsub =
...@@ -103,7 +106,7 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> { ...@@ -103,7 +106,7 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
auto vscal = auto vscal =
jit::Get<jit::kVScal, jit::AXYNTuples<T>, platform::CPUPlace>(o_numel); jit::Get<jit::kVScal, jit::AXYNTuples<T>, platform::CPUPlace>(o_numel);
auto matmul = auto matmul =
jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(k); jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
const T* x_data = x->data<T>(); const T* x_data = x->data<T>();
const T* y_data = y->data<T>(); const T* y_data = y->data<T>();
...@@ -112,12 +115,12 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> { ...@@ -112,12 +115,12 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
T* squared_xy_data = squared_xy->mutable_data<T>(place); T* squared_xy_data = squared_xy->mutable_data<T>(place);
T* o_data = out->mutable_data<T>(place); T* o_data = out->mutable_data<T>(place);
matmul(x_data, y_data, squared_xy_data, m, n, k); matmul(x_data, y_data, squared_xy_data, &attr);
vsquare_xy(squared_xy_data, squared_xy_data, o_numel); vsquare_xy(squared_xy_data, squared_xy_data, o_numel);
vsquare_x(x_data, squared_x_data, m * k); vsquare_x(x_data, squared_x_data, attr.m * attr.k);
vsquare_y(y_data, squared_y_data, k * n); vsquare_y(y_data, squared_y_data, attr.k * attr.n);
matmul(squared_x_data, squared_y_data, o_data, m, n, k); matmul(squared_x_data, squared_y_data, o_data, &attr);
vsub(squared_xy_data, o_data, o_data, o_numel); vsub(squared_xy_data, o_data, o_data, o_numel);
vscal(&scalar, o_data, o_data, o_numel); vscal(&scalar, o_data, o_data, o_numel);
......
...@@ -311,8 +311,9 @@ void BenchMatMulKernel() { ...@@ -311,8 +311,9 @@ void BenchMatMulKernel() {
const T* a_data = a.data<T>(); const T* a_data = a.data<T>();
const T* b_data = b.data<T>(); const T* b_data = b.data<T>();
T* c_data = c.mutable_data<T>(PlaceType()); T* c_data = c.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(k, a_data, b_data, const jit::matmul_attr_t attr{m, n, k};
c_data, m, n, k); BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(attr, a_data, b_data,
c_data, &attr);
} }
} }
} }
......
...@@ -9,6 +9,7 @@ function(USE_JITKERNEL_GEN TARGET) ...@@ -9,6 +9,7 @@ function(USE_JITKERNEL_GEN TARGET)
endfunction() endfunction()
# use gen jitcode kernel by name # use gen jitcode kernel by name
USE_JITKERNEL_GEN(kMatMul)
USE_JITKERNEL_GEN(kVMul) USE_JITKERNEL_GEN(kVMul)
USE_JITKERNEL_GEN(kVAdd) USE_JITKERNEL_GEN(kVAdd)
USE_JITKERNEL_GEN(kVSub) USE_JITKERNEL_GEN(kVSub)
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/matmul.h"
#include <stddef.h> // offsetof
#include <vector>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
void MatMulJitCode::genCode() {
preCode();
int block, rest;
const auto groups = packed_groups(n_, k_, &block, &rest);
PADDLE_ENFORCE_GT(groups.front(), 0);
const int block_len = sizeof(float) * block;
const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
const int w_reg_idx = x_reg_idx - 1;
// from packed mov(reg_ptr_wgt, ptr[param_attr + offsetof(matmul_attr_t,
// packed_weight)]);
mov(reg_ptr_wgt, param_y);
size_t z_offset = 0;
size_t wgt_offset = 0;
for (size_t g = 0; g < groups.size(); ++g) {
size_t x_offset = 0;
for (int k = 0; k < k_; ++k) {
vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]);
// clean
if (k == 0) {
for (int i = 0; i < groups[g]; ++i) {
vxorps(zmm_t(i), zmm_t(i), zmm_t(i));
}
}
for (int i = 0; i < groups[g]; ++i) {
vmovups(zmm_t(w_reg_idx), ptr[reg_ptr_wgt + wgt_offset]);
vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx));
wgt_offset += block_len;
}
// last one, save
if (k == k_ - 1) {
for (int i = 0; i < groups[g]; ++i) {
// only rest save should be careful
if (rest != 0 && g == groups.size() - 1 && i == groups[g] - 1) {
break;
}
vmovups(ptr[param_z + z_offset + i * block_len], zmm_t(i));
}
}
x_offset += sizeof(float);
}
z_offset += block_len * groups[g];
}
if (rest != 0) {
// below should refine with mask
int reg_idx = groups.back() - 1;
z_offset = (n_ - rest) * sizeof(float);
int inner_block = 8;
while (rest > 0) {
if (rest >= 8) {
inner_block = 8;
vmovups(ptr[param_z + z_offset], ymm_t(reg_idx));
// shift zmm of inner_block, change reg_idx if update
} else if (rest >= 4) {
inner_block = 4;
vmovups(ptr[param_z + z_offset], xmm_t(reg_idx));
} else if (rest >= 2) {
inner_block = 2;
vmovq(ptr[param_z + z_offset], xmm_t(reg_idx));
} else {
inner_block = 1;
vmovss(ptr[param_z + z_offset], xmm_t(reg_idx));
}
z_offset += inner_block * sizeof(float);
rest -= inner_block;
}
}
postCode();
}
class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
public:
bool UseMe(const matmul_attr_t& attr) const override {
return attr.m == 1 && platform::MayIUse(platform::avx512f) &&
attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
}
size_t CodeSize(const matmul_attr_t& attr) const override {
int block = YMM_FLOAT_BLOCK;
if (platform::MayIUse(platform::avx512f)) {
block = ZMM_FLOAT_BLOCK;
}
return 96 + 4 * attr.k * (attr.n / block + 1) * 8;
}
std::unique_ptr<GenBase> CreateJitCode(
const matmul_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.m, 0);
PADDLE_ENFORCE_GT(attr.n, 0);
PADDLE_ENFORCE_GT(attr.k, 0);
return make_unique<MatMulJitCode>(attr, CodeSize(attr));
}
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
namespace gen = paddle::operators::jit::gen;
REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <stdlib.h> // for malloc and free
#include <string>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/operators/jit/gen/jitcode.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
class MatMulJitCode : public JitCode {
public:
explicit MatMulJitCode(const matmul_attr_t& attr,
size_t code_size = 256 * 1024,
void* code_ptr = nullptr)
: JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
this->genCode();
}
virtual const char* name() const {
std::string base = "MatMulJitCode";
base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
std::to_string(k_);
return base.c_str();
}
void genCode() override;
private:
int m_, n_, k_;
reg64_t param_x{abi_param1};
reg64_t param_y{abi_param2};
reg64_t param_z{abi_param3};
reg64_t param_attr{abi_param4};
reg64_t reg_tmp{rax};
reg64_t reg_ptr_wgt{r10};
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
...@@ -16,6 +16,8 @@ ...@@ -16,6 +16,8 @@
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <vector>
#include "paddle/fluid/platform/cpu_info.h"
DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
...@@ -38,6 +40,35 @@ void GenBase::dumpCode(const unsigned char* code) const { ...@@ -38,6 +40,35 @@ void GenBase::dumpCode(const unsigned char* code) const {
} }
} }
std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
int block;
int max_num_regs;
if (platform::MayIUse(platform::avx512f)) {
block = ZMM_FLOAT_BLOCK;
max_num_regs = 32;
} else {
block = YMM_FLOAT_BLOCK;
max_num_regs = 16;
}
// one for x, one for y, others for z
const int max_used_regs_for_n = max_num_regs - 2;
const int aligned_n = n % block == 0 ? n : (n / block + 1) * block;
const int num_block = aligned_n / block;
const int num_groups = num_block / max_used_regs_for_n;
std::vector<int> groups(num_groups, max_used_regs_for_n);
int rest_num_regs = num_block % max_used_regs_for_n;
if (rest_num_regs != 0) {
groups.push_back(rest_num_regs);
}
if (block_out) {
*block_out = block;
}
if (rest_out) {
*rest_out = n % block;
}
return groups;
}
} // namespace jit } // namespace jit
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <memory> // for unique_ptr #include <memory> // for unique_ptr
#include <vector>
#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/operators/jit/kernel_base.h"
DECLARE_bool(dump_jitcode); DECLARE_bool(dump_jitcode);
...@@ -67,6 +68,11 @@ class JitCodeCreator : public GenCreator { ...@@ -67,6 +68,11 @@ class JitCodeCreator : public GenCreator {
virtual std::unique_ptr<GenBase> CreateJitCode(const Attr& attr) const = 0; virtual std::unique_ptr<GenBase> CreateJitCode(const Attr& attr) const = 0;
}; };
// unify the method of packed groups
// output the packed groups which used in weights, the block size and rest size
std::vector<int> packed_groups(int n, int k, int* block = nullptr,
int* rest = nullptr);
} // namespace jit } // namespace jit
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "paddle/fluid/operators/jit/helper.h" #include "paddle/fluid/operators/jit/helper.h"
#include <algorithm> // tolower #include <algorithm> // tolower
#include <numeric>
#include <string>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
...@@ -91,6 +93,41 @@ KernelType to_kerneltype(const std::string& act) { ...@@ -91,6 +93,41 @@ KernelType to_kerneltype(const std::string& act) {
return kNone; return kNone;
} }
template <>
void pack_weights<float>(const float* src, float* dst, int n, int k) {
int block, rest;
const auto groups = packed_groups(n, k, &block, &rest);
std::for_each(groups.begin(), groups.end(), [&](int i) {
PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
});
int sum = std::accumulate(groups.begin(), groups.end(), 0);
std::memset(dst, 0, k * sum * block * sizeof(float));
PADDLE_ENFORCE_GE(sum * block, n,
"The packed n should be equal to or larger than n");
const int block_len = sizeof(float) * block;
int n_offset = 0;
for (size_t g = 0; g < groups.size(); ++g) {
const float* from = src + n_offset;
for (int j = 0; j < k; ++j) {
size_t copy_sz = groups[g] * block_len;
if (g == groups.size() - 1 && rest != 0) {
copy_sz = (groups[g] - 1) * block_len + rest * sizeof(float);
}
std::memcpy(dst, from + j * n, copy_sz);
dst += groups[g] * block;
}
n_offset += groups[g] * block;
}
}
template <typename T>
typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights(
const T* src, T* dst, int n, int k) {
PADDLE_THROW("Only support pack with float type.");
}
} // namespace jit } // namespace jit
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -152,17 +152,28 @@ inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) { ...@@ -152,17 +152,28 @@ inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) {
<< (attr.use_peephole ? "True" : "False") << "]"; << (attr.use_peephole ? "True" : "False") << "]";
return os; return os;
} }
inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) { inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate) os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate)
<< "],act_cand[" << to_string(attr.act_cand) << "]"; << "],act_cand[" << to_string(attr.act_cand) << "]";
return os; return os;
} }
inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) { inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type[" os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type["
<< to_string(attr.type) << "]"; << to_string(attr.type) << "]";
return os; return os;
} }
inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) {
os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]";
return os;
}
// expose the method to pack matmul weight
template <typename T>
void pack_weights(const T* src, T* dst, int n, int k);
} // namespace jit } // namespace jit
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -145,11 +145,19 @@ struct SeqPoolTuples { ...@@ -145,11 +145,19 @@ struct SeqPoolTuples {
typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*); typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
}; };
typedef struct matmul_attr_s {
int m, n, k;
void* packed_weight{nullptr};
matmul_attr_s() = default;
explicit matmul_attr_s(int m_, int n_, int k_, void* packed_weight_ = nullptr)
: m(m_), n(n_), k(k_), packed_weight(packed_weight_) {}
} matmul_attr_t;
template <typename T> template <typename T>
struct MatMulTuples { struct MatMulTuples {
typedef T data_type; typedef T data_type;
typedef int attr_type; typedef matmul_attr_t attr_type;
typedef void (*func_type)(const T*, const T*, T*, int, int, int); typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*);
}; };
template <typename T> template <typename T>
......
...@@ -49,6 +49,13 @@ size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) { ...@@ -49,6 +49,13 @@ size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
return (key << pool_type_shift) + static_cast<int>(attr.type); return (key << pool_type_shift) + static_cast<int>(attr.type);
} }
template <>
size_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
size_t key = attr.m;
constexpr int shift = 21;
return (key << shift * 2) + ((static_cast<size_t>(attr.n)) << shift) + attr.k;
}
} // namespace jit } // namespace jit
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -25,17 +25,19 @@ namespace more { ...@@ -25,17 +25,19 @@ namespace more {
namespace mkl { namespace mkl {
template <> template <>
void MatMul<float>(const float* a, const float* b, float* c, int m, int n, void MatMul<float>(const float* a, const float* b, float* c,
int k) { const matmul_attr_t* attr) {
platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n, k, 1.f, a, k, b, n, 0.f, c, n); attr->m, attr->n, attr->k, 1.f, a, attr->k, b,
attr->n, 0.f, c, attr->n);
} }
template <> template <>
void MatMul<double>(const double* a, const double* b, double* c, int m, int n, void MatMul<double>(const double* a, const double* b, double* c,
int k) { const matmul_attr_t* attr) {
platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n, k, 1.0, a, k, b, n, 0.0, c, n); attr->m, attr->n, attr->k, 1.0, a, attr->k, b,
attr->n, 0.0, c, attr->n);
} }
template <> template <>
...@@ -127,11 +129,6 @@ void ASum<double>(const double* x, double* res, int n) { ...@@ -127,11 +129,6 @@ void ASum<double>(const double* x, double* res, int n) {
} }
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
template <>
bool MatMulKernel<float>::UseMe(const int& d) const {
return platform::MayIUse(platform::avx);
}
template <> template <>
bool VMulKernel<float>::UseMe(const int& d) const { bool VMulKernel<float>::UseMe(const int& d) const {
return platform::MayIUse(platform::avx512f) && d > 512; return platform::MayIUse(platform::avx512f) && d > 512;
...@@ -177,6 +174,16 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const { ...@@ -177,6 +174,16 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
return true; return true;
} }
template <>
bool MatMulKernel<float>::UseMe(const matmul_attr_t& attr) const {
return platform::MayIUse(platform::avx);
}
template <>
bool MatMulKernel<double>::UseMe(const matmul_attr_t& attr) const {
return true;
}
template <> template <>
bool SoftmaxKernel<float>::UseMe(const int& d) const { bool SoftmaxKernel<float>::UseMe(const int& d) const {
// tuned on avx2 // tuned on avx2
...@@ -189,7 +196,6 @@ bool SoftmaxKernel<float>::UseMe(const int& d) const { ...@@ -189,7 +196,6 @@ bool SoftmaxKernel<float>::UseMe(const int& d) const {
return true; \ return true; \
} }
AWALYS_USE_ME_WITH_DOUBLE(MatMul);
AWALYS_USE_ME_WITH_DOUBLE(VMul); AWALYS_USE_ME_WITH_DOUBLE(VMul);
AWALYS_USE_ME_WITH_DOUBLE(VAdd); AWALYS_USE_ME_WITH_DOUBLE(VAdd);
AWALYS_USE_ME_WITH_DOUBLE(VScal); AWALYS_USE_ME_WITH_DOUBLE(VScal);
......
...@@ -26,7 +26,7 @@ namespace more { ...@@ -26,7 +26,7 @@ namespace more {
namespace mkl { namespace mkl {
template <typename T> template <typename T>
void MatMul(const T* a, const T* b, T* c, int m, int n, int k); void MatMul(const T* a, const T* b, T* c, const matmul_attr_t* attr);
template <typename T> template <typename T>
void VMul(const T* x, const T* y, T* z, int n); void VMul(const T* x, const T* y, T* z, int n);
......
...@@ -363,17 +363,19 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { ...@@ -363,17 +363,19 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
// A(M,K) * B(K,N) = C(M,N) // A(M,K) * B(K,N) = C(M,N)
template <typename T> template <typename T>
void MatMul(const T* A, const T* B, T* C, int M, int N, int K) { void MatMul(const T* A, const T* B, T* C, const matmul_attr_t* attr) {
int M = attr->m;
int N = attr->n;
int K = attr->k;
for (int m = 0; m < M; ++m) { for (int m = 0; m < M; ++m) {
const T* pa = A + m * K; const T* pa = A + m * K;
T* pc = C + m * N; T* pc = C + m * N;
for (int n = 0; n < N; ++n) { for (int n = 0; n < N; ++n) {
const T* pb = B + n; const T* pb = B + n;
T sum = static_cast<T>(0); pc[n] = pa[0] * pb[0];
for (int k = 0; k < K; ++k) { for (int k = 1; k < K; ++k) {
sum += (pa[k] * pb[k * N]); pc[n] += pa[k] * pb[k * N];
} }
*(pc + n) = sum;
} }
} }
} }
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
static double acc = 1e-5; DEFINE_double(acc, 1e-5, "Test accuracy threshold.");
template <typename T> template <typename T>
void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f), void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
...@@ -39,7 +39,7 @@ template <typename T> ...@@ -39,7 +39,7 @@ template <typename T>
void ExpectEQ(const T* target, const T* refer, int n) { void ExpectEQ(const T* target, const T* refer, int n) {
if (std::is_floating_point<T>::value) { if (std::is_floating_point<T>::value) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
EXPECT_NEAR(target[i], refer[i], acc); EXPECT_NEAR(target[i], refer[i], FLAGS_acc);
} }
} else { } else {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
...@@ -272,21 +272,23 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>, ...@@ -272,21 +272,23 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
template <typename T> template <typename T>
struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>, struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
std::vector<T>, int, int, int> { std::vector<T>,
typename jit::MatMulTuples<T>::attr_type> {
void operator()(const typename jit::MatMulTuples<T>::func_type tgt, void operator()(const typename jit::MatMulTuples<T>::func_type tgt,
const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& a, const std::vector<T>& b,
const std::vector<T>& cref, int m, int n, int k) { const std::vector<T>& cref,
const typename jit::MatMulTuples<T>::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr); EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(a.size(), static_cast<size_t>(m * k)); EXPECT_EQ(a.size(), static_cast<size_t>(attr.m * attr.k));
EXPECT_EQ(b.size(), static_cast<size_t>(k * n)); EXPECT_EQ(b.size(), static_cast<size_t>(attr.k * attr.n));
EXPECT_EQ(cref.size(), static_cast<size_t>(m * n)); EXPECT_EQ(cref.size(), static_cast<size_t>(attr.m * attr.n));
std::vector<T> c(cref.size()); std::vector<T> c(cref.size());
const T* a_data = a.data(); const T* a_data = a.data();
const T* b_data = b.data(); const T* b_data = b.data();
const T* cref_data = cref.data(); const T* cref_data = cref.data();
T* c_data = c.data(); T* c_data = c.data();
tgt(a_data, b_data, c_data, m, n, k); tgt(a_data, b_data, c_data, &attr);
ExpectEQ<T>(c_data, cref_data, m * n); ExpectEQ<T>(c_data, cref_data, attr.m * attr.n);
} }
}; };
...@@ -383,8 +385,8 @@ void TestAXYNKernel() { ...@@ -383,8 +385,8 @@ void TestAXYNKernel() {
template <jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void TestXRNKernel() { void TestXRNKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
auto last_acc = acc; auto last_acc = FLAGS_acc;
acc = 1e-4; FLAGS_acc = 1e-4;
for (int d : TestSizes()) { for (int d : TestSizes()) {
auto ref = jit::GetRefer<KT, jit::XRNTuples<T>>(); auto ref = jit::GetRefer<KT, jit::XRNTuples<T>>();
EXPECT_TRUE(ref != nullptr); EXPECT_TRUE(ref != nullptr);
...@@ -395,7 +397,7 @@ void TestXRNKernel() { ...@@ -395,7 +397,7 @@ void TestXRNKernel() {
TestAllImpls<KT, jit::XRNTuples<T>, PlaceType, std::vector<T>, T>(d, x, TestAllImpls<KT, jit::XRNTuples<T>, PlaceType, std::vector<T>, T>(d, x,
ref_res); ref_res);
} }
acc = last_acc; FLAGS_acc = last_acc;
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
...@@ -535,9 +537,10 @@ void TestSeqPoolKernel() { ...@@ -535,9 +537,10 @@ void TestSeqPoolKernel() {
template <jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void TestMatMulKernel() { void TestMatMulKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
auto last_acc = acc; auto last_acc = FLAGS_acc;
// TODO(intel): this should be acc issue of MKL // TODO(intel): fix MKL acc issue
acc = 1e-3; // https://github.com/PaddlePaddle/Paddle/issues/15447
FLAGS_acc = 1e-3;
for (int m : {1, 2, 3, 4}) { for (int m : {1, 2, 3, 4}) {
for (int n : {1, 2, 3, 4}) { for (int n : {1, 2, 3, 4}) {
for (int k : TestSizes()) { for (int k : TestSizes()) {
...@@ -549,13 +552,14 @@ void TestMatMulKernel() { ...@@ -549,13 +552,14 @@ void TestMatMulKernel() {
const T* a_data = a.data(); const T* a_data = a.data();
const T* b_data = b.data(); const T* b_data = b.data();
T* c_data = c.data(); T* c_data = c.data();
ref(a_data, b_data, c_data, m, n, k); const jit::matmul_attr_t attr{m, n, k};
ref(a_data, b_data, c_data, &attr);
TestAllImpls<KT, jit::MatMulTuples<T>, PlaceType, std::vector<T>, TestAllImpls<KT, jit::MatMulTuples<T>, PlaceType, std::vector<T>,
std::vector<T>, std::vector<T>>(k, a, b, c, m, n, k); std::vector<T>, std::vector<T>>(attr, a, b, c, attr);
} }
} }
} }
acc = last_acc; FLAGS_acc = last_acc;
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册