diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc index a35ee8a09ed5ddcc4ac465d200b84358fa65b2f3..e9e2a3b1f5c1c00bb2e95b6171ecd09bfe7a0d21 100644 --- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc @@ -79,17 +79,17 @@ void FusionRepeatedFCReluOpMaker::Make() { } template -static void fc_relu(const T* x, const T* w, const T* b, T* y, int m, int n, - int k) { +static void fc_relu(const T* x, const T* w, const T* b, T* y, + const jit::matmul_attr_t& attr) { auto matmul = - jit::Get, platform::CPUPlace>(k); + jit::Get, platform::CPUPlace>(attr); auto addbias_relu = - jit::Get, platform::CPUPlace>(n); - matmul(x, w, y, m, n, k); + jit::Get, platform::CPUPlace>(attr.n); + matmul(x, w, y, &attr); T* dst = y; - for (int i = 0; i < m; ++i) { - addbias_relu(b, dst, dst, n); - dst += n; + for (int i = 0; i < attr.m; ++i) { + addbias_relu(b, dst, dst, attr.n); + dst += attr.n; } } @@ -107,32 +107,33 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel { auto i_dims = in->dims(); auto w_dims = weights[0]->dims(); - int m = i_dims[0]; - int n = w_dims[1]; - int k = w_dims[0]; - relus[0]->Resize({m, n}); + jit::matmul_attr_t attr; + attr.m = i_dims[0]; + attr.n = w_dims[1]; + attr.k = w_dims[0]; + relus[0]->Resize({attr.m, attr.n}); fc_relu(in->data(), weights[0]->data(), biases[0]->data(), - relus[0]->mutable_data(place), m, n, k); + relus[0]->mutable_data(place), attr); for (int i = 1; i < weight_sz - 1; ++i) { auto i_dims = relus[i - 1]->dims(); auto w_dims = weights[i]->dims(); - int m = i_dims[0]; - int n = w_dims[1]; - int k = w_dims[0]; - relus[i]->Resize({m, n}); + attr.m = i_dims[0]; + attr.n = w_dims[1]; + attr.k = w_dims[0]; + relus[i]->Resize({attr.m, attr.n}); fc_relu(relus[i - 1]->data(), weights[i]->data(), - biases[i]->data(), relus[i]->mutable_data(place), m, n, k); + biases[i]->data(), relus[i]->mutable_data(place), attr); } auto i_dims_last = relus[weight_sz - 2]->dims(); auto w_dims_last = weights[weight_sz - 1]->dims(); - m = i_dims_last[0]; - n = w_dims_last[1]; - k = w_dims_last[0]; + attr.m = i_dims_last[0]; + attr.n = w_dims_last[1]; + attr.k = w_dims_last[0]; fc_relu(relus[weight_sz - 2]->data(), weights[weight_sz - 1]->data(), - biases[weight_sz - 1]->data(), out->mutable_data(place), m, n, - k); + biases[weight_sz - 1]->data(), out->mutable_data(place), + attr); } }; diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc index 00dafdead53bbd4614c70875441c565724fca46d..8c8b079633aacb711aa304ec7016c37c6bec61ce 100644 --- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc +++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc @@ -87,15 +87,18 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { auto x_dims = x->dims(); auto y_dims = y->dims(); - int m = x_dims[0]; - int k = x_dims[1]; - int n = y_dims[1]; - int o_numel = m * n; + jit::matmul_attr_t attr; + attr.m = x_dims[0]; + attr.k = x_dims[1]; + attr.n = y_dims[1]; + int o_numel = attr.m * attr.n; auto vsquare_x = - jit::Get, platform::CPUPlace>(m * k); + jit::Get, platform::CPUPlace>(attr.m * + attr.k); auto vsquare_y = - jit::Get, platform::CPUPlace>(k * n); + jit::Get, platform::CPUPlace>(attr.k * + attr.n); auto vsquare_xy = jit::Get, platform::CPUPlace>(o_numel); auto vsub = @@ -103,7 +106,7 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { auto vscal = jit::Get, platform::CPUPlace>(o_numel); auto matmul = - jit::Get, platform::CPUPlace>(k); + jit::Get, platform::CPUPlace>(attr); const T* x_data = x->data(); const T* y_data = y->data(); @@ -112,12 +115,12 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { T* squared_xy_data = squared_xy->mutable_data(place); T* o_data = out->mutable_data(place); - matmul(x_data, y_data, squared_xy_data, m, n, k); + matmul(x_data, y_data, squared_xy_data, &attr); vsquare_xy(squared_xy_data, squared_xy_data, o_numel); - vsquare_x(x_data, squared_x_data, m * k); - vsquare_y(y_data, squared_y_data, k * n); - matmul(squared_x_data, squared_y_data, o_data, m, n, k); + vsquare_x(x_data, squared_x_data, attr.m * attr.k); + vsquare_y(y_data, squared_y_data, attr.k * attr.n); + matmul(squared_x_data, squared_y_data, o_data, &attr); vsub(squared_xy_data, o_data, o_data, o_numel); vscal(&scalar, o_data, o_data, o_numel); diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 5c5a61f64093802697eb21452267471129c7fcf3..1b9360afcecf63ff0c3e306cdf303cc426e80f1e 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -311,8 +311,9 @@ void BenchMatMulKernel() { const T* a_data = a.data(); const T* b_data = b.data(); T* c_data = c.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>(k, a_data, b_data, - c_data, m, n, k); + const jit::matmul_attr_t attr{m, n, k}; + BenchAllImpls, PlaceType>(attr, a_data, b_data, + c_data, &attr); } } } diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 2ea8f927e1a13867fa2065841fac05e766735237..efc7eb79d36c5cf9fac4ac40db4e2e28cb242e22 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -9,6 +9,7 @@ function(USE_JITKERNEL_GEN TARGET) endfunction() # use gen jitcode kernel by name +USE_JITKERNEL_GEN(kMatMul) USE_JITKERNEL_GEN(kVMul) USE_JITKERNEL_GEN(kVAdd) USE_JITKERNEL_GEN(kVSub) diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc new file mode 100644 index 0000000000000000000000000000000000000000..ae3858eab20aeb80553d8fcec4088a6632c9c17d --- /dev/null +++ b/paddle/fluid/operators/jit/gen/matmul.cc @@ -0,0 +1,128 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/matmul.h" +#include // offsetof +#include + +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void MatMulJitCode::genCode() { + preCode(); + int block, rest; + const auto groups = packed_groups(n_, k_, &block, &rest); + PADDLE_ENFORCE_GT(groups.front(), 0); + + const int block_len = sizeof(float) * block; + const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1; + const int w_reg_idx = x_reg_idx - 1; + // from packed mov(reg_ptr_wgt, ptr[param_attr + offsetof(matmul_attr_t, + // packed_weight)]); + mov(reg_ptr_wgt, param_y); + size_t z_offset = 0; + size_t wgt_offset = 0; + for (size_t g = 0; g < groups.size(); ++g) { + size_t x_offset = 0; + for (int k = 0; k < k_; ++k) { + vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]); + // clean + if (k == 0) { + for (int i = 0; i < groups[g]; ++i) { + vxorps(zmm_t(i), zmm_t(i), zmm_t(i)); + } + } + for (int i = 0; i < groups[g]; ++i) { + vmovups(zmm_t(w_reg_idx), ptr[reg_ptr_wgt + wgt_offset]); + vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx)); + wgt_offset += block_len; + } + // last one, save + if (k == k_ - 1) { + for (int i = 0; i < groups[g]; ++i) { + // only rest save should be careful + if (rest != 0 && g == groups.size() - 1 && i == groups[g] - 1) { + break; + } + vmovups(ptr[param_z + z_offset + i * block_len], zmm_t(i)); + } + } + x_offset += sizeof(float); + } + z_offset += block_len * groups[g]; + } + + if (rest != 0) { + // below should refine with mask + int reg_idx = groups.back() - 1; + z_offset = (n_ - rest) * sizeof(float); + int inner_block = 8; + while (rest > 0) { + if (rest >= 8) { + inner_block = 8; + vmovups(ptr[param_z + z_offset], ymm_t(reg_idx)); + // shift zmm of inner_block, change reg_idx if update + } else if (rest >= 4) { + inner_block = 4; + vmovups(ptr[param_z + z_offset], xmm_t(reg_idx)); + } else if (rest >= 2) { + inner_block = 2; + vmovq(ptr[param_z + z_offset], xmm_t(reg_idx)); + } else { + inner_block = 1; + vmovss(ptr[param_z + z_offset], xmm_t(reg_idx)); + } + z_offset += inner_block * sizeof(float); + rest -= inner_block; + } + } + + postCode(); +} + +class MatMulCreator : public JitCodeCreator { + public: + bool UseMe(const matmul_attr_t& attr) const override { + return attr.m == 1 && platform::MayIUse(platform::avx512f) && + attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512; + } + size_t CodeSize(const matmul_attr_t& attr) const override { + int block = YMM_FLOAT_BLOCK; + if (platform::MayIUse(platform::avx512f)) { + block = ZMM_FLOAT_BLOCK; + } + return 96 + 4 * attr.k * (attr.n / block + 1) * 8; + } + std::unique_ptr CreateJitCode( + const matmul_attr_t& attr) const override { + PADDLE_ENFORCE_GT(attr.m, 0); + PADDLE_ENFORCE_GT(attr.n, 0); + PADDLE_ENFORCE_GT(attr.k, 0); + return make_unique(attr, CodeSize(attr)); + } +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator); diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h new file mode 100644 index 0000000000000000000000000000000000000000..626baa8f738bf0395f3c7f1700610d0a9075879b --- /dev/null +++ b/paddle/fluid/operators/jit/gen/matmul.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include // for malloc and free +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class MatMulJitCode : public JitCode { + public: + explicit MatMulJitCode(const matmul_attr_t& attr, + size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) { + PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet"); + this->genCode(); + } + + virtual const char* name() const { + std::string base = "MatMulJitCode"; + base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" + + std::to_string(k_); + return base.c_str(); + } + void genCode() override; + + private: + int m_, n_, k_; + + reg64_t param_x{abi_param1}; + reg64_t param_y{abi_param2}; + reg64_t param_z{abi_param3}; + reg64_t param_attr{abi_param4}; + reg64_t reg_tmp{rax}; + + reg64_t reg_ptr_wgt{r10}; +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc index 310da0c76f1ab251d788e54f2305f375f3fb4838..3cd5f6554bdc188ce9ea0c0b85c84d032c509600 100644 --- a/paddle/fluid/operators/jit/gen_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -16,6 +16,8 @@ #include #include #include +#include +#include "paddle/fluid/platform/cpu_info.h" DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); @@ -38,6 +40,35 @@ void GenBase::dumpCode(const unsigned char* code) const { } } +std::vector packed_groups(int n, int k, int* block_out, int* rest_out) { + int block; + int max_num_regs; + if (platform::MayIUse(platform::avx512f)) { + block = ZMM_FLOAT_BLOCK; + max_num_regs = 32; + } else { + block = YMM_FLOAT_BLOCK; + max_num_regs = 16; + } + // one for x, one for y, others for z + const int max_used_regs_for_n = max_num_regs - 2; + const int aligned_n = n % block == 0 ? n : (n / block + 1) * block; + const int num_block = aligned_n / block; + const int num_groups = num_block / max_used_regs_for_n; + std::vector groups(num_groups, max_used_regs_for_n); + int rest_num_regs = num_block % max_used_regs_for_n; + if (rest_num_regs != 0) { + groups.push_back(rest_num_regs); + } + if (block_out) { + *block_out = block; + } + if (rest_out) { + *rest_out = n % block; + } + return groups; +} + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index 4af01a437670aa6a07d370ff23ed2abd369f69a3..d808a332472ae86240cb63356cb417123523366a 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -16,6 +16,7 @@ #include #include // for unique_ptr +#include #include "paddle/fluid/operators/jit/kernel_base.h" DECLARE_bool(dump_jitcode); @@ -67,6 +68,11 @@ class JitCodeCreator : public GenCreator { virtual std::unique_ptr CreateJitCode(const Attr& attr) const = 0; }; +// unify the method of packed groups +// output the packed groups which used in weights, the block size and rest size +std::vector packed_groups(int n, int k, int* block = nullptr, + int* rest = nullptr); + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index 4dac2f2460f72c7da63f48c82549b948cc253153..e7292fe2bd8031aa5bbff68e7c2305a238085bf1 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/operators/jit/helper.h" #include // tolower +#include +#include #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -91,6 +93,41 @@ KernelType to_kerneltype(const std::string& act) { return kNone; } +template <> +void pack_weights(const float* src, float* dst, int n, int k) { + int block, rest; + const auto groups = packed_groups(n, k, &block, &rest); + std::for_each(groups.begin(), groups.end(), [&](int i) { + PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0."); + }); + int sum = std::accumulate(groups.begin(), groups.end(), 0); + std::memset(dst, 0, k * sum * block * sizeof(float)); + PADDLE_ENFORCE_GE(sum * block, n, + "The packed n should be equal to or larger than n"); + + const int block_len = sizeof(float) * block; + int n_offset = 0; + + for (size_t g = 0; g < groups.size(); ++g) { + const float* from = src + n_offset; + for (int j = 0; j < k; ++j) { + size_t copy_sz = groups[g] * block_len; + if (g == groups.size() - 1 && rest != 0) { + copy_sz = (groups[g] - 1) * block_len + rest * sizeof(float); + } + std::memcpy(dst, from + j * n, copy_sz); + dst += groups[g] * block; + } + n_offset += groups[g] * block; + } +} + +template +typename std::enable_if::value>::type pack_weights( + const T* src, T* dst, int n, int k) { + PADDLE_THROW("Only support pack with float type."); +} + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 7bdc45779b7d39d36db0d52ca9361943cdcdef3e..bba3a13619619b6de3f797a4efc4a0d09c3b281f 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -152,17 +152,28 @@ inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) { << (attr.use_peephole ? "True" : "False") << "]"; return os; } + inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) { os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate) << "],act_cand[" << to_string(attr.act_cand) << "]"; return os; } + inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) { os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type[" << to_string(attr.type) << "]"; return os; } +inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) { + os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]"; + return os; +} + +// expose the method to pack matmul weight +template +void pack_weights(const T* src, T* dst, int n, int k); + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 42a58580f7b1e0832af57398ba9c29882b6cc6fb..4a8f61146a1921fa1d5f6b7e15af40cd45d31a22 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -145,11 +145,19 @@ struct SeqPoolTuples { typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*); }; +typedef struct matmul_attr_s { + int m, n, k; + void* packed_weight{nullptr}; + matmul_attr_s() = default; + explicit matmul_attr_s(int m_, int n_, int k_, void* packed_weight_ = nullptr) + : m(m_), n(n_), k(k_), packed_weight(packed_weight_) {} +} matmul_attr_t; + template struct MatMulTuples { typedef T data_type; - typedef int attr_type; - typedef void (*func_type)(const T*, const T*, T*, int, int, int); + typedef matmul_attr_t attr_type; + typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*); }; template diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 61de38688664f83775c0c4e5aa6f7e06c3602ddb..1e4a8884e78c5d3c1748988f05ecf461a6f0eb94 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -49,6 +49,13 @@ size_t JitCodeKey(const seq_pool_attr_t& attr) { return (key << pool_type_shift) + static_cast(attr.type); } +template <> +size_t JitCodeKey(const matmul_attr_t& attr) { + size_t key = attr.m; + constexpr int shift = 21; + return (key << shift * 2) + ((static_cast(attr.n)) << shift) + attr.k; +} + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 28a37198dae19a57509934ec784746bc23436e7a..c7d0215eda9d1e14fcad16da7b70f45824789266 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -25,17 +25,19 @@ namespace more { namespace mkl { template <> -void MatMul(const float* a, const float* b, float* c, int m, int n, - int k) { - platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, - n, k, 1.f, a, k, b, n, 0.f, c, n); +void MatMul(const float* a, const float* b, float* c, + const matmul_attr_t* attr) { + platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + attr->m, attr->n, attr->k, 1.f, a, attr->k, b, + attr->n, 0.f, c, attr->n); } template <> -void MatMul(const double* a, const double* b, double* c, int m, int n, - int k) { - platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, - n, k, 1.0, a, k, b, n, 0.0, c, n); +void MatMul(const double* a, const double* b, double* c, + const matmul_attr_t* attr) { + platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + attr->m, attr->n, attr->k, 1.0, a, attr->k, b, + attr->n, 0.0, c, attr->n); } template <> @@ -127,11 +129,6 @@ void ASum(const double* x, double* res, int n) { } // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 -template <> -bool MatMulKernel::UseMe(const int& d) const { - return platform::MayIUse(platform::avx); -} - template <> bool VMulKernel::UseMe(const int& d) const { return platform::MayIUse(platform::avx512f) && d > 512; @@ -177,6 +174,16 @@ bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { return true; } +template <> +bool MatMulKernel::UseMe(const matmul_attr_t& attr) const { + return platform::MayIUse(platform::avx); +} + +template <> +bool MatMulKernel::UseMe(const matmul_attr_t& attr) const { + return true; +} + template <> bool SoftmaxKernel::UseMe(const int& d) const { // tuned on avx2 @@ -189,7 +196,6 @@ bool SoftmaxKernel::UseMe(const int& d) const { return true; \ } -AWALYS_USE_ME_WITH_DOUBLE(MatMul); AWALYS_USE_ME_WITH_DOUBLE(VMul); AWALYS_USE_ME_WITH_DOUBLE(VAdd); AWALYS_USE_ME_WITH_DOUBLE(VScal); diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 6b95b9c872dc12cccaef0b0737edd760447a47d0..8130b87326f1887f232022ab30fa7bf42b0723e7 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -26,7 +26,7 @@ namespace more { namespace mkl { template -void MatMul(const T* a, const T* b, T* c, int m, int n, int k); +void MatMul(const T* a, const T* b, T* c, const matmul_attr_t* attr); template void VMul(const T* x, const T* y, T* z, int n); diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 5a074db7e0e8ab49dc281e1809edef23e6a25c42..0c4a985f8e8ece0a6169478fa3a9b111f5a6f3b4 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -363,17 +363,19 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { // A(M,K) * B(K,N) = C(M,N) template -void MatMul(const T* A, const T* B, T* C, int M, int N, int K) { +void MatMul(const T* A, const T* B, T* C, const matmul_attr_t* attr) { + int M = attr->m; + int N = attr->n; + int K = attr->k; for (int m = 0; m < M; ++m) { const T* pa = A + m * K; T* pc = C + m * N; for (int n = 0; n < N; ++n) { const T* pb = B + n; - T sum = static_cast(0); - for (int k = 0; k < K; ++k) { - sum += (pa[k] * pb[k * N]); + pc[n] = pa[0] * pb[0]; + for (int k = 1; k < K; ++k) { + pc[n] += pa[k] * pb[k * N]; } - *(pc + n) = sum; } } } diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index cc461552898fc68661ce548a520d65215d3572b4..237e588d35cc3b33658a830db34676967818aab6 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -22,7 +22,7 @@ #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/place.h" -static double acc = 1e-5; +DEFINE_double(acc, 1e-5, "Test accuracy threshold."); template void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), @@ -39,7 +39,7 @@ template void ExpectEQ(const T* target, const T* refer, int n) { if (std::is_floating_point::value) { for (int i = 0; i < n; ++i) { - EXPECT_NEAR(target[i], refer[i], acc); + EXPECT_NEAR(target[i], refer[i], FLAGS_acc); } } else { for (int i = 0; i < n; ++i) { @@ -272,21 +272,23 @@ struct TestFuncWithRefer, std::vector, std::vector, template struct TestFuncWithRefer, std::vector, std::vector, - std::vector, int, int, int> { + std::vector, + typename jit::MatMulTuples::attr_type> { void operator()(const typename jit::MatMulTuples::func_type tgt, const std::vector& a, const std::vector& b, - const std::vector& cref, int m, int n, int k) { + const std::vector& cref, + const typename jit::MatMulTuples::attr_type& attr) { EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(a.size(), static_cast(m * k)); - EXPECT_EQ(b.size(), static_cast(k * n)); - EXPECT_EQ(cref.size(), static_cast(m * n)); + EXPECT_EQ(a.size(), static_cast(attr.m * attr.k)); + EXPECT_EQ(b.size(), static_cast(attr.k * attr.n)); + EXPECT_EQ(cref.size(), static_cast(attr.m * attr.n)); std::vector c(cref.size()); const T* a_data = a.data(); const T* b_data = b.data(); const T* cref_data = cref.data(); T* c_data = c.data(); - tgt(a_data, b_data, c_data, m, n, k); - ExpectEQ(c_data, cref_data, m * n); + tgt(a_data, b_data, c_data, &attr); + ExpectEQ(c_data, cref_data, attr.m * attr.n); } }; @@ -383,8 +385,8 @@ void TestAXYNKernel() { template void TestXRNKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - auto last_acc = acc; - acc = 1e-4; + auto last_acc = FLAGS_acc; + FLAGS_acc = 1e-4; for (int d : TestSizes()) { auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); @@ -395,7 +397,7 @@ void TestXRNKernel() { TestAllImpls, PlaceType, std::vector, T>(d, x, ref_res); } - acc = last_acc; + FLAGS_acc = last_acc; } template @@ -535,9 +537,10 @@ void TestSeqPoolKernel() { template void TestMatMulKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - auto last_acc = acc; - // TODO(intel): this should be acc issue of MKL - acc = 1e-3; + auto last_acc = FLAGS_acc; + // TODO(intel): fix MKL acc issue + // https://github.com/PaddlePaddle/Paddle/issues/15447 + FLAGS_acc = 1e-3; for (int m : {1, 2, 3, 4}) { for (int n : {1, 2, 3, 4}) { for (int k : TestSizes()) { @@ -549,13 +552,14 @@ void TestMatMulKernel() { const T* a_data = a.data(); const T* b_data = b.data(); T* c_data = c.data(); - ref(a_data, b_data, c_data, m, n, k); + const jit::matmul_attr_t attr{m, n, k}; + ref(a_data, b_data, c_data, &attr); TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(k, a, b, c, m, n, k); + std::vector, std::vector>(attr, a, b, c, attr); } } } - acc = last_acc; + FLAGS_acc = last_acc; } template