From 802f362ac44e99a06e3c499893f3c8279a367f0d Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 7 Mar 2019 07:21:35 +0000 Subject: [PATCH] unify the kernelfuncs cache and add unit test test=develop --- paddle/fluid/operators/crf_decoding_op.h | 5 +- .../mkldnn/elementwise_mul_mkldnn_op.cc | 6 ++- .../fused/fused_embedding_seq_pool_op.h | 12 +++-- paddle/fluid/operators/fused/fusion_gru_op.cc | 49 +++++++++-------- .../fluid/operators/fused/fusion_lstm_op.cc | 54 ++++++++++--------- .../fused/fusion_repeated_fc_relu_op.cc | 10 ++-- .../fused/fusion_seqpool_concat_op.cc | 6 +-- .../fused/fusion_squared_mat_sub_op.cc | 32 ++++++----- paddle/fluid/operators/jit/CMakeLists.txt | 2 +- paddle/fluid/operators/jit/benchmark.cc | 2 +- paddle/fluid/operators/jit/helper.h | 34 ++++++++---- paddle/fluid/operators/jit/test.cc | 30 ++++++++--- paddle/fluid/operators/layer_norm_op.h | 6 +-- .../fluid/operators/math/sequence_pooling.cc | 6 +-- paddle/fluid/operators/optimizers/sgd_op.h | 10 ++-- 15 files changed, 158 insertions(+), 106 deletions(-) diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index 72774a878d9..3d98790a4d4 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -82,8 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel { Tensor track; int* track_value = track.mutable_data(emission_dims, platform::CPUPlace()); - auto ker = jit::Get, - platform::CPUPlace>(tag_num); + auto ker = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(tag_num); ker(static_cast(seq_len), x, w, alpha_value, track_value, tag_num); T max_score = -std::numeric_limits::max(); int max_i = 0; diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index 04e8800bbc8..e37bbd28376 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -110,8 +110,10 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { constexpr int simd_width = 16; int C = c / simd_width; - auto multiply = jit::Get, - platform::CPUPlace>(0); + auto multiply = + jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(0); #pragma omp parallel for collapse(2) for (int ni = 0; ni < n; ni++) { for (int ci = 0; ci < C; ci++) { diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index f13c0203860..fe43545e605 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -52,8 +52,10 @@ struct EmbeddingVSumFunctor { out_width, jit::SeqPoolType::kSum); for (size_t i = 0; i != ids_lod.size() - 1; ++i) { attr.index_height = ids_lod[i + 1] - ids_lod[i]; - auto emb_seqpool = jit::Get, - platform::CPUPlace>(attr); + auto emb_seqpool = + jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width, &attr); } @@ -135,8 +137,10 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { T *d_table_data = d_table_value->mutable_data(context.GetPlace()); const T *d_output_data = d_output->data(); - auto vbroadcast = jit::Get, - platform::CPUPlace>(out_width); + auto vbroadcast = + jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(out_width); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { int64_t h = static_cast(lod[i + 1] - lod[i]); const T *src = d_output_data + i * out_width; diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 66acba49e5a..cd8a6a55d47 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -182,29 +182,32 @@ class FusionGRUKernel : public framework::OpKernel { const int total_T = x_dims[0]; \ const int D3 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - auto* h0 = ctx.Input("H0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* bias = ctx.Input("Bias"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const jit::gru_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("activation"))); \ - jit::gru_t one_step; \ - auto ComputeH1 = \ - jit::Get, platform::CPUPlace>(attr); \ - auto ComputeHtPart1 = \ - jit::Get, platform::CPUPlace>(attr); \ - auto ComputeHtPart2 = \ - jit::Get, platform::CPUPlace>(attr); \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - auto place = ctx.GetPlace(); \ +#define INIT_OTHER_DEFINES \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* bias = ctx.Input("Bias"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const jit::gru_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("activation"))); \ + jit::gru_t one_step; \ + auto ComputeH1 = jit::KernelFuncs, \ + platform::CPUPlace>::Cache() \ + .At(attr); \ + auto ComputeHtPart1 = jit::KernelFuncs, \ + platform::CPUPlace>::Cache() \ + .At(attr); \ + auto ComputeHtPart2 = jit::KernelFuncs, \ + platform::CPUPlace>::Cache() \ + .At(attr); \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + auto place = ctx.GetPlace(); \ T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index b11b7c11bfe..d7d12df4bf9 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -235,32 +235,34 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D = wh_dims[0]; \ const int D4 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wp_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } \ - const jit::lstm_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("candidate_activation")), \ - jit::to_kerneltype(ctx.Attr("cell_activation")), \ - use_peepholes); \ - jit::lstm_t one_step; \ - one_step.wp = wp_data; \ - one_step.checked = checked_cell_data; \ - auto ComputeC1H1 = \ - jit::Get, platform::CPUPlace>(attr); \ - auto ComputeCtHt = \ - jit::Get, platform::CPUPlace>(attr) +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const jit::lstm_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("candidate_activation")), \ + jit::to_kerneltype(ctx.Attr("cell_activation")), \ + use_peepholes); \ + jit::lstm_t one_step; \ + one_step.wp = wp_data; \ + one_step.checked = checked_cell_data; \ + auto ComputeC1H1 = jit::KernelFuncs, \ + platform::CPUPlace>::Cache() \ + .At(attr); \ + auto ComputeCtHt = jit::KernelFuncs, \ + platform::CPUPlace>::Cache() \ + .At(attr) // Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc index 8ecdf2ed9d4..e057724b5a8 100644 --- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc @@ -81,10 +81,12 @@ void FusionRepeatedFCReluOpMaker::Make() { template static void fc_relu(const T* x, const T* w, const T* b, T* y, const jit::matmul_attr_t& attr) { - auto matmul = - jit::Get, platform::CPUPlace>(attr); - auto addbias_relu = - jit::Get, platform::CPUPlace>(attr.n); + auto matmul = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); + auto addbias_relu = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr.n); matmul(x, w, y, &attr); T* dst = y; for (int i = 0; i < attr.m; ++i) { diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc index d48bdafe0aa..7aeeabc5128 100644 --- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc @@ -97,9 +97,9 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel { } else if (pooltype == "SQRT") { attr.type = jit::SeqPoolType::kSqrt; } - auto seqpool = - jit::Get, platform::CPUPlace>( - attr); + auto seqpool = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); size_t n = ins.size(); size_t dst_step_size = n * w; for (size_t i = 0; i < n; ++i) { diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc index 8493f4468fc..9382bf0ebb4 100644 --- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc +++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc @@ -93,20 +93,24 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { attr.n = y_dims[1]; int o_numel = attr.m * attr.n; - auto vsquare_x = - jit::Get, platform::CPUPlace>(attr.m * - attr.k); - auto vsquare_y = - jit::Get, platform::CPUPlace>(attr.k * - attr.n); - auto vsquare_xy = - jit::Get, platform::CPUPlace>(o_numel); - auto vsub = - jit::Get, platform::CPUPlace>(o_numel); - auto vscal = - jit::Get, platform::CPUPlace>(o_numel); - auto matmul = - jit::Get, platform::CPUPlace>(attr); + auto vsquare_x = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr.m * attr.k); + auto vsquare_y = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr.k * attr.n); + auto vsquare_xy = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(o_numel); + auto vsub = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(o_numel); + auto vscal = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(o_numel); + auto matmul = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); const T* x_data = x->data(); const T* y_data = y->data(); diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt index 35775d7ec9e..47d6c83f2ad 100644 --- a/paddle/fluid/operators/jit/CMakeLists.txt +++ b/paddle/fluid/operators/jit/CMakeLists.txt @@ -5,7 +5,7 @@ file(APPEND ${jit_file} "\#pragma once\n") file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n") file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n") -set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) +set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash) file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 3088280bb90..deb96ee6cd1 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -142,7 +142,7 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { } } // Test result from Get function - auto tgt = jit::Get(attr); + auto tgt = jit::KernelFuncs::Cache().At(attr); if (!tgt) { LOG(FATAL) << "Target can not be empty!"; } diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index d85c719c1c5..1af1add3ee2 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -14,6 +14,9 @@ #pragma once +extern "C" { +#include +} #include #include #include @@ -127,23 +130,36 @@ class KernelFuncs { return g_func_cache; } - bool Has(int key) const { return funcs_.find(key) != funcs_.end(); } - - void Insert(int key, typename KernelTuples::func_type func) { - funcs_.emplace(key, func); - } - - typename KernelTuples::func_type At(int key) { + // the exposed interface to use + typename KernelTuples::func_type At( + const typename KernelTuples::attr_type& attr) { + // XXH64: 13.8 GB/s + int64_t key = XXH64(&attr, sizeof(typename KernelTuples::attr_type), 0); if (Has(key)) { return funcs_.at(key); } - auto func = Get(key); + // If do not have this attr in cache, + // then could run some runtime benchmark of this attr and save the best one. + // Here just get the offline benchmarked best one. + auto func = Get(attr); Insert(key, func); return func; } + typename KernelTuples::func_type operator[]( + const typename KernelTuples::attr_type& attr) { + return At(attr); + } + + protected: + bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); } + + void Insert(int64_t key, typename KernelTuples::func_type func) { + funcs_.emplace(key, func); + } + private: - std::unordered_map funcs_; + std::unordered_map funcs_; DISABLE_COPY_AND_ASSIGN(KernelFuncs); }; diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index cdec14dc438..18f8c09f143 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -462,7 +462,7 @@ void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { } // test result from Get function // VLOG(10) << "Test Get function "; - auto tgt = jit::Get(attr); + auto tgt = jit::KernelFuncs::Cache().At(attr); test(tgt, args...); } @@ -845,7 +845,9 @@ void TestKernelNCHW16CMulNCTuples() { T* zjit_data = zjit.data(); constexpr int simd_width = ZMM_FLOAT_BLOCK; int C = c / simd_width; - auto tgt = jit::Get, PlaceType>(0); + auto tgt = + jit::KernelFuncs, PlaceType>::Cache().At( + 0); auto jitcode = jit::GetJitCode, PlaceType>(0); EXPECT_TRUE(tgt != nullptr); @@ -967,10 +969,10 @@ void TestKernelVBroadcastTuples() { } } -#define TEST_CPU_KERNEL(test_tuple, kernel_type) \ - TEST(JITKernel, kernel_type) { \ - TestKernel##test_tuple(); \ - TestKernel##test_tuple(); \ +#define TEST_CPU_KERNEL(test_tuple, kernel_type) \ + TEST(JITKernel, kernel_type) { \ + TestKernel##test_tuple(); \ + TestKernel##test_tuple(); \ } TEST_CPU_KERNEL(XYZNTuples, kVMul); @@ -1041,4 +1043,18 @@ TEST(JITKernel_key, gru) { EXPECT_TRUE(key2 == key3); EXPECT_TRUE(key3 != key4); } -// TODO(TJ): add more test about key and pool + +TEST(JITKernel, kernel_func) { + auto f1 = + jit::KernelFuncs, CPUPlace>::Cache() + .At(3); + auto f2 = jit::KernelFuncs, + CPUPlace>::Cache()[3]; + EXPECT_TRUE(f1 == f2); + + f1 = jit::KernelFuncs, CPUPlace>::Cache() + .At(3); + f2 = jit::KernelFuncs, CPUPlace>::Cache() + .At(4); + EXPECT_TRUE(f1 != f2); +} diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index f564a103963..f0c3064d413 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -229,9 +229,9 @@ class LayerNormKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(scale->numel(), right); PADDLE_ENFORCE_EQ(bias->numel(), right); - auto ker = - jit::Get, platform::CPUPlace>( - right); + auto ker = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(right); ker(x.data(), out.data(), mean->data(), var->data(), scale->data(), bias->data(), static_cast(left), static_cast(epsilon), right); diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 2a47502614b..db103e5fab1 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -255,9 +255,9 @@ class SequencePoolFunctor { jit::seq_pool_attr_t attr( static_cast(input.numel() / input.dims()[0]), jit::SeqPoolType::kSum); - auto seqpool = - jit::Get, platform::CPUPlace>( - attr); + auto seqpool = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { attr.h = static_cast(lod[i + 1] - lod[i]); seqpool(src, dst, &attr); diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index c9c9f530fe8..0425a3d1942 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -47,8 +47,9 @@ class SGDOpKernel : public framework::OpKernel { int64_t rows_idx = 0; T *out_data = param_out->mutable_data(ctx.GetPlace()); - auto sgd = - jit::Get, platform::CPUPlace>(attr); + auto sgd = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr); } else if (grad_var->IsType()) { // TODO(qijun): In Sparse SGD operator, in-place update is enforced. @@ -81,8 +82,9 @@ class SGDOpKernel : public framework::OpKernel { attr.selected_rows_size = grad_rows.size(); PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width); - auto sgd = - jit::Get, platform::CPUPlace>(attr); + auto sgd = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); sgd(lr, param_data, grad_data, rows_data, out_data, &attr); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); -- GitLab