diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 212724a0c7fc06edf0599ddcaad33aba1fead879..6a37b5ca433a3baa1388dd4f720d782ca53e4e99 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -75,7 +75,8 @@ paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'outp paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)) paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)) paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) @@ -84,6 +85,7 @@ paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name'] paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)) paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 4576999c8ec756ee662ed91df198e1a96f9897fc..b212666637a5289c9c6cd3585655deaeed8afd4b 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -101,7 +101,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) { } else if (var_type == proto::VarType::FETCH_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); + var->GetMutable>(); } else if (var_type == proto::VarType::LOD_RANK_TABLE) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 8e1f93c5ebd448903d70f9668539e077875836e4..3e9353f5cf67d8de62c5551f12ea786e49190549 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, // be created. VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; Variable* g_feed_value = scope->Var(var_name); - auto& feed_inputs = - *(g_feed_value->GetMutable>()); + auto& feed_inputs = *(g_feed_value->GetMutable()); if (index >= feed_inputs.size()) { feed_inputs.resize(index + 1); } diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index ba10687d65cfbbac89cfc76879c8b202ebd03229..2840d503f1454271afb309efdd435225ab077dc0 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { } else if (var_type == proto::VarType::FETCH_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); + var->GetMutable>(); } else if (var_type == proto::VarType::LOD_RANK_TABLE) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 9f930065324f13f5aa79c214e820fb6fc2f3a166..14fcde2fe3b1c3acfc0994e9cd37a784c57826d7 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -149,9 +149,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { platform::SetDeviceId(dev_id); #endif } - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); - RunImpl(scope, place); + + // The profile has a process-wide mutex, results in serious performance issue + // in concurrency scenerio. Here use an `if` to fix this issue. + // Please not remove the `if`, ask @Superjomn if there are any concern. + if (platform::IsProfileEnabled()) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); + RunImpl(scope, place); + } else { + RunImpl(scope, place); + } VLOG(3) << place << " " << DebugStringEx(&scope); } diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 1d7a2eb5b38255531880fe3d2e5321024caf0c6b..69bcbc0e5891f95af4de8dfd49a25648ca920ab1 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } memory::Copy(boost::get(dst_place), dst_ptr, boost::get(src_place), src_ptr, size); } @@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto stream = reinterpret_cast(ctx).stream(); if (platform::is_same_place(src_place, dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else { @@ -114,6 +124,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, auto dst_ptr = dst->mutable_data(dst_place, src.type()); auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data from " << src_place << " to " + << dst_place; + return; + } memory::Copy(boost::get(dst_place), dst_ptr, boost::get(src_place), src_ptr, size); } @@ -130,6 +145,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); } else if (platform::is_gpu_place(src_place) && platform::is_gpu_place(dst_place)) { + if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) { + VLOG(3) << "Skip copy the same data from " << src_place << " to " + << dst_place; + return; + } auto src_gpu_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index a1e5b967a86d10f3439db662af54bb82888027b9..793ccfc79fe56707f226477b9d50b1d972ab6a59 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) { EXPECT_EQ(src_ptr[i], dst_ptr[i]); } + TensorCopy(dst_tensor, *cpu_place, &dst_tensor); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); Tensor slice_tensor = src_tensor.Slice(1, 2); @@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) { EXPECT_EQ(src_ptr[i], dst_ptr[i]); } + // Copy the same tensor + TensorCopy(gpu_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + gpu_ctx.Wait(); + const int* dst_ptr_tmp = dst_tensor.data(); + EXPECT_NE(src_ptr, dst_ptr_tmp); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]); + } + Tensor slice_tensor = src_tensor.Slice(1, 2); // CPU Slice Tensor to GPU Tensor diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h index e33849ef502fb10b913e7e28cbd0abdb8b8ff9bb..9d3fb811191c207c75845ef8f8486e8beac7525a 100644 --- a/paddle/fluid/framework/var_desc.h +++ b/paddle/fluid/framework/var_desc.h @@ -59,6 +59,7 @@ class VarDesc { public: explicit VarDesc(const std::string &name) { desc_.set_name(name); + // TODO(paddle-dev): Why default to lodtensor. desc_.mutable_type()->set_type(proto::VarType::LOD_TENSOR); } diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h index 067e0c2b8389f88639fd9b95bd680702517efee1..873e1b20a584df3ba90cf5c1a62a3879bf98ce5c 100644 --- a/paddle/fluid/framework/variable.h +++ b/paddle/fluid/framework/variable.h @@ -38,8 +38,12 @@ class Variable { template T* GetMutable() { - if (!IsType()) { + if (!holder_) { holder_.reset(new PlaceholderImpl(new T())); + } else { + PADDLE_ENFORCE(IsType(), + "Variable must be type %s, the holding type is %s", + typeid(T).name(), holder_->Type().name()); } return static_cast(holder_->Ptr()); } diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc index c5c1d215f4a6affae0a3bdafacec40a2aee2ca19..003dcfd3dfe5ecfd563a686bb72b061aff602f73 100644 --- a/paddle/fluid/framework/variable_test.cc +++ b/paddle/fluid/framework/variable_test.cc @@ -33,9 +33,10 @@ TEST(Variable, GetMutable) { const Tensor& tt = v->Get(); EXPECT_EQ(1234, tt.content_); - std::string* s = v->GetMutable(); - *s = "hello"; - - const std::string& ss = v->Get(); - EXPECT_EQ("hello", ss); + try { + v->GetMutable(); + } catch (std::exception& e) { + return; + } + EXPECT_TRUE(false); } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index f9135ff9d7be16b7dfb08c044188c1d7a567da4e..3095dee0f0106b2408663cd32bb4fb310111eda4 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -340,6 +340,19 @@ bool AnalysisPredictor::LoadProgramDesc() { } return true; } + +AnalysisPredictor::~AnalysisPredictor() { +#if !defined(_WIN32) + if (FLAGS_profile) { + platform::DisableProfiler(platform::EventSortingKey::kTotal, + "./profile.log"); + } +#endif + if (sub_scope_) { + scope_->DeleteScope(sub_scope_); + } +} + std::unique_ptr AnalysisPredictor::Clone() { auto *x = new AnalysisPredictor(config_); x->Init(scope_, inference_program_); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 0d01d7ac2b29ea6364b07af9bb3bdeb5ced6bd00..5a9f4d36959d4ee7ca16dec769d9d1283b8787cb 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -72,6 +72,7 @@ class AnalysisPredictor : public PaddlePredictor { template void GetFetchOne(const framework::LoDTensor &fetchs, PaddleTensor *output_data); + ~AnalysisPredictor(); private: contrib::AnalysisConfig config_; diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 031109398d8e21ad95f19f65fdd814e1782889e6..df3e3fcd9c75f03f4d9b0a7c12788f06bfdefd7f 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -300,7 +300,7 @@ op_library(flatten_op DEPS reshape_op) op_library(sequence_pad_op DEPS sequence_padding) op_library(unstack_op DEPS stack_op) op_library(fake_quantize_op DEPS memory) -op_library(fusion_lstm_op DEPS cpu_lstm_compute) +op_library(fusion_lstm_op DEPS jit_kernel) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(layer_norm_op DEPS cub) diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index 2826b82117db113d4d8c10095e89f610ca895775..e04a68717b351ddb0be5a7e70aa9297e5eb0125f 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -70,6 +70,12 @@ class FillConstantOp : public framework::OperatorBase { } }; +class FillConstantOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override {} +}; + class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -102,4 +108,5 @@ Fill up a variable with specified constant value. namespace ops = paddle::operators; REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantInferShape, ops::FillConstantOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + ops::FillConstantOpVarTypeInference); diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index ae1f6d8e489039667d861a69acabf2c632ef2061..067e6a3e7cccc1f15ebdd984f3a2441339a989ab 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -15,11 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/fusion_lstm_op.h" #include #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/cpu_lstm_compute.h" -#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/jit_kernel.h" #include "paddle/fluid/operators/math/sequence2batch.h" -#include "paddle/fluid/platform/cpu_info.h" namespace paddle { namespace operators { @@ -219,121 +217,55 @@ This operator fuse the X into LSTM, more details can refer to LSTM op. template class FuisonLSTMKernel : public framework::OpKernel { public: -#define INIT_VEC_FUNC \ - std::function act_gate, act_cell, act_cand; \ - auto& act_gate_str = ctx.Attr("gate_activation"); \ - auto& act_cell_str = ctx.Attr("cell_activation"); \ - auto& act_cand_str = ctx.Attr("candidate_activation"); \ - if (platform::jit::MayIUse(platform::jit::avx)) { \ - math::VecActivations act_functor; \ - act_gate = act_functor(act_gate_str); \ - act_cell = act_functor(act_cell_str); \ - act_cand = act_functor(act_cand_str); \ - } else { \ - math::VecActivations act_functor; \ - act_gate = act_functor(act_gate_str); \ - act_cell = act_functor(act_cell_str); \ - act_cand = act_functor(act_cand_str); \ - } - -#define INIT_BASE_INPUT_OUTPUT \ - auto* x = ctx.Input("X"); \ - auto* h0 = ctx.Input("H0"); \ - auto* c0 = ctx.Input("C0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* wh = ctx.Input("WeightH"); \ - auto* bias = ctx.Input("Bias"); \ - auto* xx = ctx.Output("XX"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - auto* cell_out = ctx.Output("Cell"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - bool use_peepholes = ctx.Attr("use_peepholes"); - -#define INIT_BASE_SIZES \ - auto x_dims = x->dims(); /* T x M*/ \ - auto wh_dims = wh->dims(); /* D x 4D*/ \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const int D3 = D * 3; \ - const int D4 = wh_dims[1]; - -#define INIT_BASE_INPUT_DATAS \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wc_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } - -/// Compute LSTM +#define INIT_BASE_DEFINES \ + using DeviceContext = paddle::platform::CPUDeviceContext; \ + auto* x = ctx.Input("X"); \ + auto* h0 = ctx.Input("H0"); \ + auto* c0 = ctx.Input("C0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* wh = ctx.Input("WeightH"); \ + auto* bias = ctx.Input("Bias"); \ + auto* xx = ctx.Output("XX"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + auto* cell_out = ctx.Output("Cell"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + bool use_peepholes = ctx.Attr("use_peepholes"); \ + auto x_dims = x->dims(); /* T x M*/ \ + auto wh_dims = wh->dims(); /* D x 4D*/ \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D4 = wh_dims[1] + +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const auto& ker = \ + math::jitkernel::KernelPool::Instance() \ + .template Get, const std::string&, \ + const std::string&, const std::string&>( \ + ctx.Attr("gate_activation"), \ + ctx.Attr("candidate_activation"), \ + ctx.Attr("cell_activation"), D, use_peepholes) + +// Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast(1), prev, D, \ wh_data, D4, static_cast(1), out, D4) -#define GET_Ct(ct_1, gates, ct) \ - /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ - act_cand(D, gates, gates); \ - blas.VMUL(D, gates, gates + D, gates + D); \ - blas.VMUL(D, ct_1, gates + D2, gates + D2); \ - blas.VADD(D, gates + D, gates + D2, ct) - -#define GET_Ht(ct, gates, ht) \ - /* H_t = act_cell(C_t) * ogated */ \ - act_cell(D, ct, gates + D2); \ - blas.VMUL(D, gates + D2, gates + D3, ht) - -#define GET_Ct_NOH0C0(gates, ct) \ - /* C_t = igated * cgated*/ \ - act_gate(D, gates + D, gates + D); \ - act_cand(D, gates, gates); \ - blas.VMUL(D, gates, gates + D, ct) - -#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \ - GET_Ct_NOH0C0(gates, ct); \ - act_gate(D, gates + D3, gates + D3); \ - GET_Ht(ct, gates, ht) - -#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \ - GET_Ct_NOH0C0(gates, ct); \ - /* get outgated, put W_oc * C_t on igated */ \ - blas.VMUL(D, wc_data + D2, ct, gates + D); \ - blas.VADD(D, gates + D, gates + D3, gates + D3); \ - act_gate(D, gates + D3, gates + D3); \ - GET_Ht(ct, gates, ht) - -#define COMPUTE_CtHt(gates, ct_1, ct, ht) \ - act_gate(D3, gates + D, gates + D); \ - GET_Ct(ct_1, gates, ct); \ - GET_Ht(ct, gates, ht) - -#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht) \ - /* get fgated and igated*/ \ - blas.VMUL(D, wc_data, ct_1, checked_cell_data); \ - blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \ - blas.VADD(D2, checked_cell_data, gates + D, gates + D); \ - act_gate(D2, gates + D, gates + D); \ - GET_Ct(ct_1, gates, ct); \ - /* get ogated*/ \ - blas.VMUL(D, wc_data + D2, ct, gates + D); \ - blas.VADD(D, gates + D, gates + D3, gates + D3); \ - act_gate(D, gates + D3, gates + D3); \ - GET_Ht(ct, gates, ht) - void SeqCompute(const framework::ExecutionContext& ctx) const { - using DeviceContext = paddle::platform::CPUDeviceContext; - INIT_BASE_INPUT_OUTPUT - INIT_BASE_SIZES - INIT_VEC_FUNC - INIT_BASE_INPUT_DATAS - + INIT_BASE_DEFINES; + INIT_OTHER_DEFINES; auto x_lod = x->lod(); const int total_T = x_dims[0]; const int N = x_lod[0].size() - 1; @@ -357,89 +289,47 @@ class FuisonLSTMKernel : public framework::OpKernel { gate_offset = -D; } -#define MOVE_ONE_STEP \ - prev_h_data = h_out_data; \ - prev_c_data = c_out_data; \ - xx_data = xx_data + xx_offset; \ - h_out_data = h_out_data + gate_offset; \ - c_out_data = c_out_data + gate_offset - -#define PROCESS_H0C0_DEFINES \ - int bid = is_reverse ? N - 1 - i : i; \ - int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; \ - const T* prev_c_data = nullptr; \ - const T* prev_h_data = nullptr; \ - int tstart = 0 - -#define PROCESS_H0C0_PEEPHOLE \ - PROCESS_H0C0_DEFINES; \ - if (h0_data) { \ - prev_h_data = h0_data + bid * D; \ - prev_c_data = c0_data + bid * D; \ - } else { \ - COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \ - MOVE_ONE_STEP; \ - tstart = 1; \ - } - -#define PROCESS_H0C0 \ - PROCESS_H0C0_DEFINES; \ - if (h0_data) { \ - prev_h_data = h0_data + bid * D; \ - prev_c_data = c0_data + bid * D; \ - } else { \ - COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \ - MOVE_ONE_STEP; \ - tstart = 1; \ - } - - if (use_peepholes) { - for (int i = 0; i < N; ++i) { - PROCESS_H0C0_PEEPHOLE - for (int step = tstart; step < seq_len; ++step) { - GEMM_WH_ADDON(1, prev_h_data, xx_data); - COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data); - MOVE_ONE_STEP; - } - } - } else { - // TODO(TJ): unly workaround, clean me - std::function compute_ctht; - if (platform::jit::MayIUse(platform::jit::avx) && - act_gate_str == "sigmoid" && act_cand_str == "tanh" && - act_cell_str == "tanh" && D == 8) { - compute_ctht = math::lstm_compute_ctht; + for (int i = 0; i < N; ++i) { + int bid = is_reverse ? N - 1 - i : i; + int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; + const T* prev_c_data = nullptr; + const T* prev_h_data = nullptr; + int tstart = 0; + if (h0_data) { + prev_h_data = h0_data + bid * D; + prev_c_data = c0_data + bid * D; } else { - compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) { - COMPUTE_CtHt(gates, ct_1, ct, ht); - }; + ker->ComputeC1H1(xx_data, c_out_data, h_out_data, wp_data); + tstart = 1; + // move one step + prev_h_data = h_out_data; + prev_c_data = c_out_data; + xx_data = xx_data + xx_offset; + h_out_data = h_out_data + gate_offset; + c_out_data = c_out_data + gate_offset; } - for (int i = 0; i < N; ++i) { - PROCESS_H0C0 - for (int step = tstart; step < seq_len; ++step) { - GEMM_WH_ADDON(1, prev_h_data, xx_data); - compute_ctht(xx_data, prev_c_data, c_out_data, h_out_data); - MOVE_ONE_STEP; - } + for (int step = tstart; step < seq_len; ++step) { + GEMM_WH_ADDON(1, prev_h_data, xx_data); + ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data, wp_data, + checked_cell_data); + // move one step + prev_h_data = h_out_data; + prev_c_data = c_out_data; + xx_data = xx_data + xx_offset; + h_out_data = h_out_data + gate_offset; + c_out_data = c_out_data + gate_offset; } } -#undef PROCESS_H0C0_DEFINES -#undef PROCESS_H0C0_PEEPHOLE -#undef PROCESS_H0C0 -#undef MOVE_ONE_STEP } void BatchCompute(const framework::ExecutionContext& ctx) const { - using DeviceContext = platform::CPUDeviceContext; - INIT_BASE_INPUT_OUTPUT - INIT_BASE_SIZES + INIT_BASE_DEFINES; if (x->lod()[0].size() == 2) { xx->Resize({x_dims[0], D4}); SeqCompute(ctx); return; } - INIT_VEC_FUNC - INIT_BASE_INPUT_DATAS + INIT_OTHER_DEFINES; auto* reordered_h0 = ctx.Output("ReorderedH0"); auto* reordered_c0 = ctx.Output("ReorderedC0"); @@ -487,8 +377,8 @@ class FuisonLSTMKernel : public framework::OpKernel { prev_c_data = reordered_c0_data; size_t sz = sizeof(T) * D; for (int i = 0; i < max_bs; ++i) { - std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz); - std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz); + blas.VCOPY(sz, h0_data + seq_order[i] * D, reordered_h0_data); + blas.VCOPY(sz, c0_data + seq_order[i] * D, reordered_c0_data); reordered_h0_data += D; reordered_c0_data += D; } @@ -498,13 +388,7 @@ class FuisonLSTMKernel : public framework::OpKernel { T* cur_h_out_data = batched_h_out_data; T* cur_c_out_data = batched_c_out_data; for (int i = 0; i < max_bs; ++i) { - GET_Ct_NOH0C0(cur_in_data, cur_c_out_data); - if (use_peepholes) { - blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D); - blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3); - } - act_gate(D, cur_in_data + D3, cur_in_data + D3); - GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data); + ker->ComputeC1H1(cur_in_data, cur_c_out_data, cur_h_out_data, wp_data); cur_in_data += D4; cur_c_out_data += D; cur_h_out_data += D; @@ -513,71 +397,37 @@ class FuisonLSTMKernel : public framework::OpKernel { prev_h_data = batched_h_out_data; prev_c_data = batched_c_out_data; } + + // compute kernel part const auto& batch_starts = batched_lod[0]; const int max_seq_len = batch_starts.size() - 1; const int offset = tstart * max_bs * D; batched_input_data = batched_input_data + offset * 4; batched_h_out_data = batched_h_out_data + offset; batched_c_out_data = batched_c_out_data + offset; - -#define DEFINE_CUR \ - T* cur_in_data = batched_input_data; \ - T* cur_prev_c_data = prev_c_data; \ - T* cur_c_out_data = batched_c_out_data; \ - T* cur_h_out_data = batched_h_out_data - -#define MOVE_ONE_BATCH \ - cur_in_data += D4; \ - cur_prev_c_data += D; \ - cur_c_out_data += D; \ - cur_h_out_data += D - -#define MOVE_ONE_STEP \ - prev_c_data = batched_c_out_data; \ - prev_h_data = batched_h_out_data; \ - batched_c_out_data = cur_c_out_data; \ - batched_h_out_data = cur_h_out_data; \ - batched_input_data = cur_in_data - - if (use_peepholes) { - for (int step = tstart; step < max_seq_len; ++step) { - const int cur_bs = batch_starts[step + 1] - batch_starts[step]; - GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); - DEFINE_CUR; - for (int i = 0; i < cur_bs; ++i) { - COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data, - cur_h_out_data); - MOVE_ONE_BATCH; - } - MOVE_ONE_STEP; - } - } else { - // TODO(TJ): unly workaround, clean me - std::function compute_ctht; - if (platform::jit::MayIUse(platform::jit::avx) && - act_gate_str == "sigmoid" && act_cand_str == "tanh" && - act_cell_str == "tanh" && D == 8) { - compute_ctht = math::lstm_compute_ctht; - } else { - compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) { - COMPUTE_CtHt(gates, ct_1, ct, ht); - }; - } - for (int step = tstart; step < max_seq_len; ++step) { - const int cur_bs = batch_starts[step + 1] - batch_starts[step]; - GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); - DEFINE_CUR; - for (int i = 0; i < cur_bs; ++i) { - compute_ctht(cur_in_data, cur_prev_c_data, cur_c_out_data, - cur_h_out_data); - MOVE_ONE_BATCH; - } - MOVE_ONE_STEP; + for (int step = tstart; step < max_seq_len; ++step) { + const int cur_bs = batch_starts[step + 1] - batch_starts[step]; + GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); + T* cur_in_data = batched_input_data; + T* cur_prev_c_data = prev_c_data; + T* cur_c_out_data = batched_c_out_data; + T* cur_h_out_data = batched_h_out_data; + for (int i = 0; i < cur_bs; ++i) { + ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data, + cur_h_out_data, wp_data, checked_cell_data); + // move one batch + cur_in_data += D4; + cur_prev_c_data += D; + cur_c_out_data += D; + cur_h_out_data += D; } + // move one step + prev_c_data = batched_c_out_data; + prev_h_data = batched_h_out_data; + batched_c_out_data = cur_c_out_data; + batched_h_out_data = cur_h_out_data; + batched_input_data = cur_in_data; } -#undef MOVE_ONE_STEP -#undef MOVE_ONE_BATCH -#undef DEFINE_CUR math::Batch2LoDTensorFunctor to_seq; batched_h_out->set_lod(batched_lod); @@ -594,18 +444,9 @@ class FuisonLSTMKernel : public framework::OpKernel { } } -#undef COMPUTE_CtHt_PEEPHOLE -#undef COMPUTE_CtHt -#undef GET_Ct_NOH0C0 -#undef COMPUTE_CtHt_NOH0C0 -#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0 -#undef GET_Ht -#undef GET_Ct #undef GEMM_WH_ADDON -#undef INIT_BASE_INPUT_DATAS -#undef INIT_BASE_SIZES -#undef INIT_BASE_INPUT_OUTPUT -#undef INIT_VEC_FUNC +#undef INIT_OTHER_DEFINES +#undef INIT_BASE_DEFINES }; } // namespace operators diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc index 248c7793560db99c0af06421bf74808422016061..7b42efd623b31a703bf51d2d157130b3120b42a4 100644 --- a/paddle/fluid/operators/isfinite_op.cc +++ b/paddle/fluid/operators/isfinite_op.cc @@ -60,7 +60,7 @@ class OverflowOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor) 1-dim tensor, contains a bool scalar. The output " "tensor of overflow operator."); AddComment(string::Sprintf(R"DOC( -Overflow operator. +Overflow %s operator. $$Out = any(X)$$ @@ -69,6 +69,8 @@ Out = Inf if any X contains Inf, Out = Nan if any X contains Nan, Out = 0 if no Inf/Nan detected. If X contains both Inf/Nan, it will return the first indicator it meeted. + +%s )DOC", GetName(), GetComments())); } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index b0276f4080b3989047f9d181e9d37a18dcfad5fa..7365bfeeb8edf09a8ad5e1cb2c61300e86bdf518 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -45,8 +45,6 @@ math_library(im2col) if (NOT WIN32) # windows do not support avx functions yet. math_library(gru_compute DEPS activation_functions math_function) math_library(lstm_compute DEPS activation_functions) -# TODO(TJ): ugly workaround, clean me -cc_library(cpu_lstm_compute SRCS cpu_lstm_compute.cc DEPS activation_functions cblas cpu_info) endif (NOT WIN32) cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) @@ -76,3 +74,7 @@ if(WITH_GPU) endif() cc_test(concat_test SRCS concat_test.cc DEPS concat) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) +cc_library(jit_kernel + SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_lstm.cc + DEPS cpu_info cblas activation_functions) +cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.cc b/paddle/fluid/operators/math/cpu_lstm_compute.cc deleted file mode 100644 index e96d1879331974e0873e13f171414bcfa8c45953..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/cpu_lstm_compute.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/cpu_lstm_compute.h" - -namespace paddle { -namespace operators { -namespace math { -#ifdef __AVX__ -template <> -void lstm_compute_ctht(float* gates, const float* ct_1, float* ct, - float* ht) { - namespace act = detail::forward::avx; - // gates: W_ch, W_ih, W_fh, W_oh - __m256 c, i, f, o; - c = _mm256_loadu_ps(gates); - i = _mm256_loadu_ps(gates + 8); - f = _mm256_loadu_ps(gates + 16); - o = _mm256_loadu_ps(gates + 24); - - /* C_t = C_t-1 * fgated + cand_gated * igated*/ - c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i)); - i = _mm256_loadu_ps(ct_1); - f = _mm256_mul_ps(i, act::Sigmoid(f)); - f = _mm256_add_ps(c, f); - _mm256_storeu_ps(ct, f); - - /* H_t = act_cell(C_t) * ogated */ - o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o)); - _mm256_storeu_ps(ht, o); -} -#endif -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.h b/paddle/fluid/operators/math/cpu_lstm_compute.h deleted file mode 100644 index 169a9e4b47f54851ad436428416eca879b78e186..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/cpu_lstm_compute.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/operators/math/cpu_vec.h" -#include "paddle/fluid/platform/cpu_info.h" -#ifdef __AVX__ -#include -#endif - -namespace paddle { -namespace operators { -namespace math { - -// TODO(TJ): ugly workaround, clean me -template -void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht) { - // gates: W_ch, W_ih, W_fh, W_oh - vec_sigmoid(24, gates + 8, gates + 8); - vec_tanh(8, gates, gates); - const T *i = gates + 8, *f = gates + 16, *o = gates + 24; - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int d = 0; d < 8; ++d) { - // C_t = C_t-1 * fgated + cand_gated * igated - ct[d] = ct_1[d] * f[d] + gates[d] * i[d]; - // H_t = act_cell(C_t) * ogated - T tmp = ct[d] * 2; - tmp = static_cast(0) - ((tmp < min) ? min : ((tmp > max) ? max : tmp)); - vec_exp(1, &tmp, &tmp); - tmp = static_cast(2) / (static_cast(1) + tmp) - static_cast(1); - ht[d] = tmp * o[d]; - } -} - -#ifdef __AVX__ -namespace detail { -namespace forward { -namespace avx { -__m256 Sigmoid(const __m256 a); -__m256 Tanh(const __m256 a); - -} // namespace avx -} // namespace forward -} // namespace detail - -template <> -void lstm_compute_ctht(float* gates, const float* ct_1, float* ct, - float* ht); - -#endif - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 6a059968b79189458349e466079cc7a663a8e5ff..0aed253c80fc28560716cbcfa70f74ef9c84f9b6 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -125,10 +125,8 @@ inline void vec_scal(const int n, const float a, } template <> -inline void vec_scal(const int n, - const float a, - const float* x, - float* y) { +inline void vec_scal(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me vec_scal(n, a, x, y); } @@ -181,10 +179,10 @@ inline void vec_bias_sub(const int n, const float a, } template <> -inline void vec_bias_sub(const int n, - const float a, - const float* x, - float* y) { +inline void vec_bias_sub(const int n, + const float a, + const float* x, + float* y) { // TODO(TJ): enable me vec_bias_sub(n, a, x, y); } @@ -242,7 +240,7 @@ inline void vec_cross(const int n, const float* x, } template <> -inline void vec_cross( +inline void vec_cross( const int n, const float* x, const float* y, const float* z, float* out) { // TODO(TJ): enable me vec_cross(n, x, y, z, out); @@ -296,10 +294,10 @@ inline void vec_add_bias(const int n, const float a, } template <> -inline void vec_add_bias(const int n, - const float a, - const float* x, - float* y) { +inline void vec_add_bias(const int n, + const float a, + const float* x, + float* y) { // TODO(TJ): enable me vec_add_bias(n, a, x, y); } @@ -390,9 +388,9 @@ inline void vec_sigmoid(const int n, const float* x, } template <> -inline void vec_sigmoid(const int n, - const float* x, - float* y) { +inline void vec_sigmoid(const int n, + const float* x, + float* y) { // TODO(TJ): enable me vec_sigmoid(n, x, y); } @@ -454,9 +452,8 @@ inline void vec_relu(const int n, const float* x, } template <> -inline void vec_relu(const int n, - const float* x, - float* y) { +inline void vec_relu(const int n, const float* x, + float* y) { // TODO(TJ): enable me vec_relu(n, x, y); } diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc index 3ce66f49ed8354c49e8af26ca6eb48fef654a40b..cd40f1b2f984126663a5711efac24fdf6d680b32 100644 --- a/paddle/fluid/operators/math/cpu_vec_test.cc +++ b/paddle/fluid/operators/math/cpu_vec_test.cc @@ -110,7 +110,7 @@ TEST(CpuVecTest, sigmoid) { TestAndBench(sz, vec_sigmoid, ref_sigmoid); TestAndBench(sz, vec_sigmoid, ref_sigmoid); TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, + TestAndBench(sz, vec_sigmoid, ref_sigmoid); } TestAndBench(30, vec_sigmoid, ref_sigmoid); @@ -123,8 +123,7 @@ TEST(CpuVecTest, tanh) { TestAndBench(sz, vec_tanh, ref_tanh); TestAndBench(sz, vec_tanh, ref_tanh); TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, - ref_tanh); + TestAndBench(sz, vec_tanh, ref_tanh); } TestAndBench(30, vec_tanh, ref_tanh); } @@ -136,8 +135,7 @@ TEST(CpuVecTest, relu) { TestAndBench(sz, vec_relu, ref_relu); TestAndBench(sz, vec_relu, ref_relu); TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, - ref_relu); + TestAndBench(sz, vec_relu, ref_relu); } TestAndBench(30, vec_relu, ref_relu); } @@ -170,7 +168,7 @@ TEST(CpuVecTest, inplace_sigmoid) { TestInplace(sz, vec_sigmoid, ref_sigmoid); TestInplace(sz, vec_sigmoid, ref_sigmoid); TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, + TestInplace(sz, vec_sigmoid, ref_sigmoid); } TestInplace(30, vec_sigmoid, ref_sigmoid); @@ -183,8 +181,7 @@ TEST(CpuVecTest, inplace_tanh) { TestInplace(sz, vec_tanh, ref_tanh); TestInplace(sz, vec_tanh, ref_tanh); TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, - ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); } TestInplace(30, vec_tanh, ref_tanh); } @@ -196,8 +193,7 @@ TEST(CpuVecTest, inplace_relu) { TestInplace(sz, vec_relu, ref_relu); TestInplace(sz, vec_relu, ref_relu); TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, - ref_relu); + TestInplace(sz, vec_relu, ref_relu); } TestInplace(30, vec_relu, ref_relu); } diff --git a/paddle/fluid/operators/math/jit_kernel.cc b/paddle/fluid/operators/math/jit_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..68b708b345334bc63b5e2e88c308d20ca6378e6b --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel.cc @@ -0,0 +1,41 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include +#include + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +namespace jit = platform::jit; + +KernelPool& KernelPool::Instance() { + static thread_local KernelPool g_jit_kernels; + return g_jit_kernels; +} + +std::shared_ptr KernelPool::Get(const std::string& key) const { + if (kers_.find(key) == kers_.end()) { + return nullptr; + } + return kers_.at(key); +} + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..b4dfda6db76fd4231be0acd1f90c98a2d62134b8 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include // for shared_ptr +#include +#include +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/macros.h" + +// Note: Only support on CPU yet. +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 +#define AVX_FLOAT_BLOCK 8 +#define AVX2_FLOAT_BLOCK 8 +#define AVX512_FLOAT_BLOCK 16 + +typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block; + +class Kernel { + public: + Kernel() = default; + virtual ~Kernel() = default; + int num_{0}; + int end_{0}; + int rest_{0}; + DISABLE_COPY_AND_ASSIGN(Kernel); +}; + +class KernelPool { + public: + static KernelPool &Instance(); + + template + std::shared_ptr Get(ARGS... args); + + std::shared_ptr Get(const std::string &key) const; + + private: + KernelPool() = default; + std::unordered_map> kers_; + + DISABLE_COPY_AND_ASSIGN(KernelPool); +}; + +template +class VMulKernel : public Kernel { + public: + virtual void Compute(const T *x, const T *y, T *z) const = 0; +}; + +template +class VAddKernel : public Kernel { + public: + virtual void Compute(const T *x, const T *y, T *z) const = 0; +}; + +template +class VScalKernel : public Kernel { + public: + virtual void Compute(const T a, const T *x, T *y) const = 0; + virtual void Compute(const T a, T *x) const = 0; +}; + +template +class VAddBiasKernel : public Kernel { + public: + virtual void Compute(const T a, const T *x, T *y) const = 0; +}; + +template +class VActKernel : public Kernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class VReluKernel : public VActKernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class VIdentityKernel : public VActKernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class VExpKernel : public VActKernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class VSigmoidKernel : public VActKernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class VTanhKernel : public VActKernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class LSTMKernel : public Kernel { + public: + virtual void ComputeCtHt(T *gates, const T *ct_1, T *ct, T *ht, + /* below only used in peephole*/ + const T *wp_data = nullptr, + T *checked = nullptr) const = 0; + + // compute c1 and h1 without c0 or h0 + virtual void ComputeC1H1(T *gates, T *ct, T *ht, + /* below only used in peephole*/ + const T *wp_data = nullptr) const = 0; +}; + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f9ea533fccdd34a5ccf061d89ffe92687d65933 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -0,0 +1,391 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include +#include "paddle/fluid/operators/math/jit_kernel_macro.h" +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +#ifdef __AVX__ +#include +#endif + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +namespace jit = platform::jit; + +/* VMUL JitKernel */ +template +class VMulKernelImpl : public VMulKernel { + public: + explicit VMulKernelImpl(int d) : VMulKernel() { this->num_ = d; } + void Compute(const T* x, const T* y, T* z) const override { + for (int i = 0; i < this->num_; ++i) { + z[i] = x[i] * y[i]; + } + } +}; + +#ifdef PADDLE_WITH_MKLML +#define MKL_FLOAT(isa, block) \ + template <> \ + void VMulKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + platform::dynload::vsMul(this->num_, x, y, z); \ + } + +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VMulKernelImpl::Compute( \ + const double* x, const double* y, double* z) const { \ + platform::dynload::vdMul(this->num_, x, y, z); \ + } + +FOR_EACH_ISA(MKL_FLOAT, kGT16); +FOR_EACH_ISA_BLOCK(MKL_DOUBLE); +#endif + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VMulKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + __m256 tmpx, tmpy; \ + tmpx = _mm256_loadu_ps(x); \ + tmpy = _mm256_loadu_ps(y); \ + tmpx = _mm256_mul_ps(tmpx, tmpy); \ + _mm256_storeu_ps(z, tmpx); \ + } + +// avx > for > mkl +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +#endif +// TODO(TJ): eq16 test and complete avx512 +#undef INTRI8_FLOAT +#undef MKL_FLOAT +#undef MKL_DOUBLE + +/* VADD JitKernel */ +template +class VAddKernelImpl : public VAddKernel { + public: + explicit VAddKernelImpl(int d) : VAddKernel() { this->num_ = d; } + void Compute(const T* x, const T* y, T* z) const override { + for (int i = 0; i < this->num_; ++i) { + z[i] = x[i] + y[i]; + } + } +}; + +#ifdef PADDLE_WITH_MKLML +#define MKL_FLOAT(isa, block) \ + template <> \ + void VAddKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + platform::dynload::vsAdd(this->num_, x, y, z); \ + } + +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VAddKernelImpl::Compute( \ + const double* x, const double* y, double* z) const { \ + platform::dynload::vdAdd(this->num_, x, y, z); \ + } + +FOR_EACH_ISA(MKL_FLOAT, kGT16); +FOR_EACH_ISA_BLOCK(MKL_DOUBLE); +#endif + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VAddKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + __m256 tmpx, tmpy; \ + tmpx = _mm256_loadu_ps(x); \ + tmpy = _mm256_loadu_ps(y); \ + tmpx = _mm256_add_ps(tmpx, tmpy); \ + _mm256_storeu_ps(z, tmpx); \ + } +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +#endif +// TODO(TJ): eq16 test and complete avx512 + +#undef INTRI8_FLOAT +#undef MKL_FLOAT +#undef MKL_DOUBLE + +/* VSCAL JitKernel */ +template +class VScalKernelImpl : public VScalKernel { + public: + explicit VScalKernelImpl(int d) : VScalKernel() { this->num_ = d; } + void Compute(const T a, const T* x, T* y) const override { + for (int i = 0; i < this->num_; ++i) { + y[i] = a * x[i]; + } + } + void Compute(const T a, T* x) const override { + for (int i = 0; i < this->num_; ++i) { + x[i] = a * x[i]; + } + } +}; + +#ifdef PADDLE_WITH_MKLML +#define MKL_FLOAT(isa, block) \ + template <> \ + void VScalKernelImpl::Compute(const float a, float* x) \ + const { \ + platform::dynload::cblas_sscal(this->num_, a, x, 1); \ + } + +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VScalKernelImpl::Compute(const double a, double* x) \ + const { \ + platform::dynload::cblas_dscal(this->num_, a, x, 1); \ + } + +FOR_EACH_ISA(MKL_FLOAT, kGT16); +FOR_EACH_ISA_BLOCK(MKL_DOUBLE); +#endif + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VScalKernelImpl::Compute( \ + const float a, const float* x, float* y) const { \ + __m256 tmp; \ + __m256 scalar = _mm256_set1_ps(a); \ + tmp = _mm256_loadu_ps(x); \ + tmp = _mm256_mul_ps(tmp, scalar); \ + _mm256_storeu_ps(y, tmp); \ + } +#define INTRI8_INPLACE_FLOAT(isa) \ + template <> \ + void VScalKernelImpl::Compute(const float a, float* x) \ + const { \ + __m256 tmp; \ + __m256 scalar = _mm256_set1_ps(a); \ + tmp = _mm256_loadu_ps(x); \ + tmp = _mm256_mul_ps(tmp, scalar); \ + _mm256_storeu_ps(x, tmp); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI8_INPLACE_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI8_INPLACE_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +INTRI8_INPLACE_FLOAT(jit::avx512f); +#endif +// TODO(TJ): eq16 test and complete avx512 + +#undef INTRI8_FLOAT +#undef INTRI8_INPLACE_FLOAT +#undef MKL_FLOAT +#undef MKL_DOUBLE + +/* VAddBias JitKernel */ +template +class VAddBiasKernelImpl : public VAddBiasKernel { + public: + explicit VAddBiasKernelImpl(int d) : VAddBiasKernel() { this->num_ = d; } + void Compute(const T a, const T* x, T* y) const override { + for (int i = 0; i < this->num_; ++i) { + y[i] = x[i] + a; + } + } +}; + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VAddBiasKernelImpl::Compute( \ + const float a, const float* x, float* y) const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a)); \ + _mm256_storeu_ps(y, tmp); \ + } + +#define INTRI16_FLOAT(isa) \ + template <> \ + void VAddBiasKernelImpl::Compute( \ + const float a, const float* x, float* y) const { \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a)); \ + tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a)); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI16_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI16_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +INTRI16_FLOAT(jit::avx512f); +#endif +// TODO(TJ): eq16 test and complete avx512 + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT + +/* VRelu JitKernel */ +template +class VReluKernelImpl : public VReluKernel { + public: + explicit VReluKernelImpl(int d) : VReluKernel() { this->num_ = d; } + void Compute(const T* x, T* y) const override { + for (int i = 0; i < this->num_; ++i) { + y[i] = x[i] > 0 ? x[i] : 0; + } + } +}; + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VReluKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + tmp = _mm256_max_ps(tmp, _mm256_setzero_ps()); \ + _mm256_storeu_ps(y, tmp); \ + } + +#define INTRI16_FLOAT(isa) \ + template <> \ + void VReluKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 zeros = _mm256_setzero_ps(); \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + tmp0 = _mm256_max_ps(tmp0, zeros); \ + tmp1 = _mm256_max_ps(tmp1, zeros); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ + } + +#define INTRI_GT8LT16_FLOAT(isa) \ + template <> \ + VReluKernelImpl::VReluKernelImpl(int d) \ + : VReluKernel() { \ + this->num_ = d; \ + this->end_ = AVX_FLOAT_BLOCK; \ + this->rest_ = d - AVX_FLOAT_BLOCK; \ + } \ + template <> \ + void VReluKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 zeros = _mm256_setzero_ps(); \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + this->rest_); \ + tmp0 = _mm256_max_ps(tmp0, zeros); \ + tmp1 = _mm256_max_ps(tmp1, zeros); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + this->rest_, tmp1); \ + } + +#define INTRI_GT16_FLOAT(isa) \ + template <> \ + VReluKernelImpl::VReluKernelImpl(int d) \ + : VReluKernel() { \ + this->num_ = d; \ + this->end_ = d - d % AVX_FLOAT_BLOCK; \ + this->rest_ = d - AVX_FLOAT_BLOCK; \ + } \ + template <> \ + void VReluKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 zeros = _mm256_setzero_ps(); \ + for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ + __m256 tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_max_ps(tmp, zeros); \ + _mm256_storeu_ps(y + i, tmp); \ + } \ + __m256 tmp = _mm256_loadu_ps(x + this->rest_); \ + tmp = _mm256_max_ps(tmp, zeros); \ + _mm256_storeu_ps(y + this->rest_, tmp); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI16_FLOAT(jit::avx); +INTRI_GT8LT16_FLOAT(jit::avx); +INTRI_GT16_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI16_FLOAT(jit::avx2); +INTRI_GT8LT16_FLOAT(jit::avx2); +INTRI_GT16_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +// TODO(TJ): refine avx512 +INTRI8_FLOAT(jit::avx512f); +INTRI16_FLOAT(jit::avx512f); +INTRI_GT8LT16_FLOAT(jit::avx512f); +INTRI_GT16_FLOAT(jit::avx512f); +#endif + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT +#undef INTRI_GT8LT16_FLOAT +#undef INTRI_GT16_FLOAT + +/* An empty JitKernel */ +template +class VIdentityKernelImpl : public VIdentityKernel { + public: + explicit VIdentityKernelImpl(int d) : VIdentityKernel() { this->num_ = d; } + void Compute(const T* x, T* y) const override {} +}; + +REGISTER_JITKERNEL(vmul, VMulKernel); +REGISTER_JITKERNEL(vadd, VAddKernel); +REGISTER_JITKERNEL(vscal, VScalKernel); +REGISTER_JITKERNEL(vaddb, VAddBiasKernel); +REGISTER_JITKERNEL(vrelu, VReluKernel); +REGISTER_JITKERNEL(videntity, VIdentityKernel); + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc new file mode 100644 index 0000000000000000000000000000000000000000..b62e130c43743f542e2074868fc01598047d6b19 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -0,0 +1,400 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include // for exp +#include +#include "paddle/fluid/operators/math/jit_kernel_macro.h" +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +#ifdef __AVX__ +#include +#endif + +namespace paddle { +namespace operators { +namespace math { + +#ifdef __AVX__ +namespace detail { +__m256 Exp(__m256 a); +} // namespace detail +#endif + +namespace jitkernel { +namespace jit = platform::jit; + +/* VExp JitKernel */ +template +class VExpKernelImpl : public VExpKernel { + public: + explicit VExpKernelImpl(int d) : VExpKernel() { this->num_ = d; } + void Compute(const T* x, T* y) const override { + for (int i = 0; i < this->num_; ++i) { + y[i] = std::exp(x[i]); + } + } +}; + +#ifdef PADDLE_WITH_MKLML +#define MKL_FLOAT(isa, block) \ + template <> \ + void VExpKernelImpl::Compute(const float* x, float* y) \ + const { \ + platform::dynload::vsExp(this->num_, x, y); \ + } + +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VExpKernelImpl::Compute(const double* x, double* y) \ + const { \ + platform::dynload::vdExp(this->num_, x, y); \ + } +FOR_EACH_ISA(MKL_FLOAT, kLT8); +FOR_EACH_ISA(MKL_FLOAT, kGT8LT16); +FOR_EACH_ISA(MKL_FLOAT, kGT16); +FOR_EACH_ISA_BLOCK(MKL_DOUBLE); +#endif + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VExpKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + _mm256_storeu_ps(y, detail::Exp(tmp)); \ + } + +#define INTRI16_FLOAT(isa) \ + template <> \ + void VExpKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + tmp0 = detail::Exp(tmp0); \ + tmp1 = detail::Exp(tmp1); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI16_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI16_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +INTRI16_FLOAT(jit::avx512f); +#endif +// TODO(TJ): eq16 test and complete avx512 + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT +#undef MKL_FLOAT +#undef MKL_DOUBLE + +REGISTER_JITKERNEL(vexp, VExpKernel); + +/* VSigmoid JitKernel */ +template +class VSigmoidKernelImpl : public VSigmoidKernel { + public: + explicit VSigmoidKernelImpl(int d) : VSigmoidKernel() { + this->num_ = d; + vexp_ = KernelPool::Instance().template Get>(d); + } + void Compute(const T* x, T* y) const override { + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < this->num_; ++i) { + y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(0) - y[i]; + } + vexp_->Compute(y, y); + for (int i = 0; i < this->num_; ++i) { + y[i] = static_cast(1) / (static_cast(1) + y[i]); + } + } + + private: + std::shared_ptr> vexp_; +}; + +#define INTRI_SIGMOID(tmp, min, max) \ + tmp = _mm256_max_ps(tmp, min); \ + tmp = _mm256_min_ps(tmp, max); \ + tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); \ + tmp = detail::Exp(tmp); \ + tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \ + tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp) + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VSigmoidKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + __m256 tmp = _mm256_loadu_ps(x); \ + INTRI_SIGMOID(tmp, min, max); \ + _mm256_storeu_ps(y, tmp); \ + } + +#define INTRI16_FLOAT(isa) \ + template <> \ + void VSigmoidKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + INTRI_SIGMOID(tmp0, min, max); \ + INTRI_SIGMOID(tmp1, min, max); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ + } + +#define INTRI_GT8LT16_FLOAT(isa) \ + template <> \ + VSigmoidKernelImpl::VSigmoidKernelImpl(int d) \ + : VSigmoidKernel() { \ + this->num_ = d; \ + this->end_ = AVX_FLOAT_BLOCK; \ + this->rest_ = d - this->end_; \ + vexp_ = \ + KernelPool::Instance().template Get>(this->rest_); \ + } \ + template <> \ + void VSigmoidKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + __m256 tmp = _mm256_loadu_ps(x); \ + INTRI_SIGMOID(tmp, min, max); \ + _mm256_storeu_ps(y, tmp); \ + const float min_ = SIGMOID_THRESHOLD_MIN; \ + const float max_ = SIGMOID_THRESHOLD_MAX; \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \ + y[i] = 0.f - y[i]; \ + } \ + vexp_->Compute(y + this->end_, y + this->end_); \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = 1.f / (1.f + y[i]); \ + } \ + } + +#define INTRI_GT16_FLOAT(isa) \ + template <> \ + VSigmoidKernelImpl::VSigmoidKernelImpl(int d) \ + : VSigmoidKernel() { \ + this->num_ = d; \ + this->rest_ = d % AVX_FLOAT_BLOCK; \ + this->end_ = d - this->rest_; \ + vexp_ = \ + KernelPool::Instance().template Get>(this->rest_); \ + } \ + template <> \ + void VSigmoidKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ + __m256 tmp = _mm256_loadu_ps(x + i); \ + INTRI_SIGMOID(tmp, min, max); \ + _mm256_storeu_ps(y + i, tmp); \ + } \ + const float min_ = SIGMOID_THRESHOLD_MIN; \ + const float max_ = SIGMOID_THRESHOLD_MAX; \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \ + y[i] = 0.f - y[i]; \ + } \ + vexp_->Compute(y + this->end_, y + this->end_); \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = 1.f / (1.f + y[i]); \ + } \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI16_FLOAT(jit::avx); +INTRI_GT8LT16_FLOAT(jit::avx); +INTRI_GT16_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI16_FLOAT(jit::avx2); +// INTRI_GT8LT16_FLOAT(jit::avx2); +// INTRI_GT16_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +INTRI16_FLOAT(jit::avx512f); +// INTRI_GT8LT16_FLOAT(jit::avx512f); +// INTRI_GT16_FLOAT(jit::avx512f); +#endif + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT +#undef INTRI_GT8LT16_FLOAT +#undef INTRI_GT16_FLOAT +#undef INTRI_VSIGMOID + +REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel); + +/* VTanh JitKernel */ +template +class VTanhKernelImpl : public VTanhKernel { + public: + explicit VTanhKernelImpl(int d) : VTanhKernel() { + this->num_ = d; + vscal_ = KernelPool::Instance().template Get>(d); + vsigmoid_ = KernelPool::Instance().template Get>(d); + vaddbias_ = KernelPool::Instance().template Get>(d); + } + void Compute(const T* x, T* y) const override { + vscal_->Compute(static_cast(2), x, y); + vsigmoid_->Compute(y, y); + vscal_->Compute(static_cast(2), y); + vaddbias_->Compute(static_cast(-1), y, y); + } + + private: + std::shared_ptr> vscal_; + std::shared_ptr> vsigmoid_; + std::shared_ptr> vaddbias_; +}; + +#define INTRI_VTANH(tmp) \ + tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), tmp); \ + tmp = _mm256_min_ps(tmp, _mm256_set1_ps(EXP_MAX_INPUT)); \ + tmp = detail::Exp(tmp); \ + tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \ + tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp); \ + tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f)) + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VTanhKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + INTRI_VTANH(tmp); \ + _mm256_storeu_ps(y, tmp); \ + } + +#define INTRI16_FLOAT(isa) \ + template <> \ + void VTanhKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + INTRI_VTANH(tmp0); \ + INTRI_VTANH(tmp1); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ + } + +#define INTRI_GT8LT16_FLOAT(isa) \ + template <> \ + VTanhKernelImpl::VTanhKernelImpl(int d) \ + : VTanhKernel() { \ + this->num_ = d; \ + this->end_ = AVX_FLOAT_BLOCK; \ + this->rest_ = d - this->end_; \ + vscal_ = \ + KernelPool::Instance().template Get>(this->rest_); \ + vsigmoid_ = KernelPool::Instance().template Get>( \ + this->rest_); \ + vaddbias_ = KernelPool::Instance().template Get>( \ + this->rest_); \ + } \ + template <> \ + void VTanhKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + INTRI_VTANH(tmp); \ + _mm256_storeu_ps(y, tmp); \ + x += AVX_FLOAT_BLOCK; \ + y += AVX_FLOAT_BLOCK; \ + vscal_->Compute(2.f, x, y); \ + vsigmoid_->Compute(y, y); \ + vscal_->Compute(2.f, y); \ + vaddbias_->Compute(-1.f, y, y); \ + } + +#define INTRI_GT16_FLOAT(isa) \ + template <> \ + VTanhKernelImpl::VTanhKernelImpl(int d) \ + : VTanhKernel() { \ + this->num_ = d; \ + this->rest_ = d % AVX_FLOAT_BLOCK; \ + this->end_ = d - this->rest_; \ + vscal_ = \ + KernelPool::Instance().template Get>(this->rest_); \ + vsigmoid_ = KernelPool::Instance().template Get>( \ + this->rest_); \ + vaddbias_ = KernelPool::Instance().template Get>( \ + this->rest_); \ + } \ + template <> \ + void VTanhKernelImpl::Compute(const float* x, float* y) \ + const { \ + for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ + __m256 tmp = _mm256_loadu_ps(x + i); \ + INTRI_VTANH(tmp); \ + _mm256_storeu_ps(y + i, tmp); \ + } \ + x += this->end_; \ + y += this->end_; \ + vscal_->Compute(2.f, x, y); \ + vsigmoid_->Compute(y, y); \ + vscal_->Compute(2.f, y); \ + vaddbias_->Compute(-1.f, y, y); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI16_FLOAT(jit::avx); +INTRI_GT8LT16_FLOAT(jit::avx); +INTRI_GT16_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI16_FLOAT(jit::avx2); +// maybe use avx at gt8lt16 and gt16 +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +INTRI16_FLOAT(jit::avx512f); +// maybe use avx at gt8lt16 and gt16 +#endif + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT +#undef INTRI_GT8LT16_FLOAT +#undef INTRI_GT16_FLOAT +#undef INTRI_VTANH + +REGISTER_JITKERNEL(vtanh, VTanhKernel); + +#undef JITKERNEL_NEW_ACT_IMPL + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_lstm.cc b/paddle/fluid/operators/math/jit_kernel_lstm.cc new file mode 100644 index 0000000000000000000000000000000000000000..42a2b96fd945c516f8c26ca51ecb452345a9a86f --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_lstm.cc @@ -0,0 +1,308 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include +#include "paddle/fluid/operators/math/jit_kernel_macro.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" + +#ifdef __AVX__ +#include +#endif + +namespace paddle { +namespace operators { +namespace math { +#ifdef __AVX__ +namespace detail { +__m256 Exp(__m256 a); +} // namespace detail +#endif + +namespace jitkernel { +namespace jit = platform::jit; + +#ifdef __AVX__ +typedef enum { kSigmoid, kRelu, kTanh, kIdentity } act_type; + +class AVXAct { + public: + virtual ~AVXAct() = default; + virtual __m256 Compute(__m256 x) const = 0; +}; + +template +class AVXActImpl : public AVXAct { + public: + __m256 Compute(__m256 x) const override { PADDLE_THROW("Unkown type!"); } +}; + +template <> +__m256 AVXActImpl::Compute(__m256 x) const { + __m256 ones = _mm256_set1_ps(1.0f); + x = _mm256_max_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MIN)); + x = _mm256_min_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MAX)); + x = _mm256_sub_ps(_mm256_set1_ps(0.0f), x); + x = detail::Exp(x); + x = _mm256_add_ps(ones, x); + return _mm256_div_ps(ones, x); +} + +template <> +__m256 AVXActImpl::Compute(__m256 x) const { + __m256 ones = _mm256_set1_ps(1.0f); + x = _mm256_mul_ps(_mm256_set1_ps(-2.0f), x); + x = _mm256_min_ps(x, _mm256_set1_ps(EXP_MAX_INPUT)); + x = detail::Exp(x); + x = _mm256_add_ps(ones, x); + x = _mm256_div_ps(_mm256_set1_ps(2.0f), x); + return _mm256_sub_ps(x, ones); +} + +template <> +__m256 AVXActImpl::Compute(__m256 x) const { + return _mm256_max_ps(x, _mm256_setzero_ps()); +} + +template <> +__m256 AVXActImpl::Compute(__m256 x) const { + return x; +} +#endif + +template +static std::shared_ptr> GetActKernel( + const std::string& type, int n) { + if (type == "sigmoid") { + return std::dynamic_pointer_cast>( + KernelPool::Instance().template Get>(n)); + } else if (type == "relu") { + return std::dynamic_pointer_cast>( + KernelPool::Instance().template Get>(n)); + } else if (type == "tanh") { + return std::dynamic_pointer_cast>( + KernelPool::Instance().template Get>(n)); + } else if (type == "identity" || type == "") { + return std::dynamic_pointer_cast>( + KernelPool::Instance().template Get>(n)); + } + PADDLE_THROW("Not support type: %s", type); + return nullptr; +} + +/* LSTM JitKernel */ +template +class LSTMKernelImpl : public LSTMKernel { + public: + explicit LSTMKernelImpl(const std::string& act_gate, + const std::string& act_cand, + const std::string& act_cell, int d) + : LSTMKernel() { + d_ = d; + d2_ = d * 2; + d3_ = d * 3; + act_gate_d3_ = GetActKernel(act_gate, d3_); + act_gate_d_ = GetActKernel(act_gate, d); + act_cand_d_ = GetActKernel(act_cand, d); + act_cell_d_ = GetActKernel(act_cell, d); + vmul_d_ = KernelPool::Instance().template Get>(d); + vadd_d_ = KernelPool::Instance().template Get>(d); +#ifdef __AVX__ + auto GetAVXAct = [&](const std::string& type) -> std::unique_ptr { + if (type == "sigmoid") { + return std::unique_ptr(new AVXActImpl()); + } else if (type == "relu") { + return std::unique_ptr(new AVXActImpl()); + } else if (type == "tanh") { + return std::unique_ptr(new AVXActImpl()); + } else if (type == "identity" || type == "") { + return std::unique_ptr(new AVXActImpl()); + } + PADDLE_THROW("Not support type: %s", type); + }; + avx_act_gate_ = GetAVXAct(act_gate); + avx_act_cand_ = GetAVXAct(act_cand); + avx_act_cell_ = GetAVXAct(act_cell); +#endif + } + + void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, + T* checked) const override { + // gates: W_ch, W_ih, W_fh, W_oh + act_gate_d3_->Compute(gates + d_, gates + d_); + + /* C_t = C_t-1 * fgated + cand_gated * igated */ + act_cand_d_->Compute(gates, gates); + vmul_d_->Compute(gates, gates + d_, gates + d_); + vmul_d_->Compute(ct_1, gates + d2_, gates + d2_); + vadd_d_->Compute(gates + d_, gates + d2_, ct); + + /* H_t = act_cell(C_t) * ogated */ + act_cell_d_->Compute(ct, gates + d2_); + vmul_d_->Compute(gates + d2_, gates + d3_, ht); + } + void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { + /* C_t = igated * cgated*/ + act_gate_d_->Compute(gates + d_, gates + d_); + act_cand_d_->Compute(gates, gates); + vmul_d_->Compute(gates, gates + d_, ct); + /* H_t = act_cell(C_t) * ogated */ + act_gate_d_->Compute(gates + d3_, gates + d3_); + act_cell_d_->Compute(ct, gates + d2_); + vmul_d_->Compute(gates + d2_, gates + d3_, ht); + } + + private: + int d_, d2_, d3_; + std::shared_ptr> act_gate_d3_, act_gate_d_, act_cand_d_, + act_cell_d_; + std::shared_ptr> vmul_d_; + std::shared_ptr> vadd_d_; +#ifdef __AVX__ + std::unique_ptr avx_act_gate_, avx_act_cand_, avx_act_cell_; +#endif +}; + +#define INTRI8_FLOAT(isa) \ + template <> \ + void LSTMKernelImpl::ComputeCtHt( \ + float* gates, const float* ct_1, float* ct, float* ht, \ + const float* wp_data, float* checked) const { \ + /* gates: W_ch, W_ih, W_fh, W_oh */ \ + __m256 c, i, f, o; \ + c = _mm256_loadu_ps(gates); \ + i = _mm256_loadu_ps(gates + 8); \ + f = _mm256_loadu_ps(gates + 16); \ + o = _mm256_loadu_ps(gates + 24); \ + /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ + c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \ + i = _mm256_loadu_ps(ct_1); \ + f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \ + f = _mm256_add_ps(c, f); \ + _mm256_storeu_ps(ct, f); \ + /* H_t = act_cell(C_t) * ogated */ \ + o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \ + _mm256_storeu_ps(ht, o); \ + } + +// TODO(TJ): optimize keq16 + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +#endif + +/* Peephole JitKernel */ +template +class PeepholeKernelImpl : public LSTMKernel { + public: + explicit PeepholeKernelImpl(const std::string& act_gate, + const std::string& act_cand, + const std::string& act_cell, int d) + : LSTMKernel() { + d_ = d; + d2_ = d * 2; + d3_ = d * 3; + act_gate_d_ = GetActKernel(act_gate, d); + act_cand_d_ = GetActKernel(act_cand, d); + act_cell_d_ = GetActKernel(act_cell, d); + vmul_d_ = KernelPool::Instance().template Get>(d); + vadd_d_ = KernelPool::Instance().template Get>(d); + vadd_d2_ = KernelPool::Instance().template Get>(d2_); + act_gate_d2_ = GetActKernel(act_gate, d2_); + } + + void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, + T* checked) const override { + /* get fgated and igated*/ + vmul_d_->Compute(wp_data, ct_1, checked); + vmul_d_->Compute(wp_data + d_, ct_1, checked + d_); + vadd_d2_->Compute(checked, gates + d_, gates + d_); + act_gate_d2_->Compute(gates + d_, gates + d_); + /* C_t = C_t-1 * fgated + cand_gated * igated*/ + act_cand_d_->Compute(gates, gates); + vmul_d_->Compute(gates, gates + d_, gates + d_); + vmul_d_->Compute(ct_1, gates + d2_, gates + d2_); + vadd_d_->Compute(gates + d_, gates + d2_, ct); + /* get ogated*/ + vmul_d_->Compute(wp_data + d2_, ct, gates + d_); + vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_); + act_gate_d_->Compute(gates + d3_, gates + d3_); + /* H_t = act_cell(C_t) * ogated */ + act_cell_d_->Compute(ct, gates + d2_); + vmul_d_->Compute(gates + d2_, gates + d3_, ht); + } + + void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { + /* C_t = igated * cgated*/ + act_gate_d_->Compute(gates + d_, gates + d_); + act_cand_d_->Compute(gates, gates); + vmul_d_->Compute(gates, gates + d_, ct); + /* get outgated, put W_oc * C_t on igated */ + vmul_d_->Compute(wp_data + d2_, ct, gates + d_); + vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_); + /* H_t = act_cell(C_t) * ogated */ + act_gate_d_->Compute(gates + d3_, gates + d3_); + act_cell_d_->Compute(ct, gates + d2_); + vmul_d_->Compute(gates + d2_, gates + d3_, ht); + } + + private: + int d_, d2_, d3_; + std::shared_ptr> act_gate_d2_, act_gate_d_, act_cand_d_, + act_cell_d_; + std::shared_ptr> vmul_d_; + std::shared_ptr> vadd_d_, vadd_d2_; +}; + +#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype) \ + template <> \ + std::shared_ptr> \ + KernelPool::Get, const std::string&, \ + const std::string&, const std::string&, int, bool>( \ + const std::string& act_gate, const std::string& act_cand, \ + const std::string& act_cell, int d, bool use_peephole) + +#define JITKERNEL_KEY_LSTM(ker_key, dtype_key) \ + #ker_key #dtype_key + std::to_string(d) + act_gate + act_cand + act_cell + \ + (use_peephole ? "p" : "n") + +#define JITKERNEL_NEW_LSTM_IMPL(ker, dtype, isa, k) \ + if (use_peephole) { \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>( \ + act_gate, act_cand, act_cell, d)); \ + } else { \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(act_gate, act_cand, \ + act_cell, d)); \ + } + +REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM, + JITKERNEL_KEY_LSTM, JITKERNEL_NEW_LSTM_IMPL); + +#undef INTRI8_FLOAT +#undef JITKERNEL_DECLARE_LSTM +#undef JITKERNEL_KEY_LSTM +#undef JITKERNEL_NEW_LSTM_IMPL +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h new file mode 100644 index 0000000000000000000000000000000000000000..d8e55f2673560ff6afa34376b73275b57a8ceea1 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_macro.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +namespace jit = platform::jit; + +#define SEARCH_BLOCK(macro_, ker, dtype, isa) \ + if (d < AVX_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kLT8); \ + } else if (d == AVX_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kEQ8); \ + } else if (d > AVX_FLOAT_BLOCK && d < AVX512_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kGT8LT16); \ + } else if (d == AVX512_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kEQ16); \ + } else { \ + macro_(ker, dtype, isa, kGT16); \ + } + +#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ + if (jit::MayIUse(jit::avx512f)) { \ + SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \ + } else if (jit::MayIUse(jit::avx2)) { \ + SEARCH_BLOCK(macro_, ker, dtype, jit::avx2); \ + } else if (jit::MayIUse(jit::avx)) { \ + SEARCH_BLOCK(macro_, ker, dtype, jit::avx); \ + } else { \ + SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \ + } + +#define JITKERNEL_DECLARE(ker_class, ker_dtype) \ + template <> \ + std::shared_ptr> \ + KernelPool::Get, int>(int d) + +#define JITKERNEL_KEY(ker_key, dtype_key) \ + #ker_key #dtype_key + std::to_string(d) + +#define JITKERNEL_NEW_IMPL(ker, dtype, isa, k) \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(d)) + +#define JITKERNEL_WITH_DTYPE(ker_key, ker_class, ker_dtype, dtype_key, \ + marco_declare, macro_key, macro_impl) \ + marco_declare(ker_class, ker_dtype) { \ + std::string key = macro_key(ker_key, dtype_key); \ + if (kers_.find(key) == kers_.end()) { \ + std::shared_ptr> p; \ + SEARCH_ISA_BLOCK(macro_impl, ker_class, ker_dtype); \ + kers_.insert({key, std::dynamic_pointer_cast(p)}); \ + return p; \ + } \ + return std::dynamic_pointer_cast>( \ + kers_.at(key)); \ + } + +#define REGISTER_JITKERNEL(ker_key, ker_class) \ + JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, JITKERNEL_DECLARE, \ + JITKERNEL_KEY, JITKERNEL_NEW_IMPL); \ + JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, JITKERNEL_DECLARE, \ + JITKERNEL_KEY, JITKERNEL_NEW_IMPL) + +#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_declare, macro_key, \ + macro_impl) \ + JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, marco_declare, macro_key, \ + macro_impl); \ + JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, marco_declare, \ + macro_key, macro_impl) + +#define FOR_EACH_ISA(macro_, block) \ + macro_(jit::avx512f, block); \ + macro_(jit::avx2, block); \ + macro_(jit::avx, block); \ + macro_(jit::isa_any, block) + +#define FOR_EACH_BLOCK(macro_, isa) \ + macro_(isa, kLT8); \ + macro_(isa, kEQ8); \ + macro_(isa, kGT8LT16); \ + macro_(isa, kEQ16); \ + macro_(isa, kGT16) + +#define FOR_EACH_ISA_BLOCK(macro_) \ + FOR_EACH_BLOCK(macro_, jit::avx512f); \ + FOR_EACH_BLOCK(macro_, jit::avx2); \ + FOR_EACH_BLOCK(macro_, jit::avx); \ + FOR_EACH_BLOCK(macro_, jit::isa_any) + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..26590171bbeaa385ac09b04e5faf483924176598 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -0,0 +1,749 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include +#include // for exp +#include // for memcpy +#include +#include +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +#ifdef __AVX__ +#include +#endif + +constexpr int repeat = 20000; + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +template +void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), + const T upper = static_cast(20.f)) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + for (int i = 0; i < n; ++i) { + a[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); + } +} + +void vrelu_ref(const int n, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] > 0.f ? x[i] : 0.f; + } +} + +#if defined __AVX__ || defined __AVX2__ +void vrelu_intri8(const int n, const float* x, float* y) { + __m256 tmp = _mm256_loadu_ps(x); + tmp = _mm256_max_ps(tmp, _mm256_setzero_ps()); + _mm256_storeu_ps(y, tmp); +} +#endif + +TEST(JitKernel, vrelu) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 256, 512}) { + std::vector x(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data(), -10.f, 1.f); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vrelu_ref(d, x_data, zref_data); + } + auto trefe = GetCurrentUS(); +#if defined __AVX__ || defined __AVX2__ + if (d == 8) { + auto si0 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vrelu_intri8(d, x_data, zref_data); + } + auto si1 = GetCurrentUS(); + VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; + } +#endif + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void vaddbias_ref(const int n, const float a, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] + a; + } +} + +TEST(JitKernel, vaddbias) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 64, 100, 128, 256}) { + std::vector x(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data(), -2.f, 2.f); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float a = 2.f; + const float* x_data = x.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vaddbias_ref(d, a, x_data, zref_data); + } + auto trefe = GetCurrentUS(); + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(a, x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void vexp_ref(const int n, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = std::exp(x[i]); + } +} + +#ifdef PADDLE_WITH_MKLML +void vexp_mkl(const int n, const float* x, float* y) { + paddle::platform::dynload::vsExp(n, x, y); +} +#endif + +TEST(JitKernel, vexp) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 128, 256}) { + std::vector x(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data(), -2.f, 2.f); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vexp_ref(d, x_data, zref_data); + } + auto trefe = GetCurrentUS(); + +#ifdef PADDLE_WITH_MKLML + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vexp_mkl(d, x_data, zref_data); + } + auto tmkle = GetCurrentUS(); +#endif + + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat +#ifdef PADDLE_WITH_MKLML + << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " +#else + << " us, " +#endif + << "tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +inline float _sigmoid(float x) { + const float min = SIGMOID_THRESHOLD_MIN; + const float max = SIGMOID_THRESHOLD_MAX; + float tmp = (x < min) ? min : ((x > max) ? max : x); + return 1.f / (1.f + std::exp(-tmp)); +} + +void vsigmoid_ref(const int n, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = _sigmoid(x[i]); + } +} + +void vsigmoid_better( + const std::shared_ptr< + const paddle::operators::math::jitkernel::VExpKernel>& vexp, + const int n, const float* x, float* y) { + const float min = SIGMOID_THRESHOLD_MIN; + const float max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = 0.f - y[i]; + } + vexp->Compute(y, y); + for (int i = 0; i < n; ++i) { + y[i] = 1.f / (1.f + y[i]); + } +} + +TEST(JitKernel, vsigmoid) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { + std::vector x(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data(), -2.f, 2.f); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const auto& vexp = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vsigmoid_better(vexp, d, x_data, zref_data); + } + auto tmkle = GetCurrentUS(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vsigmoid_ref(d, x_data, zref_data); + } + auto trefe = GetCurrentUS(); + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +inline float _tanh(float x) { return 2.f * _sigmoid(2.f * x) - 1.f; } + +void vtanh_ref(const int n, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = _tanh(x[i]); + } +} + +void vtanh_better( + const std::shared_ptr< + const paddle::operators::math::jitkernel::VScalKernel>& vscal, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VSigmoidKernel>& + vsigmoid, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VAddBiasKernel>& + vaddbias, + const int n, const float* x, float* y) { + vscal->Compute(2.f, x, y); + vsigmoid->Compute(y, y); + vscal->Compute(2.f, y); + vaddbias->Compute(-1.f, y, y); +} + +TEST(JitKernel, vtanh) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { + std::vector x(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data(), -2.f, 2.f); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const auto& vscal = + jit::KernelPool::Instance().template Get>(d); + const auto& vsigmoid = + jit::KernelPool::Instance().template Get>(d); + const auto& vaddbias = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vtanh_better(vscal, vsigmoid, vaddbias, d, x_data, zref_data); + } + auto tmkle = GetCurrentUS(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vtanh_ref(d, x_data, zref_data); + } + auto trefe = GetCurrentUS(); + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void lstm_ctht_ref( + const std::shared_ptr< + const paddle::operators::math::jitkernel::VSigmoidKernel>& + vsigmoid_3d, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VTanhKernel>& vtanh_d, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VExpKernel>& vexp_1, + const int d, float* gates, const float* ct_1, float* ct, float* ht) { + vsigmoid_3d->Compute(gates + d, gates + d); + vtanh_d->Compute(gates, gates); + const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3; + const float min = SIGMOID_THRESHOLD_MIN; + const float max = SIGMOID_THRESHOLD_MAX; + for (int k = 0; k < d; ++k) { + // C_t = C_t-1 * fgated + cand_gated * igated + ct[k] = ct_1[k] * f[k] + gates[k] * i[k]; + // H_t = act_cell(C_t) * ogated + float tmp = ct[k] * 2; + tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp)); + vexp_1->Compute(&tmp, &tmp); + tmp = 2.f / (1.f + tmp) - 1.f; + ht[k] = tmp * o[k]; + } +} + +void lstm_ctht_better( + const std::shared_ptr< + const paddle::operators::math::jitkernel::VSigmoidKernel>& + vsigmoid_3d, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VTanhKernel>& vtanh_d, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VMulKernel>& vmul_d, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VAddKernel>& vadd_d, + const int d, float* gates, const float* ct_1, float* ct, float* ht) { + int d2 = d * 2; + vsigmoid_3d->Compute(gates + d, gates + d); + vtanh_d->Compute(gates, gates); + vmul_d->Compute(gates, gates + d, gates + d); + vmul_d->Compute(ct_1, gates + d2, gates + d2); + vadd_d->Compute(gates + d, gates + d2, ct); + /* H_t = act_cell(C_t) * ogated */ + vtanh_d->Compute(ct, gates + d2); + vmul_d->Compute(gates + d2, gates + d * 3, ht); +} + +TEST(JitKernel, lstm) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 32, 64, 100}) { + int d4 = d * 4; + int d3 = d * 3; + std::vector x(d4), xref(d4); + std::vector ct_1(d), ct_tgt(d), ht_tgt(d); + std::vector ct_ref(d), ht_ref(d); + RandomVec(d4, x.data(), -2.f, 2.f); + RandomVec(d, ct_1.data(), -2.f, 2.f); + memcpy(xref.data(), x.data(), sizeof(float) * d4); + std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh"; + const auto& ker = + jit::KernelPool::Instance() + .template Get, const std::string&, + const std::string&, const std::string&>( + act_gate, act_cand, act_cell, d, false); + // below kernels are used to compute refer + const auto& vsigmoid_3d = + jit::KernelPool::Instance().template Get>( + d3); + const auto& vtanh_d = + jit::KernelPool::Instance().template Get>(d); + const auto& vexp_1 = + jit::KernelPool::Instance().template Get>(1); + const auto& vmul_d = + jit::KernelPool::Instance().template Get>(d); + const auto& vadd_d = + jit::KernelPool::Instance().template Get>(d); + + float* x_data = x.data(); + float* xref_data = xref.data(); + const float* ct_1_data = ct_1.data(); + float* ct_tgt_data = ct_tgt.data(); + float* ht_tgt_data = ht_tgt.data(); + float* ct_ref_data = ct_ref.data(); + float* ht_ref_data = ht_ref.data(); + // compute once to check correctness + lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data, + ct_ref_data, ht_ref_data); + ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3); + EXPECT_NEAR(ht_tgt_data[i], ht_ref_data[i], 1e-3); + } + + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + lstm_ctht_better(vsigmoid_3d, vtanh_d, vmul_d, vadd_d, d, xref_data, + ct_1_data, ct_ref_data, ht_ref_data); + } + auto tmkle = GetCurrentUS(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data, + ct_ref_data, ht_ref_data); + } + auto trefe = GetCurrentUS(); + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); + } + auto ttgte = GetCurrentUS(); + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, better(jit) takes: " << (tmkle - tmkls) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; + } +} + +void vscal_ref(const int n, const float a, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = a * x[i]; + } +} +void vscal_inp_ref(const int n, const float a, float* x) { + for (int i = 0; i < n; ++i) { + x[i] = a * x[i]; + } +} +#if defined __AVX__ || defined __AVX2__ +void vscal_intri8(const int n, const float a, const float* x, float* y) { + __m256 tmp; + __m256 scalar = _mm256_set1_ps(a); + tmp = _mm256_loadu_ps(x); + tmp = _mm256_mul_ps(tmp, scalar); + _mm256_storeu_ps(y, tmp); +} +void vscal_inp_intri8(const int n, const float a, float* x) { + __m256 tmp; + __m256 scalar = _mm256_set1_ps(a); + tmp = _mm256_loadu_ps(x); + tmp = _mm256_mul_ps(tmp, scalar); + _mm256_storeu_ps(x, tmp); +} +#endif + +#ifdef PADDLE_WITH_MKLML +void vscal_inp_mkl(const int n, const float a, float* x) { + paddle::platform::dynload::cblas_sscal(n, a, x, 1); +} +#endif + +TEST(JitKernel, vscal) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 256, 512}) { + std::vector x(d), y(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data()); + std::memcpy(y.data(), x.data(), sizeof(float) * d); + float a = 2.f; + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + float* y_data = y.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vscal_ref(d, a, x_data, zref_data); + } + auto trefe = GetCurrentUS(); + auto trefs1 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vscal_inp_ref(d, a, y_data); + } + auto trefe1 = GetCurrentUS(); + +#ifdef PADDLE_WITH_MKLML + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vscal_inp_mkl(d, a, y_data); + } + auto tmkle = GetCurrentUS(); +#endif + +#if defined __AVX__ || defined __AVX2__ + if (d == 8) { + auto si0 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vscal_intri8(d, a, x_data, zref_data); + } + auto si1 = GetCurrentUS(); + auto si2 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vscal_inp_intri8(d, a, y_data); + } + auto si3 = GetCurrentUS(); + VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat + << " us, inplace: " << (si3 - si2) / repeat; + } +#endif + + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(a, x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + auto ttgts1 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(a, y_data); + } + auto ttgte1 = GetCurrentUS(); + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, inplace takes: " << (trefe1 - trefs1) / repeat +#ifdef PADDLE_WITH_MKLML + << " us, mkl inplace takes: " << (tmkle - tmkls) / repeat << " us, " +#else + << " us, " +#endif + << "tgt takes: " << (ttgte - ttgts) / repeat + << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void vmul_ref(const int n, const float* x, const float* y, float* z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] * y[i]; + } +} + +#if defined __AVX__ || defined __AVX2__ +void vmul_intri8(const int n, const float* x, const float* y, float* z) { + __m256 tmpx, tmpy; + tmpx = _mm256_loadu_ps(x); + tmpy = _mm256_loadu_ps(y); + tmpx = _mm256_mul_ps(tmpx, tmpy); + _mm256_storeu_ps(z, tmpx); +} +#endif + +#ifdef PADDLE_WITH_MKLML +void vmul_mkl(const int n, const float* x, const float* y, float* z) { + paddle::platform::dynload::vsMul(n, x, y, z); +} +#endif + +TEST(JitKernel, vmul) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 256, 512}) { + std::vector x(d), y(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data()); + RandomVec(d, y.data()); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + const float* y_data = y.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vmul_ref(d, x_data, y_data, zref_data); + } + auto trefe = GetCurrentUS(); + +#ifdef PADDLE_WITH_MKLML + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vmul_mkl(d, x_data, y_data, zref_data); + } + auto tmkle = GetCurrentUS(); +#endif + +#if defined __AVX__ || defined __AVX2__ + if (d == 8) { + auto si0 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vmul_intri8(d, x_data, y_data, zref_data); + } + auto si1 = GetCurrentUS(); + VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; + } +#endif + + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, y_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat +#ifdef PADDLE_WITH_MKLML + << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " +#else + << " us, " +#endif + << "tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void vadd_ref(const int n, const float* x, const float* y, float* z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + } +} + +#if defined __AVX__ || defined __AVX2__ +void vadd_intri8(const int n, const float* x, const float* y, float* z) { + __m256 tmpx, tmpy; + tmpx = _mm256_loadu_ps(x); + tmpy = _mm256_loadu_ps(y); + tmpx = _mm256_add_ps(tmpx, tmpy); + _mm256_storeu_ps(z, tmpx); +} +#endif + +#ifdef PADDLE_WITH_MKLML +void vadd_mkl(const int n, const float* x, const float* y, float* z) { + paddle::platform::dynload::vsAdd(n, x, y, z); +} +#endif + +TEST(JitKernel, vadd) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 256, 512}) { + std::vector x(d), y(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data()); + RandomVec(d, y.data()); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + const float* y_data = y.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vadd_ref(d, x_data, y_data, zref_data); + } + auto trefe = GetCurrentUS(); + +#ifdef PADDLE_WITH_MKLML + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vadd_mkl(d, x_data, y_data, zref_data); + } + auto tmkle = GetCurrentUS(); +#endif + +#if defined __AVX__ || defined __AVX2__ + if (d == 8) { + auto si0 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vadd_intri8(d, x_data, y_data, zref_data); + } + auto si1 = GetCurrentUS(); + VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; + } +#endif + + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, y_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat +#ifdef PADDLE_WITH_MKLML + << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " +#else + << " us, " +#endif + << "tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +TEST(JitKernel, pool) { + namespace jit = paddle::operators::math::jitkernel; + const int frame_size = 4; + std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh"; + const auto& plstm1 = + jit::KernelPool::Instance() + .template Get, const std::string&, + const std::string&, const std::string&>( + act_gate, act_cand, act_cell, frame_size, false); + const auto& plstm2 = + jit::KernelPool::Instance() + .template Get, const std::string&, + const std::string&, const std::string&>( + act_gate, act_cand, act_cell, frame_size, false); + const auto& peephole = + jit::KernelPool::Instance() + .template Get, const std::string&, + const std::string&, const std::string&>( + act_gate, act_cand, act_cell, frame_size, true); + EXPECT_TRUE(plstm1 != peephole); + + const auto& pvmul_f = + jit::KernelPool::Instance().template Get>(4); + EXPECT_TRUE(std::dynamic_pointer_cast(plstm2) != + std::dynamic_pointer_cast(pvmul_f)); + + const auto& pvmul_d = + jit::KernelPool::Instance().template Get>(4); + EXPECT_TRUE(std::dynamic_pointer_cast(pvmul_f) != + std::dynamic_pointer_cast(pvmul_d)); + + const auto& pvmul_from_key = jit::KernelPool::Instance().Get("vmulf4"); + EXPECT_EQ(pvmul_f, pvmul_from_key); + const auto& pvmul_from_key2 = jit::KernelPool::Instance().Get("vmulf5"); + EXPECT_TRUE(pvmul_from_key2 == nullptr); +} diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index 97c36a83fc5eff421725d05f66fca05f5169d1bb..ab25628d45699dbcfc1fc5792958bae9e42e72a3 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -397,6 +397,24 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase { } }; +class ParallelDoGradOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + framework::BlockDesc *sub_block = + boost::get(op_desc.GetAttr(kParallelBlock)); + for (auto &out_vars : op_desc.Outputs()) { + for (auto &out_var : out_vars.second) { + auto &var = block->FindRecursiveOrCreateVar(out_var); + auto sub_var = sub_block->FindRecursiveOrCreateVar(out_var); + if (sub_var.GetType() != var.GetType()) { + var.SetType(sub_var.GetType()); + } + } + } + } +}; + } // namespace operators } // namespace paddle @@ -404,4 +422,5 @@ REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp, paddle::operators::ParallelDoOpProtoMaker, paddle::operators::ParallelDoGradOpDescMaker); REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp, - paddle::operators::ParallelDoGradOpShapeInference); + paddle::operators::ParallelDoGradOpShapeInference, + paddle::operators::ParallelDoGradOpVarTypeInference); diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index d72f85f2c44db2fa887732cfc05e1376a6a79e4a..500d86fec33830fc2cfb0412f1f2c7780d08eb02 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -164,7 +164,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input. 3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while -Attr(shape) still should be set correctly to gurantee shape inference in +Attr(shape) still should be set correctly to gurantee shape inference in compile-time. )DOC"); @@ -259,7 +259,6 @@ class Reshape2Op : public ReshapeOp { : ReshapeOp(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - ReshapeOp::InferShape(ctx); PADDLE_ENFORCE(ctx->HasOutput("XShape"), "Output(XShape) of ReshapeOp should not be null."); const auto &x_dims = ctx->GetInputDim("X"); @@ -270,6 +269,8 @@ class Reshape2Op : public ReshapeOp { } ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); ctx->ShareLoD("X", /*->*/ "XShape"); + + ReshapeOp::InferShape(ctx); } }; diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc index 397a3182953e3f1afaeadeff6d53a4f22fb95d26..3234b60861da3d0c6a8434eb11fd0488a95e171f 100644 --- a/paddle/fluid/operators/sequence_concat_op.cc +++ b/paddle/fluid/operators/sequence_concat_op.cc @@ -90,11 +90,13 @@ REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel, paddle::framework::DefaultGradOpDescMaker); template using Kernel = op::SeqConcatKernel; -REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel, Kernel); +REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel, Kernel, + Kernel); + REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel, op::SeqConcatGradShapeInferer); template using GradKernel = op::SeqConcatGradKernel; REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel, - GradKernel); + GradKernel, GradKernel); diff --git a/paddle/fluid/operators/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_unpad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f3a0762b9a4d3e080d5d6d10b249e0bd81980b95 --- /dev/null +++ b/paddle/fluid/operators/sequence_unpad_op.cc @@ -0,0 +1,153 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_unpad_op.h" + +namespace paddle { +namespace operators { + +class SequenceUnpadOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceUnpadOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Length"), + "Input(Length) of SequenceUnpadOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceUnpadOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "The rank of Input(X) can't be less than 2."); + + auto len_dims = ctx->GetInputDim("Length"); + PADDLE_ENFORCE(len_dims.size() == 2 && len_dims[1] == 1, + "The shape of Input(Length) should be [batch_size, 1]."); + PADDLE_ENFORCE( + len_dims[0] == x_dims[0], + "Input(X) and Input(Length) should have the same first dimension."); + + int64_t out_dim_0 = -1; + if (ctx->IsRuntime()) { + out_dim_0 = x_dims[0] * x_dims[1]; + } + + std::vector out_dims_vec{out_dim_0}; + if (x_dims.size() == 2) { + out_dims_vec.push_back(1); + } else { + for (size_t i = 2; i < x_dims.size(); ++i) { + out_dims_vec.push_back(x_dims[i]); + } + } + ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(LoDTensor, default LoDTensor) Input tensor which " + "contains the padded sequences with equal length."); + AddInput("Length", + "(LoDTensor) The input tensor which specifies the actual ength of " + "sequences after unpadding."); + AddOutput( + "Out", + "(LoDTensor) The output tensor which contains unpadded sequences."); + AddComment(R"DOC( + Sequence Unpad Operator + + This operator removes the padding data in the input sequences and convert + them into sequences with actual length as output, identitied by lod + information. + + Example: + + Given input tensor Input(X): + X.data = [[ 1.0, 2.0, 3.0, 4.0, 5.0], + [ 6.0, 7.0, 8.0, 9.0, 10.0], + [11.0, 12.0, 13.0, 14.0, 15.0]], +` + in which there are 3 sequences padded to length 5, and the acutal length + specified by Input(Length): + + Length.data = [[2], [3], [4]], + + after unpadding, Output(Out) will be: + + Out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]] + Out.lod = [[0, 2, 5, 9]] + + )DOC"); + } +}; + +class SequenceUnpadGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceUnpadGradOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SequenceUnpadGradOp should not be null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sequence_unpad, ops::SequenceUnpadOp, + ops::SequenceUnpadOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp); +REGISTER_OP_CPU_KERNEL( + sequence_unpad, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel); +REGISTER_OP_CPU_KERNEL( + sequence_unpad_grad, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel); diff --git a/paddle/fluid/operators/sequence_unpad_op.cu b/paddle/fluid/operators/sequence_unpad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..75248372237ec2cb23122f6b16e64f6ce750ebf9 --- /dev/null +++ b/paddle/fluid/operators/sequence_unpad_op.cu @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_unpad_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_unpad, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_unpad_grad, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel); diff --git a/paddle/fluid/operators/sequence_unpad_op.h b/paddle/fluid/operators/sequence_unpad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ebe3118b985bdfd41ca55e8c572047aa87502ff4 --- /dev/null +++ b/paddle/fluid/operators/sequence_unpad_op.h @@ -0,0 +1,104 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence_padding.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +template +class SequenceUnpadOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x_t = ctx.Input("X"); + auto* len_t = ctx.Input("Length"); + auto* out_t = ctx.Output("Out"); + out_t->mutable_data(ctx.GetPlace()); + + const int64_t* seq_len_ptr = nullptr; + if (platform::is_gpu_place(ctx.GetPlace())) { + LoDTensor seq_len_cpu; + seq_len_cpu.Resize(len_t->dims()); + seq_len_ptr = seq_len_cpu.mutable_data(platform::CPUPlace()); + framework::TensorCopy(*len_t, platform::CPUPlace(), + ctx.template device_context(), + &seq_len_cpu); + } else { + seq_len_ptr = len_t->data(); + } + + size_t batch_size = x_t->dims()[0]; + std::vector out_lod0(batch_size + 1, 0); + for (size_t i = 0; i < batch_size; ++i) { + out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i]; + } + + framework::LoD out_lod; + out_lod.push_back(out_lod0); + out_t->set_lod(out_lod); + + std::vector out_dims_vec{static_cast(out_lod0.back())}; + if (x_t->dims().size() == 2) { + out_dims_vec.push_back(1); + } else { + for (size_t i = 2; i < x_t->dims().size(); ++i) { + out_dims_vec.push_back(x_t->dims()[i]); + } + } + out_t->Resize(framework::make_ddim(out_dims_vec)); + + int64_t padded_length = x_t->dims()[1]; + math::UnpaddingLoDTensorFunctor()( + ctx.template device_context(), *x_t, out_t, + padded_length, 0, false, math::kBatchLengthWidth); + } +}; + +template +class SequenceUnpadGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* d_x = ctx.Output(framework::GradVarName("X")); + if (d_x) { + const auto* d_out = ctx.Input(framework::GradVarName("Out")); + const auto* x_t = ctx.Input("X"); + d_x->mutable_data(ctx.GetPlace()); + + int padded_length = x_t->dims()[1]; + + LoDTensor zero_pads; + zero_pads.Resize({1, 1}); + zero_pads.mutable_data(ctx.GetPlace()); + math::SetConstant set_zero; + auto& dev_ctx = ctx.template device_context(); + set_zero(dev_ctx, &zero_pads, static_cast(0)); + + math::PaddingLoDTensorFunctor()( + ctx.template device_context(), *d_out, d_x, zero_pads, + padded_length, 0, false, math::kBatchLengthWidth); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index 2880c09263f10e9c624e11b77188171f48d9db28..b5f472d20f40fa182a4aa55ff384b0954e4ba9e3 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -128,7 +128,7 @@ bool MayIUse(const cpu_isa_t cpu_isa) { return cpu.has(Cpu::tAVX); case avx2: return cpu.has(Cpu::tAVX2); - case avx512_common: + case avx512f: return cpu.has(Cpu::tAVX512F); case avx512_core: return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) && diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 30c8fbcfce92a8b06a175ddf198cde572f72b2a4..6810a1651a14cdb2080af846b21cad242b70bf35 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -43,7 +43,7 @@ typedef enum { sse42, avx, avx2, - avx512_common, + avx512f, avx512_core, avx512_core_vnni, avx512_mic, diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 4c99f4be321160caf0ee2f89a655bdfb933408e3..ab91ca5345047f3053eb8771e6a265d2a3011f85 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -116,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { platform::SetNumThreads(FLAGS_paddle_num_threads); #endif - if (platform::jit::MayIUse(platform::jit::avx512_common)) { + if (platform::jit::MayIUse(platform::jit::avx512f)) { #ifndef __AVX512F__ LOG(WARNING) << "AVX512F is available, Please re-compile on local machine"; #endif diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 612f3bc0e7940663f84a55b2c4395a7b5119d5bb..a35147da90e87af85308431fd7dbe965bb1fd1d7 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -370,8 +370,8 @@ void ParseEvents(const std::vector>& events, std::vector> merged_events_list; if (merge_thread) { std::vector merged_events; - for (int i = 0; i < events.size(); ++i) { - for (int j = 0; j < events[i].size(); ++j) { + for (size_t i = 0; i < events.size(); ++i) { + for (size_t j = 0; j < events[i].size(); ++j) { merged_events.push_back(events[i][j]); } } diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 43aa4a9e7c4ffe96a8f1346cd1bc3e4f309ba309..224781e6596e2b2bc6d3084f4780a33ed2903118 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -56,6 +56,7 @@ __all__ = [ 'sequence_expand', 'sequence_expand_as', 'sequence_pad', + 'sequence_unpad', 'lstm_unit', 'reduce_sum', 'reduce_mean', @@ -64,6 +65,7 @@ __all__ = [ 'reduce_prod', 'sequence_first_step', 'sequence_last_step', + 'sequence_slice', 'dropout', 'split', 'ctc_greedy_decoder', @@ -1902,6 +1904,76 @@ def sequence_last_step(input): return sequence_pool(input=input, pool_type="last") +def sequence_slice(input, offset, length, name=None): + """ + **Sequence Slice Layer** + + The layer crops a subsequence from given sequence with given start + offset and subsequence length. + + It only supports sequence data (LoDTensor with lod_level equal to 1). + + .. code-block:: text + + - Case: + + Given the input Variable **input**: + + input.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]], + input.lod = [[3, 2]], + input.dims = (5, 2), + + with offset.data = [[0], [1]] and length.data = [[2], [1]], + + the output Variable will be + + out.data = [[a1, a2], [b1, b2], [e1, e2]], + out.lod = [[2, 1]], + out.dims = (3, 2). + + NOTE: The first dimension size of **input**, **offset** and **length** + should be equal. The **offset** should start from 0. + + Args: + input(Variable): The input Variable which consists of the complete + sequences. + offset(Variable): The offset to slice each sequence. + length(Variable): The length of each subsequence. + name(str|None): A name for this layer(optional). If set None, the + layer will be named automatically. + + Returns: + Variable: The output subsequences. + + Examples: + + .. code-block:: python + + import numpy as np + seqs = fluid.layers.data(name='x', shape=[10, 5], + dtype='float32', lod_level=1) + offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32")) + length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32")) + subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset, + length=length) + """ + helper = LayerHelper("sequence_slice", **locals()) + dtype = helper.input_dtype() + out = helper.create_tmp_variable(dtype) + + offset.stop_gradient = True + length.stop_gradient = True + + helper.append_op( + type="sequence_slice", + inputs={"X": input, + "Offset": offset, + "Length": length}, + outputs={"Out": out}) + + return out + + @templatedoc() def pool2d(input, pool_size=-1, @@ -2793,7 +2865,7 @@ def sequence_expand_as(x, y, name=None): @templatedoc() -def sequence_pad(x, pad_value, maxlen=None): +def sequence_pad(x, pad_value, maxlen=None, name=None): """ ${comment} @@ -2807,7 +2879,9 @@ def sequence_pad(x, pad_value, maxlen=None): None or any positive int. When it is None, all sequences will be padded up to the length of the longest one among them; when it a certain positive value, it must be greater than the length of the - longest original sequence." + longest original sequence. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Returns: Variable: The padded sequence batch and the original lengths before @@ -2844,6 +2918,66 @@ def sequence_pad(x, pad_value, maxlen=None): return out, length +def sequence_unpad(x, length, name=None): + """ + **Sequence Unpad Layer** + + This layer removes the padding data in the input sequences and convert + them into sequences with actual length as output, identitied by lod + information. + + .. code-block:: text + + Example: + + Given input Variable **x**: + x.data = [[ 1.0, 2.0, 3.0, 4.0, 5.0], + [ 6.0, 7.0, 8.0, 9.0, 10.0], + [11.0, 12.0, 13.0, 14.0, 15.0]], + + in which there are 3 sequences padded to length 5, and the acutal length + specified by input Variable **length**: + + length.data = [[2], [3], [4]], + + after unpadding, the output Variable will be: + + out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]] + out.lod = [[2, 3, 4]] + + Args: + x(Variable): Input Variable which contains the padded sequences with + equal length. + length(Variable): The Variable that specifies the actual ength of + sequences after unpadding. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The Variable contains the unpadded sequences. + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10, 5], dtype='float32') + len = fluid.layers.data(name='length', shape=[1], dtype='int64') + out = fluid.layers.sequence_unpad(x=x, length=len) + """ + + helper = LayerHelper('sequence_unpad', input=x, **locals()) + dtype = helper.input_dtype() + out = helper.create_tmp_variable(dtype) + + length.stop_gradient = True + + helper.append_op( + type='sequence_unpad', + inputs={'X': x, + 'Length': length}, + outputs={'Out': out}) + return out + + def beam_search(pre_ids, pre_scores, ids, diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py index 6456d1b53a129db04ace7ff4413a3d76e922ccde..fac5e037a46715d146e354825f09ee8ccc4f3d70 100644 --- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py @@ -81,7 +81,10 @@ def get_optimizer(): return optimizer -def train_network(batch_size, is_distributed=False, is_sparse=False): +def train_network(batch_size, + is_distributed=False, + is_sparse=False, + is_self_contained_lr=False): # query q = fluid.layers.data( name="query_ids", shape=[1], dtype="int64", lod_level=1) @@ -93,7 +96,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False): param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.01), name="__emb__", - learning_rate=emb_lr), + learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__"), is_sparse=is_sparse) ## vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') @@ -119,7 +124,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False): param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.01), name="__emb__", - learning_rate=emb_lr), + learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__"), is_sparse=is_sparse) ## vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') @@ -144,7 +151,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False): param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.01), name="__emb__", - learning_rate=emb_lr), + learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__"), is_sparse=is_sparse) ## vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') @@ -220,7 +229,10 @@ class TestDistSimnetBow2x2(TestDistRunnerBase): def get_model(self, batch_size=2): # Train program avg_cost, acc, predict = \ - train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"]))) + train_network(batch_size, + bool(int(os.environ["IS_DISTRIBUTED"])), + bool(int(os.environ["IS_SPARSE"])), + bool(int(os.environ["IS_SELF_CONTAINED_LR"]))) inference_program = fluid.default_main_program().clone() diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py index e971f29db42a7c1a2394505a8ece3d2fd6b347e9..11095f23591edc41a82962149a52096fa17cfb93 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py @@ -25,7 +25,11 @@ class TestDistSimnetBowDense2x2(TestDistBase): self._enforce_place = "CPU" def test_simnet_bow(self): - need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '0', + 'IS_SELF_CONTAINED_LR': '1' + } self.check_with_place( "dist_simnet_bow.py", delta=1e-5, @@ -39,7 +43,11 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase): self._enforce_place = "CPU" def test_simnet_bow(self): - need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '0', + 'IS_SELF_CONTAINED_LR': '1' + } self.check_with_place( "dist_simnet_bow.py", delta=100, @@ -53,7 +61,11 @@ class TestDistSimnetBowSparse2x2(TestDistBase): self._enforce_place = "CPU" def test_simnet_bow(self): - need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '1', + 'IS_SELF_CONTAINED_LR': '1' + } self.check_with_place( "dist_simnet_bow.py", delta=1e-5, @@ -67,7 +79,11 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase): self._enforce_place = "CPU" def test_simnet_bow(self): - need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '1', + 'IS_SELF_CONTAINED_LR': '1' + } self.check_with_place( "dist_simnet_bow.py", delta=100, @@ -75,5 +91,59 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase): need_envs=need_envs) +class TestDistSimnetBow2x2LookupTableSync(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" + + def test_simnet_bow(self): + need_envs = { + "IS_DISTRIBUTED": '1', + "IS_SPARSE": '1', + 'IS_SELF_CONTAINED_LR': '1' + } + self.check_with_place( + "dist_simnet_bow.py", + delta=1e-5, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSimnetBow2x2LookupTableAsync(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._enforce_place = "CPU" + + def test_simnet_bow(self): + need_envs = { + "IS_DISTRIBUTED": '1', + "IS_SPARSE": '1', + 'IS_SELF_CONTAINED_LR': '1' + } + self.check_with_place( + "dist_simnet_bow.py", + delta=100, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" + + def test_simnet_bow(self): + need_envs = { + "IS_DISTRIBUTED": '1', + "IS_SPARSE": '1', + 'IS_SELF_CONTAINED_LR': '0' + } + self.check_with_place( + "dist_simnet_bow.py", + delta=1e-5, + check_error_log=False, + need_envs=need_envs) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 1d8d0b55f0c5d7cffa01a100847bdf48b6d7023d..dc70477ebe1cfbffd207ebb4bbf9d9f39893d79e 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -194,6 +194,14 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(layers.sequence_expand(x=x, y=y, ref_level=1)) print(str(program)) + def test_sequence_unpad(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[10, 5], dtype='float32') + length = layers.data(name='length', shape=[1], dtype='int64') + self.assertIsNotNone(layers.sequence_unpad(x=x, length=length)) + print(str(program)) + def test_lstm_unit(self): program = Program() with program_guard(program): @@ -406,6 +414,19 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(out) print(str(program)) + def test_sequence_slice(self): + program = Program() + with program_guard(program): + import numpy as np + seqs = layers.data( + name='x', shape=[10, 5], dtype='float32', lod_level=1) + offset = layers.assign(input=np.array([[0, 1]]).astype('int32')) + length = layers.assign(input=np.array([[2, 1]]).astype('int32')) + out = layers.sequence_slice( + input=seqs, offset=offset, length=length) + self.assertIsNotNone(out) + print(str(program)) + def test_lod_reset(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py new file mode 100644 index 0000000000000000000000000000000000000000..673b0ea180464b8b8f6f5c6e76d5c5c80f347d25 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py @@ -0,0 +1,75 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import six +import numpy as np +from op_test import OpTest + + +class TestSequenceUnpadOp(OpTest): + def init(self): + self.length = [2, 3, 4] + self.x_shape = (3, 5) + self.dtype = "float32" + + def compute(self): + assert len(self.length) == self.x_shape[0] + x = np.random.random(self.x_shape).astype(self.dtype) + out_lod = [self.length] + + out = x[0, 0:self.length[0]] + for i in six.moves.xrange(1, x.shape[0]): + out = np.append(out, x[i, 0:self.length[i]], axis=0) + + out_shape = (sum(self.length), ) + if len(self.x_shape) == 2: + out_shape = out_shape + (1, ) + else: + out_shape = out_shape + self.x_shape[2:] + + self.inputs = { + 'X': x, + 'Length': np.array(self.length).astype('int64').reshape(-1, 1) + } + self.outputs = {'Out': (out.reshape(out_shape), out_lod)} + + def setUp(self): + self.op_type = 'sequence_unpad' + self.init() + self.compute() + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestSequenceUnpadOp2(TestSequenceUnpadOp): + def init(self): + self.length = [2, 3, 4] + self.x_shape = (3, 5, 4, 3) + self.dtype = "float32" + + +class TestSequenceUnpadOp3(TestSequenceUnpadOp): + def init(self): + self.length = [5, 2, 3, 4] + self.x_shape = (4, 5, 3, 3, 6) + self.dtype = "float64" + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 91db85b8ec6a32fee3b7aa8ab76429a4a197fcc3..2192139f8d5950286691a77333dd8ec35505b033 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -1119,6 +1119,7 @@ to transpile() call.") def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints): # 2. add split_ids_op and send_op to send gradient to pservers + # there should only be one table_name all_ops = program.global_block().ops table_grad_name = grad_var_name(self.table_name) @@ -1143,7 +1144,7 @@ to transpile() call.") if self.sync_mode else [] }, attrs={ - "sync_mode": self.sync_mode, + "sync_mode": not self.sync_mode, "epmap": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, OP_ROLE_VAR_ATTR_NAME: [ @@ -1189,7 +1190,15 @@ to transpile() call.") def _create_table_optimize_block(self, pserver_index, pserver_program, pre_block_idx, grad_to_block_id): # STEP: create table optimize block + table_opt_block = pserver_program._create_block(pre_block_idx) # create table param and grad var in pserver program + # create table optimize block in pserver program + table_opt_op = [ + op for op in self.optimize_ops + if 'Param' in op.input_names and op.input("Param")[0] == + self.table_name + ][0] + origin_param_var = self.origin_program.global_block().vars[ self.table_name] @@ -1205,19 +1214,16 @@ to transpile() call.") dtype=origin_param_var.dtype, type=core.VarDesc.VarType.SELECTED_ROWS, persistable=True) + # parameter must be selected rows param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS) grad_var = pserver_program.global_block()._clone_variable( self.origin_program.global_block().vars[grad_var_name( self.table_name)]) - # create table optimize block in pserver program - table_opt_op = [ - op for op in self.optimize_ops - if 'Param' in op.input_names and op.input("Param")[0] == - self.table_name - ][0] - table_opt_block = pserver_program._create_block(pre_block_idx) + lr_var = pserver_program.global_block()._clone_variable( + self.origin_program.global_block().vars[table_opt_op.input( + "LearningRate")[0]]) if self.sync_mode: # create grad vars in pserver program @@ -1249,8 +1255,6 @@ to transpile() call.") grad_var = pserver_program.global_block()._rename_var( origin_grad_name, splited_grad_name) - lr_var = pserver_program.global_block().vars[table_opt_op.input( - "LearningRate")[0]] inputs = { "Param": [param_var], "Grad": [grad_var],