提交 9131a356 编写于 作者: T tensor-tang

replace the lstm compute with jitkernel

test=develop
上级 b55c2476
...@@ -299,7 +299,7 @@ op_library(flatten_op DEPS reshape_op) ...@@ -299,7 +299,7 @@ op_library(flatten_op DEPS reshape_op)
op_library(sequence_pad_op DEPS sequence_padding) op_library(sequence_pad_op DEPS sequence_padding)
op_library(unstack_op DEPS stack_op) op_library(unstack_op DEPS stack_op)
op_library(fake_quantize_op DEPS memory) op_library(fake_quantize_op DEPS memory)
op_library(fusion_lstm_op DEPS cpu_lstm_compute) op_library(fusion_lstm_op DEPS jit_kernel)
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(conv_op DEPS vol2col depthwise_conv im2col)
op_library(layer_norm_op DEPS cub) op_library(layer_norm_op DEPS cub)
......
...@@ -15,9 +15,9 @@ limitations under the License. */ ...@@ -15,9 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/fusion_lstm_op.h" #include "paddle/fluid/operators/fusion_lstm_op.h"
#include <string> #include <string>
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc_compute.h" #include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
...@@ -309,11 +309,6 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -309,11 +309,6 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
act_gate(D, gates + D3, gates + D3); \ act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht) GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt(gates, ct_1, ct, ht) \
act_gate(D3, gates + D, gates + D); \
GET_Ct(ct_1, gates, ct); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht) \ #define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht) \
/* get fgated and igated*/ \ /* get fgated and igated*/ \
blas.VMUL(D, wc_data, ct_1, checked_cell_data); \ blas.VMUL(D, wc_data, ct_1, checked_cell_data); \
...@@ -403,22 +398,18 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -403,22 +398,18 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
} }
} }
} else { } else {
// TODO(TJ): unly workaround, clean me const auto& ker =
std::function<void(T*, const T*, T*, T*)> compute_ctht; math::jitkernel::KernelPool::Instance()
if (platform::jit::MayIUse(platform::jit::avx) && .template Get<math::jitkernel::LSTMKernel<T>, int,
act_gate_str == "sigmoid" && act_cand_str == "tanh" && const std::string&, const std::string&,
act_cell_str == "tanh" && D == 8) { const std::string&>(D, act_gate_str, act_cand_str,
compute_ctht = math::lstm_compute_ctht<T>; act_cell_str);
} else {
compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) {
COMPUTE_CtHt(gates, ct_1, ct, ht);
};
}
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
PROCESS_H0C0 PROCESS_H0C0
for (int step = tstart; step < seq_len; ++step) { for (int step = tstart; step < seq_len; ++step) {
GEMM_WH_ADDON(1, prev_h_data, xx_data); GEMM_WH_ADDON(1, prev_h_data, xx_data);
compute_ctht(xx_data, prev_c_data, c_out_data, h_out_data); ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data);
MOVE_ONE_STEP; MOVE_ONE_STEP;
} }
} }
...@@ -552,23 +543,19 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -552,23 +543,19 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
MOVE_ONE_STEP; MOVE_ONE_STEP;
} }
} else { } else {
// TODO(TJ): unly workaround, clean me const auto& ker =
std::function<void(T*, const T*, T*, T*)> compute_ctht; math::jitkernel::KernelPool::Instance()
if (platform::jit::MayIUse(platform::jit::avx) && .template Get<math::jitkernel::LSTMKernel<T>, int,
act_gate_str == "sigmoid" && act_cand_str == "tanh" && const std::string&, const std::string&,
act_cell_str == "tanh" && D == 8) { const std::string&>(D, act_gate_str, act_cand_str,
compute_ctht = math::lstm_compute_ctht<T>; act_cell_str);
} else {
compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) {
COMPUTE_CtHt(gates, ct_1, ct, ht);
};
}
for (int step = tstart; step < max_seq_len; ++step) { for (int step = tstart; step < max_seq_len; ++step) {
const int cur_bs = batch_starts[step + 1] - batch_starts[step]; const int cur_bs = batch_starts[step + 1] - batch_starts[step];
GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
DEFINE_CUR; DEFINE_CUR;
for (int i = 0; i < cur_bs; ++i) { for (int i = 0; i < cur_bs; ++i) {
compute_ctht(cur_in_data, cur_prev_c_data, cur_c_out_data, ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
cur_h_out_data); cur_h_out_data);
MOVE_ONE_BATCH; MOVE_ONE_BATCH;
} }
...@@ -595,7 +582,6 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -595,7 +582,6 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
} }
#undef COMPUTE_CtHt_PEEPHOLE #undef COMPUTE_CtHt_PEEPHOLE
#undef COMPUTE_CtHt
#undef GET_Ct_NOH0C0 #undef GET_Ct_NOH0C0
#undef COMPUTE_CtHt_NOH0C0 #undef COMPUTE_CtHt_NOH0C0
#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0 #undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
......
...@@ -45,8 +45,6 @@ math_library(im2col) ...@@ -45,8 +45,6 @@ math_library(im2col)
if (NOT WIN32) # windows do not support avx functions yet. if (NOT WIN32) # windows do not support avx functions yet.
math_library(gru_compute DEPS activation_functions math_function) math_library(gru_compute DEPS activation_functions math_function)
math_library(lstm_compute DEPS activation_functions) math_library(lstm_compute DEPS activation_functions)
# TODO(TJ): ugly workaround, clean me
cc_library(cpu_lstm_compute SRCS cpu_lstm_compute.cc DEPS activation_functions cblas cpu_info)
endif (NOT WIN32) endif (NOT WIN32)
cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
...@@ -76,7 +74,7 @@ if(WITH_GPU) ...@@ -76,7 +74,7 @@ if(WITH_GPU)
endif() endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat) cc_test(concat_test SRCS concat_test.cc DEPS concat)
cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
cc_library(jit_kernel_exp SRCS jit_kernel_exp.cc DEPS cpu_info cblas activation_functions) cc_library(jit_kernel
cc_library(jit_kernel_lstm SRCS jit_kernel_lstm.cc DEPS cpu_info cblas activation_functions) SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_lstm.cc
cc_library(jit_kernel SRCS jit_kernel.cc jit_kernel_blas.cc DEPS cpu_info cblas jit_kernel_exp jit_kernel_lstm) DEPS cpu_info cblas activation_functions)
cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
namespace paddle {
namespace operators {
namespace math {
#ifdef __AVX__
template <>
void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct,
float* ht) {
namespace act = detail::forward::avx;
// gates: W_ch, W_ih, W_fh, W_oh
__m256 c, i, f, o;
c = _mm256_loadu_ps(gates);
i = _mm256_loadu_ps(gates + 8);
f = _mm256_loadu_ps(gates + 16);
o = _mm256_loadu_ps(gates + 24);
/* C_t = C_t-1 * fgated + cand_gated * igated*/
c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i));
i = _mm256_loadu_ps(ct_1);
f = _mm256_mul_ps(i, act::Sigmoid(f));
f = _mm256_add_ps(c, f);
_mm256_storeu_ps(ct, f);
/* H_t = act_cell(C_t) * ogated */
o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o));
_mm256_storeu_ps(ht, o);
}
#endif
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace paddle {
namespace operators {
namespace math {
// TODO(TJ): ugly workaround, clean me
template <typename T>
void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht) {
// gates: W_ch, W_ih, W_fh, W_oh
vec_sigmoid<T, platform::jit::avx>(24, gates + 8, gates + 8);
vec_tanh<T, platform::jit::avx>(8, gates, gates);
const T *i = gates + 8, *f = gates + 16, *o = gates + 24;
const T min = SIGMOID_THRESHOLD_MIN;
const T max = SIGMOID_THRESHOLD_MAX;
for (int d = 0; d < 8; ++d) {
// C_t = C_t-1 * fgated + cand_gated * igated
ct[d] = ct_1[d] * f[d] + gates[d] * i[d];
// H_t = act_cell(C_t) * ogated
T tmp = ct[d] * 2;
tmp = static_cast<T>(0) - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
vec_exp<T>(1, &tmp, &tmp);
tmp = static_cast<T>(2) / (static_cast<T>(1) + tmp) - static_cast<T>(1);
ht[d] = tmp * o[d];
}
}
#ifdef __AVX__
namespace detail {
namespace forward {
namespace avx {
__m256 Sigmoid(const __m256 a);
__m256 Tanh(const __m256 a);
} // namespace avx
} // namespace forward
} // namespace detail
template <>
void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct,
float* ht);
#endif
} // namespace math
} // namespace operators
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册