提交 d0662bc9 编写于 作者: G guosheng

Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into fix-transpose-doc

test=develop
...@@ -127,6 +127,9 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING ...@@ -127,6 +127,9 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
"A path setting fluid shared and static libraries") "A path setting fluid shared and static libraries")
set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
"A path setting fluid inference shared and static libraries")
if (WITH_C_API AND WITH_PYTHON) if (WITH_C_API AND WITH_PYTHON)
message(WARNING "It is suggest not embedded a python interpreter in Paddle " message(WARNING "It is suggest not embedded a python interpreter in Paddle "
"when using C-API. It will give an unpredictable behavior when using a " "when using C-API. It will give an unpredictable behavior when using a "
......
...@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. ...@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
### Latest PaddlePaddle Release: [Fluid 0.15.0](https://github.com/PaddlePaddle/Paddle/tree/v0.15.0) ### Latest PaddlePaddle Release: [Fluid 1.0.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0)
### Install Latest Stable Release: ### Install Latest Stable Release:
``` ```
# Linux CPU # Linux CPU
...@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.15.0.post85 ...@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.15.0.post85
## Installation ## Installation
It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/install/install_doc.html) on our website. It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) on our website.
## Documentation ## Documentation
We provide [English](http://paddlepaddle.org/documentation/docs/en/0.15.0/getstarted/index_en.html) and We provide [English](http://paddlepaddle.org/documentation/docs/en/1.0.0/getstarted/index_en.html) and
[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/index.html) documentation. [Chinese](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) documentation.
- [Deep Learning 101](https://github.com/PaddlePaddle/book) - [Deep Learning 101](https://github.com/PaddlePaddle/book)
You might want to start from this online interactive book that can run in a Jupyter Notebook. You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/user_guides/howto/training/cluster_howto.html) - [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.0/user_guides/howto/training/cluster_howto.html)
You can run distributed training jobs on MPI clusters. You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/api/zh/0.15.0/fluid.html) - [Python API](http://paddlepaddle.org/documentation/api/zh/1.0/fluid.html)
Our new API enables much shorter programs. Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/advanced_usage/development/contribute_to_paddle.html) - [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.0/advanced_usage/development/contribute_to_paddle.html)
We appreciate your contributions! We appreciate your contributions!
......
文件模式从 100644 更改为 100755
...@@ -150,16 +150,16 @@ if (WITH_ANAKIN AND WITH_MKL) ...@@ -150,16 +150,16 @@ if (WITH_ANAKIN AND WITH_MKL)
SRCS SRCS
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
${ANAKIN_INSTALL_DIR} # anakin release ${ANAKIN_INSTALL_DIR} # anakin release
DSTS ${dst_dir}/inference/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin) DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
list(APPEND inference_deps anakin_inference_lib) list(APPEND inference_deps anakin_inference_lib)
endif() endif()
set(module "inference") set(module "inference")
copy(inference_lib DEPS ${inference_deps} copy(inference_lib DEPS ${inference_deps}
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci ${src_dir}/${module}/api/paddle_inference_api.h
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
) )
set(module "platform") set(module "platform")
...@@ -188,18 +188,38 @@ copy(cmake_cache ...@@ -188,18 +188,38 @@ copy(cmake_cache
# This command generates a complete fluid library for both train and inference # This command generates a complete fluid library for both train and inference
add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
# Following commands generate a inference-only fluid library
# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
copy(third_party DEPS fluid_lib_dist
SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt
DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
)
# only need libpaddle_fluid.so/a and paddle_inference_api.h for inference-only library
copy(inference_api_lib DEPS fluid_lib_dist
SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.*
${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_inference_api.h
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
)
add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib)
# paddle fluid version # paddle fluid version
execute_process( function(version version_file)
execute_process(
COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_GIT_COMMIT) OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
set(version_file ${FLUID_INSTALL_DIR}/version.txt) file(WRITE ${version_file}
file(WRITE ${version_file}
"GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
"WITH_MKL: ${WITH_MKL}\n" "WITH_MKL: ${WITH_MKL}\n"
"WITH_MKLDNN: ${WITH_MKLDNN}\n"
"WITH_GPU: ${WITH_GPU}\n") "WITH_GPU: ${WITH_GPU}\n")
if(WITH_GPU) if(WITH_GPU)
file(APPEND ${version_file} file(APPEND ${version_file}
"CUDA version: ${CUDA_VERSION}\n" "CUDA version: ${CUDA_VERSION}\n"
"CUDNN version: v${CUDNN_MAJOR_VERSION}\n") "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
endif() endif()
endfunction()
version(${FLUID_INSTALL_DIR}/version.txt)
version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt)
...@@ -85,6 +85,7 @@ paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name'] ...@@ -85,6 +85,7 @@ paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name']
paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)) paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
......
...@@ -101,7 +101,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) { ...@@ -101,7 +101,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
} else if (var_type == proto::VarType::FETCH_LIST) { } else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>(); var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) { } else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope>>(); var->GetMutable<std::vector<framework::Scope*>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) { } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>(); var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
......
...@@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, ...@@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
// be created. // be created.
VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name); Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs = auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
*(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
if (index >= feed_inputs.size()) { if (index >= feed_inputs.size()) {
feed_inputs.resize(index + 1); feed_inputs.resize(index + 1);
} }
......
...@@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { ...@@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
} else if (var_type == proto::VarType::FETCH_LIST) { } else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>(); var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) { } else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope>>(); var->GetMutable<std::vector<framework::Scope *>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) { } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>(); var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
......
...@@ -100,16 +100,6 @@ class OpDesc { ...@@ -100,16 +100,6 @@ class OpDesc {
std::vector<std::string> InputNames() const { return MapKeys(inputs_); } std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
std::vector<std::string> OutputNames() const { return MapKeys(outputs_); } std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
void SetInputMap(const VariableNameMap &input) {
this->inputs_ = input;
this->need_update_ = true;
}
void SetOutputMap(const VariableNameMap &output) {
this->outputs_ = output;
this->need_update_ = true;
}
const VariableNameMap &Inputs() const { return inputs_; } const VariableNameMap &Inputs() const { return inputs_; }
const VariableNameMap &Outputs() const { return outputs_; } const VariableNameMap &Outputs() const { return outputs_; }
......
...@@ -149,9 +149,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -149,9 +149,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform::SetDeviceId(dev_id); platform::SetDeviceId(dev_id);
#endif #endif
} }
// The profile has a process-wide mutex, results in serious performance issue
// in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern.
if (platform::IsProfileEnabled()) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place)); platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place); RunImpl(scope, place);
} else {
RunImpl(scope, place);
}
VLOG(3) << place << " " << DebugStringEx(&scope); VLOG(3) << place << " " << DebugStringEx(&scope);
} }
......
...@@ -59,6 +59,7 @@ class VarDesc { ...@@ -59,6 +59,7 @@ class VarDesc {
public: public:
explicit VarDesc(const std::string &name) { explicit VarDesc(const std::string &name) {
desc_.set_name(name); desc_.set_name(name);
// TODO(paddle-dev): Why default to lodtensor.
desc_.mutable_type()->set_type(proto::VarType::LOD_TENSOR); desc_.mutable_type()->set_type(proto::VarType::LOD_TENSOR);
} }
......
...@@ -38,8 +38,12 @@ class Variable { ...@@ -38,8 +38,12 @@ class Variable {
template <typename T> template <typename T>
T* GetMutable() { T* GetMutable() {
if (!IsType<T>()) { if (!holder_) {
holder_.reset(new PlaceholderImpl<T>(new T())); holder_.reset(new PlaceholderImpl<T>(new T()));
} else {
PADDLE_ENFORCE(IsType<T>(),
"Variable must be type %s, the holding type is %s",
typeid(T).name(), holder_->Type().name());
} }
return static_cast<T*>(holder_->Ptr()); return static_cast<T*>(holder_->Ptr());
} }
......
...@@ -33,9 +33,10 @@ TEST(Variable, GetMutable) { ...@@ -33,9 +33,10 @@ TEST(Variable, GetMutable) {
const Tensor& tt = v->Get<Tensor>(); const Tensor& tt = v->Get<Tensor>();
EXPECT_EQ(1234, tt.content_); EXPECT_EQ(1234, tt.content_);
std::string* s = v->GetMutable<std::string>(); try {
*s = "hello"; v->GetMutable<std::string>();
} catch (std::exception& e) {
const std::string& ss = v->Get<std::string>(); return;
EXPECT_EQ("hello", ss); }
EXPECT_TRUE(false);
} }
...@@ -51,9 +51,7 @@ void TestWord2vecPrediction(const std::string& model_path) { ...@@ -51,9 +51,7 @@ void TestWord2vecPrediction(const std::string& model_path) {
config.model_dir = model_path; config.model_dir = model_path;
config.use_gpu = false; config.use_gpu = false;
config.device = 0; config.device = 0;
auto predictor = auto predictor = ::paddle::CreatePaddlePredictor<NativeConfig>(config);
::paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
config);
// One single batch // One single batch
......
...@@ -340,6 +340,19 @@ bool AnalysisPredictor::LoadProgramDesc() { ...@@ -340,6 +340,19 @@ bool AnalysisPredictor::LoadProgramDesc() {
} }
return true; return true;
} }
AnalysisPredictor::~AnalysisPredictor() {
#if !defined(_WIN32)
if (FLAGS_profile) {
platform::DisableProfiler(platform::EventSortingKey::kTotal,
"./profile.log");
}
#endif
if (sub_scope_) {
scope_->DeleteScope(sub_scope_);
}
}
std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() { std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
auto *x = new AnalysisPredictor(config_); auto *x = new AnalysisPredictor(config_);
x->Init(scope_, inference_program_); x->Init(scope_, inference_program_);
......
...@@ -72,6 +72,7 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -72,6 +72,7 @@ class AnalysisPredictor : public PaddlePredictor {
template <typename T> template <typename T>
void GetFetchOne(const framework::LoDTensor &fetchs, void GetFetchOne(const framework::LoDTensor &fetchs,
PaddleTensor *output_data); PaddleTensor *output_data);
~AnalysisPredictor();
private: private:
contrib::AnalysisConfig config_; contrib::AnalysisConfig config_;
......
...@@ -27,9 +27,7 @@ TEST(AnalysisPredictor, ZeroCopy) { ...@@ -27,9 +27,7 @@ TEST(AnalysisPredictor, ZeroCopy) {
config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
config.use_feed_fetch_ops = false; config.use_feed_fetch_ops = false;
auto predictor = auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
auto w0 = predictor->GetInputTensor("firstw"); auto w0 = predictor->GetInputTensor("firstw");
auto w1 = predictor->GetInputTensor("secondw"); auto w1 = predictor->GetInputTensor("secondw");
......
...@@ -41,11 +41,8 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { ...@@ -41,11 +41,8 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
config1.device = 0; config1.device = 0;
config1.max_batch_size = 10; config1.max_batch_size = 10;
auto predictor0 = auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0); auto predictor1 = CreatePaddlePredictor<MixedRTConfig>(config1);
auto predictor1 =
CreatePaddlePredictor<MixedRTConfig,
PaddleEngineKind::kAutoMixedTensorRT>(config1);
for (int batch_id = 0; batch_id < 1; batch_id++) { for (int batch_id = 0; batch_id < 1; batch_id++) {
//# 2. Prepare input. //# 2. Prepare input.
......
...@@ -77,7 +77,7 @@ endif(NOT WIN32) ...@@ -77,7 +77,7 @@ endif(NOT WIN32)
link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
link_directories("${PADDLE_LIB}/third_party/install/glog/lib") link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
link_directories("${PADDLE_LIB}/paddle/fluid/inference") link_directories("${PADDLE_LIB}/paddle/lib")
add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
...@@ -97,10 +97,10 @@ endif() ...@@ -97,10 +97,10 @@ endif()
# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
if(WITH_STATIC_LIB) if(WITH_STATIC_LIB)
set(DEPS set(DEPS
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
else() else()
set(DEPS set(DEPS
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
endif() endif()
if (NOT WIN32) if (NOT WIN32)
......
...@@ -5,12 +5,13 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode ...@@ -5,12 +5,13 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
DATA_DIR=$4 # dataset DATA_DIR=$4 # dataset
TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, defalut to /usr/local/TensorRT/include TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, defalut to /usr/local/TensorRT/include
TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
inference_install_dir=${PADDLE_ROOT}/build/fluid_inference_install_dir
cd `dirname $0` cd `dirname $0`
current_dir=`pwd` current_dir=`pwd`
if [ $2 == ON ]; then if [ $2 == ON ]; then
# You can export yourself if move the install path # You can export yourself if move the install path
MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB} export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB}
fi fi
if [ $3 == ON ]; then if [ $3 == ON ]; then
...@@ -55,7 +56,7 @@ cd build ...@@ -55,7 +56,7 @@ cd build
for WITH_STATIC_LIB in ON OFF; do for WITH_STATIC_LIB in ON OFF; do
# -----simple_on_word2vec----- # -----simple_on_word2vec-----
rm -rf * rm -rf *
cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \ cmake .. -DPADDLE_LIB=${inference_install_dir} \
-DWITH_MKL=$TURN_ON_MKL \ -DWITH_MKL=$TURN_ON_MKL \
-DDEMO_NAME=simple_on_word2vec \ -DDEMO_NAME=simple_on_word2vec \
-DWITH_GPU=$TEST_GPU_CPU \ -DWITH_GPU=$TEST_GPU_CPU \
...@@ -75,7 +76,7 @@ for WITH_STATIC_LIB in ON OFF; do ...@@ -75,7 +76,7 @@ for WITH_STATIC_LIB in ON OFF; do
fi fi
# ---------vis_demo--------- # ---------vis_demo---------
rm -rf * rm -rf *
cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \ cmake .. -DPADDLE_LIB=${inference_install_dir} \
-DWITH_MKL=$TURN_ON_MKL \ -DWITH_MKL=$TURN_ON_MKL \
-DDEMO_NAME=vis_demo \ -DDEMO_NAME=vis_demo \
-DWITH_GPU=$TEST_GPU_CPU \ -DWITH_GPU=$TEST_GPU_CPU \
...@@ -98,7 +99,7 @@ for WITH_STATIC_LIB in ON OFF; do ...@@ -98,7 +99,7 @@ for WITH_STATIC_LIB in ON OFF; do
# --------tensorrt mobilenet------ # --------tensorrt mobilenet------
if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
rm -rf * rm -rf *
cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \ cmake .. -DPADDLE_LIB=${inference_install_dir} \
-DWITH_MKL=$TURN_ON_MKL \ -DWITH_MKL=$TURN_ON_MKL \
-DDEMO_NAME=trt_mobilenet_demo \ -DDEMO_NAME=trt_mobilenet_demo \
-DWITH_GPU=$TEST_GPU_CPU \ -DWITH_GPU=$TEST_GPU_CPU \
......
...@@ -23,7 +23,7 @@ limitations under the License. */ ...@@ -23,7 +23,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <thread> //NOLINT #include <thread> //NOLINT
#include "paddle/fluid/inference/paddle_inference_api.h" #include "paddle/include/paddle_inference_api.h"
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
DEFINE_bool(use_gpu, false, "Whether use gpu."); DEFINE_bool(use_gpu, false, "Whether use gpu.");
...@@ -42,8 +42,7 @@ void Main(bool use_gpu) { ...@@ -42,8 +42,7 @@ void Main(bool use_gpu) {
config.use_gpu = use_gpu; config.use_gpu = use_gpu;
config.fraction_of_gpu_memory = 0.15; config.fraction_of_gpu_memory = 0.15;
config.device = 0; config.device = 0;
auto predictor = auto predictor = CreatePaddlePredictor<NativeConfig>(config);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
for (int batch_id = 0; batch_id < 3; batch_id++) { for (int batch_id = 0; batch_id < 3; batch_id++) {
//# 2. Prepare input. //# 2. Prepare input.
...@@ -85,8 +84,7 @@ void MainThreads(int num_threads, bool use_gpu) { ...@@ -85,8 +84,7 @@ void MainThreads(int num_threads, bool use_gpu) {
config.use_gpu = use_gpu; config.use_gpu = use_gpu;
config.fraction_of_gpu_memory = 0.15; config.fraction_of_gpu_memory = 0.15;
config.device = 0; config.device = 0;
auto main_predictor = auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
std::vector<std::thread> threads; std::vector<std::thread> threads;
for (int tid = 0; tid < num_threads; ++tid) { for (int tid = 0; tid < num_threads; ++tid) {
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of CHECK to avoid importing other paddle header files. #include <glog/logging.h> // use glog instead of CHECK to avoid importing other paddle header files.
#include "paddle/fluid/inference/demo_ci/utils.h" #include "utils.h" // NOLINT
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
DEFINE_string(modeldir, "", "Directory of the inference model."); DEFINE_string(modeldir, "", "Directory of the inference model.");
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/inference/paddle_inference_api.h" #include "paddle/include/paddle_inference_api.h"
namespace paddle { namespace paddle {
namespace demo { namespace demo {
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of CHECK to avoid importing other paddle header files. #include <glog/logging.h> // use glog instead of CHECK to avoid importing other paddle header files.
#include "paddle/fluid/inference/demo_ci/utils.h" #include "utils.h" // NOLINT
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
...@@ -34,12 +34,13 @@ DEFINE_bool(use_gpu, false, "Whether use gpu."); ...@@ -34,12 +34,13 @@ DEFINE_bool(use_gpu, false, "Whether use gpu.");
namespace paddle { namespace paddle {
namespace demo { namespace demo {
using contrib::AnalysisConfig;
/* /*
* Use the native fluid engine to inference the demo. * Use the native and analysis fluid engine to inference the demo.
*/ */
void Main(bool use_gpu) { void Main(bool use_gpu) {
std::unique_ptr<PaddlePredictor> predictor; std::unique_ptr<PaddlePredictor> predictor, analysis_predictor;
NativeConfig config; AnalysisConfig config;
config.param_file = FLAGS_modeldir + "/__params__"; config.param_file = FLAGS_modeldir + "/__params__";
config.prog_file = FLAGS_modeldir + "/__model__"; config.prog_file = FLAGS_modeldir + "/__model__";
config.use_gpu = use_gpu; config.use_gpu = use_gpu;
...@@ -49,8 +50,8 @@ void Main(bool use_gpu) { ...@@ -49,8 +50,8 @@ void Main(bool use_gpu) {
} }
VLOG(3) << "init predictor"; VLOG(3) << "init predictor";
predictor = predictor = CreatePaddlePredictor<NativeConfig>(config);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
VLOG(3) << "begin to process data"; VLOG(3) << "begin to process data";
// Just a single batch of data. // Just a single batch of data.
...@@ -68,7 +69,7 @@ void Main(bool use_gpu) { ...@@ -68,7 +69,7 @@ void Main(bool use_gpu) {
input.dtype = PaddleDType::FLOAT32; input.dtype = PaddleDType::FLOAT32;
VLOG(3) << "run executor"; VLOG(3) << "run executor";
std::vector<PaddleTensor> output; std::vector<PaddleTensor> output, analysis_output;
predictor->Run({input}, &output, 1); predictor->Run({input}, &output, 1);
VLOG(3) << "output.size " << output.size(); VLOG(3) << "output.size " << output.size();
...@@ -77,6 +78,10 @@ void Main(bool use_gpu) { ...@@ -77,6 +78,10 @@ void Main(bool use_gpu) {
// compare with reference result // compare with reference result
CheckOutput(FLAGS_refer, tensor); CheckOutput(FLAGS_refer, tensor);
// the analysis_output has some diff with native_output,
// TODO(luotao): add CheckOutput for analysis_output later.
analysis_predictor->Run({input}, &analysis_output, 1);
} }
} // namespace demo } // namespace demo
......
...@@ -308,18 +308,13 @@ TEST(Analyzer_rnn1, ZeroCopy) { ...@@ -308,18 +308,13 @@ TEST(Analyzer_rnn1, ZeroCopy) {
PaddlePlace place; PaddlePlace place;
int output_size{0}; int output_size{0};
auto predictor = auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
config.use_feed_fetch_ops = true; config.use_feed_fetch_ops = true;
auto native_predictor = auto native_predictor = CreatePaddlePredictor<NativeConfig>(config);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch. config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch.
auto analysis_predictor = auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
#define NEW_TENSOR(name__) \ #define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__); auto name__##_tensor = predictor->GetInputTensor(#name__);
......
...@@ -77,11 +77,9 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -77,11 +77,9 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
std::unique_ptr<PaddlePredictor> CreateTestPredictor( std::unique_ptr<PaddlePredictor> CreateTestPredictor(
const AnalysisConfig &config, bool use_analysis = true) { const AnalysisConfig &config, bool use_analysis = true) {
if (use_analysis) { if (use_analysis) {
return CreatePaddlePredictor<contrib::AnalysisConfig, return CreatePaddlePredictor<contrib::AnalysisConfig>(config);
PaddleEngineKind::kAnalysis>(config);
} else { } else {
return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>( return CreatePaddlePredictor<NativeConfig>(config);
config);
} }
} }
......
...@@ -51,11 +51,8 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) { ...@@ -51,11 +51,8 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
config1.model_dir = model_dirname; config1.model_dir = model_dirname;
config1.max_batch_size = batch_size; config1.max_batch_size = batch_size;
auto predictor0 = auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0); auto predictor1 = CreatePaddlePredictor<MixedRTConfig>(config1);
auto predictor1 =
CreatePaddlePredictor<MixedRTConfig,
PaddleEngineKind::kAutoMixedTensorRT>(config1);
// Prepare inputs // Prepare inputs
int height = 224; int height = 224;
int width = 224; int width = 224;
......
...@@ -300,7 +300,7 @@ op_library(flatten_op DEPS reshape_op) ...@@ -300,7 +300,7 @@ op_library(flatten_op DEPS reshape_op)
op_library(sequence_pad_op DEPS sequence_padding) op_library(sequence_pad_op DEPS sequence_padding)
op_library(unstack_op DEPS stack_op) op_library(unstack_op DEPS stack_op)
op_library(fake_quantize_op DEPS memory) op_library(fake_quantize_op DEPS memory)
op_library(fusion_lstm_op DEPS cpu_lstm_compute) op_library(fusion_lstm_op DEPS jit_kernel)
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(conv_op DEPS vol2col depthwise_conv im2col)
op_library(layer_norm_op DEPS cub) op_library(layer_norm_op DEPS cub)
......
...@@ -20,7 +20,7 @@ if(WITH_GRPC) ...@@ -20,7 +20,7 @@ if(WITH_GRPC)
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
cc_test(rpc_server_test SRCS rpc_server_test.cc cc_test(rpc_server_test SRCS rpc_server_test.cc
DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL)
cc_test(varhandle_test SRCS varhandle_test.cc) cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
return() return()
endif() endif()
......
...@@ -73,10 +73,11 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, ...@@ -73,10 +73,11 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val); const auto ch = GetChannel(ep_val);
SendProcessor* s = new SendProcessor(ch); SendProcessor* s = new SendProcessor(ch);
VarHandlePtr h(new VarHandle(ep, "Send", var_name_val, p_ctx, p_scope)); const std::string method = "SendRPC";
VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
s->Prepare(h, time_out); s->Prepare(h, time_out);
framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] { framework::AsyncIO([var_name_val, p_scope, p_ctx, s, method, h, this] {
auto* var = p_scope->FindVar(var_name_val); auto* var = p_scope->FindVar(var_name_val);
::grpc::ByteBuffer req; ::grpc::ByteBuffer req;
...@@ -87,10 +88,16 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, ...@@ -87,10 +88,16 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
// stub context // stub context
s->response_call_back_ = nullptr; s->response_call_back_ = nullptr;
platform::RecordEvent record_event(method, p_ctx);
auto call = s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_); s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
call->StartCall(); call->StartCall();
call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
if (UNLIKELY(platform::IsProfileEnabled())) {
h->Wait();
}
}); });
req_count_++; req_count_++;
...@@ -122,10 +129,11 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, ...@@ -122,10 +129,11 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val); const auto ch = GetChannel(ep_val);
GetProcessor* s = new GetProcessor(ch); GetProcessor* s = new GetProcessor(ch);
VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope)); const std::string method = "GetRPC";
VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
s->Prepare(h, time_out); s->Prepare(h, time_out);
framework::AsyncIO([var_name_val, s, this] { framework::AsyncIO([var_name_val, s, method, p_ctx, h, this] {
// prepare input // prepare input
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(var_name_val); req.set_varname(var_name_val);
...@@ -137,10 +145,16 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, ...@@ -137,10 +145,16 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
// stub context // stub context
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
platform::RecordEvent record_event(method, p_ctx);
auto call = s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_); s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
call->StartCall(); call->StartCall();
call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
if (UNLIKELY(platform::IsProfileEnabled())) {
h->Wait();
}
}); });
req_count_++; req_count_++;
...@@ -161,12 +175,14 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, ...@@ -161,12 +175,14 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val); const auto ch = GetChannel(ep_val);
GetProcessor* s = new GetProcessor(ch); GetProcessor* s = new GetProcessor(ch);
VarHandlePtr h(
new VarHandle(ep, "Prefetch", out_var_name_val, p_ctx, p_scope)); const std::string method = "PrefetchRPC";
VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
s->Prepare(h, time_out); s->Prepare(h, time_out);
framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
s, this] { s, method, h, this] {
auto* var = p_scope->FindVar(in_var_name_val); auto* var = p_scope->FindVar(in_var_name_val);
::grpc::ByteBuffer req; ::grpc::ByteBuffer req;
...@@ -177,11 +193,17 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, ...@@ -177,11 +193,17 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
// stub context // stub context
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
platform::RecordEvent record_event(method, p_ctx);
auto call = s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
&cq_); &cq_);
call->StartCall(); call->StartCall();
call->Finish(&s->reply_, &s->status_, static_cast<void*>(s)); call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
if (UNLIKELY(platform::IsProfileEnabled())) {
h->Wait();
}
}); });
req_count_++; req_count_++;
...@@ -193,15 +215,24 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, ...@@ -193,15 +215,24 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
const auto ch = GetChannel(ep); const auto ch = GetChannel(ep);
BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
VarHandlePtr h(new VarHandle(ep, "BatchBarrier", BATCH_BARRIER_MESSAGE, const std::string method = "BatchBarrierRPC";
nullptr, nullptr)); VarHandlePtr h(
new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr));
s->Prepare(h, time_out); s->Prepare(h, time_out);
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(BATCH_BARRIER_MESSAGE); req.set_varname(BATCH_BARRIER_MESSAGE);
platform::RecordEvent record_event(method, nullptr);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++; req_count_++;
if (UNLIKELY(platform::IsProfileEnabled())) {
h->Wait();
}
return h; return h;
} }
...@@ -209,15 +240,24 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, ...@@ -209,15 +240,24 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
int64_t time_out) { int64_t time_out) {
const auto ch = GetChannel(ep); const auto ch = GetChannel(ep);
FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
VarHandlePtr h(new VarHandle(ep, "FetchBarrier", FETCH_BARRIER_MESSAGE, const std::string method = "FetchBarrierRPC";
nullptr, nullptr)); VarHandlePtr h(
new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
s->Prepare(h, time_out); s->Prepare(h, time_out);
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(FETCH_BARRIER_MESSAGE); req.set_varname(FETCH_BARRIER_MESSAGE);
platform::RecordEvent record_event(method, nullptr);
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++; req_count_++;
if (UNLIKELY(platform::IsProfileEnabled())) {
h->Wait();
}
return h; return h;
} }
...@@ -226,15 +266,23 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, ...@@ -226,15 +266,23 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
const auto ch = GetChannel(ep); const auto ch = GetChannel(ep);
BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
VarHandlePtr h( const std::string method = "SendCompleteRPC";
new VarHandle(ep, "SendComplete", COMPLETE_MESSAGE, nullptr, nullptr)); VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr));
s->Prepare(h, time_out); s->Prepare(h, time_out);
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(COMPLETE_MESSAGE); req.set_varname(COMPLETE_MESSAGE);
platform::RecordEvent record_event(method, nullptr);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++; req_count_++;
if (UNLIKELY(platform::IsProfileEnabled())) {
h->Wait();
}
return h; return h;
} }
...@@ -244,17 +292,27 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, ...@@ -244,17 +292,27 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
const auto ch = GetChannel(ep); const auto ch = GetChannel(ep);
CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch); CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
VarHandlePtr h(new VarHandle(ep, "CheckPointNotify", CHECKPOINT_SAVE_MESSAGE,
nullptr, nullptr)); const std::string method = "CheckPointNotifyRPC";
VarHandlePtr h(
new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr));
s->Prepare(h, time_out); s->Prepare(h, time_out);
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(CHECKPOINT_SAVE_MESSAGE); req.set_varname(CHECKPOINT_SAVE_MESSAGE);
req.set_out_varname(dir); req.set_out_varname(dir);
platform::RecordEvent record_event(method, nullptr);
auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++; req_count_++;
if (UNLIKELY(platform::IsProfileEnabled())) {
h->Wait();
}
return h; return h;
} }
...@@ -273,6 +331,7 @@ void GRPCClient::Proceed() { ...@@ -273,6 +331,7 @@ void GRPCClient::Proceed() {
BaseProcessor* c = static_cast<BaseProcessor*>(tag); BaseProcessor* c = static_cast<BaseProcessor*>(tag);
GPR_ASSERT(ok); GPR_ASSERT(ok);
PADDLE_ENFORCE(c); PADDLE_ENFORCE(c);
if (c->status_.ok()) { if (c->status_.ok()) {
VLOG(3) << c->GetVarHandlePtr()->String() << " process"; VLOG(3) << c->GetVarHandlePtr()->String() << " process";
c->Process(); c->Process();
......
...@@ -36,6 +36,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -36,6 +36,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg, ::grpc::ByteBuffer* msg,
const std::string& out_name) { const std::string& out_name) {
platform::RecordEvent record_event("serial", &ctx);
// Default DestroyCallback does nothing, When using GPU // Default DestroyCallback does nothing, When using GPU
// the CPU buffer need to be freed. // the CPU buffer need to be freed.
DestroyCallback destroy_callback = [](void* backing) {}; DestroyCallback destroy_callback = [](void* backing) {};
...@@ -147,6 +148,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, ...@@ -147,6 +148,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope* scope, const framework::Scope* scope,
framework::Variable** var) { framework::Variable** var) {
platform::RecordEvent record_event("deserial", &ctx);
operators::distributed::GRPCVariableResponse resp(scope, &ctx); operators::distributed::GRPCVariableResponse resp(scope, &ctx);
PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
*var = resp.GetVar(); *var = resp.GetVar();
......
...@@ -70,6 +70,12 @@ class FillConstantOp : public framework::OperatorBase { ...@@ -70,6 +70,12 @@ class FillConstantOp : public framework::OperatorBase {
} }
}; };
class FillConstantOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {}
};
class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker { class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -102,4 +108,5 @@ Fill up a variable with specified constant value. ...@@ -102,4 +108,5 @@ Fill up a variable with specified constant value.
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
ops::FillConstantInferShape, ops::FillConstantOpMaker, ops::FillConstantInferShape, ops::FillConstantOpMaker,
paddle::framework::EmptyGradOpMaker); paddle::framework::EmptyGradOpMaker,
ops::FillConstantOpVarTypeInference);
...@@ -15,11 +15,9 @@ limitations under the License. */ ...@@ -15,11 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/fusion_lstm_op.h" #include "paddle/fluid/operators/fusion_lstm_op.h"
#include <string> #include <string>
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc_compute.h" #include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -219,24 +217,8 @@ This operator fuse the X into LSTM, more details can refer to LSTM op. ...@@ -219,24 +217,8 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
template <typename T> template <typename T>
class FuisonLSTMKernel : public framework::OpKernel<T> { class FuisonLSTMKernel : public framework::OpKernel<T> {
public: public:
#define INIT_VEC_FUNC \ #define INIT_BASE_DEFINES \
std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \ using DeviceContext = paddle::platform::CPUDeviceContext; \
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \
if (platform::jit::MayIUse(platform::jit::avx)) { \
math::VecActivations<T, platform::jit::avx> act_functor; \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
} else { \
math::VecActivations<T, platform::jit::isa_any> act_functor; \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
}
#define INIT_BASE_INPUT_OUTPUT \
auto* x = ctx.Input<LoDTensor>("X"); \ auto* x = ctx.Input<LoDTensor>("X"); \
auto* h0 = ctx.Input<Tensor>("H0"); \ auto* h0 = ctx.Input<Tensor>("H0"); \
auto* c0 = ctx.Input<Tensor>("C0"); \ auto* c0 = ctx.Input<Tensor>("C0"); \
...@@ -247,23 +229,19 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -247,23 +229,19 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \ auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
auto* cell_out = ctx.Output<LoDTensor>("Cell"); \ auto* cell_out = ctx.Output<LoDTensor>("Cell"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \ bool is_reverse = ctx.Attr<bool>("is_reverse"); \
bool use_peepholes = ctx.Attr<bool>("use_peepholes"); bool use_peepholes = ctx.Attr<bool>("use_peepholes"); \
#define INIT_BASE_SIZES \
auto x_dims = x->dims(); /* T x M*/ \ auto x_dims = x->dims(); /* T x M*/ \
auto wh_dims = wh->dims(); /* D x 4D*/ \ auto wh_dims = wh->dims(); /* D x 4D*/ \
const int M = x_dims[1]; \ const int M = x_dims[1]; \
const int D = wh_dims[0]; \ const int D = wh_dims[0]; \
const int D2 = D * 2; \ const int D4 = wh_dims[1]
const int D3 = D * 3; \
const int D4 = wh_dims[1];
#define INIT_BASE_INPUT_DATAS \ #define INIT_OTHER_DEFINES \
const T* x_data = x->data<T>(); \ const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \ const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \ const T* wh_data = wh->data<T>(); \
/* diagonal weight*/ \ /* diagonal weight*/ \
const T* wc_data = bias->data<T>() + D4; \ const T* wp_data = bias->data<T>() + D4; \
/* for peephole only*/ \ /* for peephole only*/ \
T* checked_cell_data = nullptr; \ T* checked_cell_data = nullptr; \
auto place = ctx.GetPlace(); \ auto place = ctx.GetPlace(); \
...@@ -271,69 +249,23 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -271,69 +249,23 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \
auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \ auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
checked_cell_data = checked_cell->mutable_data<T>(place); \ checked_cell_data = checked_cell->mutable_data<T>(place); \
} } \
const auto& ker = \
/// Compute LSTM math::jitkernel::KernelPool::Instance() \
.template Get<math::jitkernel::LSTMKernel<T>, const std::string&, \
const std::string&, const std::string&>( \
ctx.Attr<std::string>("gate_activation"), \
ctx.Attr<std::string>("candidate_activation"), \
ctx.Attr<std::string>("cell_activation"), D, use_peepholes)
// Wh GEMM
#define GEMM_WH_ADDON(bs, prev, out) \ #define GEMM_WH_ADDON(bs, prev, out) \
blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \ blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
wh_data, D4, static_cast<T>(1), out, D4) wh_data, D4, static_cast<T>(1), out, D4)
#define GET_Ct(ct_1, gates, ct) \
/* C_t = C_t-1 * fgated + cand_gated * igated*/ \
act_cand(D, gates, gates); \
blas.VMUL(D, gates, gates + D, gates + D); \
blas.VMUL(D, ct_1, gates + D2, gates + D2); \
blas.VADD(D, gates + D, gates + D2, ct)
#define GET_Ht(ct, gates, ht) \
/* H_t = act_cell(C_t) * ogated */ \
act_cell(D, ct, gates + D2); \
blas.VMUL(D, gates + D2, gates + D3, ht)
#define GET_Ct_NOH0C0(gates, ct) \
/* C_t = igated * cgated*/ \
act_gate(D, gates + D, gates + D); \
act_cand(D, gates, gates); \
blas.VMUL(D, gates, gates + D, ct)
#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \
GET_Ct_NOH0C0(gates, ct); \
act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \
GET_Ct_NOH0C0(gates, ct); \
/* get outgated, put W_oc * C_t on igated */ \
blas.VMUL(D, wc_data + D2, ct, gates + D); \
blas.VADD(D, gates + D, gates + D3, gates + D3); \
act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt(gates, ct_1, ct, ht) \
act_gate(D3, gates + D, gates + D); \
GET_Ct(ct_1, gates, ct); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht) \
/* get fgated and igated*/ \
blas.VMUL(D, wc_data, ct_1, checked_cell_data); \
blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \
blas.VADD(D2, checked_cell_data, gates + D, gates + D); \
act_gate(D2, gates + D, gates + D); \
GET_Ct(ct_1, gates, ct); \
/* get ogated*/ \
blas.VMUL(D, wc_data + D2, ct, gates + D); \
blas.VADD(D, gates + D, gates + D3, gates + D3); \
act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht)
void SeqCompute(const framework::ExecutionContext& ctx) const { void SeqCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = paddle::platform::CPUDeviceContext; INIT_BASE_DEFINES;
INIT_BASE_INPUT_OUTPUT INIT_OTHER_DEFINES;
INIT_BASE_SIZES
INIT_VEC_FUNC
INIT_BASE_INPUT_DATAS
auto x_lod = x->lod(); auto x_lod = x->lod();
const int total_T = x_dims[0]; const int total_T = x_dims[0];
const int N = x_lod[0].size() - 1; const int N = x_lod[0].size() - 1;
...@@ -357,89 +289,47 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -357,89 +289,47 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
gate_offset = -D; gate_offset = -D;
} }
#define MOVE_ONE_STEP \
prev_h_data = h_out_data; \
prev_c_data = c_out_data; \
xx_data = xx_data + xx_offset; \
h_out_data = h_out_data + gate_offset; \
c_out_data = c_out_data + gate_offset
#define PROCESS_H0C0_DEFINES \
int bid = is_reverse ? N - 1 - i : i; \
int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; \
const T* prev_c_data = nullptr; \
const T* prev_h_data = nullptr; \
int tstart = 0
#define PROCESS_H0C0_PEEPHOLE \
PROCESS_H0C0_DEFINES; \
if (h0_data) { \
prev_h_data = h0_data + bid * D; \
prev_c_data = c0_data + bid * D; \
} else { \
COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \
MOVE_ONE_STEP; \
tstart = 1; \
}
#define PROCESS_H0C0 \
PROCESS_H0C0_DEFINES; \
if (h0_data) { \
prev_h_data = h0_data + bid * D; \
prev_c_data = c0_data + bid * D; \
} else { \
COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \
MOVE_ONE_STEP; \
tstart = 1; \
}
if (use_peepholes) {
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
PROCESS_H0C0_PEEPHOLE int bid = is_reverse ? N - 1 - i : i;
for (int step = tstart; step < seq_len; ++step) { int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
GEMM_WH_ADDON(1, prev_h_data, xx_data); const T* prev_c_data = nullptr;
COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data); const T* prev_h_data = nullptr;
MOVE_ONE_STEP; int tstart = 0;
} if (h0_data) {
} prev_h_data = h0_data + bid * D;
} else { prev_c_data = c0_data + bid * D;
// TODO(TJ): unly workaround, clean me
std::function<void(T*, const T*, T*, T*)> compute_ctht;
if (platform::jit::MayIUse(platform::jit::avx) &&
act_gate_str == "sigmoid" && act_cand_str == "tanh" &&
act_cell_str == "tanh" && D == 8) {
compute_ctht = math::lstm_compute_ctht<T>;
} else { } else {
compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) { ker->ComputeC1H1(xx_data, c_out_data, h_out_data, wp_data);
COMPUTE_CtHt(gates, ct_1, ct, ht); tstart = 1;
}; // move one step
prev_h_data = h_out_data;
prev_c_data = c_out_data;
xx_data = xx_data + xx_offset;
h_out_data = h_out_data + gate_offset;
c_out_data = c_out_data + gate_offset;
} }
for (int i = 0; i < N; ++i) {
PROCESS_H0C0
for (int step = tstart; step < seq_len; ++step) { for (int step = tstart; step < seq_len; ++step) {
GEMM_WH_ADDON(1, prev_h_data, xx_data); GEMM_WH_ADDON(1, prev_h_data, xx_data);
compute_ctht(xx_data, prev_c_data, c_out_data, h_out_data); ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data, wp_data,
MOVE_ONE_STEP; checked_cell_data);
// move one step
prev_h_data = h_out_data;
prev_c_data = c_out_data;
xx_data = xx_data + xx_offset;
h_out_data = h_out_data + gate_offset;
c_out_data = c_out_data + gate_offset;
} }
} }
} }
#undef PROCESS_H0C0_DEFINES
#undef PROCESS_H0C0_PEEPHOLE
#undef PROCESS_H0C0
#undef MOVE_ONE_STEP
}
void BatchCompute(const framework::ExecutionContext& ctx) const { void BatchCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = platform::CPUDeviceContext; INIT_BASE_DEFINES;
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
if (x->lod()[0].size() == 2) { if (x->lod()[0].size() == 2) {
xx->Resize({x_dims[0], D4}); xx->Resize({x_dims[0], D4});
SeqCompute(ctx); SeqCompute(ctx);
return; return;
} }
INIT_VEC_FUNC INIT_OTHER_DEFINES;
INIT_BASE_INPUT_DATAS
auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0"); auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0"); auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
...@@ -487,8 +377,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -487,8 +377,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
prev_c_data = reordered_c0_data; prev_c_data = reordered_c0_data;
size_t sz = sizeof(T) * D; size_t sz = sizeof(T) * D;
for (int i = 0; i < max_bs; ++i) { for (int i = 0; i < max_bs; ++i) {
std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz); blas.VCOPY(sz, h0_data + seq_order[i] * D, reordered_h0_data);
std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz); blas.VCOPY(sz, c0_data + seq_order[i] * D, reordered_c0_data);
reordered_h0_data += D; reordered_h0_data += D;
reordered_c0_data += D; reordered_c0_data += D;
} }
...@@ -498,13 +388,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -498,13 +388,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
T* cur_h_out_data = batched_h_out_data; T* cur_h_out_data = batched_h_out_data;
T* cur_c_out_data = batched_c_out_data; T* cur_c_out_data = batched_c_out_data;
for (int i = 0; i < max_bs; ++i) { for (int i = 0; i < max_bs; ++i) {
GET_Ct_NOH0C0(cur_in_data, cur_c_out_data); ker->ComputeC1H1(cur_in_data, cur_c_out_data, cur_h_out_data, wp_data);
if (use_peepholes) {
blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D);
blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3);
}
act_gate(D, cur_in_data + D3, cur_in_data + D3);
GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data);
cur_in_data += D4; cur_in_data += D4;
cur_c_out_data += D; cur_c_out_data += D;
cur_h_out_data += D; cur_h_out_data += D;
...@@ -513,71 +397,37 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -513,71 +397,37 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
prev_h_data = batched_h_out_data; prev_h_data = batched_h_out_data;
prev_c_data = batched_c_out_data; prev_c_data = batched_c_out_data;
} }
// compute kernel part
const auto& batch_starts = batched_lod[0]; const auto& batch_starts = batched_lod[0];
const int max_seq_len = batch_starts.size() - 1; const int max_seq_len = batch_starts.size() - 1;
const int offset = tstart * max_bs * D; const int offset = tstart * max_bs * D;
batched_input_data = batched_input_data + offset * 4; batched_input_data = batched_input_data + offset * 4;
batched_h_out_data = batched_h_out_data + offset; batched_h_out_data = batched_h_out_data + offset;
batched_c_out_data = batched_c_out_data + offset; batched_c_out_data = batched_c_out_data + offset;
#define DEFINE_CUR \
T* cur_in_data = batched_input_data; \
T* cur_prev_c_data = prev_c_data; \
T* cur_c_out_data = batched_c_out_data; \
T* cur_h_out_data = batched_h_out_data
#define MOVE_ONE_BATCH \
cur_in_data += D4; \
cur_prev_c_data += D; \
cur_c_out_data += D; \
cur_h_out_data += D
#define MOVE_ONE_STEP \
prev_c_data = batched_c_out_data; \
prev_h_data = batched_h_out_data; \
batched_c_out_data = cur_c_out_data; \
batched_h_out_data = cur_h_out_data; \
batched_input_data = cur_in_data
if (use_peepholes) {
for (int step = tstart; step < max_seq_len; ++step) {
const int cur_bs = batch_starts[step + 1] - batch_starts[step];
GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
DEFINE_CUR;
for (int i = 0; i < cur_bs; ++i) {
COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data,
cur_h_out_data);
MOVE_ONE_BATCH;
}
MOVE_ONE_STEP;
}
} else {
// TODO(TJ): unly workaround, clean me
std::function<void(T*, const T*, T*, T*)> compute_ctht;
if (platform::jit::MayIUse(platform::jit::avx) &&
act_gate_str == "sigmoid" && act_cand_str == "tanh" &&
act_cell_str == "tanh" && D == 8) {
compute_ctht = math::lstm_compute_ctht<T>;
} else {
compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) {
COMPUTE_CtHt(gates, ct_1, ct, ht);
};
}
for (int step = tstart; step < max_seq_len; ++step) { for (int step = tstart; step < max_seq_len; ++step) {
const int cur_bs = batch_starts[step + 1] - batch_starts[step]; const int cur_bs = batch_starts[step + 1] - batch_starts[step];
GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
DEFINE_CUR; T* cur_in_data = batched_input_data;
T* cur_prev_c_data = prev_c_data;
T* cur_c_out_data = batched_c_out_data;
T* cur_h_out_data = batched_h_out_data;
for (int i = 0; i < cur_bs; ++i) { for (int i = 0; i < cur_bs; ++i) {
compute_ctht(cur_in_data, cur_prev_c_data, cur_c_out_data, ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
cur_h_out_data); cur_h_out_data, wp_data, checked_cell_data);
MOVE_ONE_BATCH; // move one batch
} cur_in_data += D4;
MOVE_ONE_STEP; cur_prev_c_data += D;
cur_c_out_data += D;
cur_h_out_data += D;
} }
// move one step
prev_c_data = batched_c_out_data;
prev_h_data = batched_h_out_data;
batched_c_out_data = cur_c_out_data;
batched_h_out_data = cur_h_out_data;
batched_input_data = cur_in_data;
} }
#undef MOVE_ONE_STEP
#undef MOVE_ONE_BATCH
#undef DEFINE_CUR
math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq; math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
batched_h_out->set_lod(batched_lod); batched_h_out->set_lod(batched_lod);
...@@ -594,18 +444,9 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -594,18 +444,9 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
} }
} }
#undef COMPUTE_CtHt_PEEPHOLE
#undef COMPUTE_CtHt
#undef GET_Ct_NOH0C0
#undef COMPUTE_CtHt_NOH0C0
#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
#undef GET_Ht
#undef GET_Ct
#undef GEMM_WH_ADDON #undef GEMM_WH_ADDON
#undef INIT_BASE_INPUT_DATAS #undef INIT_OTHER_DEFINES
#undef INIT_BASE_SIZES #undef INIT_BASE_DEFINES
#undef INIT_BASE_INPUT_OUTPUT
#undef INIT_VEC_FUNC
}; };
} // namespace operators } // namespace operators
......
...@@ -136,9 +136,9 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> { ...@@ -136,9 +136,9 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
// since infershape can not get lod info // since infershape can not get lod info
PADDLE_ENFORCE_EQ(ref_lod.size(), 1UL, "Only support input lod size is 1."); PADDLE_ENFORCE_EQ(ref_lod.size(), 1UL, "Only support input lod size is 1.");
PADDLE_ENFORCE_EQ(in1_lod.size(), 1UL, "Only support input lod size is 1."); PADDLE_ENFORCE_EQ(in1_lod.size(), 1UL, "Only support input lod size is 1.");
PADDLE_ENFORCE_EQ(in1_lod[0].size() - 1, N, PADDLE_ENFORCE_EQ(static_cast<int>(in1_lod[0].size() - 1), N,
"Batch size of all inputs should be equal."); "Batch size of all inputs should be equal.");
PADDLE_ENFORCE_EQ(in1_lod[0][N], N, PADDLE_ENFORCE_EQ(static_cast<int>(in1_lod[0][N]), N,
"Seq_length of other inputs should be 1."); "Seq_length of other inputs should be 1.");
PADDLE_ENFORCE_EQ(in1_dims[0], N, "input height should be batch size."); PADDLE_ENFORCE_EQ(in1_dims[0], N, "input height should be batch size.");
for (size_t i = 2; i < ins.size(); ++i) { for (size_t i = 2; i < ins.size(); ++i) {
......
...@@ -66,7 +66,7 @@ static void ParallelExecuteBlocks( ...@@ -66,7 +66,7 @@ static void ParallelExecuteBlocks(
<< "pointer: " << prepared[run_block].get(); << "pointer: " << prepared[run_block].get();
executor->RunPreparedContext(prepared[run_block].get(), scope); executor->RunPreparedContext(prepared[run_block].get(), scope);
} catch (const std::exception &e) { } catch (const std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what(); LOG(FATAL) << "run sub program:" << idx << " error " << e.what();
} }
})); }));
} }
......
...@@ -45,8 +45,6 @@ math_library(im2col) ...@@ -45,8 +45,6 @@ math_library(im2col)
if (NOT WIN32) # windows do not support avx functions yet. if (NOT WIN32) # windows do not support avx functions yet.
math_library(gru_compute DEPS activation_functions math_function) math_library(gru_compute DEPS activation_functions math_function)
math_library(lstm_compute DEPS activation_functions) math_library(lstm_compute DEPS activation_functions)
# TODO(TJ): ugly workaround, clean me
cc_library(cpu_lstm_compute SRCS cpu_lstm_compute.cc DEPS activation_functions cblas cpu_info)
endif (NOT WIN32) endif (NOT WIN32)
cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
...@@ -76,3 +74,7 @@ if(WITH_GPU) ...@@ -76,3 +74,7 @@ if(WITH_GPU)
endif() endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat) cc_test(concat_test SRCS concat_test.cc DEPS concat)
cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
cc_library(jit_kernel
SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_lstm.cc
DEPS cpu_info cblas activation_functions)
cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace paddle {
namespace operators {
namespace math {
// TODO(TJ): ugly workaround, clean me
template <typename T>
void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht) {
// gates: W_ch, W_ih, W_fh, W_oh
vec_sigmoid<T, platform::jit::avx>(24, gates + 8, gates + 8);
vec_tanh<T, platform::jit::avx>(8, gates, gates);
const T *i = gates + 8, *f = gates + 16, *o = gates + 24;
const T min = SIGMOID_THRESHOLD_MIN;
const T max = SIGMOID_THRESHOLD_MAX;
for (int d = 0; d < 8; ++d) {
// C_t = C_t-1 * fgated + cand_gated * igated
ct[d] = ct_1[d] * f[d] + gates[d] * i[d];
// H_t = act_cell(C_t) * ogated
T tmp = ct[d] * 2;
tmp = static_cast<T>(0) - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
vec_exp<T>(1, &tmp, &tmp);
tmp = static_cast<T>(2) / (static_cast<T>(1) + tmp) - static_cast<T>(1);
ht[d] = tmp * o[d];
}
}
#ifdef __AVX__
namespace detail {
namespace forward {
namespace avx {
__m256 Sigmoid(const __m256 a);
__m256 Tanh(const __m256 a);
} // namespace avx
} // namespace forward
} // namespace detail
template <>
void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct,
float* ht);
#endif
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -125,10 +125,8 @@ inline void vec_scal<float, platform::jit::avx2>(const int n, const float a, ...@@ -125,10 +125,8 @@ inline void vec_scal<float, platform::jit::avx2>(const int n, const float a,
} }
template <> template <>
inline void vec_scal<float, platform::jit::avx512_common>(const int n, inline void vec_scal<float, platform::jit::avx512f>(const int n, const float a,
const float a, const float* x, float* y) {
const float* x,
float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_scal<float, platform::jit::avx2>(n, a, x, y); vec_scal<float, platform::jit::avx2>(n, a, x, y);
} }
...@@ -181,7 +179,7 @@ inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a, ...@@ -181,7 +179,7 @@ inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
} }
template <> template <>
inline void vec_bias_sub<float, platform::jit::avx512_common>(const int n, inline void vec_bias_sub<float, platform::jit::avx512f>(const int n,
const float a, const float a,
const float* x, const float* x,
float* y) { float* y) {
...@@ -242,7 +240,7 @@ inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x, ...@@ -242,7 +240,7 @@ inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
} }
template <> template <>
inline void vec_cross<float, platform::jit::avx512_common>( inline void vec_cross<float, platform::jit::avx512f>(
const int n, const float* x, const float* y, const float* z, float* out) { const int n, const float* x, const float* y, const float* z, float* out) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_cross<float, platform::jit::avx>(n, x, y, z, out); vec_cross<float, platform::jit::avx>(n, x, y, z, out);
...@@ -296,7 +294,7 @@ inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a, ...@@ -296,7 +294,7 @@ inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a,
} }
template <> template <>
inline void vec_add_bias<float, platform::jit::avx512_common>(const int n, inline void vec_add_bias<float, platform::jit::avx512f>(const int n,
const float a, const float a,
const float* x, const float* x,
float* y) { float* y) {
...@@ -390,7 +388,7 @@ inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x, ...@@ -390,7 +388,7 @@ inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x,
} }
template <> template <>
inline void vec_sigmoid<float, platform::jit::avx512_common>(const int n, inline void vec_sigmoid<float, platform::jit::avx512f>(const int n,
const float* x, const float* x,
float* y) { float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
...@@ -454,8 +452,7 @@ inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x, ...@@ -454,8 +452,7 @@ inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
} }
template <> template <>
inline void vec_relu<float, platform::jit::avx512_common>(const int n, inline void vec_relu<float, platform::jit::avx512f>(const int n, const float* x,
const float* x,
float* y) { float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_relu<float, platform::jit::avx2>(n, x, y); vec_relu<float, platform::jit::avx2>(n, x, y);
......
...@@ -110,7 +110,7 @@ TEST(CpuVecTest, sigmoid) { ...@@ -110,7 +110,7 @@ TEST(CpuVecTest, sigmoid) {
TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>); TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
TestAndBench<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>); TestAndBench<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
TestAndBench<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>); TestAndBench<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
TestAndBench<float>(sz, vec_sigmoid<float, jit::avx512_common>, TestAndBench<float>(sz, vec_sigmoid<float, jit::avx512f>,
ref_sigmoid<float>); ref_sigmoid<float>);
} }
TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>); TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
...@@ -123,8 +123,7 @@ TEST(CpuVecTest, tanh) { ...@@ -123,8 +123,7 @@ TEST(CpuVecTest, tanh) {
TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>); TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
TestAndBench<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>); TestAndBench<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
TestAndBench<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>); TestAndBench<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
TestAndBench<float>(sz, vec_tanh<float, jit::avx512_common>, TestAndBench<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>);
ref_tanh<float>);
} }
TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>); TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
} }
...@@ -136,8 +135,7 @@ TEST(CpuVecTest, relu) { ...@@ -136,8 +135,7 @@ TEST(CpuVecTest, relu) {
TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>); TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
TestAndBench<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>); TestAndBench<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
TestAndBench<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>); TestAndBench<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
TestAndBench<float>(sz, vec_relu<float, jit::avx512_common>, TestAndBench<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>);
ref_relu<float>);
} }
TestAndBench<double>(30, vec_relu<double>, ref_relu<double>); TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
} }
...@@ -170,7 +168,7 @@ TEST(CpuVecTest, inplace_sigmoid) { ...@@ -170,7 +168,7 @@ TEST(CpuVecTest, inplace_sigmoid) {
TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>); TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
TestInplace<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>); TestInplace<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
TestInplace<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>); TestInplace<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
TestInplace<float>(sz, vec_sigmoid<float, jit::avx512_common>, TestInplace<float>(sz, vec_sigmoid<float, jit::avx512f>,
ref_sigmoid<float>); ref_sigmoid<float>);
} }
TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>); TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
...@@ -183,8 +181,7 @@ TEST(CpuVecTest, inplace_tanh) { ...@@ -183,8 +181,7 @@ TEST(CpuVecTest, inplace_tanh) {
TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>); TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
TestInplace<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>); TestInplace<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
TestInplace<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>); TestInplace<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
TestInplace<float>(sz, vec_tanh<float, jit::avx512_common>, TestInplace<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>);
ref_tanh<float>);
} }
TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>); TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>);
} }
...@@ -196,8 +193,7 @@ TEST(CpuVecTest, inplace_relu) { ...@@ -196,8 +193,7 @@ TEST(CpuVecTest, inplace_relu) {
TestInplace<float>(sz, vec_relu<float>, ref_relu<float>); TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
TestInplace<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>); TestInplace<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
TestInplace<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>); TestInplace<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
TestInplace<float>(sz, vec_relu<float, jit::avx512_common>, TestInplace<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>);
ref_relu<float>);
} }
TestInplace<double>(30, vec_relu<double>, ref_relu<double>); TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
} }
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/cpu_lstm_compute.h" #include "paddle/fluid/operators/math/jit_kernel.h"
#include <iostream>
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
#ifdef __AVX__ namespace jitkernel {
template <>
void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct, namespace jit = platform::jit;
float* ht) {
namespace act = detail::forward::avx; KernelPool& KernelPool::Instance() {
// gates: W_ch, W_ih, W_fh, W_oh static thread_local KernelPool g_jit_kernels;
__m256 c, i, f, o; return g_jit_kernels;
c = _mm256_loadu_ps(gates); }
i = _mm256_loadu_ps(gates + 8);
f = _mm256_loadu_ps(gates + 16); std::shared_ptr<const Kernel> KernelPool::Get(const std::string& key) const {
o = _mm256_loadu_ps(gates + 24); if (kers_.find(key) == kers_.end()) {
return nullptr;
/* C_t = C_t-1 * fgated + cand_gated * igated*/ }
c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i)); return kers_.at(key);
i = _mm256_loadu_ps(ct_1);
f = _mm256_mul_ps(i, act::Sigmoid(f));
f = _mm256_add_ps(c, f);
_mm256_storeu_ps(ct, f);
/* H_t = act_cell(C_t) * ogated */
o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o));
_mm256_storeu_ps(ht, o);
} }
#endif
} // namespace jitkernel
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <memory> // for shared_ptr
#include <string>
#include <unordered_map>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/macros.h"
// Note: Only support on CPU yet.
namespace paddle {
namespace operators {
namespace math {
namespace jitkernel {
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
#define AVX_FLOAT_BLOCK 8
#define AVX2_FLOAT_BLOCK 8
#define AVX512_FLOAT_BLOCK 16
typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block;
class Kernel {
public:
Kernel() = default;
virtual ~Kernel() = default;
int num_{0};
int end_{0};
int rest_{0};
DISABLE_COPY_AND_ASSIGN(Kernel);
};
class KernelPool {
public:
static KernelPool &Instance();
template <typename Ker, typename... ARGS>
std::shared_ptr<const Ker> Get(ARGS... args);
std::shared_ptr<const Kernel> Get(const std::string &key) const;
private:
KernelPool() = default;
std::unordered_map<std::string, std::shared_ptr<const Kernel>> kers_;
DISABLE_COPY_AND_ASSIGN(KernelPool);
};
template <typename T>
class VMulKernel : public Kernel {
public:
virtual void Compute(const T *x, const T *y, T *z) const = 0;
};
template <typename T>
class VAddKernel : public Kernel {
public:
virtual void Compute(const T *x, const T *y, T *z) const = 0;
};
template <typename T>
class VScalKernel : public Kernel {
public:
virtual void Compute(const T a, const T *x, T *y) const = 0;
virtual void Compute(const T a, T *x) const = 0;
};
template <typename T>
class VAddBiasKernel : public Kernel {
public:
virtual void Compute(const T a, const T *x, T *y) const = 0;
};
template <typename T>
class VActKernel : public Kernel {
public:
virtual void Compute(const T *x, T *y) const = 0;
};
template <typename T>
class VReluKernel : public VActKernel<T> {
public:
virtual void Compute(const T *x, T *y) const = 0;
};
template <typename T>
class VIdentityKernel : public VActKernel<T> {
public:
virtual void Compute(const T *x, T *y) const = 0;
};
template <typename T>
class VExpKernel : public VActKernel<T> {
public:
virtual void Compute(const T *x, T *y) const = 0;
};
template <typename T>
class VSigmoidKernel : public VActKernel<T> {
public:
virtual void Compute(const T *x, T *y) const = 0;
};
template <typename T>
class VTanhKernel : public VActKernel<T> {
public:
virtual void Compute(const T *x, T *y) const = 0;
};
template <typename T>
class LSTMKernel : public Kernel {
public:
virtual void ComputeCtHt(T *gates, const T *ct_1, T *ct, T *ht,
/* below only used in peephole*/
const T *wp_data = nullptr,
T *checked = nullptr) const = 0;
// compute c1 and h1 without c0 or h0
virtual void ComputeC1H1(T *gates, T *ct, T *ht,
/* below only used in peephole*/
const T *wp_data = nullptr) const = 0;
};
} // namespace jitkernel
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace paddle {
namespace operators {
namespace math {
namespace jitkernel {
namespace jit = platform::jit;
/* VMUL JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block>
class VMulKernelImpl : public VMulKernel<T> {
public:
explicit VMulKernelImpl(int d) : VMulKernel<T>() { this->num_ = d; }
void Compute(const T* x, const T* y, T* z) const override {
for (int i = 0; i < this->num_; ++i) {
z[i] = x[i] * y[i];
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \
template <> \
void VMulKernelImpl<float, isa, block>::Compute( \
const float* x, const float* y, float* z) const { \
platform::dynload::vsMul(this->num_, x, y, z); \
}
#define MKL_DOUBLE(isa, block) \
template <> \
void VMulKernelImpl<double, isa, block>::Compute( \
const double* x, const double* y, double* z) const { \
platform::dynload::vdMul(this->num_, x, y, z); \
}
FOR_EACH_ISA(MKL_FLOAT, kGT16);
FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
#endif
#define INTRI8_FLOAT(isa) \
template <> \
void VMulKernelImpl<float, isa, kEQ8>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 tmpx, tmpy; \
tmpx = _mm256_loadu_ps(x); \
tmpy = _mm256_loadu_ps(y); \
tmpx = _mm256_mul_ps(tmpx, tmpy); \
_mm256_storeu_ps(z, tmpx); \
}
// avx > for > mkl
#ifdef __AVX__
INTRI8_FLOAT(jit::avx);
#endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT(jit::avx512f);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef MKL_FLOAT
#undef MKL_DOUBLE
/* VADD JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block>
class VAddKernelImpl : public VAddKernel<T> {
public:
explicit VAddKernelImpl(int d) : VAddKernel<T>() { this->num_ = d; }
void Compute(const T* x, const T* y, T* z) const override {
for (int i = 0; i < this->num_; ++i) {
z[i] = x[i] + y[i];
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \
template <> \
void VAddKernelImpl<float, isa, block>::Compute( \
const float* x, const float* y, float* z) const { \
platform::dynload::vsAdd(this->num_, x, y, z); \
}
#define MKL_DOUBLE(isa, block) \
template <> \
void VAddKernelImpl<double, isa, block>::Compute( \
const double* x, const double* y, double* z) const { \
platform::dynload::vdAdd(this->num_, x, y, z); \
}
FOR_EACH_ISA(MKL_FLOAT, kGT16);
FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
#endif
#define INTRI8_FLOAT(isa) \
template <> \
void VAddKernelImpl<float, isa, kEQ8>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 tmpx, tmpy; \
tmpx = _mm256_loadu_ps(x); \
tmpy = _mm256_loadu_ps(y); \
tmpx = _mm256_add_ps(tmpx, tmpy); \
_mm256_storeu_ps(z, tmpx); \
}
#ifdef __AVX__
INTRI8_FLOAT(jit::avx);
#endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT(jit::avx512f);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef MKL_FLOAT
#undef MKL_DOUBLE
/* VSCAL JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block>
class VScalKernelImpl : public VScalKernel<T> {
public:
explicit VScalKernelImpl(int d) : VScalKernel<T>() { this->num_ = d; }
void Compute(const T a, const T* x, T* y) const override {
for (int i = 0; i < this->num_; ++i) {
y[i] = a * x[i];
}
}
void Compute(const T a, T* x) const override {
for (int i = 0; i < this->num_; ++i) {
x[i] = a * x[i];
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \
template <> \
void VScalKernelImpl<float, isa, block>::Compute(const float a, float* x) \
const { \
platform::dynload::cblas_sscal(this->num_, a, x, 1); \
}
#define MKL_DOUBLE(isa, block) \
template <> \
void VScalKernelImpl<double, isa, block>::Compute(const double a, double* x) \
const { \
platform::dynload::cblas_dscal(this->num_, a, x, 1); \
}
FOR_EACH_ISA(MKL_FLOAT, kGT16);
FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
#endif
#define INTRI8_FLOAT(isa) \
template <> \
void VScalKernelImpl<float, isa, kEQ8>::Compute( \
const float a, const float* x, float* y) const { \
__m256 tmp; \
__m256 scalar = _mm256_set1_ps(a); \
tmp = _mm256_loadu_ps(x); \
tmp = _mm256_mul_ps(tmp, scalar); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI8_INPLACE_FLOAT(isa) \
template <> \
void VScalKernelImpl<float, isa, kEQ8>::Compute(const float a, float* x) \
const { \
__m256 tmp; \
__m256 scalar = _mm256_set1_ps(a); \
tmp = _mm256_loadu_ps(x); \
tmp = _mm256_mul_ps(tmp, scalar); \
_mm256_storeu_ps(x, tmp); \
}
#ifdef __AVX__
INTRI8_FLOAT(jit::avx);
INTRI8_INPLACE_FLOAT(jit::avx);
#endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2);
INTRI8_INPLACE_FLOAT(jit::avx2);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT(jit::avx512f);
INTRI8_INPLACE_FLOAT(jit::avx512f);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef INTRI8_INPLACE_FLOAT
#undef MKL_FLOAT
#undef MKL_DOUBLE
/* VAddBias JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block>
class VAddBiasKernelImpl : public VAddBiasKernel<T> {
public:
explicit VAddBiasKernelImpl(int d) : VAddBiasKernel<T>() { this->num_ = d; }
void Compute(const T a, const T* x, T* y) const override {
for (int i = 0; i < this->num_; ++i) {
y[i] = x[i] + a;
}
}
};
#define INTRI8_FLOAT(isa) \
template <> \
void VAddBiasKernelImpl<float, isa, kEQ8>::Compute( \
const float a, const float* x, float* y) const { \
__m256 tmp = _mm256_loadu_ps(x); \
tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a)); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VAddBiasKernelImpl<float, isa, kEQ16>::Compute( \
const float a, const float* x, float* y) const { \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a)); \
tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a)); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#ifdef __AVX__
INTRI8_FLOAT(jit::avx);
INTRI16_FLOAT(jit::avx);
#endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2);
INTRI16_FLOAT(jit::avx2);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT(jit::avx512f);
INTRI16_FLOAT(jit::avx512f);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
/* VRelu JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block>
class VReluKernelImpl : public VReluKernel<T> {
public:
explicit VReluKernelImpl(int d) : VReluKernel<T>() { this->num_ = d; }
void Compute(const T* x, T* y) const override {
for (int i = 0; i < this->num_; ++i) {
y[i] = x[i] > 0 ? x[i] : 0;
}
}
};
#define INTRI8_FLOAT(isa) \
template <> \
void VReluKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \
__m256 tmp = _mm256_loadu_ps(x); \
tmp = _mm256_max_ps(tmp, _mm256_setzero_ps()); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VReluKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
const { \
__m256 zeros = _mm256_setzero_ps(); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
tmp0 = _mm256_max_ps(tmp0, zeros); \
tmp1 = _mm256_max_ps(tmp1, zeros); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
VReluKernelImpl<float, isa, kGT8LT16>::VReluKernelImpl(int d) \
: VReluKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - AVX_FLOAT_BLOCK; \
} \
template <> \
void VReluKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 zeros = _mm256_setzero_ps(); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + this->rest_); \
tmp0 = _mm256_max_ps(tmp0, zeros); \
tmp1 = _mm256_max_ps(tmp1, zeros); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + this->rest_, tmp1); \
}
#define INTRI_GT16_FLOAT(isa) \
template <> \
VReluKernelImpl<float, isa, kGT16>::VReluKernelImpl(int d) \
: VReluKernel<float>() { \
this->num_ = d; \
this->end_ = d - d % AVX_FLOAT_BLOCK; \
this->rest_ = d - AVX_FLOAT_BLOCK; \
} \
template <> \
void VReluKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y) \
const { \
__m256 zeros = _mm256_setzero_ps(); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_max_ps(tmp, zeros); \
_mm256_storeu_ps(y + i, tmp); \
} \
__m256 tmp = _mm256_loadu_ps(x + this->rest_); \
tmp = _mm256_max_ps(tmp, zeros); \
_mm256_storeu_ps(y + this->rest_, tmp); \
}
#ifdef __AVX__
INTRI8_FLOAT(jit::avx);
INTRI16_FLOAT(jit::avx);
INTRI_GT8LT16_FLOAT(jit::avx);
INTRI_GT16_FLOAT(jit::avx);
#endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2);
INTRI16_FLOAT(jit::avx2);
INTRI_GT8LT16_FLOAT(jit::avx2);
INTRI_GT16_FLOAT(jit::avx2);
#endif
#ifdef __AVX512F__
// TODO(TJ): refine avx512
INTRI8_FLOAT(jit::avx512f);
INTRI16_FLOAT(jit::avx512f);
INTRI_GT8LT16_FLOAT(jit::avx512f);
INTRI_GT16_FLOAT(jit::avx512f);
#endif
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef INTRI_GT8LT16_FLOAT
#undef INTRI_GT16_FLOAT
/* An empty JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block>
class VIdentityKernelImpl : public VIdentityKernel<T> {
public:
explicit VIdentityKernelImpl(int d) : VIdentityKernel<T>() { this->num_ = d; }
void Compute(const T* x, T* y) const override {}
};
REGISTER_JITKERNEL(vmul, VMulKernel);
REGISTER_JITKERNEL(vadd, VAddKernel);
REGISTER_JITKERNEL(vscal, VScalKernel);
REGISTER_JITKERNEL(vaddb, VAddBiasKernel);
REGISTER_JITKERNEL(vrelu, VReluKernel);
REGISTER_JITKERNEL(videntity, VIdentityKernel);
} // namespace jitkernel
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <cmath> // for exp
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace paddle {
namespace operators {
namespace math {
#ifdef __AVX__
namespace detail {
__m256 Exp(__m256 a);
} // namespace detail
#endif
namespace jitkernel {
namespace jit = platform::jit;
/* VExp JitKernel */
template <typename T, jit::cpu_isa_t isa, jit_block>
class VExpKernelImpl : public VExpKernel<T> {
public:
explicit VExpKernelImpl(int d) : VExpKernel<T>() { this->num_ = d; }
void Compute(const T* x, T* y) const override {
for (int i = 0; i < this->num_; ++i) {
y[i] = std::exp(x[i]);
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \
template <> \
void VExpKernelImpl<float, isa, block>::Compute(const float* x, float* y) \
const { \
platform::dynload::vsExp(this->num_, x, y); \
}
#define MKL_DOUBLE(isa, block) \
template <> \
void VExpKernelImpl<double, isa, block>::Compute(const double* x, double* y) \
const { \
platform::dynload::vdExp(this->num_, x, y); \
}
FOR_EACH_ISA(MKL_FLOAT, kLT8);
FOR_EACH_ISA(MKL_FLOAT, kGT8LT16);
FOR_EACH_ISA(MKL_FLOAT, kGT16);
FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
#endif
#define INTRI8_FLOAT(isa) \
template <> \
void VExpKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \
__m256 tmp = _mm256_loadu_ps(x); \
_mm256_storeu_ps(y, detail::Exp(tmp)); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VExpKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
const { \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
tmp0 = detail::Exp(tmp0); \
tmp1 = detail::Exp(tmp1); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#ifdef __AVX__
INTRI8_FLOAT(jit::avx);
INTRI16_FLOAT(jit::avx);
#endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2);
INTRI16_FLOAT(jit::avx2);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT(jit::avx512f);
INTRI16_FLOAT(jit::avx512f);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef MKL_FLOAT
#undef MKL_DOUBLE
REGISTER_JITKERNEL(vexp, VExpKernel);
/* VSigmoid JitKernel */
template <typename T, jit::cpu_isa_t isa, jit_block>
class VSigmoidKernelImpl : public VSigmoidKernel<T> {
public:
explicit VSigmoidKernelImpl(int d) : VSigmoidKernel<T>() {
this->num_ = d;
vexp_ = KernelPool::Instance().template Get<VExpKernel<T>>(d);
}
void Compute(const T* x, T* y) const override {
const T min = SIGMOID_THRESHOLD_MIN;
const T max = SIGMOID_THRESHOLD_MAX;
for (int i = 0; i < this->num_; ++i) {
y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
y[i] = static_cast<T>(0) - y[i];
}
vexp_->Compute(y, y);
for (int i = 0; i < this->num_; ++i) {
y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
}
}
private:
std::shared_ptr<const VExpKernel<T>> vexp_;
};
#define INTRI_SIGMOID(tmp, min, max) \
tmp = _mm256_max_ps(tmp, min); \
tmp = _mm256_min_ps(tmp, max); \
tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); \
tmp = detail::Exp(tmp); \
tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp)
#define INTRI8_FLOAT(isa) \
template <> \
void VSigmoidKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VSigmoidKernelImpl<float, isa, kEQ16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
INTRI_SIGMOID(tmp0, min, max); \
INTRI_SIGMOID(tmp1, min, max); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
VSigmoidKernelImpl<float, isa, kGT8LT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
vexp_ = \
KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
} \
template <> \
void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y, tmp); \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(y + this->end_, y + this->end_); \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
}
#define INTRI_GT16_FLOAT(isa) \
template <> \
VSigmoidKernelImpl<float, isa, kGT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \
this->num_ = d; \
this->rest_ = d % AVX_FLOAT_BLOCK; \
this->end_ = d - this->rest_; \
vexp_ = \
KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
} \
template <> \
void VSigmoidKernelImpl<float, isa, kGT16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y + i, tmp); \
} \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(y + this->end_, y + this->end_); \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
}
#ifdef __AVX__
INTRI8_FLOAT(jit::avx);
INTRI16_FLOAT(jit::avx);
INTRI_GT8LT16_FLOAT(jit::avx);
INTRI_GT16_FLOAT(jit::avx);
#endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2);
INTRI16_FLOAT(jit::avx2);
// INTRI_GT8LT16_FLOAT(jit::avx2);
// INTRI_GT16_FLOAT(jit::avx2);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT(jit::avx512f);
INTRI16_FLOAT(jit::avx512f);
// INTRI_GT8LT16_FLOAT(jit::avx512f);
// INTRI_GT16_FLOAT(jit::avx512f);
#endif
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef INTRI_GT8LT16_FLOAT
#undef INTRI_GT16_FLOAT
#undef INTRI_VSIGMOID
REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel);
/* VTanh JitKernel */
template <typename T, jit::cpu_isa_t isa, jit_block>
class VTanhKernelImpl : public VTanhKernel<T> {
public:
explicit VTanhKernelImpl(int d) : VTanhKernel<T>() {
this->num_ = d;
vscal_ = KernelPool::Instance().template Get<VScalKernel<T>>(d);
vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<T>>(d);
vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<T>>(d);
}
void Compute(const T* x, T* y) const override {
vscal_->Compute(static_cast<T>(2), x, y);
vsigmoid_->Compute(y, y);
vscal_->Compute(static_cast<T>(2), y);
vaddbias_->Compute(static_cast<T>(-1), y, y);
}
private:
std::shared_ptr<const VScalKernel<T>> vscal_;
std::shared_ptr<const VSigmoidKernel<T>> vsigmoid_;
std::shared_ptr<const VAddBiasKernel<T>> vaddbias_;
};
#define INTRI_VTANH(tmp) \
tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), tmp); \
tmp = _mm256_min_ps(tmp, _mm256_set1_ps(EXP_MAX_INPUT)); \
tmp = detail::Exp(tmp); \
tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp); \
tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f))
#define INTRI8_FLOAT(isa) \
template <> \
void VTanhKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VTanhKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
const { \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
INTRI_VTANH(tmp0); \
INTRI_VTANH(tmp1); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
VTanhKernelImpl<float, isa, kGT8LT16>::VTanhKernelImpl(int d) \
: VTanhKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
vscal_ = \
KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>( \
this->rest_); \
vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>( \
this->rest_); \
} \
template <> \
void VTanhKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y, tmp); \
x += AVX_FLOAT_BLOCK; \
y += AVX_FLOAT_BLOCK; \
vscal_->Compute(2.f, x, y); \
vsigmoid_->Compute(y, y); \
vscal_->Compute(2.f, y); \
vaddbias_->Compute(-1.f, y, y); \
}
#define INTRI_GT16_FLOAT(isa) \
template <> \
VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d) \
: VTanhKernel<float>() { \
this->num_ = d; \
this->rest_ = d % AVX_FLOAT_BLOCK; \
this->end_ = d - this->rest_; \
vscal_ = \
KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>( \
this->rest_); \
vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>( \
this->rest_); \
} \
template <> \
void VTanhKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y) \
const { \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y + i, tmp); \
} \
x += this->end_; \
y += this->end_; \
vscal_->Compute(2.f, x, y); \
vsigmoid_->Compute(y, y); \
vscal_->Compute(2.f, y); \
vaddbias_->Compute(-1.f, y, y); \
}
#ifdef __AVX__
INTRI8_FLOAT(jit::avx);
INTRI16_FLOAT(jit::avx);
INTRI_GT8LT16_FLOAT(jit::avx);
INTRI_GT16_FLOAT(jit::avx);
#endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2);
INTRI16_FLOAT(jit::avx2);
// maybe use avx at gt8lt16 and gt16
#endif
#ifdef __AVX512F__
INTRI8_FLOAT(jit::avx512f);
INTRI16_FLOAT(jit::avx512f);
// maybe use avx at gt8lt16 and gt16
#endif
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef INTRI_GT8LT16_FLOAT
#undef INTRI_GT16_FLOAT
#undef INTRI_VTANH
REGISTER_JITKERNEL(vtanh, VTanhKernel);
#undef JITKERNEL_NEW_ACT_IMPL
} // namespace jitkernel
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace paddle {
namespace operators {
namespace math {
#ifdef __AVX__
namespace detail {
__m256 Exp(__m256 a);
} // namespace detail
#endif
namespace jitkernel {
namespace jit = platform::jit;
#ifdef __AVX__
typedef enum { kSigmoid, kRelu, kTanh, kIdentity } act_type;
class AVXAct {
public:
virtual ~AVXAct() = default;
virtual __m256 Compute(__m256 x) const = 0;
};
template <act_type type>
class AVXActImpl : public AVXAct {
public:
__m256 Compute(__m256 x) const override { PADDLE_THROW("Unkown type!"); }
};
template <>
__m256 AVXActImpl<kSigmoid>::Compute(__m256 x) const {
__m256 ones = _mm256_set1_ps(1.0f);
x = _mm256_max_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MIN));
x = _mm256_min_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MAX));
x = _mm256_sub_ps(_mm256_set1_ps(0.0f), x);
x = detail::Exp(x);
x = _mm256_add_ps(ones, x);
return _mm256_div_ps(ones, x);
}
template <>
__m256 AVXActImpl<kTanh>::Compute(__m256 x) const {
__m256 ones = _mm256_set1_ps(1.0f);
x = _mm256_mul_ps(_mm256_set1_ps(-2.0f), x);
x = _mm256_min_ps(x, _mm256_set1_ps(EXP_MAX_INPUT));
x = detail::Exp(x);
x = _mm256_add_ps(ones, x);
x = _mm256_div_ps(_mm256_set1_ps(2.0f), x);
return _mm256_sub_ps(x, ones);
}
template <>
__m256 AVXActImpl<kRelu>::Compute(__m256 x) const {
return _mm256_max_ps(x, _mm256_setzero_ps());
}
template <>
__m256 AVXActImpl<kIdentity>::Compute(__m256 x) const {
return x;
}
#endif
template <typename T>
static std::shared_ptr<const VActKernel<T>> GetActKernel(
const std::string& type, int n) {
if (type == "sigmoid") {
return std::dynamic_pointer_cast<const VActKernel<T>>(
KernelPool::Instance().template Get<VSigmoidKernel<T>>(n));
} else if (type == "relu") {
return std::dynamic_pointer_cast<const VActKernel<T>>(
KernelPool::Instance().template Get<VReluKernel<T>>(n));
} else if (type == "tanh") {
return std::dynamic_pointer_cast<const VActKernel<T>>(
KernelPool::Instance().template Get<VTanhKernel<T>>(n));
} else if (type == "identity" || type == "") {
return std::dynamic_pointer_cast<const VActKernel<T>>(
KernelPool::Instance().template Get<VIdentityKernel<T>>(n));
}
PADDLE_THROW("Not support type: %s", type);
return nullptr;
}
/* LSTM JitKernel */
template <typename T, jit::cpu_isa_t isa, jit_block>
class LSTMKernelImpl : public LSTMKernel<T> {
public:
explicit LSTMKernelImpl(const std::string& act_gate,
const std::string& act_cand,
const std::string& act_cell, int d)
: LSTMKernel<T>() {
d_ = d;
d2_ = d * 2;
d3_ = d * 3;
act_gate_d3_ = GetActKernel<T>(act_gate, d3_);
act_gate_d_ = GetActKernel<T>(act_gate, d);
act_cand_d_ = GetActKernel<T>(act_cand, d);
act_cell_d_ = GetActKernel<T>(act_cell, d);
vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d);
#ifdef __AVX__
auto GetAVXAct = [&](const std::string& type) -> std::unique_ptr<AVXAct> {
if (type == "sigmoid") {
return std::unique_ptr<AVXAct>(new AVXActImpl<kSigmoid>());
} else if (type == "relu") {
return std::unique_ptr<AVXAct>(new AVXActImpl<kRelu>());
} else if (type == "tanh") {
return std::unique_ptr<AVXAct>(new AVXActImpl<kTanh>());
} else if (type == "identity" || type == "") {
return std::unique_ptr<AVXAct>(new AVXActImpl<kIdentity>());
}
PADDLE_THROW("Not support type: %s", type);
};
avx_act_gate_ = GetAVXAct(act_gate);
avx_act_cand_ = GetAVXAct(act_cand);
avx_act_cell_ = GetAVXAct(act_cell);
#endif
}
void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
T* checked) const override {
// gates: W_ch, W_ih, W_fh, W_oh
act_gate_d3_->Compute(gates + d_, gates + d_);
/* C_t = C_t-1 * fgated + cand_gated * igated */
act_cand_d_->Compute(gates, gates);
vmul_d_->Compute(gates, gates + d_, gates + d_);
vmul_d_->Compute(ct_1, gates + d2_, gates + d2_);
vadd_d_->Compute(gates + d_, gates + d2_, ct);
/* H_t = act_cell(C_t) * ogated */
act_cell_d_->Compute(ct, gates + d2_);
vmul_d_->Compute(gates + d2_, gates + d3_, ht);
}
void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
/* C_t = igated * cgated*/
act_gate_d_->Compute(gates + d_, gates + d_);
act_cand_d_->Compute(gates, gates);
vmul_d_->Compute(gates, gates + d_, ct);
/* H_t = act_cell(C_t) * ogated */
act_gate_d_->Compute(gates + d3_, gates + d3_);
act_cell_d_->Compute(ct, gates + d2_);
vmul_d_->Compute(gates + d2_, gates + d3_, ht);
}
private:
int d_, d2_, d3_;
std::shared_ptr<const VActKernel<T>> act_gate_d3_, act_gate_d_, act_cand_d_,
act_cell_d_;
std::shared_ptr<const VMulKernel<T>> vmul_d_;
std::shared_ptr<const VAddKernel<T>> vadd_d_;
#ifdef __AVX__
std::unique_ptr<const AVXAct> avx_act_gate_, avx_act_cand_, avx_act_cell_;
#endif
};
#define INTRI8_FLOAT(isa) \
template <> \
void LSTMKernelImpl<float, isa, kEQ8>::ComputeCtHt( \
float* gates, const float* ct_1, float* ct, float* ht, \
const float* wp_data, float* checked) const { \
/* gates: W_ch, W_ih, W_fh, W_oh */ \
__m256 c, i, f, o; \
c = _mm256_loadu_ps(gates); \
i = _mm256_loadu_ps(gates + 8); \
f = _mm256_loadu_ps(gates + 16); \
o = _mm256_loadu_ps(gates + 24); \
/* C_t = C_t-1 * fgated + cand_gated * igated*/ \
c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \
i = _mm256_loadu_ps(ct_1); \
f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \
f = _mm256_add_ps(c, f); \
_mm256_storeu_ps(ct, f); \
/* H_t = act_cell(C_t) * ogated */ \
o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \
_mm256_storeu_ps(ht, o); \
}
// TODO(TJ): optimize keq16
#ifdef __AVX__
INTRI8_FLOAT(jit::avx);
#endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT(jit::avx512f);
#endif
/* Peephole JitKernel */
template <typename T, jit::cpu_isa_t isa, jit_block>
class PeepholeKernelImpl : public LSTMKernel<T> {
public:
explicit PeepholeKernelImpl(const std::string& act_gate,
const std::string& act_cand,
const std::string& act_cell, int d)
: LSTMKernel<T>() {
d_ = d;
d2_ = d * 2;
d3_ = d * 3;
act_gate_d_ = GetActKernel<T>(act_gate, d);
act_cand_d_ = GetActKernel<T>(act_cand, d);
act_cell_d_ = GetActKernel<T>(act_cell, d);
vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d);
vadd_d2_ = KernelPool::Instance().template Get<VAddKernel<T>>(d2_);
act_gate_d2_ = GetActKernel<T>(act_gate, d2_);
}
void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
T* checked) const override {
/* get fgated and igated*/
vmul_d_->Compute(wp_data, ct_1, checked);
vmul_d_->Compute(wp_data + d_, ct_1, checked + d_);
vadd_d2_->Compute(checked, gates + d_, gates + d_);
act_gate_d2_->Compute(gates + d_, gates + d_);
/* C_t = C_t-1 * fgated + cand_gated * igated*/
act_cand_d_->Compute(gates, gates);
vmul_d_->Compute(gates, gates + d_, gates + d_);
vmul_d_->Compute(ct_1, gates + d2_, gates + d2_);
vadd_d_->Compute(gates + d_, gates + d2_, ct);
/* get ogated*/
vmul_d_->Compute(wp_data + d2_, ct, gates + d_);
vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_);
act_gate_d_->Compute(gates + d3_, gates + d3_);
/* H_t = act_cell(C_t) * ogated */
act_cell_d_->Compute(ct, gates + d2_);
vmul_d_->Compute(gates + d2_, gates + d3_, ht);
}
void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
/* C_t = igated * cgated*/
act_gate_d_->Compute(gates + d_, gates + d_);
act_cand_d_->Compute(gates, gates);
vmul_d_->Compute(gates, gates + d_, ct);
/* get outgated, put W_oc * C_t on igated */
vmul_d_->Compute(wp_data + d2_, ct, gates + d_);
vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_);
/* H_t = act_cell(C_t) * ogated */
act_gate_d_->Compute(gates + d3_, gates + d3_);
act_cell_d_->Compute(ct, gates + d2_);
vmul_d_->Compute(gates + d2_, gates + d3_, ht);
}
private:
int d_, d2_, d3_;
std::shared_ptr<const VActKernel<T>> act_gate_d2_, act_gate_d_, act_cand_d_,
act_cell_d_;
std::shared_ptr<const VMulKernel<T>> vmul_d_;
std::shared_ptr<const VAddKernel<T>> vadd_d_, vadd_d2_;
};
#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype) \
template <> \
std::shared_ptr<const LSTMKernel<ker_dtype>> \
KernelPool::Get<LSTMKernel<ker_dtype>, const std::string&, \
const std::string&, const std::string&, int, bool>( \
const std::string& act_gate, const std::string& act_cand, \
const std::string& act_cell, int d, bool use_peephole)
#define JITKERNEL_KEY_LSTM(ker_key, dtype_key) \
#ker_key #dtype_key + std::to_string(d) + act_gate + act_cand + act_cell + \
(use_peephole ? "p" : "n")
#define JITKERNEL_NEW_LSTM_IMPL(ker, dtype, isa, k) \
if (use_peephole) { \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<PeepholeKernelImpl<dtype, isa, k>>( \
act_gate, act_cand, act_cell, d)); \
} else { \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_cand, \
act_cell, d)); \
}
REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM,
JITKERNEL_KEY_LSTM, JITKERNEL_NEW_LSTM_IMPL);
#undef INTRI8_FLOAT
#undef JITKERNEL_DECLARE_LSTM
#undef JITKERNEL_KEY_LSTM
#undef JITKERNEL_NEW_LSTM_IMPL
} // namespace jitkernel
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
namespace math {
namespace jitkernel {
namespace jit = platform::jit;
#define SEARCH_BLOCK(macro_, ker, dtype, isa) \
if (d < AVX_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kLT8); \
} else if (d == AVX_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kEQ8); \
} else if (d > AVX_FLOAT_BLOCK && d < AVX512_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kGT8LT16); \
} else if (d == AVX512_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kEQ16); \
} else { \
macro_(ker, dtype, isa, kGT16); \
}
#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \
if (jit::MayIUse(jit::avx512f)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \
} else if (jit::MayIUse(jit::avx2)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx2); \
} else if (jit::MayIUse(jit::avx)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx); \
} else { \
SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \
}
#define JITKERNEL_DECLARE(ker_class, ker_dtype) \
template <> \
std::shared_ptr<const ker_class<ker_dtype>> \
KernelPool::Get<ker_class<ker_dtype>, int>(int d)
#define JITKERNEL_KEY(ker_key, dtype_key) \
#ker_key #dtype_key + std::to_string(d)
#define JITKERNEL_NEW_IMPL(ker, dtype, isa, k) \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<ker##Impl<dtype, isa, k>>(d))
#define JITKERNEL_WITH_DTYPE(ker_key, ker_class, ker_dtype, dtype_key, \
marco_declare, macro_key, macro_impl) \
marco_declare(ker_class, ker_dtype) { \
std::string key = macro_key(ker_key, dtype_key); \
if (kers_.find(key) == kers_.end()) { \
std::shared_ptr<ker_class<ker_dtype>> p; \
SEARCH_ISA_BLOCK(macro_impl, ker_class, ker_dtype); \
kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)}); \
return p; \
} \
return std::dynamic_pointer_cast<const ker_class<ker_dtype>>( \
kers_.at(key)); \
}
#define REGISTER_JITKERNEL(ker_key, ker_class) \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, JITKERNEL_DECLARE, \
JITKERNEL_KEY, JITKERNEL_NEW_IMPL); \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, JITKERNEL_DECLARE, \
JITKERNEL_KEY, JITKERNEL_NEW_IMPL)
#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_declare, macro_key, \
macro_impl) \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, marco_declare, macro_key, \
macro_impl); \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, marco_declare, \
macro_key, macro_impl)
#define FOR_EACH_ISA(macro_, block) \
macro_(jit::avx512f, block); \
macro_(jit::avx2, block); \
macro_(jit::avx, block); \
macro_(jit::isa_any, block)
#define FOR_EACH_BLOCK(macro_, isa) \
macro_(isa, kLT8); \
macro_(isa, kEQ8); \
macro_(isa, kGT8LT16); \
macro_(isa, kEQ16); \
macro_(isa, kGT16)
#define FOR_EACH_ISA_BLOCK(macro_) \
FOR_EACH_BLOCK(macro_, jit::avx512f); \
FOR_EACH_BLOCK(macro_, jit::avx2); \
FOR_EACH_BLOCK(macro_, jit::avx); \
FOR_EACH_BLOCK(macro_, jit::isa_any)
} // namespace jitkernel
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <sys/time.h>
#include <cmath> // for exp
#include <cstring> // for memcpy
#include <random>
#include <string>
#include <vector>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef __AVX__
#include <immintrin.h>
#endif
constexpr int repeat = 20000;
inline double GetCurrentUS() {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
}
template <typename T>
void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
const T upper = static_cast<T>(20.f)) {
static unsigned int seed = 100;
std::mt19937 rng(seed++);
std::uniform_real_distribution<double> uniform_dist(0, 1);
for (int i = 0; i < n; ++i) {
a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
}
}
void vrelu_ref(const int n, const float* x, float* y) {
for (int i = 0; i < n; ++i) {
y[i] = x[i] > 0.f ? x[i] : 0.f;
}
}
#if defined __AVX__ || defined __AVX2__
void vrelu_intri8(const int n, const float* x, float* y) {
__m256 tmp = _mm256_loadu_ps(x);
tmp = _mm256_max_ps(tmp, _mm256_setzero_ps());
_mm256_storeu_ps(y, tmp);
}
#endif
TEST(JitKernel, vrelu) {
namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 256, 512}) {
std::vector<float> x(d);
std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data(), -10.f, 1.f);
const auto& ker =
jit::KernelPool::Instance().template Get<jit::VReluKernel<float>>(d);
const float* x_data = x.data();
float* ztgt_data = ztgt.data();
float* zref_data = zref.data();
auto trefs = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vrelu_ref(d, x_data, zref_data);
}
auto trefe = GetCurrentUS();
#if defined __AVX__ || defined __AVX2__
if (d == 8) {
auto si0 = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vrelu_intri8(d, x_data, zref_data);
}
auto si1 = GetCurrentUS();
VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
}
#endif
auto ttgts = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
ker->Compute(x_data, ztgt_data);
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
}
}
void vaddbias_ref(const int n, const float a, const float* x, float* y) {
for (int i = 0; i < n; ++i) {
y[i] = x[i] + a;
}
}
TEST(JitKernel, vaddbias) {
namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 64, 100, 128, 256}) {
std::vector<float> x(d);
std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data(), -2.f, 2.f);
const auto& ker =
jit::KernelPool::Instance().template Get<jit::VAddBiasKernel<float>>(d);
const float a = 2.f;
const float* x_data = x.data();
float* ztgt_data = ztgt.data();
float* zref_data = zref.data();
auto trefs = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vaddbias_ref(d, a, x_data, zref_data);
}
auto trefe = GetCurrentUS();
auto ttgts = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
ker->Compute(a, x_data, ztgt_data);
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
}
}
void vexp_ref(const int n, const float* x, float* y) {
for (int i = 0; i < n; ++i) {
y[i] = std::exp(x[i]);
}
}
#ifdef PADDLE_WITH_MKLML
void vexp_mkl(const int n, const float* x, float* y) {
paddle::platform::dynload::vsExp(n, x, y);
}
#endif
TEST(JitKernel, vexp) {
namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 128, 256}) {
std::vector<float> x(d);
std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data(), -2.f, 2.f);
const auto& ker =
jit::KernelPool::Instance().template Get<jit::VExpKernel<float>>(d);
const float* x_data = x.data();
float* ztgt_data = ztgt.data();
float* zref_data = zref.data();
auto trefs = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vexp_ref(d, x_data, zref_data);
}
auto trefe = GetCurrentUS();
#ifdef PADDLE_WITH_MKLML
auto tmkls = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vexp_mkl(d, x_data, zref_data);
}
auto tmkle = GetCurrentUS();
#endif
auto ttgts = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
ker->Compute(x_data, ztgt_data);
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
#ifdef PADDLE_WITH_MKLML
<< " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
#else
<< " us, "
#endif
<< "tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
}
}
inline float _sigmoid(float x) {
const float min = SIGMOID_THRESHOLD_MIN;
const float max = SIGMOID_THRESHOLD_MAX;
float tmp = (x < min) ? min : ((x > max) ? max : x);
return 1.f / (1.f + std::exp(-tmp));
}
void vsigmoid_ref(const int n, const float* x, float* y) {
for (int i = 0; i < n; ++i) {
y[i] = _sigmoid(x[i]);
}
}
void vsigmoid_better(
const std::shared_ptr<
const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp,
const int n, const float* x, float* y) {
const float min = SIGMOID_THRESHOLD_MIN;
const float max = SIGMOID_THRESHOLD_MAX;
for (int i = 0; i < n; ++i) {
y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
y[i] = 0.f - y[i];
}
vexp->Compute(y, y);
for (int i = 0; i < n; ++i) {
y[i] = 1.f / (1.f + y[i]);
}
}
TEST(JitKernel, vsigmoid) {
namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
std::vector<float> x(d);
std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data(), -2.f, 2.f);
const auto& ker =
jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(d);
const auto& vexp =
jit::KernelPool::Instance().template Get<jit::VExpKernel<float>>(d);
const float* x_data = x.data();
float* ztgt_data = ztgt.data();
float* zref_data = zref.data();
auto tmkls = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vsigmoid_better(vexp, d, x_data, zref_data);
}
auto tmkle = GetCurrentUS();
auto trefs = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vsigmoid_ref(d, x_data, zref_data);
}
auto trefe = GetCurrentUS();
auto ttgts = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
ker->Compute(x_data, ztgt_data);
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
<< " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
}
}
inline float _tanh(float x) { return 2.f * _sigmoid(2.f * x) - 1.f; }
void vtanh_ref(const int n, const float* x, float* y) {
for (int i = 0; i < n; ++i) {
y[i] = _tanh(x[i]);
}
}
void vtanh_better(
const std::shared_ptr<
const paddle::operators::math::jitkernel::VScalKernel<float>>& vscal,
const std::shared_ptr<
const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
vsigmoid,
const std::shared_ptr<
const paddle::operators::math::jitkernel::VAddBiasKernel<float>>&
vaddbias,
const int n, const float* x, float* y) {
vscal->Compute(2.f, x, y);
vsigmoid->Compute(y, y);
vscal->Compute(2.f, y);
vaddbias->Compute(-1.f, y, y);
}
TEST(JitKernel, vtanh) {
namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
std::vector<float> x(d);
std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data(), -2.f, 2.f);
const auto& ker =
jit::KernelPool::Instance().template Get<jit::VTanhKernel<float>>(d);
const auto& vscal =
jit::KernelPool::Instance().template Get<jit::VScalKernel<float>>(d);
const auto& vsigmoid =
jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(d);
const auto& vaddbias =
jit::KernelPool::Instance().template Get<jit::VAddBiasKernel<float>>(d);
const float* x_data = x.data();
float* ztgt_data = ztgt.data();
float* zref_data = zref.data();
auto tmkls = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vtanh_better(vscal, vsigmoid, vaddbias, d, x_data, zref_data);
}
auto tmkle = GetCurrentUS();
auto trefs = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vtanh_ref(d, x_data, zref_data);
}
auto trefe = GetCurrentUS();
auto ttgts = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
ker->Compute(x_data, ztgt_data);
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
<< " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
}
}
void lstm_ctht_ref(
const std::shared_ptr<
const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
vsigmoid_3d,
const std::shared_ptr<
const paddle::operators::math::jitkernel::VTanhKernel<float>>& vtanh_d,
const std::shared_ptr<
const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp_1,
const int d, float* gates, const float* ct_1, float* ct, float* ht) {
vsigmoid_3d->Compute(gates + d, gates + d);
vtanh_d->Compute(gates, gates);
const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3;
const float min = SIGMOID_THRESHOLD_MIN;
const float max = SIGMOID_THRESHOLD_MAX;
for (int k = 0; k < d; ++k) {
// C_t = C_t-1 * fgated + cand_gated * igated
ct[k] = ct_1[k] * f[k] + gates[k] * i[k];
// H_t = act_cell(C_t) * ogated
float tmp = ct[k] * 2;
tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
vexp_1->Compute(&tmp, &tmp);
tmp = 2.f / (1.f + tmp) - 1.f;
ht[k] = tmp * o[k];
}
}
void lstm_ctht_better(
const std::shared_ptr<
const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
vsigmoid_3d,
const std::shared_ptr<
const paddle::operators::math::jitkernel::VTanhKernel<float>>& vtanh_d,
const std::shared_ptr<
const paddle::operators::math::jitkernel::VMulKernel<float>>& vmul_d,
const std::shared_ptr<
const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd_d,
const int d, float* gates, const float* ct_1, float* ct, float* ht) {
int d2 = d * 2;
vsigmoid_3d->Compute(gates + d, gates + d);
vtanh_d->Compute(gates, gates);
vmul_d->Compute(gates, gates + d, gates + d);
vmul_d->Compute(ct_1, gates + d2, gates + d2);
vadd_d->Compute(gates + d, gates + d2, ct);
/* H_t = act_cell(C_t) * ogated */
vtanh_d->Compute(ct, gates + d2);
vmul_d->Compute(gates + d2, gates + d * 3, ht);
}
TEST(JitKernel, lstm) {
namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 32, 64, 100}) {
int d4 = d * 4;
int d3 = d * 3;
std::vector<float> x(d4), xref(d4);
std::vector<float> ct_1(d), ct_tgt(d), ht_tgt(d);
std::vector<float> ct_ref(d), ht_ref(d);
RandomVec<float>(d4, x.data(), -2.f, 2.f);
RandomVec<float>(d, ct_1.data(), -2.f, 2.f);
memcpy(xref.data(), x.data(), sizeof(float) * d4);
std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
const auto& ker =
jit::KernelPool::Instance()
.template Get<jit::LSTMKernel<float>, const std::string&,
const std::string&, const std::string&>(
act_gate, act_cand, act_cell, d, false);
// below kernels are used to compute refer
const auto& vsigmoid_3d =
jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(
d3);
const auto& vtanh_d =
jit::KernelPool::Instance().template Get<jit::VTanhKernel<float>>(d);
const auto& vexp_1 =
jit::KernelPool::Instance().template Get<jit::VExpKernel<float>>(1);
const auto& vmul_d =
jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(d);
const auto& vadd_d =
jit::KernelPool::Instance().template Get<jit::VAddKernel<float>>(d);
float* x_data = x.data();
float* xref_data = xref.data();
const float* ct_1_data = ct_1.data();
float* ct_tgt_data = ct_tgt.data();
float* ht_tgt_data = ht_tgt.data();
float* ct_ref_data = ct_ref.data();
float* ht_ref_data = ht_ref.data();
// compute once to check correctness
lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data,
ct_ref_data, ht_ref_data);
ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3);
EXPECT_NEAR(ht_tgt_data[i], ht_ref_data[i], 1e-3);
}
auto tmkls = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
lstm_ctht_better(vsigmoid_3d, vtanh_d, vmul_d, vadd_d, d, xref_data,
ct_1_data, ct_ref_data, ht_ref_data);
}
auto tmkle = GetCurrentUS();
auto trefs = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data,
ct_ref_data, ht_ref_data);
}
auto trefe = GetCurrentUS();
auto ttgts = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
<< " us, better(jit) takes: " << (tmkle - tmkls) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
}
}
void vscal_ref(const int n, const float a, const float* x, float* y) {
for (int i = 0; i < n; ++i) {
y[i] = a * x[i];
}
}
void vscal_inp_ref(const int n, const float a, float* x) {
for (int i = 0; i < n; ++i) {
x[i] = a * x[i];
}
}
#if defined __AVX__ || defined __AVX2__
void vscal_intri8(const int n, const float a, const float* x, float* y) {
__m256 tmp;
__m256 scalar = _mm256_set1_ps(a);
tmp = _mm256_loadu_ps(x);
tmp = _mm256_mul_ps(tmp, scalar);
_mm256_storeu_ps(y, tmp);
}
void vscal_inp_intri8(const int n, const float a, float* x) {
__m256 tmp;
__m256 scalar = _mm256_set1_ps(a);
tmp = _mm256_loadu_ps(x);
tmp = _mm256_mul_ps(tmp, scalar);
_mm256_storeu_ps(x, tmp);
}
#endif
#ifdef PADDLE_WITH_MKLML
void vscal_inp_mkl(const int n, const float a, float* x) {
paddle::platform::dynload::cblas_sscal(n, a, x, 1);
}
#endif
TEST(JitKernel, vscal) {
namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 256, 512}) {
std::vector<float> x(d), y(d);
std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data());
std::memcpy(y.data(), x.data(), sizeof(float) * d);
float a = 2.f;
const auto& ker =
jit::KernelPool::Instance().template Get<jit::VScalKernel<float>>(d);
const float* x_data = x.data();
float* y_data = y.data();
float* ztgt_data = ztgt.data();
float* zref_data = zref.data();
auto trefs = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vscal_ref(d, a, x_data, zref_data);
}
auto trefe = GetCurrentUS();
auto trefs1 = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vscal_inp_ref(d, a, y_data);
}
auto trefe1 = GetCurrentUS();
#ifdef PADDLE_WITH_MKLML
auto tmkls = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vscal_inp_mkl(d, a, y_data);
}
auto tmkle = GetCurrentUS();
#endif
#if defined __AVX__ || defined __AVX2__
if (d == 8) {
auto si0 = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vscal_intri8(d, a, x_data, zref_data);
}
auto si1 = GetCurrentUS();
auto si2 = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vscal_inp_intri8(d, a, y_data);
}
auto si3 = GetCurrentUS();
VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat
<< " us, inplace: " << (si3 - si2) / repeat;
}
#endif
auto ttgts = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
ker->Compute(a, x_data, ztgt_data);
}
auto ttgte = GetCurrentUS();
auto ttgts1 = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
ker->Compute(a, y_data);
}
auto ttgte1 = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
<< " us, inplace takes: " << (trefe1 - trefs1) / repeat
#ifdef PADDLE_WITH_MKLML
<< " us, mkl inplace takes: " << (tmkle - tmkls) / repeat << " us, "
#else
<< " us, "
#endif
<< "tgt takes: " << (ttgte - ttgts) / repeat
<< "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
}
}
void vmul_ref(const int n, const float* x, const float* y, float* z) {
for (int i = 0; i < n; ++i) {
z[i] = x[i] * y[i];
}
}
#if defined __AVX__ || defined __AVX2__
void vmul_intri8(const int n, const float* x, const float* y, float* z) {
__m256 tmpx, tmpy;
tmpx = _mm256_loadu_ps(x);
tmpy = _mm256_loadu_ps(y);
tmpx = _mm256_mul_ps(tmpx, tmpy);
_mm256_storeu_ps(z, tmpx);
}
#endif
#ifdef PADDLE_WITH_MKLML
void vmul_mkl(const int n, const float* x, const float* y, float* z) {
paddle::platform::dynload::vsMul(n, x, y, z);
}
#endif
TEST(JitKernel, vmul) {
namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 256, 512}) {
std::vector<float> x(d), y(d);
std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data());
RandomVec<float>(d, y.data());
const auto& ker =
jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(d);
const float* x_data = x.data();
const float* y_data = y.data();
float* ztgt_data = ztgt.data();
float* zref_data = zref.data();
auto trefs = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vmul_ref(d, x_data, y_data, zref_data);
}
auto trefe = GetCurrentUS();
#ifdef PADDLE_WITH_MKLML
auto tmkls = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vmul_mkl(d, x_data, y_data, zref_data);
}
auto tmkle = GetCurrentUS();
#endif
#if defined __AVX__ || defined __AVX2__
if (d == 8) {
auto si0 = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vmul_intri8(d, x_data, y_data, zref_data);
}
auto si1 = GetCurrentUS();
VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
}
#endif
auto ttgts = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
ker->Compute(x_data, y_data, ztgt_data);
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
#ifdef PADDLE_WITH_MKLML
<< " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
#else
<< " us, "
#endif
<< "tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
}
}
void vadd_ref(const int n, const float* x, const float* y, float* z) {
for (int i = 0; i < n; ++i) {
z[i] = x[i] + y[i];
}
}
#if defined __AVX__ || defined __AVX2__
void vadd_intri8(const int n, const float* x, const float* y, float* z) {
__m256 tmpx, tmpy;
tmpx = _mm256_loadu_ps(x);
tmpy = _mm256_loadu_ps(y);
tmpx = _mm256_add_ps(tmpx, tmpy);
_mm256_storeu_ps(z, tmpx);
}
#endif
#ifdef PADDLE_WITH_MKLML
void vadd_mkl(const int n, const float* x, const float* y, float* z) {
paddle::platform::dynload::vsAdd(n, x, y, z);
}
#endif
TEST(JitKernel, vadd) {
namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 256, 512}) {
std::vector<float> x(d), y(d);
std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data());
RandomVec<float>(d, y.data());
const auto& ker =
jit::KernelPool::Instance().template Get<jit::VAddKernel<float>>(d);
const float* x_data = x.data();
const float* y_data = y.data();
float* ztgt_data = ztgt.data();
float* zref_data = zref.data();
auto trefs = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vadd_ref(d, x_data, y_data, zref_data);
}
auto trefe = GetCurrentUS();
#ifdef PADDLE_WITH_MKLML
auto tmkls = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vadd_mkl(d, x_data, y_data, zref_data);
}
auto tmkle = GetCurrentUS();
#endif
#if defined __AVX__ || defined __AVX2__
if (d == 8) {
auto si0 = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vadd_intri8(d, x_data, y_data, zref_data);
}
auto si1 = GetCurrentUS();
VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
}
#endif
auto ttgts = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
ker->Compute(x_data, y_data, ztgt_data);
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
#ifdef PADDLE_WITH_MKLML
<< " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
#else
<< " us, "
#endif
<< "tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
}
}
TEST(JitKernel, pool) {
namespace jit = paddle::operators::math::jitkernel;
const int frame_size = 4;
std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
const auto& plstm1 =
jit::KernelPool::Instance()
.template Get<jit::LSTMKernel<float>, const std::string&,
const std::string&, const std::string&>(
act_gate, act_cand, act_cell, frame_size, false);
const auto& plstm2 =
jit::KernelPool::Instance()
.template Get<jit::LSTMKernel<float>, const std::string&,
const std::string&, const std::string&>(
act_gate, act_cand, act_cell, frame_size, false);
const auto& peephole =
jit::KernelPool::Instance()
.template Get<jit::LSTMKernel<float>, const std::string&,
const std::string&, const std::string&>(
act_gate, act_cand, act_cell, frame_size, true);
EXPECT_TRUE(plstm1 != peephole);
const auto& pvmul_f =
jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(4);
EXPECT_TRUE(std::dynamic_pointer_cast<const jit::Kernel>(plstm2) !=
std::dynamic_pointer_cast<const jit::Kernel>(pvmul_f));
const auto& pvmul_d =
jit::KernelPool::Instance().template Get<jit::VMulKernel<double>>(4);
EXPECT_TRUE(std::dynamic_pointer_cast<const jit::Kernel>(pvmul_f) !=
std::dynamic_pointer_cast<const jit::Kernel>(pvmul_d));
const auto& pvmul_from_key = jit::KernelPool::Instance().Get("vmulf4");
EXPECT_EQ(pvmul_f, pvmul_from_key);
const auto& pvmul_from_key2 = jit::KernelPool::Instance().Get("vmulf5");
EXPECT_TRUE(pvmul_from_key2 == nullptr);
}
...@@ -24,7 +24,7 @@ class MomentumOp : public framework::OperatorWithKernel { ...@@ -24,7 +24,7 @@ class MomentumOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
protected: protected:
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"), PADDLE_ENFORCE(ctx->HasInput("Param"),
"Input(param) of Momentum should not be null."); "Input(param) of Momentum should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"), PADDLE_ENFORCE(ctx->HasInput("Grad"),
...@@ -45,12 +45,15 @@ class MomentumOp : public framework::OperatorWithKernel { ...@@ -45,12 +45,15 @@ class MomentumOp : public framework::OperatorWithKernel {
"Output(VelocityOut) of Momentum should not be null."); "Output(VelocityOut) of Momentum should not be null.");
auto param_dim = ctx->GetInputDim("Param"); auto param_dim = ctx->GetInputDim("Param");
if (ctx->GetInputsVarType("Grad")[0] ==
framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Grad"), param_dim, ctx->GetInputDim("Grad"),
"Param and Grad input of MomentumOp should have the same dimension."); "Param and Grad input of MomentumOp should have the same dimension.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Velocity"), param_dim, ctx->GetInputDim("Velocity"),
"Param and Velocity of MomentumOp should have the same dimension."); "Param and Velocity of MomentumOp should have the same dimension.");
}
PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1, PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1,
"Learning_rate should be a scalar"); "Learning_rate should be a scalar");
...@@ -58,13 +61,34 @@ class MomentumOp : public framework::OperatorWithKernel { ...@@ -58,13 +61,34 @@ class MomentumOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("VelocityOut", param_dim); ctx->SetOutputDim("VelocityOut", param_dim);
} }
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext& ctx) const override {
auto input_data_type = auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
framework::ToDataType(ctx.Input<Tensor>("Param")->type());
return framework::OpKernelType(input_data_type, ctx.GetPlace()); return framework::OpKernelType(input_data_type, ctx.GetPlace());
} }
}; };
class MomentumOpInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto input_var = op_desc.Input("Param")[0];
for (auto& out_var : op_desc.Output("ParamOut")) {
if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
framework::proto::VarType::SELECTED_ROWS) {
block->FindRecursiveOrCreateVar(out_var).SetType(
framework::proto::VarType::SELECTED_ROWS);
} else if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
framework::proto::VarType::LOD_TENSOR) {
block->FindRecursiveOrCreateVar(out_var).SetType(
framework::proto::VarType::LOD_TENSOR);
} else {
PADDLE_THROW(
"Only support LodTensor and SelectedRows, Unexpected Input Type.");
}
}
}
};
class MomentumOpMaker : public framework::OpProtoAndCheckerMaker { class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -115,6 +139,9 @@ $$ ...@@ -115,6 +139,9 @@ $$
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker); REGISTER_OPERATOR(momentum, ops::MomentumOp, ops::MomentumOpMaker,
REGISTER_OP_CPU_KERNEL(momentum, ops::MomentumOpKernel<float>, paddle::framework::EmptyGradOpMaker,
ops::MomentumOpKernel<double>); ops::MomentumOpInferVarType);
REGISTER_OP_CPU_KERNEL(
momentum, ops::MomentumOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::MomentumOpKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -15,76 +15,7 @@ limitations under the License. */ ...@@ -15,76 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/momentum_op.h" #include "paddle/fluid/operators/momentum_op.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void MomentumKernel(const T* p, const T* g, const T* v,
const T* learning_rate, const T mu,
const int64_t num, bool use_nesterov, T* p_out,
T* v_out) {
T lr = learning_rate[0];
if (use_nesterov) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
T g_val = g[i];
T v_new = v[i] * mu + g_val;
v_out[i] = v_new;
p_out[i] = p[i] - (g_val + v_new * mu) * lr;
}
} else {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
T v_new = v[i] * mu + g[i];
v_out[i] = v_new;
p_out[i] = p[i] - lr * v_new;
}
}
}
template <typename T>
class MomentumOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Grad").front(), grad_var->Type().name());
auto param_out = ctx.Output<framework::Tensor>("ParamOut");
auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
auto param = ctx.Input<framework::Tensor>("Param");
auto velocity = ctx.Input<framework::Tensor>("Velocity");
auto grad = ctx.Input<framework::Tensor>("Grad");
auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
T mu = static_cast<T>(ctx.Attr<float>("mu"));
bool use_nesterov = ctx.Attr<bool>("use_nesterov");
auto* p = param->data<T>();
auto* v = velocity->data<T>();
auto* g = grad->data<T>();
auto* lr = learning_rate->data<T>();
int block = 512;
int grid = (param->numel() + block - 1) / block;
MomentumKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(momentum, ops::MomentumOpCUDAKernel<float>, REGISTER_OP_CUDA_KERNEL(
ops::MomentumOpCUDAKernel<double>); momentum, ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>);
...@@ -13,35 +13,48 @@ See the License for the specific language governing permissions and ...@@ -13,35 +13,48 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T> using framework::Tensor;
class MomentumOpKernel : public framework::OpKernel<T> { using framework::SelectedRows;
public: struct NoNesterov;
void Compute(const framework::ExecutionContext& ctx) const override { struct UseNesterov;
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
auto param_out = ctx.Output<framework::Tensor>("ParamOut");
auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
auto param = ctx.Input<framework::Tensor>("Param");
auto velocity = ctx.Input<framework::Tensor>("Velocity");
auto grad = ctx.Input<framework::Tensor>("Grad");
auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
param_out->mutable_data<T>(ctx.GetPlace()); template <typename T>
velocity_out->mutable_data<T>(ctx.GetPlace()); class CPUDenseMomentumFunctor {
private:
const Tensor* param;
const Tensor* grad;
const Tensor* velocity;
const Tensor* learning_rate;
const T mu;
const T use_nesterov;
Tensor* param_out;
Tensor* velocity_out;
T mu = static_cast<T>(ctx.Attr<float>("mu")); public:
bool use_nesterov = ctx.Attr<bool>("use_nesterov"); CPUDenseMomentumFunctor(const Tensor* param, const Tensor* grad,
const Tensor* velocity, const Tensor* learning_rate,
const T mu, const bool use_nesterov,
Tensor* param_out, Tensor* velocity_out)
: param(param),
grad(grad),
velocity(velocity),
learning_rate(learning_rate),
mu(mu),
use_nesterov(use_nesterov),
param_out(param_out),
velocity_out(velocity_out) {}
inline void operator()() {
auto p_out = framework::EigenVector<T>::Flatten(*param_out); auto p_out = framework::EigenVector<T>::Flatten(*param_out);
auto v_out = framework::EigenVector<T>::Flatten(*velocity_out); auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
...@@ -59,5 +72,283 @@ class MomentumOpKernel : public framework::OpKernel<T> { ...@@ -59,5 +72,283 @@ class MomentumOpKernel : public framework::OpKernel<T> {
} }
}; };
template <typename T, typename UpdateMethod>
class DenseMomentumFunctor;
// NOTE(dzh) for performance.
// avoid if/else in inside kernel, implement GPU UseNesterov/NoNesterov as two
// functor.
template <typename T>
class DenseMomentumFunctor<T, UseNesterov> {
private:
const T* p_;
const T* g_;
const T* v_;
const T* lr_;
const T mu_;
const int64_t num_;
T* p_out_;
T* v_out_;
public:
DenseMomentumFunctor(const T* p, const T* g, const T* v,
const T* learning_rate, const T mu, const int64_t num,
T* p_out, T* v_out)
: p_(p),
g_(g),
v_(v),
lr_(learning_rate),
mu_(mu),
num_(num),
p_out_(p_out),
v_out_(v_out) {}
inline HOSTDEVICE void operator()(size_t i) const {
// put memory access in register
const T p = p_[i];
const T g = g_[i];
const T lr = lr_[0];
const T v = v_[i];
T v_out = v * mu_ + g;
T p_out = p - (g + v_out * mu_) * lr;
// write reigster to memory
v_out_[i] = v_out;
p_out_[i] = p_out;
}
};
template <typename T>
class DenseMomentumFunctor<T, NoNesterov> {
private:
const T* p_;
const T* g_;
const T* v_;
const T* lr_;
const T mu_;
const int64_t num_;
T* p_out_;
T* v_out_;
public:
DenseMomentumFunctor(const T* p, const T* g, const T* v,
const T* learning_rate, const T mu, const int64_t num,
T* p_out, T* v_out)
: p_(p),
g_(g),
v_(v),
lr_(learning_rate),
mu_(mu),
num_(num),
p_out_(p_out),
v_out_(v_out) {}
inline HOSTDEVICE void operator()(size_t i) const {
// put memory access in register
const T p = p_[i];
const T g = g_[i];
const T lr = lr_[0];
const T v = v_[i];
T v_out = v * mu_ + g;
T p_out = p - lr * v_out;
// write reigster to memory
v_out_[i] = v_out;
p_out_[i] = p_out;
}
};
template <typename T, typename UpdateMethod>
class SparseMomentumFunctor;
template <typename T>
class SparseMomentumFunctor<T, UseNesterov> {
private:
const T* p_;
const T* g_;
const T* v_;
const T* lr_;
const T mu_;
const int64_t* rows_;
const int64_t row_numel_;
const int64_t row_height_;
T* p_out_;
T* v_out_;
public:
SparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr,
const T mu, const int64_t* rows, int64_t row_numel,
int64_t row_height, T* p_out, T* v_out)
: p_(p),
g_(g),
v_(v),
lr_(lr),
mu_(mu),
rows_(rows),
row_numel_(row_numel),
row_height_(row_height),
p_out_(p_out),
v_out_(v_out) {}
inline HOSTDEVICE void operator()(size_t i) {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0;
// put memory access in register
const T p = p_[i];
const T lr = lr_[0];
const T v = v_[i];
T v_out = v * mu_ + g;
T p_out = p - (g + v_out * mu_) * lr;
// write reigster to memory
v_out_[i] = v_out;
p_out_[i] = p_out;
}
};
template <typename T>
class SparseMomentumFunctor<T, NoNesterov> {
private:
const T* p_;
const T* g_;
const T* v_;
const T* lr_;
const T mu_;
const int64_t* rows_;
const int64_t row_numel_;
const int64_t row_height_;
T* p_out_;
T* v_out_;
public:
SparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr,
const T mu, const int64_t* rows, int64_t row_numel,
int64_t row_height, T* p_out, T* v_out)
: p_(p),
g_(g),
v_(v),
lr_(lr),
mu_(mu),
rows_(rows),
row_numel_(row_numel),
row_height_(row_height),
p_out_(p_out),
v_out_(v_out) {}
inline HOSTDEVICE void operator()(size_t i) {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0;
// put memory access in register
const T p = p_[i];
const T lr = lr_[0];
const T v = v_[i];
T v_out = v * mu_ + g;
T p_out = p - v_out * lr;
// write reigster to memory
v_out_[i] = v_out;
p_out_[i] = p_out;
}
};
template <typename DeviceContext, typename T>
class MomentumOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
T mu = static_cast<T>(ctx.Attr<float>("mu"));
bool use_nesterov = ctx.Attr<bool>("use_nesterov");
auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
auto param = ctx.Input<framework::Tensor>("Param");
auto param_out = ctx.Output<framework::Tensor>("ParamOut");
auto* velocity = ctx.Input<framework::Tensor>("Velocity");
auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
param_out->mutable_data<T>(ctx.GetPlace());
velocity_out->mutable_data<T>(ctx.GetPlace());
auto* grad_var = ctx.InputVar("Grad");
if (grad_var->IsType<framework::LoDTensor>()) {
auto grad = ctx.Input<framework::Tensor>("Grad");
if (platform::is_cpu_place(ctx.GetPlace())) {
CPUDenseMomentumFunctor<T> functor(param, grad, velocity, learning_rate,
mu, use_nesterov, param_out,
velocity_out);
functor();
} else if (platform::is_gpu_place(ctx.GetPlace())) {
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()),
param->numel());
if (use_nesterov) {
DenseMomentumFunctor<T, UseNesterov> functor(
param->data<T>(), grad->data<T>(), velocity->data<T>(),
learning_rate->data<T>(), mu, param->numel(),
param_out->mutable_data<T>(ctx.GetPlace()),
velocity_out->mutable_data<T>(ctx.GetPlace()));
for_range(functor);
} else {
DenseMomentumFunctor<T, NoNesterov> functor(
param->data<T>(), grad->data<T>(), velocity->data<T>(),
learning_rate->data<T>(), mu, param->numel(),
param_out->mutable_data<T>(ctx.GetPlace()),
velocity_out->mutable_data<T>(ctx.GetPlace()));
for_range(functor);
}
}
} else if (grad_var->IsType<framework::SelectedRows>()) {
// sparse update embedding with selectedrows
auto grad = ctx.Input<framework::SelectedRows>("Grad");
// sparse update maybe empty.
if (grad->rows().size() == 0) {
VLOG(3) << "Grad SelectedRows contains no data!";
return;
}
auto* merged_grad = const_cast<framework::Scope&>(ctx.scope())
.Var()
->GetMutable<framework::SelectedRows>();
math::scatter::MergeAdd<DeviceContext, T> merge_func;
merge_func(ctx.template device_context<DeviceContext>(), *grad,
merged_grad);
const int64_t* rows = nullptr;
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
rows = merged_grad->rows().CUDAData(ctx.GetPlace());
} else {
#endif
rows = merged_grad->rows().data();
#ifdef PADDLE_WITH_CUDA
}
#endif
int64_t row_numel =
merged_grad->value().numel() / merged_grad->rows().size();
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()),
param->numel());
if (use_nesterov) {
SparseMomentumFunctor<T, UseNesterov> functor(
param->data<T>(), merged_grad->value().data<T>(),
velocity->data<T>(), learning_rate->data<T>(), mu, rows, row_numel,
static_cast<int64_t>(merged_grad->rows().size()),
param_out->mutable_data<T>(ctx.GetPlace()),
velocity_out->mutable_data<T>(ctx.GetPlace()));
for_range(functor);
} else {
SparseMomentumFunctor<T, NoNesterov> functor(
param->data<T>(), merged_grad->value().data<T>(),
velocity->data<T>(), learning_rate->data<T>(), mu, rows, row_numel,
static_cast<int64_t>(merged_grad->rows().size()),
param_out->mutable_data<T>(ctx.GetPlace()),
velocity_out->mutable_data<T>(ctx.GetPlace()));
for_range(functor);
}
} else {
PADDLE_THROW(
string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows "
"gradient, but the received Variable Type is %s",
grad_var->Type().name()));
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -397,6 +397,24 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase { ...@@ -397,6 +397,24 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
} }
}; };
class ParallelDoGradOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
framework::BlockDesc *sub_block =
boost::get<framework::BlockDesc *>(op_desc.GetAttr(kParallelBlock));
for (auto &out_vars : op_desc.Outputs()) {
for (auto &out_var : out_vars.second) {
auto &var = block->FindRecursiveOrCreateVar(out_var);
auto sub_var = sub_block->FindRecursiveOrCreateVar(out_var);
if (sub_var.GetType() != var.GetType()) {
var.SetType(sub_var.GetType());
}
}
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -404,4 +422,5 @@ REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp, ...@@ -404,4 +422,5 @@ REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
paddle::operators::ParallelDoOpProtoMaker, paddle::operators::ParallelDoOpProtoMaker,
paddle::operators::ParallelDoGradOpDescMaker); paddle::operators::ParallelDoGradOpDescMaker);
REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp, REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp,
paddle::operators::ParallelDoGradOpShapeInference); paddle::operators::ParallelDoGradOpShapeInference,
paddle::operators::ParallelDoGradOpVarTypeInference);
...@@ -43,17 +43,31 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -43,17 +43,31 @@ class SumKernel : public framework::OpKernel<T> {
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
} }
auto result = EigenVector<T>::Flatten(*out); auto result = EigenVector<T>::Flatten(*out);
auto &place =
*context.template device_context<DeviceContext>().eigen_device();
int start = in_place ? 1 : 0;
if (!in_place) { if (!in_place) {
if ((in_num >= 2) && in_vars[0]->IsType<framework::LoDTensor>() &&
in_vars[1]->IsType<framework::LoDTensor>()) {
auto &in_0 = in_vars[0]->Get<framework::LoDTensor>();
auto &in_1 = in_vars[1]->Get<framework::LoDTensor>();
if (in_0.numel() && in_1.numel()) {
auto in_0_e = EigenVector<T>::Flatten(in_0);
auto in_1_e = EigenVector<T>::Flatten(in_1);
result.device(place) = in_0_e + in_1_e;
start = 2;
}
}
if (start != 2) {
math::SetConstant<DeviceContext, T> constant_functor; math::SetConstant<DeviceContext, T> constant_functor;
constant_functor(context.template device_context<DeviceContext>(), out, constant_functor(context.template device_context<DeviceContext>(),
0.0); out, 0.0);
}
} }
math::SelectedRowsAddToTensor<DeviceContext, T> functor; math::SelectedRowsAddToTensor<DeviceContext, T> functor;
auto &place =
*context.template device_context<DeviceContext>().eigen_device();
// If in_place, just skip the first tensor // If in_place, just skip the first tensor
for (size_t i = in_place ? 1 : 0; i < in_num; i++) { for (size_t i = start; i < in_num; i++) {
if (in_vars[i]->IsType<framework::LoDTensor>()) { if (in_vars[i]->IsType<framework::LoDTensor>()) {
auto &in_t = in_vars[i]->Get<framework::LoDTensor>(); auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
if (in_t.numel() == 0) { if (in_t.numel() == 0) {
......
...@@ -128,7 +128,7 @@ bool MayIUse(const cpu_isa_t cpu_isa) { ...@@ -128,7 +128,7 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
return cpu.has(Cpu::tAVX); return cpu.has(Cpu::tAVX);
case avx2: case avx2:
return cpu.has(Cpu::tAVX2); return cpu.has(Cpu::tAVX2);
case avx512_common: case avx512f:
return cpu.has(Cpu::tAVX512F); return cpu.has(Cpu::tAVX512F);
case avx512_core: case avx512_core:
return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) && return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
......
...@@ -43,7 +43,7 @@ typedef enum { ...@@ -43,7 +43,7 @@ typedef enum {
sse42, sse42,
avx, avx,
avx2, avx2,
avx512_common, avx512f,
avx512_core, avx512_core,
avx512_core_vnni, avx512_core_vnni,
avx512_mic, avx512_mic,
......
...@@ -116,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) { ...@@ -116,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
platform::SetNumThreads(FLAGS_paddle_num_threads); platform::SetNumThreads(FLAGS_paddle_num_threads);
#endif #endif
if (platform::jit::MayIUse(platform::jit::avx512_common)) { if (platform::jit::MayIUse(platform::jit::avx512f)) {
#ifndef __AVX512F__ #ifndef __AVX512F__
LOG(WARNING) << "AVX512F is available, Please re-compile on local machine"; LOG(WARNING) << "AVX512F is available, Please re-compile on local machine";
#endif #endif
......
...@@ -370,8 +370,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -370,8 +370,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
std::vector<std::vector<Event>> merged_events_list; std::vector<std::vector<Event>> merged_events_list;
if (merge_thread) { if (merge_thread) {
std::vector<Event> merged_events; std::vector<Event> merged_events;
for (int i = 0; i < events.size(); ++i) { for (size_t i = 0; i < events.size(); ++i) {
for (int j = 0; j < events[i].size(); ++j) { for (size_t j = 0; j < events[i].size(); ++j) {
merged_events.push_back(events[i][j]); merged_events.push_back(events[i][j]);
} }
} }
......
...@@ -71,6 +71,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx); ...@@ -71,6 +71,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
#if !defined(_WIN32) #if !defined(_WIN32)
struct RecordEvent { struct RecordEvent {
// dev_ctx can be set to nullptr if device is cpu.
RecordEvent(const std::string& name, const DeviceContext* dev_ctx); RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
~RecordEvent(); ~RecordEvent();
......
...@@ -390,7 +390,9 @@ function run_mac_test() { ...@@ -390,7 +390,9 @@ function run_mac_test() {
Running unit tests ... Running unit tests ...
======================================== ========================================
EOF EOF
#remove proxy here to fix dist error on mac
export http_proxy=
export https_proxy=
# TODO: jiabin need to refine this part when these tests fixed on mac # TODO: jiabin need to refine this part when these tests fixed on mac
ctest --output-on-failure -j $1 ctest --output-on-failure -j $1
# make install should also be test when unittest # make install should also be test when unittest
...@@ -659,6 +661,7 @@ function gen_fluid_lib() { ...@@ -659,6 +661,7 @@ function gen_fluid_lib() {
EOF EOF
cmake .. -DWITH_DISTRIBUTE=OFF cmake .. -DWITH_DISTRIBUTE=OFF
make -j `nproc` fluid_lib_dist make -j `nproc` fluid_lib_dist
make -j `nproc` inference_lib_dist
fi fi
} }
...@@ -672,6 +675,8 @@ EOF ...@@ -672,6 +675,8 @@ EOF
cd ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build
cp -r fluid_install_dir fluid cp -r fluid_install_dir fluid
tar -czf fluid.tgz fluid tar -czf fluid.tgz fluid
cp -r fluid_inference_install_dir fluid_inference
tar -czf fluid_inference.tgz fluid_inference
fi fi
} }
...@@ -683,7 +688,9 @@ function test_fluid_lib() { ...@@ -683,7 +688,9 @@ function test_fluid_lib() {
======================================== ========================================
EOF EOF
cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib} ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \
${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
./clean.sh ./clean.sh
fi fi
} }
......
...@@ -1522,13 +1522,17 @@ class Program(object): ...@@ -1522,13 +1522,17 @@ class Program(object):
>>> with program.lr_schedule_guard(): >>> with program.lr_schedule_guard():
>>> lr = lr * decay >>> lr = lr * decay
""" """
tmp_role = self._current_role
tmp_var = self._op_role_var
OpRole = core.op_proto_and_checker_maker.OpRole OpRole = core.op_proto_and_checker_maker.OpRole
self._current_role = OpRole.LRSched self._current_role = OpRole.LRSched
# TODO(typhoonzero): how to set target learning rate var # TODO(typhoonzero): how to set target learning rate var
self._op_role_var = [] self._op_role_var = []
yield yield
self._op_role_var = [] self._op_role_var = tmp_var
self._current_role = OpRole.Forward self._current_role = tmp_role
def __str__(self): def __str__(self):
""" """
......
...@@ -65,6 +65,7 @@ __all__ = [ ...@@ -65,6 +65,7 @@ __all__ = [
'reduce_prod', 'reduce_prod',
'sequence_first_step', 'sequence_first_step',
'sequence_last_step', 'sequence_last_step',
'sequence_slice',
'dropout', 'dropout',
'split', 'split',
'ctc_greedy_decoder', 'ctc_greedy_decoder',
...@@ -1903,6 +1904,76 @@ def sequence_last_step(input): ...@@ -1903,6 +1904,76 @@ def sequence_last_step(input):
return sequence_pool(input=input, pool_type="last") return sequence_pool(input=input, pool_type="last")
def sequence_slice(input, offset, length, name=None):
"""
**Sequence Slice Layer**
The layer crops a subsequence from given sequence with given start
offset and subsequence length.
It only supports sequence data (LoDTensor with lod_level equal to 1).
.. code-block:: text
- Case:
Given the input Variable **input**:
input.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]],
input.lod = [[3, 2]],
input.dims = (5, 2),
with offset.data = [[0], [1]] and length.data = [[2], [1]],
the output Variable will be
out.data = [[a1, a2], [b1, b2], [e1, e2]],
out.lod = [[2, 1]],
out.dims = (3, 2).
NOTE: The first dimension size of **input**, **offset** and **length**
should be equal. The **offset** should start from 0.
Args:
input(Variable): The input Variable which consists of the complete
sequences.
offset(Variable): The offset to slice each sequence.
length(Variable): The length of each subsequence.
name(str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
Returns:
Variable: The output subsequences.
Examples:
.. code-block:: python
import numpy as np
seqs = fluid.layers.data(name='x', shape=[10, 5],
dtype='float32', lod_level=1)
offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32"))
length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32"))
subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
length=length)
"""
helper = LayerHelper("sequence_slice", **locals())
dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype)
offset.stop_gradient = True
length.stop_gradient = True
helper.append_op(
type="sequence_slice",
inputs={"X": input,
"Offset": offset,
"Length": length},
outputs={"Out": out})
return out
@templatedoc() @templatedoc()
def pool2d(input, def pool2d(input,
pool_size=-1, pool_size=-1,
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
from __future__ import print_function from __future__ import print_function
import re import re
from collections import defaultdict from collections import defaultdict
from paddle.fluid.framework import Program, Variable, name_scope from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
from . import framework from . import framework
from . import layers from . import layers
from .backward import append_backward from .backward import append_backward
...@@ -111,6 +111,7 @@ class Optimizer(object): ...@@ -111,6 +111,7 @@ class Optimizer(object):
if param_lr == 1.0: if param_lr == 1.0:
return self._global_learning_rate() return self._global_learning_rate()
else: else:
with default_main_program()._lr_schedule_guard():
return self._global_learning_rate() * param_lr return self._global_learning_rate() * param_lr
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
......
...@@ -81,7 +81,10 @@ def get_optimizer(): ...@@ -81,7 +81,10 @@ def get_optimizer():
return optimizer return optimizer
def train_network(batch_size, is_distributed=False, is_sparse=False): def train_network(batch_size,
is_distributed=False,
is_sparse=False,
is_self_contained_lr=False):
# query # query
q = fluid.layers.data( q = fluid.layers.data(
name="query_ids", shape=[1], dtype="int64", lod_level=1) name="query_ids", shape=[1], dtype="int64", lod_level=1)
...@@ -93,7 +96,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False): ...@@ -93,7 +96,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01), initializer=fluid.initializer.Constant(value=0.01),
name="__emb__", name="__emb__",
learning_rate=emb_lr), learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__"),
is_sparse=is_sparse) is_sparse=is_sparse)
## vsum ## vsum
q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
...@@ -119,7 +124,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False): ...@@ -119,7 +124,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01), initializer=fluid.initializer.Constant(value=0.01),
name="__emb__", name="__emb__",
learning_rate=emb_lr), learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__"),
is_sparse=is_sparse) is_sparse=is_sparse)
## vsum ## vsum
pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
...@@ -144,7 +151,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False): ...@@ -144,7 +151,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01), initializer=fluid.initializer.Constant(value=0.01),
name="__emb__", name="__emb__",
learning_rate=emb_lr), learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__"),
is_sparse=is_sparse) is_sparse=is_sparse)
## vsum ## vsum
nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
...@@ -220,7 +229,10 @@ class TestDistSimnetBow2x2(TestDistRunnerBase): ...@@ -220,7 +229,10 @@ class TestDistSimnetBow2x2(TestDistRunnerBase):
def get_model(self, batch_size=2): def get_model(self, batch_size=2):
# Train program # Train program
avg_cost, acc, predict = \ avg_cost, acc, predict = \
train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"]))) train_network(batch_size,
bool(int(os.environ["IS_DISTRIBUTED"])),
bool(int(os.environ["IS_SPARSE"])),
bool(int(os.environ["IS_SELF_CONTAINED_LR"])))
inference_program = fluid.default_main_program().clone() inference_program = fluid.default_main_program().clone()
......
...@@ -25,7 +25,11 @@ class TestDistSimnetBowDense2x2(TestDistBase): ...@@ -25,7 +25,11 @@ class TestDistSimnetBowDense2x2(TestDistBase):
self._enforce_place = "CPU" self._enforce_place = "CPU"
def test_simnet_bow(self): def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} need_envs = {
"IS_DISTRIBUTED": '0',
"IS_SPARSE": '0',
'IS_SELF_CONTAINED_LR': '1'
}
self.check_with_place( self.check_with_place(
"dist_simnet_bow.py", "dist_simnet_bow.py",
delta=1e-5, delta=1e-5,
...@@ -39,7 +43,11 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase): ...@@ -39,7 +43,11 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
self._enforce_place = "CPU" self._enforce_place = "CPU"
def test_simnet_bow(self): def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} need_envs = {
"IS_DISTRIBUTED": '0',
"IS_SPARSE": '0',
'IS_SELF_CONTAINED_LR': '1'
}
self.check_with_place( self.check_with_place(
"dist_simnet_bow.py", "dist_simnet_bow.py",
delta=100, delta=100,
...@@ -53,7 +61,11 @@ class TestDistSimnetBowSparse2x2(TestDistBase): ...@@ -53,7 +61,11 @@ class TestDistSimnetBowSparse2x2(TestDistBase):
self._enforce_place = "CPU" self._enforce_place = "CPU"
def test_simnet_bow(self): def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} need_envs = {
"IS_DISTRIBUTED": '0',
"IS_SPARSE": '1',
'IS_SELF_CONTAINED_LR': '1'
}
self.check_with_place( self.check_with_place(
"dist_simnet_bow.py", "dist_simnet_bow.py",
delta=1e-5, delta=1e-5,
...@@ -67,7 +79,11 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase): ...@@ -67,7 +79,11 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
self._enforce_place = "CPU" self._enforce_place = "CPU"
def test_simnet_bow(self): def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} need_envs = {
"IS_DISTRIBUTED": '0',
"IS_SPARSE": '1',
'IS_SELF_CONTAINED_LR': '1'
}
self.check_with_place( self.check_with_place(
"dist_simnet_bow.py", "dist_simnet_bow.py",
delta=100, delta=100,
...@@ -75,5 +91,61 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase): ...@@ -75,5 +91,61 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
need_envs=need_envs) need_envs=need_envs)
# FIXME(tangwei): Learningrate variable is not created on pserver.
"""
class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._enforce_place = "CPU"
def test_simnet_bow(self):
need_envs = {
"IS_DISTRIBUTED": '1',
"IS_SPARSE": '1',
'IS_SELF_CONTAINED_LR': '1'
}
self.check_with_place(
"dist_simnet_bow.py",
delta=1e-5,
check_error_log=True,
need_envs=need_envs)
class TestDistSimnetBow2x2LookupTableAsync(TestDistBase):
def _setup_config(self):
self._sync_mode = False
self._enforce_place = "CPU"
def test_simnet_bow(self):
need_envs = {
"IS_DISTRIBUTED": '1',
"IS_SPARSE": '1',
'IS_SELF_CONTAINED_LR': '1'
}
self.check_with_place(
"dist_simnet_bow.py",
delta=100,
check_error_log=False,
need_envs=need_envs)
class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._enforce_place = "CPU"
def test_simnet_bow(self):
need_envs = {
"IS_DISTRIBUTED": '1',
"IS_SPARSE": '1',
'IS_SELF_CONTAINED_LR': '0'
}
self.check_with_place(
"dist_simnet_bow.py",
delta=1e-5,
check_error_log=False,
need_envs=need_envs)
"""
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -414,6 +414,19 @@ class TestBook(unittest.TestCase): ...@@ -414,6 +414,19 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(out) self.assertIsNotNone(out)
print(str(program)) print(str(program))
def test_sequence_slice(self):
program = Program()
with program_guard(program):
import numpy as np
seqs = layers.data(
name='x', shape=[10, 5], dtype='float32', lod_level=1)
offset = layers.assign(input=np.array([[0, 1]]).astype('int32'))
length = layers.assign(input=np.array([[2, 1]]).astype('int32'))
out = layers.sequence_slice(
input=seqs, offset=offset, length=length)
self.assertIsNotNone(out)
print(str(program))
def test_lod_reset(self): def test_lod_reset(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
......
...@@ -16,6 +16,8 @@ from __future__ import print_function ...@@ -16,6 +16,8 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core
from paddle.fluid.op import Operator
from op_test import OpTest from op_test import OpTest
...@@ -88,5 +90,97 @@ class TestMomentumOp2(OpTest): ...@@ -88,5 +90,97 @@ class TestMomentumOp2(OpTest):
self.check_output() self.check_output()
class TestSparseMomentumOp(unittest.TestCase):
def setUp(self):
self.use_nesterov = False
def check_with_place(self, place):
self.init_kernel()
scope = core.Scope()
# create and initialize Grad Variable
height = 10
rows = [0, 4, 7]
row_numel = 12
mu = 1.0
use_nesterov = self.use_nesterov
# create and initialize Param Variable
param = scope.var('Param').get_tensor()
param_array = np.full((height, row_numel), 5.0).astype("float32")
param.set(param_array, place)
param_out = scope.var("ParamOut").get_tensor()
param_out_array = np.full((height, row_numel), 0.0).astype("float32")
param_out.set(param_out_array, place)
grad_selected_rows = scope.var('Grad').get_selected_rows()
grad_selected_rows.set_height(height)
grad_selected_rows.set_rows(rows)
grad_np_array = np.ones((len(rows), row_numel)).astype("float32")
grad_np_array[0, 0] = 2.0
grad_np_array[2, 8] = 4.0
grad_tensor = grad_selected_rows.get_tensor()
grad_tensor.set(grad_np_array, place)
velocity = scope.var('Velocity').get_tensor()
velocity_np_array = np.ones((height, row_numel)).astype("float32")
velocity.set(velocity_np_array, place)
velocity_out = scope.var('VelocityOut').get_tensor()
velocity_out_np_array = np.full((height, row_numel),
0.0).astype("float32")
velocity_out.set(velocity_out_np_array, place)
# create and initialize LeraningRate Variable
lr = scope.var('LearningRate').get_tensor()
lr_array = np.full((1), 2.0).astype("float32")
lr.set(lr_array, place)
# create and run operator
op = Operator(
"momentum",
Param='Param',
Grad='Grad',
Velocity='Velocity',
ParamOut='ParamOut',
VelocityOut='VelocityOut',
LearningRate='LearningRate',
mu=mu,
use_nesterov=use_nesterov)
op.run(scope, place)
# get and compare result
param_out_np_array = np.array(param_out)
velocity_out_np_array = np.array(velocity_out)
# TODO(dzh): add a more suitable general numpy interface
# for sparse update.
_grad_np_array = np.full((height, row_numel), 0.0).astype("float32")
for i in range(len(rows)):
_grad_np_array[rows[i]] = grad_np_array[i]
_velocity_out = mu * velocity_np_array + _grad_np_array
_param = param_array
if use_nesterov:
_param_out = _param - (_grad_np_array + _velocity_out * mu
) * lr_array
else:
_param_out = _param - lr_array * _velocity_out
self.assertTrue((_velocity_out == velocity_out_np_array).all())
self.assertTrue((_param_out == param_out_np_array).all())
def init_kernel(self):
pass
def test_sparse_momentum(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place)
class TestSparseMomentumOp2(TestSparseMomentumOp):
def init_kernel(self):
self.use_nesterov = True
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -1119,6 +1119,7 @@ to transpile() call.") ...@@ -1119,6 +1119,7 @@ to transpile() call.")
def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints): def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints):
# 2. add split_ids_op and send_op to send gradient to pservers # 2. add split_ids_op and send_op to send gradient to pservers
# there should only be one table_name # there should only be one table_name
all_ops = program.global_block().ops all_ops = program.global_block().ops
table_grad_name = grad_var_name(self.table_name) table_grad_name = grad_var_name(self.table_name)
...@@ -1143,7 +1144,7 @@ to transpile() call.") ...@@ -1143,7 +1144,7 @@ to transpile() call.")
if self.sync_mode else [] if self.sync_mode else []
}, },
attrs={ attrs={
"sync_mode": self.sync_mode, "sync_mode": not self.sync_mode,
"epmap": pserver_endpoints, "epmap": pserver_endpoints,
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
OP_ROLE_VAR_ATTR_NAME: [ OP_ROLE_VAR_ATTR_NAME: [
...@@ -1189,7 +1190,15 @@ to transpile() call.") ...@@ -1189,7 +1190,15 @@ to transpile() call.")
def _create_table_optimize_block(self, pserver_index, pserver_program, def _create_table_optimize_block(self, pserver_index, pserver_program,
pre_block_idx, grad_to_block_id): pre_block_idx, grad_to_block_id):
# STEP: create table optimize block # STEP: create table optimize block
table_opt_block = pserver_program._create_block(pre_block_idx)
# create table param and grad var in pserver program # create table param and grad var in pserver program
# create table optimize block in pserver program
table_opt_op = [
op for op in self.optimize_ops
if 'Param' in op.input_names and op.input("Param")[0] ==
self.table_name
][0]
origin_param_var = self.origin_program.global_block().vars[ origin_param_var = self.origin_program.global_block().vars[
self.table_name] self.table_name]
...@@ -1205,19 +1214,16 @@ to transpile() call.") ...@@ -1205,19 +1214,16 @@ to transpile() call.")
dtype=origin_param_var.dtype, dtype=origin_param_var.dtype,
type=core.VarDesc.VarType.SELECTED_ROWS, type=core.VarDesc.VarType.SELECTED_ROWS,
persistable=True) persistable=True)
# parameter must be selected rows # parameter must be selected rows
param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS) param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
grad_var = pserver_program.global_block()._clone_variable( grad_var = pserver_program.global_block()._clone_variable(
self.origin_program.global_block().vars[grad_var_name( self.origin_program.global_block().vars[grad_var_name(
self.table_name)]) self.table_name)])
# create table optimize block in pserver program lr_var = pserver_program.global_block()._clone_variable(
table_opt_op = [ self.origin_program.global_block().vars[table_opt_op.input(
op for op in self.optimize_ops "LearningRate")[0]])
if 'Param' in op.input_names and op.input("Param")[0] ==
self.table_name
][0]
table_opt_block = pserver_program._create_block(pre_block_idx)
if self.sync_mode: if self.sync_mode:
# create grad vars in pserver program # create grad vars in pserver program
...@@ -1249,8 +1255,6 @@ to transpile() call.") ...@@ -1249,8 +1255,6 @@ to transpile() call.")
grad_var = pserver_program.global_block()._rename_var( grad_var = pserver_program.global_block()._rename_var(
origin_grad_name, splited_grad_name) origin_grad_name, splited_grad_name)
lr_var = pserver_program.global_block().vars[table_opt_op.input(
"LearningRate")[0]]
inputs = { inputs = {
"Param": [param_var], "Param": [param_var],
"Grad": [grad_var], "Grad": [grad_var],
......
...@@ -12,4 +12,5 @@ ...@@ -12,4 +12,5 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
__all__ = ['dump_config'] from plot import Ploter
__all__ = ['dump_config', 'Ploter']
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
class PlotData(object):
def __init__(self):
self.step = []
self.value = []
def append(self, step, value):
self.step.append(step)
self.value.append(value)
def reset(self):
self.step = []
self.value = []
class Ploter(object):
"""
Plot input data in a 2D graph
Args:
title: assign the title of input data.
step: x_axis of the data.
value: y_axis of the data.
"""
def __init__(self, *args):
self.__args__ = args
self.__plot_data__ = {}
for title in args:
self.__plot_data__[title] = PlotData()
# demo in notebooks will use Ploter to plot figure, but when we convert
# the ipydb to py file for testing, the import of matplotlib will make the
# script crash. So we can use `export DISABLE_PLOT=True` to disable import
# these libs
self.__disable_plot__ = os.environ.get("DISABLE_PLOT")
if not self.__plot_is_disabled__():
import matplotlib.pyplot as plt
from IPython import display
self.plt = plt
self.display = display
def __plot_is_disabled__(self):
return self.__disable_plot__ == "True"
def append(self, title, step, value):
"""
Feed data
Args:
title: assign the group data to this subtitle.
step: the x_axis of data.
value: the y_axis of data.
Examples:
.. code-block:: python
plot_curve = Ploter("Curve 1","Curve 2")
plot_curve.append(title="Curve 1",step=1,value=1)
"""
assert isinstance(title, basestring)
assert self.__plot_data__.has_key(title)
data = self.__plot_data__[title]
assert isinstance(data, PlotData)
data.append(step, value)
def plot(self, path=None):
"""
Plot data in a 2D graph
Args:
path: store the figure to this file path. Defaul None.
Examples:
.. code-block:: python
plot_curve = Ploter()
plot_cure.plot()
"""
if self.__plot_is_disabled__():
return
titles = []
for title in self.__args__:
data = self.__plot_data__[title]
assert isinstance(data, PlotData)
if len(data.step) > 0:
titles.append(title)
self.plt.plot(data.step, data.value)
self.plt.legend(titles, loc='upper left')
if path is None:
self.display.clear_output(wait=True)
self.display.display(self.plt.gcf())
else:
self.plt.savefig(path)
self.plt.gcf().clear()
def reset(self):
for key in self.__plot_data__:
data = self.__plot_data__[key]
assert isinstance(data, PlotData)
data.reset()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册