diff --git a/cmake/configure.cmake b/cmake/configure.cmake index fc984f5e560ef30c342e108c50648e3e83e8b62d..3cf242826438a6008b7e52cd87b1ace02de46f89 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -31,6 +31,11 @@ elseif(SSE3_FOUND) set(SIMD_FLAG ${SSE3_FLAG}) endif() +if (SSE3_FOUND) + # TODO: Runtime detection should be used here. + add_definitions(-DPADDLE_WITH_SSE3) +endif() + if(WIN32) # windows header option for all targets. add_definitions(-D_XKEYCHECK_H) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 5df2ee680c506a65ddc6da39b24b4bc2eb82a006..9e295e2e688a126425e789716943dfb166a168d3 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -196,7 +196,7 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) -cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) +cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector) if(WITH_DISTRIBUTE) diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 943997be2e12b7a2218008dc020e8212d53232ab..c70cc8ed037ccf1c13fc519bb08419b985e46eb6 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -24,6 +24,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/denormal.h" #include "paddle/fluid/string/pretty_log.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -47,6 +48,7 @@ void NaiveExecutor::Run() { #ifdef PADDLE_WITH_MKLDNN platform::AttachPointerHashToMKLDNNKey(this, place_); #endif + platform::ScopedFlushDenormal flush; for (auto &op : ops_) { VLOG(4) << std::this_thread::get_id() << " run " << op->DebugStringEx(scope_) << " on scope " << scope_; diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h index be5fe1d64f9e7fbba207878c9ca6740109a41b7c..d740d9ee9523cf6c7e7ad1c341c3a8f32e8a42ad 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.h +++ b/paddle/fluid/inference/api/details/reset_tensor_array.h @@ -38,8 +38,14 @@ struct TensorArrayBatchCleaner { constexpr auto kTensorId = framework::VarTypeTrait::kId; constexpr auto kLoDTensorId = framework::VarTypeTrait::kId; + constexpr auto kSelectedRowsId = + framework::VarTypeTrait::kId; + constexpr auto kFetchListId = + framework::VarTypeTrait::kId; valid_types_.insert(kTensorId); valid_types_.insert(kLoDTensorId); + valid_types_.insert(kSelectedRowsId); + valid_types_.insert(kFetchListId); } // Collect the variables that are not Tensor or LoDTensor, and reset them to a // bool(trick), because some of them are containers, and some operators just diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index ef827fd74903afd007c864307e942749e3eb0bd1..6ae1f52ec03d255181e6d616b0723d9a5b45fa0b 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -32,6 +32,7 @@ if (WITH_PYTHON) endif() cc_library(flags SRCS flags.cc DEPS gflags) +cc_library(denormal SRCS denormal.cc DEPS) cc_library(errors SRCS errors.cc DEPS error_codes_proto) cc_test(errors_test SRCS errors_test.cc DEPS errors enforce) diff --git a/paddle/fluid/platform/denormal.cc b/paddle/fluid/platform/denormal.cc new file mode 100644 index 0000000000000000000000000000000000000000..02c69dae9cc27bfdaff7f69833556f590c7bb987 --- /dev/null +++ b/paddle/fluid/platform/denormal.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/denormal.h" +#include +#include + +// Refer to https://github.com/tensorflow/tensorflow/pull/17141 + +// If we're on gcc 4.8 or older, there's a known bug that prevents the use of +// intrinsics when the architecture is not defined in the flags. See +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57202 +#if !defined(__SSE3__) && !defined(__clang__) && \ + (defined(__GNUC__) && (__GNUC__ < 4) || \ + ((__GNUC__ == 4) && (__GNUC_MINOR__ < 9))) +#define GCC_WITHOUT_INTRINSICS +#endif + +#if !defined(GCC_WITHOUT_INTRINSICS) && !defined(PADDLE_WITH_ARM) +#define DENORM_USE_INTRINSICS +#endif + +#ifdef DENORM_USE_INTRINSICS +#include +#endif + +namespace paddle { +namespace platform { + +static void SetDenormalState(bool flush_zero_mode, bool denormals_zero_mode) { +#ifdef DENORM_USE_INTRINSICS +#ifdef PADDLE_WITH_SSE3 + // Intel's C and Fortran compilers enable the denormals-are-zero (DAZ) and + // flush-to-zero (FTZ) flags for SSE by default for optimization levels higher + // than -O0. + // AArch32 NEON (SIMD) FPU always uses a flush-to-zero mode. + // Refer to https://en.wikipedia.org/wiki/Denormal_number + // and https://software.intel.com/sites/landingpage/IntrinsicsGuide/ + _MM_SET_FLUSH_ZERO_MODE(flush_zero_mode ? _MM_FLUSH_ZERO_ON + : _MM_FLUSH_ZERO_OFF); + _MM_SET_DENORMALS_ZERO_MODE(denormals_zero_mode ? _MM_DENORMALS_ZERO_ON + : _MM_DENORMALS_ZERO_OFF); +#endif +#endif +} + +static std::pair GetDenormalState() { +#ifdef DENORM_USE_INTRINSICS +#ifdef PADDLE_WITH_SSE3 + bool flush_zero_mode = _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON; + bool denormals_zero_mode = + _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON; + return {flush_zero_mode, denormals_zero_mode}; +#endif +#endif + return {false, false}; +} + +ScopedRestoreFlushDenormalState::ScopedRestoreFlushDenormalState() { + std::tie(flush_zero_mode_, denormals_zero_mode_) = GetDenormalState(); +} + +ScopedRestoreFlushDenormalState::~ScopedRestoreFlushDenormalState() { + SetDenormalState(flush_zero_mode_, denormals_zero_mode_); +} + +ScopedFlushDenormal::ScopedFlushDenormal() { SetDenormalState(true, true); } +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/denormal.h b/paddle/fluid/platform/denormal.h new file mode 100644 index 0000000000000000000000000000000000000000..e703040f39baee50334da87425b43bc87b756c60 --- /dev/null +++ b/paddle/fluid/platform/denormal.h @@ -0,0 +1,43 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace platform { + +// Used to restore the initial value at the end of the scope. +class ScopedRestoreFlushDenormalState { + public: + ScopedRestoreFlushDenormalState(); + ~ScopedRestoreFlushDenormalState(); + + private: + bool flush_zero_mode_; + bool denormals_zero_mode_; + DISABLE_COPY_AND_ASSIGN(ScopedRestoreFlushDenormalState); +}; + +class ScopedFlushDenormal { + public: + ScopedFlushDenormal(); + + private: + ScopedRestoreFlushDenormalState restore_; + DISABLE_COPY_AND_ASSIGN(ScopedFlushDenormal); +}; +} // namespace platform +} // namespace paddle