[cherry-pick] #26920 , #22924 (#29948)

bea300dd · 石晓伟 · GitHub · 160b3477 · bea300dd · bea300dd
7 changed file
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -31,6 +31,11 @@ elseif(SSE3_FOUND)
    set(SIMD_FLAG ${SSE3_FLAG})
 endif()
+if (SSE3_FOUND)
+    # TODO: Runtime detection should be used here.
+    add_definitions(-DPADDLE_WITH_SSE3)
+endif()
 if(WIN32)
  # windows header option for all targets.
  add_definitions(-D_XKEYCHECK_H)

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -196,7 +196,7 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
-cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
+cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
 cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector)
 if(WITH_DISTRIBUTE)

--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -24,6 +24,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/denormal.h"
 #include "paddle/fluid/string/pretty_log.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -47,6 +48,7 @@ void NaiveExecutor::Run() {
 #ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
 #endif
+  platform::ScopedFlushDenormal flush;
  for (auto &op : ops_) {
    VLOG(4) << std::this_thread::get_id() << " run "
            << op->DebugStringEx(scope_) << " on scope " << scope_;

--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -38,8 +38,14 @@ struct TensorArrayBatchCleaner {
    constexpr auto kTensorId = framework::VarTypeTrait<framework::Tensor>::kId;
    constexpr auto kLoDTensorId =
        framework::VarTypeTrait<framework::LoDTensor>::kId;
+    constexpr auto kSelectedRowsId =
+        framework::VarTypeTrait<framework::SelectedRows>::kId;
+    constexpr auto kFetchListId =
+        framework::VarTypeTrait<framework::FetchList>::kId;
    valid_types_.insert(kTensorId);
    valid_types_.insert(kLoDTensorId);
+    valid_types_.insert(kSelectedRowsId);
+    valid_types_.insert(kFetchListId);
  }
  // Collect the variables that are not Tensor or LoDTensor, and reset them to a
  // bool(trick), because some of them are containers, and some operators just

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -32,6 +32,7 @@ if (WITH_PYTHON)
 endif()
 cc_library(flags SRCS flags.cc DEPS gflags)
+cc_library(denormal SRCS denormal.cc DEPS)
 cc_library(errors SRCS errors.cc DEPS error_codes_proto)
 cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)

--- a/paddle/fluid/platform/denormal.cc
+++ b/paddle/fluid/platform/denormal.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/platform/denormal.h"
+#include <tuple>
+#include <utility>
+// Refer to https://github.com/tensorflow/tensorflow/pull/17141
+// If we're on gcc 4.8 or older, there's a known bug that prevents the use of
+// intrinsics when the architecture is not defined in the flags. See
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57202
+#if !defined(__SSE3__) && !defined(__clang__) && \
+    (defined(__GNUC__) && (__GNUC__ < 4) ||      \
+     ((__GNUC__ == 4) && (__GNUC_MINOR__ < 9)))
+#define GCC_WITHOUT_INTRINSICS
+#endif
+#if !defined(GCC_WITHOUT_INTRINSICS) && !defined(PADDLE_WITH_ARM)
+#define DENORM_USE_INTRINSICS
+#endif
+#ifdef DENORM_USE_INTRINSICS
+#include <pmmintrin.h>
+#endif
+namespace paddle {
+namespace platform {
+static void SetDenormalState(bool flush_zero_mode, bool denormals_zero_mode) {
+#ifdef DENORM_USE_INTRINSICS
+#ifdef PADDLE_WITH_SSE3
+  // Intel's C and Fortran compilers enable the denormals-are-zero (DAZ) and
+  // flush-to-zero (FTZ) flags for SSE by default for optimization levels higher
+  // than -O0.
+  // AArch32 NEON (SIMD) FPU always uses a flush-to-zero mode.
+  // Refer to https://en.wikipedia.org/wiki/Denormal_number
+  // and https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+  _MM_SET_FLUSH_ZERO_MODE(flush_zero_mode ? _MM_FLUSH_ZERO_ON
+                                          : _MM_FLUSH_ZERO_OFF);
+  _MM_SET_DENORMALS_ZERO_MODE(denormals_zero_mode ? _MM_DENORMALS_ZERO_ON
+                                                  : _MM_DENORMALS_ZERO_OFF);
+#endif
+#endif
+}
+static std::pair<bool, bool> GetDenormalState() {
+#ifdef DENORM_USE_INTRINSICS
+#ifdef PADDLE_WITH_SSE3
+  bool flush_zero_mode = _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
+  bool denormals_zero_mode =
+      _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
+  return {flush_zero_mode, denormals_zero_mode};
+#endif
+#endif
+  return {false, false};
+}
+ScopedRestoreFlushDenormalState::ScopedRestoreFlushDenormalState() {
+  std::tie(flush_zero_mode_, denormals_zero_mode_) = GetDenormalState();
+}
+ScopedRestoreFlushDenormalState::~ScopedRestoreFlushDenormalState() {
+  SetDenormalState(flush_zero_mode_, denormals_zero_mode_);
+}
+ScopedFlushDenormal::ScopedFlushDenormal() { SetDenormalState(true, true); }
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/denormal.h
+++ b/paddle/fluid/platform/denormal.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/platform/macros.h"
+namespace paddle {
+namespace platform {
+// Used to restore the initial value at the end of the scope.
+class ScopedRestoreFlushDenormalState {
+ public:
+  ScopedRestoreFlushDenormalState();
+  ~ScopedRestoreFlushDenormalState();
+ private:
+  bool flush_zero_mode_;
+  bool denormals_zero_mode_;
+  DISABLE_COPY_AND_ASSIGN(ScopedRestoreFlushDenormalState);
+};
+class ScopedFlushDenormal {
+ public:
+  ScopedFlushDenormal();
+ private:
+  ScopedRestoreFlushDenormalState restore_;
+  DISABLE_COPY_AND_ASSIGN(ScopedFlushDenormal);
+};
+}  // namespace platform
+}  // namespace paddle