Support inference compilation in training package (#46008)

* merge python lib * Update third_party.cmake * Update CMakeLists.txt

Support inference compilation in training package (#46008)
* merge python lib * Update third_party.cmake * Update CMakeLists.txt
cbe64cc1 · JingZhuangzhuang · GitHub · 9718791c · cbe64cc1 · cbe64cc1
11 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -249,7 +249,7 @@ option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
 option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
 option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
 option(ON_INFER "Turn on inference optimization and inference-lib generation"
-       OFF)
+       ON)
 ################################ Internal Configurations #######################################
 option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF)
 option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools"

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -236,7 +236,7 @@ endif()
 if(WIN32
   OR APPLE
   OR NOT WITH_GPU
-   OR ON_INFER)
+   OR (ON_INFER AND NOT WITH_PYTHON))
  set(WITH_DGC OFF)
 endif()

--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -21,17 +21,8 @@
 #include "paddle/phi/core/utils/rw_lock.h"
-// When in inference scenario, the scopes will not be written by two threads in
-// a mean time, but a scope may be read by multiple threads concurrently, and
-// the mutex will cause serious performance issue.
-// So the mutex is disabled when `ON_INFER`.
-#ifdef PADDLE_ON_INFERENCE
-#define SCOPE_VARS_READER_LOCK
-#define SCOPE_VARS_WRITER_LOCK
-#else
 #define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
 #define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
-#endif
 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -24,17 +24,8 @@
 #include "paddle/fluid/platform/event.h"
 #include "paddle/phi/core/utils/rw_lock.h"
-// When in inference scenario, the scopes will not be written by two threads in
-// a mean time, but a scope may be read by multiple threads concurrently, and
-// the mutex will cause serious performance issue.
-// So the mutex is disabled when `ON_INFER`.
-#ifdef PADDLE_ON_INFERENCE
-#define SCOPE_VARS_READER_LOCK
-#define SCOPE_VARS_WRITER_LOCK
-#else
 #define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
 #define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
-#endif
 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -25,21 +25,10 @@ PADDLE_DEFINE_EXPORTED_bool(
    "Delete local scope eagerly. It will reduce GPU memory usage but "
    "slow down the destruction of variables.(around 1% performance harm)");
-// When in inference scenario, the scopes will not be written by two threads in
-// a mean time, but a scope may be read by multiple threads concurrently, and
-// the mutex will cause serious performance issue.
-// So the mutex is disabled when `ON_INFER`.
-#ifdef PADDLE_ON_INFERENCE
-#define SCOPE_KIDS_READER_LOCK
-#define SCOPE_KIDS_WRITER_LOCK
-#define SCOPE_VARS_READER_LOCK
-#define SCOPE_VARS_WRITER_LOCK
-#else
 #define SCOPE_KIDS_READER_LOCK phi::AutoRDLock auto_lock(&kids_lock_);
 #define SCOPE_KIDS_WRITER_LOCK phi::AutoWRLock auto_lock(&kids_lock_);
 #define SCOPE_VARS_READER_LOCK phi::AutoRDLock auto_lock(&vars_lock_);
 #define SCOPE_VARS_WRITER_LOCK phi::AutoWRLock auto_lock(&vars_lock_);
-#endif
 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -179,12 +179,9 @@ class Scope : public ScopeBase {
  DISABLE_COPY_AND_ASSIGN(Scope);
-#ifndef PADDLE_ON_INFERENCE
 private:
  mutable phi::RWLock kids_lock_;
  mutable phi::RWLock vars_lock_;
-#endif
 };
 // Generate some debug string about the inherience structure of scope, quite

--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -388,11 +388,7 @@ PADDLE_DEFINE_EXPORTED_int32(
 *       enable garbage collection strategy when training large networks.
 */
 // Disable gc by default when inference library is built
-#ifdef PADDLE_ON_INFERENCE
-static const double kDefaultEagerDeleteTensorGB = -1;
-#else
 static const double kDefaultEagerDeleteTensorGB = 0;
-#endif
 PADDLE_DEFINE_EXPORTED_double(
    eager_delete_tensor_gb,
@@ -663,11 +659,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
 * If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
 * message summary will be shown.
 */
-#ifdef PADDLE_ON_INFERENCE
-static const int32_t kDefaultCallStackLevel = 2;
-#else
 static const int32_t kDefaultCallStackLevel = 1;
-#endif
 PADDLE_DEFINE_EXPORTED_int32(
    call_stack_level,

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -140,7 +140,7 @@ if(WITH_CUSTOM_DEVICE)
  set(PYBIND_DEPS ${PYBIND_DEPS} custom_device_common_op_registry)
 endif()
-if(NOT ON_INFER)
+if(WITH_PYTHON)
  set(PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
  if(WITH_NCCL OR WITH_RCCL)
    set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2547,7 +2547,7 @@ All parameter, weight, gradient are variables in Paddle.
  BindCompatible(&m);
  BindDataset(&m);
  BindGenerator(&m);
-#ifndef PADDLE_ON_INFERENCE
+#ifndef PADDLE_NO_PYTHON
  BindDistributed(&m);
 #endif
 #ifdef PADDLE_WITH_ASCEND

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -799,7 +799,7 @@ py_test_modules(
 # it is found that windows CI will run all the training unittests with the ON_INFER option turned on,
 # which will not appear in other CIs. The calculation behavior of some ops in inference mode is
 # inconsistent with that in non-inference mode.
-if(NOT ON_INFER)
+if(WITH_PYTHON)
  py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES
                  test_parallel_executor_seresnext_base_cpu)
  py_test_modules(test_parallel_executor_seresnext_with_reduce_cpu MODULES

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
@@ -34,7 +34,7 @@ set(TEST_EAGER_OPS
 list(REMOVE_ITEM TEST_OPS test_lac)
 # NOTE(Aurelius84): In case of Windows CI, if open ON_INFER, RWLOCK of Scope
 # will be removed and will cause some random failed in multi-thread.
-if(NOT ON_INFER)
+if(WITH_PYTHON)
  py_test_modules(test_lac MODULES test_lac ENVS FLAGS_enable_eager_mode=1)
  set_tests_properties(test_lac PROPERTIES TIMEOUT 120)
 endif()