[CHERRY-PICK2.1]Remove paddle_custom_op dynamic libraries, and link to...

[CHERRY-PICK2.1]Remove paddle_custom_op dynamic libraries, and link to FLUID_CORE on windows (#32583) (#32769) * Remove paddle_custom_op dynamic libraries, change link to FLUID_CORE on windows, and check copy_to * fix CI

[CHERRY-PICK2.1]Remove paddle_custom_op dynamic libraries, and link to...
[CHERRY-PICK2.1]Remove paddle_custom_op dynamic libraries, and link to FLUID_CORE on windows (#32583) (#32769) * Remove paddle_custom_op dynamic libraries, change link to FLUID_CORE on windows, and check copy_to * fix CI
3ba8c48a · Zhou Wei · GitHub · 70e0e3d5 · 3ba8c48a · 3ba8c48a
13 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -369,36 +369,3 @@ cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 if(WITH_TESTING AND TEST selected_rows_test)
  set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
 endif()
-##### 2.0 New custom op extension mechanism related #####
-# if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_
-if (WIN32)
-  set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer)
-  set(PADDLE_CUSTOM_OP_SRCS
-      ${CMAKE_CURRENT_SOURCE_DIR}/custom_operator.cc
-      ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_tensor.cc
-      ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_op_meta_info.cc
-      ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc)
-  set(PADDLE_CUSTOM_OP_SRCS ${PADDLE_CUSTOM_OP_SRCS} PARENT_SCOPE)
-  cc_library(paddle_custom_op_shared
-      SHARED SRCS ${PADDLE_CUSTOM_OP_SRCS} DEPS ${PADDLE_CUSTOM_OP_MODULES})
-  get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-  set_target_properties(paddle_custom_op_shared PROPERTIES OUTPUT_NAME paddle_custom_op)
-  target_link_libraries(paddle_custom_op_shared ${os_dependency_modules})
-  if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-    set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR})
-  else()
-    set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE})
-  endif()
-  set(PADDLE_CUSTOM_OP_IMPORT_LIB
-      ${paddle_custom_op_lib_path}/paddle_custom_op.lib
-      CACHE INTERNAL "Paddle custom op import lib")
-  set(PADDLE_CUSTOM_OP_SHARED_LIB
-      ${paddle_custom_op_lib_path}/paddle_custom_op.dll
-      CACHE INTERNAL "Paddle custom op dll")
-endif()
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -54,14 +54,14 @@ wmic process where name="python.exe" call terminate 2>NUL
 rem ------initialize common variable------
 if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64"
 if not defined BRANCH set BRANCH=develop
-if not defined WITH_TENSORRT set WITH_TENSORRT=ON 
+if not defined WITH_TENSORRT set WITH_TENSORRT=ON
 if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT
 if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto
 if not defined WITH_GPU set WITH_GPU=ON
 if not defined WITH_MKL set WITH_MKL=ON
 if not defined WITH_AVX set WITH_AVX=ON
 if not defined WITH_TESTING set WITH_TESTING=ON
-if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=OFF
+if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=ON
 if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
@@ -75,6 +75,7 @@ if not defined LOG_LEVEL set LOG_LEVEL=normal
 if not defined PRECISION_TEST set PRECISION_TEST=OFF
 if not defined NIGHTLY_MODE set PRECISION_TEST=OFF
 if not defined retry_times set retry_times=2
+if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
 rem -------set cache build directory-----------
 rmdir build\python /s/q
@@ -83,9 +84,6 @@ rmdir build\paddle_inference_install_dir /s/q
 rmdir build\paddle_inference_c_install_dir /s/q
 del build\CMakeCache.txt
-: set CI_SKIP_CPP_TEST if only *.py changed
-git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON
 if "%WITH_CACHE%"=="OFF" (
    rmdir build /s/q
    goto :mkbuild
@@ -135,58 +133,6 @@ dir .
 dir %cache_dir%
 dir paddle\fluid\pybind\Release
-rem ------initialize the python environment------
-if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
-set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
-set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
-rem ToDo: virtual environment can't be deleted safely, some process not exit when task is canceled
-rem Now use system python environment temporarily
-rem %PYTHON_EXECUTABLE% -m pip install virtualenv
-rem %PYTHON_EXECUTABLE% -m virtualenv paddle_winci
-rem call paddle_winci\Scripts\activate.bat
-rem ------pre install python requirement----------
-where python
-where pip
-pip install wheel --user
-pip install -r %work_dir%\python\requirements.txt --user
-if %ERRORLEVEL% NEQ 0 (
-    echo pip install requirements.txt failed!
-    exit /b 7
-)
-rem ------pre install clcache and init config----------
-rem pip install clcache --user
-pip uninstall -y clcache
-:: set USE_CLCACHE to enable clcache
-rem set USE_CLCACHE=1
-:: In some scenarios, CLCACHE_HARDLINK can save one file copy.
-rem set CLCACHE_HARDLINK=1
-:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
-rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
-:: set maximum cache size to 20G
-rem clcache.exe -M 21474836480
-:: install ninja if GENERATOR is Ninja
-if %GENERATOR% == "Ninja" (
-    pip install ninja
-    if %errorlevel% NEQ 0 (
-        echo pip install ninja failed!
-        exit /b 7
-    )
-)
-rem ------show summary of current environment----------
-cmake --version
-if "%WITH_GPU%"=="ON" (
-    nvcc --version
-    nvidia-smi
-)
-::python %work_dir%\tools\summary_env.py
-::%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\get_cpu_info.sh
 goto :CASE_%1
 echo "Usage: paddle_build.bat [OPTION]"
@@ -266,8 +212,10 @@ rem "Other configurations are added here"
 rem :CASE_wincheck_others
 rem call ...
 rem ---------------------------------------------------------------------------------------------
 :cmake
+@ECHO OFF
 echo    ========================================
 echo    Step 1. Cmake ...
 echo    ========================================
@@ -281,12 +229,52 @@ set PATH=C:\Program Files (x86)\Windows Kits\10\bin\10.0.17763.0\x64;%PATH%
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%
-@ECHO ON
+if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
-if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0
 set PATH=%TENSORRT_ROOT:/=\%\lib;%CUDA_TOOLKIT_ROOT_DIR%\bin;%CUDA_TOOLKIT_ROOT_DIR%\libnvvp;%PATH%
-rem ------set third_party cache dir------
+rem install ninja if GENERATOR is Ninja
+if %GENERATOR% == "Ninja" (
+    pip install ninja
+    if %errorlevel% NEQ 0 (
+        echo pip install ninja failed!
+        exit /b 7
+    )
+)
+rem ------show summary of current GPU environment----------
+cmake --version
+if "%WITH_GPU%"=="ON" (
+    nvcc --version
+    nvidia-smi
+)
+rem ------initialize the python environment------
+set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
+set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
+if %WITH_PYTHON% == "OFF" (
+    where python
+    where pip
+    pip install wheel --user
+    pip install -r %work_dir%\python\requirements.txt --user
+    if %ERRORLEVEL% NEQ 0 (
+        echo pip install requirements.txt failed!
+        exit /b 7
+    )
+)
+rem ------pre install clcache and init config----------
+rem pip install clcache --user
+pip uninstall -y clcache
+:: set USE_CLCACHE to enable clcache
+rem set USE_CLCACHE=1
+:: In some scenarios, CLCACHE_HARDLINK can save one file copy.
+rem set CLCACHE_HARDLINK=1
+:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
+rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
+:: set maximum cache size to 20G
+rem clcache.exe -M 21474836480
+rem ------set third_party cache dir------
 : clear third party cache every once in a while
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
 set day_now=%datetime:~6,2%
@@ -500,6 +488,10 @@ echo    ========================================
 echo    Step 4. Running unit tests ...
 echo    ========================================
+: set CI_SKIP_CPP_TEST if only *.py changed
+git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON
 pip install -r %work_dir%\python\unittest_py\requirements.txt --user
 if %ERRORLEVEL% NEQ 0 (
    echo pip install unittest requirements.txt failed!

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -43,9 +43,20 @@ set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
 IF(WIN32)
    # Python would use the .pyd by default under Windows series platform
    set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.pyd)
-    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd)
+    set(FLUID_CORE_LIB ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.lib)
+    add_custom_command(OUTPUT ${FLUID_CORE}
+      COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+      COMMAND cmake -E copy $<TARGET_LINKER_FILE:paddle_pybind> ${FLUID_CORE_LIB}
+      DEPENDS paddle_pybind)
+    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd)  
 ELSE()
    set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.so)
+    add_custom_command(OUTPUT ${FLUID_CORE}
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+        DEPENDS paddle_pybind)
    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.so)
 ENDIF()
@@ -68,9 +79,6 @@ if(HAS_NOAVX_CORE AND EXISTS "${NOAVX_CORE_FILE}")
  list(APPEND FLUID_CORE_DEPS ${FLUID_NOAVX_CORE})
 endif()
-add_custom_command(OUTPUT ${FLUID_CORE}
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-        DEPENDS paddle_pybind)
 add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE_DEPS})
 IF(WIN32)
@@ -84,6 +92,7 @@ ELSE(WIN32)
    COMMAND touch stub.cc
    COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    COMMENT "Packing whl packages------>>>"
    DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
 ENDIF()

--- a/python/paddle/check_import_scipy.py
+++ b/python/paddle/check_import_scipy.py
@@ -24,6 +24,6 @@ def check_import_scipy(OsName):
            if 'DLL load failed' in print_info:
                raise ImportError(
                    print_info +
-                    "\nplease download visual C++ Redistributable for vs 2015, https://www.microsoft.com/en-us/download/details.aspx?id=48145"
+                    "\nplease download Visual C++ Redistributable from https://support.microsoft.com/en-us/topic/the-latest-supported-visual-c-downloads-2647da03-1eea-4433-9aff-95f26a218cc0"
                )
    return
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -37,7 +37,10 @@ if os.path.exists(current_path + os.sep + 'core_noavx.' + core_suffix):
 try:
    if os.name == 'nt':
        third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
-        os.environ['path'] = third_lib_path + ';' + os.environ['path']
+        # Will load shared library from 'path' on windows
+        os.environ[
+            'path'] = current_path + ';' + third_lib_path + ';' + os.environ[
+                'path']
        sys.path.insert(0, third_lib_path)
        # Note: from python3.8, PATH will not take effect
        # https://github.com/python/cpython/pull/12302
@@ -298,7 +301,7 @@ if avx_supported():
                "WARNING: AVX is supported on local machine, but you have installed "
                "paddlepaddle without avx core. Hence, no_avx core which has worse "
                "preformance will be imported.\nYou could reinstall paddlepaddle by "
-                "'python -m pip install -U paddlepaddle-gpu[==version]' or rebuild "
+                "'python -m pip install --force-reinstall paddlepaddle-gpu[==version]' or rebuild "
                "paddlepaddle WITH_AVX=ON to get better performance.\n"
                "The original error is: %s\n" % cpt.get_exception_message(e))
            load_noavx = True
@@ -350,12 +353,19 @@ if load_noavx:
            sys.stderr.write(
                'Error: Can not import noavx core while this file exists: ' +
                current_path + os.sep + 'core_noavx.' + core_suffix + '\n')
+        elif avx_supported():
+            sys.stderr.write(
+                "Error: AVX is support on your machine, but you have installed "
+                "paddlepaddle without avx core, you should reinstall paddlepaddle by "
+                "'python -m pip install --force-reinstall paddlepaddle-gpu[==version]\n"
+            )
        else:
            sys.stderr.write(
                "Error: AVX is not support on your machine, but you have installed "
-                "paddlepaddle with avx core, you should reinstall paddlepaddle by "
+                "paddlepaddle without no_avx core, you should reinstall paddlepaddle by "
-                "'python -m pip install -U paddlepaddle-gpu[==version] -f "
+                "'python -m pip install --force-reinstall paddlepaddle-gpu[==version] -f "
-                "https://paddlepaddle.org.cn/whl/stable_noavx.html'\n")
+                "https://paddlepaddle.org.cn/whl/mkl/stable/noavx.html or "
+                "https://paddlepaddle.org.cn/whl/openblas/stable/noavx.html\n")
        raise e

--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
-# New custom OP can support Windows/Linux now
+# New custom OP can support Windows/Linux/Mac now
-if(WITH_GPU OR APPLE) 
+if(WITH_GPU OR APPLE)
-    # GPU custom op tests: compile both .cc and .cu file
    py_test(test_custom_relu_op_setup SRCS test_custom_relu_op_setup.py)
    py_test(test_custom_relu_op_jit SRCS test_custom_relu_op_jit.py)
    py_test(test_custom_relu_model SRCS test_custom_relu_model.py)

--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -45,8 +45,12 @@ std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
  int grid = (numel + block - 1) / block;
  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
      x.type(), "relu_cuda_forward_kernel", ([&] {
+        auto cpu_input = x.copy_to<data_t>(paddle::PlaceType::kCPU);
+        auto gpu_input = cpu_input.copy_to<data_t>(paddle::PlaceType::kGPU);
        relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
-            x.data<data_t>(), out.mutable_data<data_t>(x.place()), numel);
+            gpu_input.data<data_t>(),
+            out.mutable_data<data_t>(x.place()),
+            numel);
      }));
  return {out};

--- a/python/paddle/fluid/tests/custom_op/test_check_abi.py
+++ b/python/paddle/fluid/tests/custom_op/test_check_abi.py
@@ -64,14 +64,29 @@ class TestCheckCompiler(TestABIBase):
        # clear environ
        self.del_environ()
        compiler = 'python'  # fake wrong compiler
-        with warnings.catch_warnings(record=True) as error:
+        if not utils.IS_WINDOWS:
-            flag = utils.check_abi_compatibility(compiler, verbose=True)
+            with warnings.catch_warnings(record=True) as error:
-            # check return False
+                flag = utils.check_abi_compatibility(compiler, verbose=True)
-            self.assertFalse(flag)
+                # check return False
-            # check Compiler Compatibility WARNING
+                self.assertFalse(flag)
-            self.assertTrue(len(error) == 1)
+                # check Compiler Compatibility WARNING
-            self.assertTrue(
+                self.assertTrue(len(error) == 1)
-                "Compiler Compatibility WARNING" in str(error[0].message))
+                self.assertTrue(
+                    "Compiler Compatibility WARNING" in str(error[0].message))
+    def test_exception_windows(self):
+        # clear environ
+        self.del_environ()
+        compiler = 'fake compiler'  # fake command
+        if utils.IS_WINDOWS:
+            with warnings.catch_warnings(record=True) as error:
+                flag = utils.check_abi_compatibility(compiler, verbose=True)
+                # check return False
+                self.assertFalse(flag)
+                # check ABI Compatibility WARNING
+                self.assertTrue(len(error) == 1)
+                self.assertTrue("Failed to check compiler version for" in
+                                str(error[0].message))
    def test_exception_linux(self):
        # clear environ

--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -105,12 +105,12 @@ class TestJITLoad(unittest.TestCase):
                in str(e))
            if IS_WINDOWS:
                self.assertTrue(
-                    r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc:47"
+                    r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc" in
-                    in str(e))
+                    str(e))
            else:
                self.assertTrue(
-                    "python/paddle/fluid/tests/custom_op/custom_relu_op.cc:47"
+                    "python/paddle/fluid/tests/custom_op/custom_relu_op.cc" in
-                    in str(e))
+                    str(e))
        self.assertTrue(caught_exception)
        caught_exception = False
@@ -126,7 +126,7 @@ class TestJITLoad(unittest.TestCase):
                "function \"relu_cuda_forward_kernel\" is not implemented for data type `int32_t`"
                in str(e))
            self.assertTrue(
-                "python/paddle/fluid/tests/custom_op/custom_relu_op.cu:50" in
+                "python/paddle/fluid/tests/custom_op/custom_relu_op.cu" in
                str(e))
        self.assertTrue(caught_exception)

--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -26,7 +26,7 @@ from .extension_utils import find_cuda_home, find_rocm_home, normalize_extension
 from .extension_utils import is_cuda_file, prepare_unix_cudaflags, prepare_win_cudaflags
 from .extension_utils import _import_module_from_library, _write_setup_file, _jit_compile
 from .extension_utils import check_abi_compatibility, log_v, CustomOpInfo, parse_op_name_from
-from .extension_utils import clean_object_if_change_cflags, _reset_so_rpath
+from .extension_utils import clean_object_if_change_cflags, _reset_so_rpath, _get_fluid_path
 from .extension_utils import bootstrap_context, get_build_directory, add_std_without_repeat
 from .extension_utils import IS_WINDOWS, OS_NAME, MSVC_COMPILE_FLAGS, MSVC_COMPILE_FLAGS
@@ -69,7 +69,7 @@ def setup(**attr):
    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
    then the version of user's local machine should satisfy GCC >= 8.2. 
    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
-    PaddlePaddle (Visual Studio 2015 update3). 
+    PaddlePaddle (Visual Studio 2017). 
    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
    occur because of ABI compatibility.
@@ -79,7 +79,7 @@ def setup(**attr):
        2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
           GCC version.
-        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2015 update3).
+        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017).
    Compared with Just-In-Time ``load`` interface, it only compiles once by executing
@@ -611,7 +611,7 @@ class BuildExtension(build_ext, object):
            msg = (
                'It seems that the VC environment is activated but DISTUTILS_USE_SDK is not set.'
                'This may lead to multiple activations of the VC env.'
-                'Please set `DISTUTILS_USE_SDK=1` and try again.')
+                'Please run `set DISTUTILS_USE_SDK=1` and try again.')
            raise UserWarning(msg)
    def _record_op_info(self):
@@ -724,7 +724,7 @@ def load(name,
    processes under a individual subprocess. It does not require CMake or Ninja 
    environment. On Linux platform, it requires GCC compiler whose version is 
    greater than 5.4 and it should be soft linked to ``/usr/bin/cc`` . On Windows 
-    platform, it requires Visual Studio whose version is greater than 2015 update3.
+    platform, it requires Visual Studio whose version is greater than 2017.
    On MacOS, clang++ is requited. In addition, if compiling Operators supporting 
    GPU device, please make sure ``nvcc`` compiler is installed in local environment.
@@ -735,7 +735,7 @@ def load(name,
    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
    then the version of user's local machine should satisfy GCC >= 8.2. 
    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
-    PaddlePaddle (Visual Studio 2015 update3). 
+    PaddlePaddle (Visual Studio 2017). 
    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
    occur because of ABI compatibility.
@@ -749,7 +749,7 @@ def load(name,
        2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
           GCC version.
-        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2015 update3).
+        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017).
    **A simple example:**
@@ -802,9 +802,6 @@ def load(name,
    # ensure to use abs path
    build_directory = os.path.abspath(build_directory)
-    # Will load shared library from 'path' on windows
-    if IS_WINDOWS:
-        os.environ['path'] = build_directory + ';' + os.environ['path']
    log_v("build_directory: {}".format(build_directory), verbose)
@@ -827,6 +824,7 @@ def load(name,
    # write setup.py file and compile it
    build_base_dir = os.path.join(build_directory, name)
    _write_setup_file(name, sources, file_path, build_base_dir,
                      extra_include_paths, extra_cxx_cflags, extra_cuda_cflags,
                      extra_ldflags, verbose)

--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -55,7 +55,7 @@ CLANG_LINK_FLAGS = [
    '-dynamiclib', '-undefined', 'dynamic_lookup', '-arch', 'x86_64'
 ]
-MSVC_LINK_FLAGS = ['/MACHINE:X64', 'paddle_custom_op.lib']
+MSVC_LINK_FLAGS = ['/MACHINE:X64']
 COMMON_NVCC_FLAGS = ['-DPADDLE_WITH_CUDA', '-DEIGEN_USE_GPU']
@@ -371,10 +371,11 @@ def _get_core_name():
    Return pybind DSO module name.
    """
    import paddle
-    if paddle.fluid.core.load_noavx:
+    ext_name = '.pyd' if IS_WINDOWS else '.so'
-        return 'core_noavx.so'
+    if not paddle.fluid.core.load_noavx:
+        return 'core_avx' + ext_name
    else:
-        return 'core_avx.so'
+        return 'core_noavx' + ext_name
 def _get_lib_core_path():
@@ -386,6 +387,15 @@ def _get_lib_core_path():
    return os.path.join(_get_fluid_path(), lib_core_name)
+def _get_dll_core_path():
+    """
+    Return real path of libcore_(no)avx.dylib on Windows.
+    """
+    raw_core_name = _get_core_name()
+    dll_core_name = "paddle_pybind.dll"
+    return os.path.join(_get_fluid_path(), dll_core_name)
 def _reset_so_rpath(so_path):
    """
    NOTE(Aurelius84): Runtime path of core_(no)avx.so is modified into `@loader_path/../libs`
@@ -435,9 +445,12 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
        # append link flags
        extra_link_args = kwargs.get('extra_link_args', [])
        extra_link_args.extend(MSVC_LINK_FLAGS)
+        lib_core_name = create_sym_link_if_not_exist()
+        extra_link_args.append('{}'.format(lib_core_name))
        if use_cuda:
            extra_link_args.extend(['cudadevrt.lib', 'cudart_static.lib'])
        kwargs['extra_link_args'] = extra_link_args
    else:
        ########################### Linux Platform ###########################
        extra_link_args = kwargs.get('extra_link_args', [])
@@ -481,24 +494,41 @@ def create_sym_link_if_not_exist():
    """
    Create soft symbol link of `core_avx.so` or `core_noavx.so`
    """
-    assert OS_NAME.startswith('darwin')
+    assert OS_NAME.startswith('darwin') or IS_WINDOWS
    raw_core_name = _get_core_name()
    core_path = os.path.join(_get_fluid_path(), raw_core_name)
-    new_lib_core_path = _get_lib_core_path()
+    if IS_WINDOWS:
+        new_dll_core_path = _get_dll_core_path()
+        # create symbol link on windows
+        if not os.path.exists(new_dll_core_path):
+            try:
+                os.symlink(core_path, new_dll_core_path)
+            except Exception:
+                warnings.warn(
+                    "Failed to create soft symbol link for {}.\n You can run prompt as administrator and execute the "
+                    "following command manually: `mklink {} {}`. Now it will create hard link for {} trickly.".
+                    format(raw_core_name, new_dll_core_path, core_path,
+                           raw_core_name))
+                run_cmd('mklink /H {} {}'.format(new_dll_core_path, core_path))
+        # core_avx or core_noavx with lib suffix
+        assert os.path.exists(new_dll_core_path)
+        return raw_core_name[:-4] + ".lib"
-    # create symbol link
+    else:
-    if not os.path.exists(new_lib_core_path):
+        new_lib_core_path = _get_lib_core_path()
-        try:
+        # create symbol link on mac
-            os.symlink(core_path, new_lib_core_path)
+        if not os.path.exists(new_lib_core_path):
-            assert os.path.exists(new_lib_core_path)
+            try:
-        except Exception:
+                os.symlink(core_path, new_lib_core_path)
-            raise RuntimeError(
+                assert os.path.exists(new_lib_core_path)
-                "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`".
+            except Exception:
-                format(raw_core_name, core_path, new_lib_core_path))
+                raise RuntimeError(
+                    "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`".
+                    format(raw_core_name, core_path, new_lib_core_path))
-    # core_avx or core_noavx without suffix
+        # core_avx or core_noavx without suffix
-    return raw_core_name[:-3]
+        return raw_core_name[:-3]
 def find_cuda_home():
@@ -1054,20 +1084,20 @@ def check_abi_compatibility(compiler, verbose=False):
    if os.environ.get('PADDLE_SKIP_CHECK_ABI') in ['True', 'true', '1']:
        return True
-    which = 'where' if IS_WINDOWS else 'which'
+    if not IS_WINDOWS:
-    cmd_out = subprocess.check_output(
+        cmd_out = subprocess.check_output(
-        [which, compiler], stderr=subprocess.STDOUT)
+            ['which', compiler], stderr=subprocess.STDOUT)
-    compiler_path = os.path.realpath(cmd_out.decode()
+        compiler_path = os.path.realpath(cmd_out.decode()
-                                     if six.PY3 else cmd_out).strip()
+                                         if six.PY3 else cmd_out).strip()
-    # step 1. if not found any suitable compiler, raise error
+        # if not found any suitable compiler, raise warning
-    if not any(name in compiler_path
+        if not any(name in compiler_path
-               for name in _expected_compiler_current_platform()):
+                   for name in _expected_compiler_current_platform()):
-        warnings.warn(
+            warnings.warn(
-            WRONG_COMPILER_WARNING.format(
+                WRONG_COMPILER_WARNING.format(
-                user_compiler=compiler,
+                    user_compiler=compiler,
-                paddle_compiler=_expected_compiler_current_platform()[0],
+                    paddle_compiler=_expected_compiler_current_platform()[0],
-                platform=OS_NAME))
+                    platform=OS_NAME))
-        return False
+            return False
    version = (0, 0, 0)
    # clang++ have no ABI compatibility problem

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -255,11 +255,15 @@ paddle_bins = ''
 if not '${WIN32}':
    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
-package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]}
+if os.name != 'nt':
+    package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + '.so']}
+else:
+    package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + '.pyd', '${FLUID_CORE_NAME}' + '.lib']}
 if '${HAS_NOAVX_CORE}' == 'ON':
    package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')]
 package_dir={
    '': '${PADDLE_BINARY_DIR}/python',
    # The paddle.fluid.proto will be generated while compiling.
@@ -353,14 +357,6 @@ if '${WITH_XPU}' == 'OFF' and '${XPU_SDK_ROOT}' != '':
        package_data['paddle.libs']+=['libxpurt.so']
-### New custom op extension mechanism related ###
-# copy paddle_custom_op.lib/paddle_custom_op.dll to libs on Windows
-if os.name == 'nt':
-    shutil.copy('${PADDLE_CUSTOM_OP_IMPORT_LIB}', libs_path)
-    shutil.copy('${PADDLE_CUSTOM_OP_SHARED_LIB}', libs_path)
-    package_data['paddle.libs'] += ['paddle_custom_op.lib', 'paddle_custom_op.dll']
 # remove unused paddle/libs/__init__.py
 if os.path.isfile(libs_path+'/__init__.py'):
    os.remove(libs_path+'/__init__.py')

--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -436,9 +436,172 @@ CPU_PARALLEL_JOB = [
    'assign_op_test',
    'allocator_facade_frac_flags_test',
    'aes_cipher_test',
+    'test_dist_sparse_tensor_load_adagrad',
+    'test_dist_mnist_fp16_allreduce',
+    'test_dist_mnist_gradient_merge',
+    'test_dist_allreduce_op',
+    'test_hdfs3',
+    'test_parallel_dygraph_se_resnext',
+    'test_dist_fleet_ps9',
+    'test_dist_fleet_infer',
+    'test_dist_se_resnext_sync',
+    'test_dist_oneps',
+    'test_dist_sparse_load_ps1',
+    'test_dist_mnist_batch_merge',
+    'test_dist_fleet_ctr',
+    'test_dist_fleet_ps10',
+    'test_parallel_dygraph_transformer',
+    'test_dist_mnist_fleetapi',
+    'test_dist_sparse_tensor_load_adam',
+    'test_dist_fleet_ps4',
+    'test_dist_fleet_heter_program',
+    'test_parallel_dygraph_sparse_embedding_over_height',
+    'test_hdfs2',
+    'test_dist_sharding_save',
+    'test_dist_fleet_ps_gpu_ctr',
+    'test_dist_mnist_backward_deps',
+    'test_dist_fleet_heter_base',
+    'test_dist_sparse_tensor_load_sgd',
+    'test_new_group',
+    'test_dist_mnist_with_program',
+    'test_dist_mnist_pg',
+    'test_dist_sparse_tensor_load_rmsprop',
+    'test_auto_checkpoint2',
+    'test_dist_sparse_tensor_load_ftrl',
+    'test_dist_fleet_ps6',
+    'test_dist_mnist_fleet_save',
+    'test_auto_checkpoint1',
+    'test_dist_fleet_a_sync_optimizer_sync',
+    'test_dist_fleet_ps3',
+    'test_dist_se_resnext_nccl',
+    'test_parallel_dygraph_mnist',
+    'test_auto_checkpoint_multiple',
+    'test_dist_fleet_a_sync_optimizer_auto_async',
+    'test_pipeline',
+    'test_dist_fleet_ps8',
+    'test_dist_fleet_sparse_embedding_ctr',
+    'test_dist_se_resnext_dgc',
+    'test_dist_fleet_ps7',
+    'test_dist_fleet_decay',
+    'test_dist_fleet_a_sync_optimizer_auto_geo',
+    'test_dist_fleet_geo',
+    'test_parallel_dygraph_dataparallel',
+    'test_hdfs1',
+    'test_dist_mnist_dgc_nccl',
+    'test_dist_fleet_ctr2',
+    'test_parallel_dygraph_unused_variables',
+    'test_dist_mnist_multi_comm',
+    'test_dist_sparse_tensor_load_momentum',
+    'test_gen_nccl_id_op',
+    'test_parallel_dygraph_sparse_embedding',
+    'test_dist_mnist_ring_allreduce',
+    'test_fleet_launch_async',
+    'test_dist_fleet_a_sync_optimizer_geo',
+    'test_parallel_dygraph_control_flow',
+    'test_auto_checkpoint',
+    'test_fleet_pipeline_meta_optimizer',
+    'test_dist_fleet_heter_ctr',
+    'test_fleet_graph_execution_meta_optimizer',
+    'test_fleet_run_random_port',
+    'test_dist_fleet_ps5',
+    'test_dist_fleet_a_sync_optimizer_auto',
+    'test_dist_lookup_sparse_table_fuse_ops',
+    'test_dist_fleet_a_sync_optimizer_async',
+    'test_c_comm_init_op',
+    'test_fleet_launch_nproc',
+    'test_dist_fleet_simnet',
+    'test_auto_checkpoint_dist_basic',
+    'test_fleet_launch_cloud',
+    'test_dist_fleet_ps',
+    'test_dist_op',
+    'test_dist_sparse_load_ps0',
+    'test_auto_checkpoint3',
+    'test_dist_fleet_ps2',
+    'test_dist_fleet_grad_clip',
+    'test_custom_concat',
+    'test_analyzer_transformer_fuse',
+    'test_analyzer_seq_pool1_fuse_statis',
+    'test_fc_lstm_fuse_pass_cc',
+    'test_layer_norm_fuse_pass',
+    'test_fc_gru_fuse_pass_cc',
+    'test_analyzer_save_model',
+    'test_fleet_ps',
+    'test_analyzer_multi_model_prediction',
+    'test_fleet_base_3',
+    'test_fleet_base_2',
+    'test_ascend_trigger',
+    'test_fleet_amp_meta_optimizer',
+    'test_fleetrun',
+    'test_check_abi',
+    'dense_table_test',
+    'test_adaptive_pool2d_convert_global_pass',
+    'test_fleet_recompute_meta_optimizer',
+    'test_fleet_fp16_allreduce_meta_optimizer',
+    'test_post_training_quantization_lstm_model',
+    'test_fleet_metric',
+    'test_fleet_gradient_merge_meta_optimizer',
+    'test_fleet_sharding_meta_optimizer',
+    'test_listen_and_serv_op',
+    'test_analyzer_zerocopytensor_tensor',
+    'test_conv_bn_fuse_pass_cc',
+    'test_collective_optimizer',
+    'test_bf16_utils',
+    'test_analyzer_seq_pool1_compare_determine',
+    'test_avoid_twice_initialization',
+    'test_callback_early_stop',
+    'test_fleet_distributed_strategy',
+    'test_launch_coverage',
+    'test_sgd_op_bf16',
+    'test_model_cast_to_bf16',
+    'test_hybrid_parallel_topology',
+    'barrier_table_test',
+    'test_check_error',
+    'test_fleet_lamb_meta_optimizer',
+    'test_fleet_rolemaker_2',
+    'test_distributed_strategy',
+    'test_rnn_cudnn_params_packing',
+    'test_communicator_async',
+    'brpc_utils_test',
+    'test_analyzer_capi_pd_tensor',
+    'test_recv_save_op',
+    'heter_listen_and_server_test',
+    'test_analyzer_capi_ner',
+    'test_unsqueeze2_eltwise_fuse_pass',
+    'test_dgc_optimizer',
+    'test_fleet_cc',
+    'test_repeated_fc_relu_fuse_pass_cc',
+    'heter_server_test',
+    'test_static_save_load_large',
+    'graph_node_test',
+    'test_custom_conj',
+    'test_fleet_private_function',
+    'test_fake_init_op',
+    'brpc_service_sparse_sgd_test',
+    'test_tf32_cudnn',
+    'test_communicator_geo',
+    'test_dispatch_jit',
+    'test_layer_norm_fuse_pass_cc',
+    'test_fleet_dgc_meta_optimizer',
+    'test_fc_fuse_pass_cc',
+    'test_communicator_sync',
+    'test_analyzer_capi',
+    'test_fleet_lars_meta_optimizer',
+    'test_communicator_half_async',
+    'test_fleet_localsgd_meta_optimizer',
+    'test_fleet_amp_init',
+    'test_fleet_checkpoint',
+    'test_analyzer_seq_pool1_fuse_compare_zero_copy',
+    'test_lookup_table_bf16_op',
+    'test_fleet_meta_optimizer_base',
+    'table_test',
+    'test_fleet_rolemaker_new',
+    'test_fleet_graph_executor',
+    'test_multi_out_jit',
+    'test_fleet_utils',
+    'brpc_service_dense_sgd_test',
 ]
-# It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
+# It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED,
 # just remove it from this list.
 TETRAD_PARALLEL_JOB = [
    'buffered_allocator_test',
@@ -477,9 +640,53 @@ TETRAD_PARALLEL_JOB = [
    'tensor_test',
    'test_repeated_fc_relu_fuse_pass_cc',
    'test_mkldnn_caching',
+    'test_analyzer_seq_pool1',
+    'test_analyzer_ocr',
+    'test_analyzer_seq_conv1',
+    'test_analyzer_small_dam',
+    'test_analyzer_mobilenet_depthwise_conv',
+    'test_analyzer_pyramid_dnn',
+    'test_analyzer_text_classification',
+    'test_analyzer_rnn2',
+    'test_analyzer_transformer',
+    'test_analyzer_resnet50',
+    'test_analyzer_ner',
+    'test_analyzer_lac',
+    'test_analyzer_transformer_profile',
+    'test_analyzer_mobilenet_transpose',
+    'test_analyzer_rnn1',
+    'test_analyzer_seq_pool1_profile',
+    'test_analyzer_paddletensor_tensor',
+    'test_analyzer_bert',
+    'test_analyzer_googlenet',
+    'zero_copy_tensor_test',
+    'custom_tensor_test',
+    'test_fleet_base',
+    'test_imperative_container_layerdict',
+    'test_complex_simplenet',
+    'test_tensor_register_hook',
+    'test_set_value_op',
+    'test_tensor_type_promotion',
+    'test_view_op_reuse_allocation',
+    'test_complex_grad_accumulated',
+    'test_sequential',
+    'test_sequential',
+    'test_imperative_layers',
+    'test_dgc_momentum_op',
+    'test_memcpy_op',
+    'test_dgc_op',
+    'test_modelaverage',
+    'test_lookahead',
+    'test_callback_visualdl',
+    'test_new_group_api',
+    'test_collective_split_embedding_none_divisible',
+    'test_collective_wait',
+    'test_collective_split_row_linear',
+    'test_collective_split_col_linear',
+    'test_collective_split_embedding',
 ]
-# It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
+# It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED,
 # just remove it from this list.
 TWO_PARALLEL_JOB = [
    'convert_model2dot_ernie',
@@ -611,7 +818,6 @@ TWO_PARALLEL_JOB = [
    'test_adam_op_multi_thread',
    'test_adamax_op',
    'test_while_loop_op',
-    'test_affine_grid_function',
    'test_transpose_flatten_concat_fuse_pass',
    'test_trace_op',
    'test_backward',
@@ -663,7 +869,6 @@ TWO_PARALLEL_JOB = [
    'test_gather_op',
    'test_partial_concat_op',
    'test_gaussian_random_op',
-    'test_paddle_imperative_double_grad',
    'test_generate_proposals_v2_op',
    'test_pad_constant_like',
    'test_grid_sample_function',
@@ -879,6 +1084,11 @@ TWO_PARALLEL_JOB = [
    'test_imperative_load_static_param',
    'test_fuse_bn_add_act_pass',
    'test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass',
+    'test_quantize_transpiler_v2',
+    'paddle_infer_api_test',
+    'test_analyzer_ernie',
+    'lite_resnet50_test',
+    'lite_mul_model_test',
 ]