Remove paddle_custom_op dynamic libraries, and link to FLUID_CORE on Windows (#32583)

* Remove paddle_custom_op dynamic libraries, change link to FLUID_CORE on windows, and check copy_to * fix CI

Remove paddle_custom_op dynamic libraries, and link to FLUID_CORE on Windows (#32583)
* Remove paddle_custom_op dynamic libraries, change link to FLUID_CORE on windows, and check copy_to * fix CI
7610c2b4 · Zhou Wei · GitHub · 9b65d4ce · 7610c2b4 · 7610c2b4
13 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -375,36 +375,3 @@ cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 if(WITH_TESTING AND TEST selected_rows_test)
  set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
 endif()
-
-##### 2.0 New custom op extension mechanism related #####
-
-# if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_
-if (WIN32)
-  set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer)
-
-  set(PADDLE_CUSTOM_OP_SRCS
-      ${CMAKE_CURRENT_SOURCE_DIR}/custom_operator.cc
-      ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_tensor.cc
-      ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_op_meta_info.cc
-      ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc)
-  set(PADDLE_CUSTOM_OP_SRCS ${PADDLE_CUSTOM_OP_SRCS} PARENT_SCOPE)
-
-  cc_library(paddle_custom_op_shared
-      SHARED SRCS ${PADDLE_CUSTOM_OP_SRCS} DEPS ${PADDLE_CUSTOM_OP_MODULES})
-
-  get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-  set_target_properties(paddle_custom_op_shared PROPERTIES OUTPUT_NAME paddle_custom_op)
-  target_link_libraries(paddle_custom_op_shared ${os_dependency_modules})
-
-  if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-    set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR})
-  else()
-    set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE})
-  endif()
-  set(PADDLE_CUSTOM_OP_IMPORT_LIB
-      ${paddle_custom_op_lib_path}/paddle_custom_op.lib
-      CACHE INTERNAL "Paddle custom op import lib")
-  set(PADDLE_CUSTOM_OP_SHARED_LIB
-      ${paddle_custom_op_lib_path}/paddle_custom_op.dll
-      CACHE INTERNAL "Paddle custom op dll")
-endif()
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -54,14 +54,14 @@ wmic process where name="python.exe" call terminate 2>NUL
 rem ------initialize common variable------
 if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64"
 if not defined BRANCH set BRANCH=develop
-if not defined WITH_TENSORRT set WITH_TENSORRT=ON 
+if not defined WITH_TENSORRT set WITH_TENSORRT=ON
 if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT
 if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto
 if not defined WITH_GPU set WITH_GPU=ON
 if not defined WITH_MKL set WITH_MKL=ON
 if not defined WITH_AVX set WITH_AVX=ON
 if not defined WITH_TESTING set WITH_TESTING=ON
-if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=OFF
+if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=ON
 if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
@@ -75,6 +75,7 @@ if not defined LOG_LEVEL set LOG_LEVEL=normal
 if not defined PRECISION_TEST set PRECISION_TEST=OFF
 if not defined NIGHTLY_MODE set PRECISION_TEST=OFF
 if not defined retry_times set retry_times=2
+if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37

 rem -------set cache build directory-----------
 rmdir build\python /s/q
@@ -83,9 +84,6 @@ rmdir build\paddle_inference_install_dir /s/q
 rmdir build\paddle_inference_c_install_dir /s/q
 del build\CMakeCache.txt

-: set CI_SKIP_CPP_TEST if only *.py changed
-git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON
-
 if "%WITH_CACHE%"=="OFF" (
    rmdir build /s/q
    goto :mkbuild
@@ -135,58 +133,6 @@ dir .
 dir %cache_dir%
 dir paddle\fluid\pybind\Release

-rem ------initialize the python environment------
-if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
-set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
-set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
-
-rem ToDo: virtual environment can't be deleted safely, some process not exit when task is canceled
-rem Now use system python environment temporarily
-rem %PYTHON_EXECUTABLE% -m pip install virtualenv
-rem %PYTHON_EXECUTABLE% -m virtualenv paddle_winci
-rem call paddle_winci\Scripts\activate.bat
-
-rem ------pre install python requirement----------
-where python
-where pip
-pip install wheel --user
-pip install -r %work_dir%\python\requirements.txt --user
-
-if %ERRORLEVEL% NEQ 0 (
-    echo pip install requirements.txt failed!
-    exit /b 7
-)
-
-rem ------pre install clcache and init config----------
-rem pip install clcache --user
-pip uninstall -y clcache
-:: set USE_CLCACHE to enable clcache
-rem set USE_CLCACHE=1
-:: In some scenarios, CLCACHE_HARDLINK can save one file copy.
-rem set CLCACHE_HARDLINK=1
-:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
-rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
-:: set maximum cache size to 20G
-rem clcache.exe -M 21474836480
-
-:: install ninja if GENERATOR is Ninja
-if %GENERATOR% == "Ninja" (
-    pip install ninja
-    if %errorlevel% NEQ 0 (
-        echo pip install ninja failed!
-        exit /b 7
-    )
-)
-
-rem ------show summary of current environment----------
-cmake --version
-if "%WITH_GPU%"=="ON" (
-    nvcc --version
-    nvidia-smi
-)
-::python %work_dir%\tools\summary_env.py
-::%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\get_cpu_info.sh
-
 goto :CASE_%1

 echo "Usage: paddle_build.bat [OPTION]"
@@ -266,8 +212,10 @@ rem "Other configurations are added here"
 rem :CASE_wincheck_others
 rem call ...

+
 rem ---------------------------------------------------------------------------------------------
 :cmake
+@ECHO OFF
 echo    ========================================
 echo    Step 1. Cmake ...
 echo    ========================================
@@ -281,12 +229,52 @@ set PATH=C:\Program Files (x86)\Windows Kits\10\bin\10.0.17763.0\x64;%PATH%
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%

-@ECHO ON
-if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0
+if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
 set PATH=%TENSORRT_ROOT:/=\%\lib;%CUDA_TOOLKIT_ROOT_DIR%\bin;%CUDA_TOOLKIT_ROOT_DIR%\libnvvp;%PATH%

-rem ------set third_party cache dir------
+rem install ninja if GENERATOR is Ninja
+if %GENERATOR% == "Ninja" (
+    pip install ninja
+    if %errorlevel% NEQ 0 (
+        echo pip install ninja failed!
+        exit /b 7
+    )
+)

+rem ------show summary of current GPU environment----------
+cmake --version
+if "%WITH_GPU%"=="ON" (
+    nvcc --version
+    nvidia-smi
+)
+
+rem ------initialize the python environment------
+set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
+set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
+if %WITH_PYTHON% == "OFF" (
+    where python
+    where pip
+    pip install wheel --user
+    pip install -r %work_dir%\python\requirements.txt --user
+    if %ERRORLEVEL% NEQ 0 (
+        echo pip install requirements.txt failed!
+        exit /b 7
+    )
+)
+
+rem ------pre install clcache and init config----------
+rem pip install clcache --user
+pip uninstall -y clcache
+:: set USE_CLCACHE to enable clcache
+rem set USE_CLCACHE=1
+:: In some scenarios, CLCACHE_HARDLINK can save one file copy.
+rem set CLCACHE_HARDLINK=1
+:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
+rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
+:: set maximum cache size to 20G
+rem clcache.exe -M 21474836480
+
+rem ------set third_party cache dir------
 : clear third party cache every once in a while
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
 set day_now=%datetime:~6,2%
@@ -500,6 +488,10 @@ echo    ========================================
 echo    Step 4. Running unit tests ...
 echo    ========================================

+
+: set CI_SKIP_CPP_TEST if only *.py changed
+git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON
+
 pip install -r %work_dir%\python\unittest_py\requirements.txt --user
 if %ERRORLEVEL% NEQ 0 (
    echo pip install unittest requirements.txt failed!

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -43,9 +43,20 @@ set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
 IF(WIN32)
    # Python would use the .pyd by default under Windows series platform
    set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.pyd)
-    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd)
+    set(FLUID_CORE_LIB ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.lib)
+    
+    add_custom_command(OUTPUT ${FLUID_CORE}
+      COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+      COMMAND cmake -E copy $<TARGET_LINKER_FILE:paddle_pybind> ${FLUID_CORE_LIB}
+      DEPENDS paddle_pybind)
+
+    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd)  
 ELSE()
    set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.so)
+    add_custom_command(OUTPUT ${FLUID_CORE}
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+        DEPENDS paddle_pybind)
+
    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.so)
 ENDIF()

@@ -68,9 +79,6 @@ if(HAS_NOAVX_CORE AND EXISTS "${NOAVX_CORE_FILE}")
  list(APPEND FLUID_CORE_DEPS ${FLUID_NOAVX_CORE})
 endif()

-add_custom_command(OUTPUT ${FLUID_CORE}
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-        DEPENDS paddle_pybind)
 add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE_DEPS})

 IF(WIN32)
@@ -84,6 +92,7 @@ ELSE(WIN32)
    COMMAND touch stub.cc
    COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    COMMENT "Packing whl packages------>>>"
    DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
 ENDIF()


--- a/python/paddle/check_import_scipy.py
+++ b/python/paddle/check_import_scipy.py
@@ -24,6 +24,6 @@ def check_import_scipy(OsName):
            if 'DLL load failed' in print_info:
                raise ImportError(
                    print_info +
-                    "\nplease download visual C++ Redistributable for vs 2015, https://www.microsoft.com/en-us/download/details.aspx?id=48145"
+                    "\nplease download Visual C++ Redistributable from https://support.microsoft.com/en-us/topic/the-latest-supported-visual-c-downloads-2647da03-1eea-4433-9aff-95f26a218cc0"
                )
    return
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -37,7 +37,10 @@ if os.path.exists(current_path + os.sep + 'core_noavx.' + core_suffix):
 try:
    if os.name == 'nt':
        third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
-        os.environ['path'] = third_lib_path + ';' + os.environ['path']
+        # Will load shared library from 'path' on windows
+        os.environ[
+            'path'] = current_path + ';' + third_lib_path + ';' + os.environ[
+                'path']
        sys.path.insert(0, third_lib_path)
        # Note: from python3.8, PATH will not take effect
        # https://github.com/python/cpython/pull/12302
@@ -298,7 +301,7 @@ if avx_supported():
                "WARNING: AVX is supported on local machine, but you have installed "
                "paddlepaddle without avx core. Hence, no_avx core which has worse "
                "preformance will be imported.\nYou could reinstall paddlepaddle by "
-                "'python -m pip install -U paddlepaddle-gpu[==version]' or rebuild "
+                "'python -m pip install --force-reinstall paddlepaddle-gpu[==version]' or rebuild "
                "paddlepaddle WITH_AVX=ON to get better performance.\n"
                "The original error is: %s\n" % cpt.get_exception_message(e))
            load_noavx = True
@@ -350,12 +353,19 @@ if load_noavx:
            sys.stderr.write(
                'Error: Can not import noavx core while this file exists: ' +
                current_path + os.sep + 'core_noavx.' + core_suffix + '\n')
+        elif avx_supported():
+            sys.stderr.write(
+                "Error: AVX is support on your machine, but you have installed "
+                "paddlepaddle without avx core, you should reinstall paddlepaddle by "
+                "'python -m pip install --force-reinstall paddlepaddle-gpu[==version]\n"
+            )
        else:
            sys.stderr.write(
                "Error: AVX is not support on your machine, but you have installed "
-                "paddlepaddle with avx core, you should reinstall paddlepaddle by "
-                "'python -m pip install -U paddlepaddle-gpu[==version] -f "
-                "https://paddlepaddle.org.cn/whl/stable_noavx.html'\n")
+                "paddlepaddle without no_avx core, you should reinstall paddlepaddle by "
+                "'python -m pip install --force-reinstall paddlepaddle-gpu[==version] -f "
+                "https://paddlepaddle.org.cn/whl/mkl/stable/noavx.html or "
+                "https://paddlepaddle.org.cn/whl/openblas/stable/noavx.html\n")
        raise e



--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
-# New custom OP can support Windows/Linux now
-if(WITH_GPU OR APPLE) 
-    # GPU custom op tests: compile both .cc and .cu file
+# New custom OP can support Windows/Linux/Mac now
+if(WITH_GPU OR APPLE)
    py_test(test_custom_relu_op_setup SRCS test_custom_relu_op_setup.py)
    py_test(test_custom_relu_op_jit SRCS test_custom_relu_op_jit.py)
    py_test(test_custom_relu_model SRCS test_custom_relu_model.py)

--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -45,8 +45,12 @@ std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
  int grid = (numel + block - 1) / block;
  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
      x.type(), "relu_cuda_forward_kernel", ([&] {
+        auto cpu_input = x.copy_to<data_t>(paddle::PlaceType::kCPU);
+        auto gpu_input = cpu_input.copy_to<data_t>(paddle::PlaceType::kGPU);
        relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
-            x.data<data_t>(), out.mutable_data<data_t>(x.place()), numel);
+            gpu_input.data<data_t>(),
+            out.mutable_data<data_t>(x.place()),
+            numel);
      }));

  return {out};

--- a/python/paddle/fluid/tests/custom_op/test_check_abi.py
+++ b/python/paddle/fluid/tests/custom_op/test_check_abi.py
@@ -64,14 +64,29 @@ class TestCheckCompiler(TestABIBase):
        # clear environ
        self.del_environ()
        compiler = 'python'  # fake wrong compiler
-        with warnings.catch_warnings(record=True) as error:
-            flag = utils.check_abi_compatibility(compiler, verbose=True)
-            # check return False
-            self.assertFalse(flag)
-            # check Compiler Compatibility WARNING
-            self.assertTrue(len(error) == 1)
-            self.assertTrue(
-                "Compiler Compatibility WARNING" in str(error[0].message))
+        if not utils.IS_WINDOWS:
+            with warnings.catch_warnings(record=True) as error:
+                flag = utils.check_abi_compatibility(compiler, verbose=True)
+                # check return False
+                self.assertFalse(flag)
+                # check Compiler Compatibility WARNING
+                self.assertTrue(len(error) == 1)
+                self.assertTrue(
+                    "Compiler Compatibility WARNING" in str(error[0].message))
+
+    def test_exception_windows(self):
+        # clear environ
+        self.del_environ()
+        compiler = 'fake compiler'  # fake command
+        if utils.IS_WINDOWS:
+            with warnings.catch_warnings(record=True) as error:
+                flag = utils.check_abi_compatibility(compiler, verbose=True)
+                # check return False
+                self.assertFalse(flag)
+                # check ABI Compatibility WARNING
+                self.assertTrue(len(error) == 1)
+                self.assertTrue("Failed to check compiler version for" in
+                                str(error[0].message))

    def test_exception_linux(self):
        # clear environ

--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -105,12 +105,12 @@ class TestJITLoad(unittest.TestCase):
                in str(e))
            if IS_WINDOWS:
                self.assertTrue(
-                    r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc:47"
-                    in str(e))
+                    r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc" in
+                    str(e))
            else:
                self.assertTrue(
-                    "python/paddle/fluid/tests/custom_op/custom_relu_op.cc:47"
-                    in str(e))
+                    "python/paddle/fluid/tests/custom_op/custom_relu_op.cc" in
+                    str(e))
        self.assertTrue(caught_exception)

        caught_exception = False
@@ -126,7 +126,7 @@ class TestJITLoad(unittest.TestCase):
                "function \"relu_cuda_forward_kernel\" is not implemented for data type `int32_t`"
                in str(e))
            self.assertTrue(
-                "python/paddle/fluid/tests/custom_op/custom_relu_op.cu:50" in
+                "python/paddle/fluid/tests/custom_op/custom_relu_op.cu" in
                str(e))
        self.assertTrue(caught_exception)


--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -26,7 +26,7 @@ from .extension_utils import find_cuda_home, find_rocm_home, normalize_extension
 from .extension_utils import is_cuda_file, prepare_unix_cudaflags, prepare_win_cudaflags
 from .extension_utils import _import_module_from_library, _write_setup_file, _jit_compile
 from .extension_utils import check_abi_compatibility, log_v, CustomOpInfo, parse_op_name_from
-from .extension_utils import clean_object_if_change_cflags, _reset_so_rpath
+from .extension_utils import clean_object_if_change_cflags, _reset_so_rpath, _get_fluid_path
 from .extension_utils import bootstrap_context, get_build_directory, add_std_without_repeat

 from .extension_utils import IS_WINDOWS, OS_NAME, MSVC_COMPILE_FLAGS, MSVC_COMPILE_FLAGS
@@ -69,7 +69,7 @@ def setup(**attr):
    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
    then the version of user's local machine should satisfy GCC >= 8.2. 
    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
-    PaddlePaddle (Visual Studio 2015 update3). 
+    PaddlePaddle (Visual Studio 2017). 
    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
    occur because of ABI compatibility.

@@ -79,7 +79,7 @@ def setup(**attr):
        2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
           GCC version.
-        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2015 update3).
+        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017).


    Compared with Just-In-Time ``load`` interface, it only compiles once by executing
@@ -611,7 +611,7 @@ class BuildExtension(build_ext, object):
            msg = (
                'It seems that the VC environment is activated but DISTUTILS_USE_SDK is not set.'
                'This may lead to multiple activations of the VC env.'
-                'Please set `DISTUTILS_USE_SDK=1` and try again.')
+                'Please run `set DISTUTILS_USE_SDK=1` and try again.')
            raise UserWarning(msg)

    def _record_op_info(self):
@@ -724,7 +724,7 @@ def load(name,
    processes under a individual subprocess. It does not require CMake or Ninja 
    environment. On Linux platform, it requires GCC compiler whose version is 
    greater than 5.4 and it should be soft linked to ``/usr/bin/cc`` . On Windows 
-    platform, it requires Visual Studio whose version is greater than 2015 update3.
+    platform, it requires Visual Studio whose version is greater than 2017.
    On MacOS, clang++ is requited. In addition, if compiling Operators supporting 
    GPU device, please make sure ``nvcc`` compiler is installed in local environment.
    
@@ -735,7 +735,7 @@ def load(name,
    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
    then the version of user's local machine should satisfy GCC >= 8.2. 
    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
-    PaddlePaddle (Visual Studio 2015 update3). 
+    PaddlePaddle (Visual Studio 2017). 
    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
    occur because of ABI compatibility.

@@ -749,7 +749,7 @@ def load(name,
        2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
           GCC version.
-        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2015 update3).
+        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017).


    **A simple example:**
@@ -802,9 +802,6 @@ def load(name,

    # ensure to use abs path
    build_directory = os.path.abspath(build_directory)
-    # Will load shared library from 'path' on windows
-    if IS_WINDOWS:
-        os.environ['path'] = build_directory + ';' + os.environ['path']

    log_v("build_directory: {}".format(build_directory), verbose)

@@ -827,6 +824,7 @@ def load(name,

    # write setup.py file and compile it
    build_base_dir = os.path.join(build_directory, name)
+
    _write_setup_file(name, sources, file_path, build_base_dir,
                      extra_include_paths, extra_cxx_cflags, extra_cuda_cflags,
                      extra_ldflags, verbose)

--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -55,7 +55,7 @@ CLANG_LINK_FLAGS = [
    '-dynamiclib', '-undefined', 'dynamic_lookup', '-arch', 'x86_64'
 ]

-MSVC_LINK_FLAGS = ['/MACHINE:X64', 'paddle_custom_op.lib']
+MSVC_LINK_FLAGS = ['/MACHINE:X64']

 COMMON_NVCC_FLAGS = ['-DPADDLE_WITH_CUDA', '-DEIGEN_USE_GPU']

@@ -371,10 +371,11 @@ def _get_core_name():
    Return pybind DSO module name.
    """
    import paddle
-    if paddle.fluid.core.load_noavx:
-        return 'core_noavx.so'
+    ext_name = '.pyd' if IS_WINDOWS else '.so'
+    if not paddle.fluid.core.load_noavx:
+        return 'core_avx' + ext_name
    else:
-        return 'core_avx.so'
+        return 'core_noavx' + ext_name


 def _get_lib_core_path():
@@ -386,6 +387,15 @@ def _get_lib_core_path():
    return os.path.join(_get_fluid_path(), lib_core_name)


+def _get_dll_core_path():
+    """
+    Return real path of libcore_(no)avx.dylib on Windows.
+    """
+    raw_core_name = _get_core_name()
+    dll_core_name = "paddle_pybind.dll"
+    return os.path.join(_get_fluid_path(), dll_core_name)
+
+
 def _reset_so_rpath(so_path):
    """
    NOTE(Aurelius84): Runtime path of core_(no)avx.so is modified into `@loader_path/../libs`
@@ -435,9 +445,12 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
        # append link flags
        extra_link_args = kwargs.get('extra_link_args', [])
        extra_link_args.extend(MSVC_LINK_FLAGS)
+        lib_core_name = create_sym_link_if_not_exist()
+        extra_link_args.append('{}'.format(lib_core_name))
        if use_cuda:
            extra_link_args.extend(['cudadevrt.lib', 'cudart_static.lib'])
        kwargs['extra_link_args'] = extra_link_args
+
    else:
        ########################### Linux Platform ###########################
        extra_link_args = kwargs.get('extra_link_args', [])
@@ -481,24 +494,41 @@ def create_sym_link_if_not_exist():
    """
    Create soft symbol link of `core_avx.so` or `core_noavx.so`
    """
-    assert OS_NAME.startswith('darwin')
+    assert OS_NAME.startswith('darwin') or IS_WINDOWS

    raw_core_name = _get_core_name()
    core_path = os.path.join(_get_fluid_path(), raw_core_name)
-    new_lib_core_path = _get_lib_core_path()
+    if IS_WINDOWS:
+        new_dll_core_path = _get_dll_core_path()
+        # create symbol link on windows
+        if not os.path.exists(new_dll_core_path):
+            try:
+                os.symlink(core_path, new_dll_core_path)
+            except Exception:
+                warnings.warn(
+                    "Failed to create soft symbol link for {}.\n You can run prompt as administrator and execute the "
+                    "following command manually: `mklink {} {}`. Now it will create hard link for {} trickly.".
+                    format(raw_core_name, new_dll_core_path, core_path,
+                           raw_core_name))
+                run_cmd('mklink /H {} {}'.format(new_dll_core_path, core_path))
+        # core_avx or core_noavx with lib suffix
+        assert os.path.exists(new_dll_core_path)
+        return raw_core_name[:-4] + ".lib"

-    # create symbol link
-    if not os.path.exists(new_lib_core_path):
-        try:
-            os.symlink(core_path, new_lib_core_path)
-            assert os.path.exists(new_lib_core_path)
-        except Exception:
-            raise RuntimeError(
-                "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`".
-                format(raw_core_name, core_path, new_lib_core_path))
+    else:
+        new_lib_core_path = _get_lib_core_path()
+        # create symbol link on mac
+        if not os.path.exists(new_lib_core_path):
+            try:
+                os.symlink(core_path, new_lib_core_path)
+                assert os.path.exists(new_lib_core_path)
+            except Exception:
+                raise RuntimeError(
+                    "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`".
+                    format(raw_core_name, core_path, new_lib_core_path))

-    # core_avx or core_noavx without suffix
-    return raw_core_name[:-3]
+        # core_avx or core_noavx without suffix
+        return raw_core_name[:-3]


 def find_cuda_home():
@@ -1054,20 +1084,20 @@ def check_abi_compatibility(compiler, verbose=False):
    if os.environ.get('PADDLE_SKIP_CHECK_ABI') in ['True', 'true', '1']:
        return True

-    which = 'where' if IS_WINDOWS else 'which'
-    cmd_out = subprocess.check_output(
-        [which, compiler], stderr=subprocess.STDOUT)
-    compiler_path = os.path.realpath(cmd_out.decode()
-                                     if six.PY3 else cmd_out).strip()
-    # step 1. if not found any suitable compiler, raise error
-    if not any(name in compiler_path
-               for name in _expected_compiler_current_platform()):
-        warnings.warn(
-            WRONG_COMPILER_WARNING.format(
-                user_compiler=compiler,
-                paddle_compiler=_expected_compiler_current_platform()[0],
-                platform=OS_NAME))
-        return False
+    if not IS_WINDOWS:
+        cmd_out = subprocess.check_output(
+            ['which', compiler], stderr=subprocess.STDOUT)
+        compiler_path = os.path.realpath(cmd_out.decode()
+                                         if six.PY3 else cmd_out).strip()
+        # if not found any suitable compiler, raise warning
+        if not any(name in compiler_path
+                   for name in _expected_compiler_current_platform()):
+            warnings.warn(
+                WRONG_COMPILER_WARNING.format(
+                    user_compiler=compiler,
+                    paddle_compiler=_expected_compiler_current_platform()[0],
+                    platform=OS_NAME))
+            return False

    version = (0, 0, 0)
    # clang++ have no ABI compatibility problem

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -255,11 +255,15 @@ paddle_bins = ''

 if not '${WIN32}':
    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
-package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]}
+
+if os.name != 'nt':
+    package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + '.so']}
+else:
+    package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + '.pyd', '${FLUID_CORE_NAME}' + '.lib']}
+
 if '${HAS_NOAVX_CORE}' == 'ON':
    package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')]

-
 package_dir={
    '': '${PADDLE_BINARY_DIR}/python',
    # The paddle.fluid.proto will be generated while compiling.
@@ -353,14 +357,6 @@ if '${WITH_XPU}' == 'OFF' and '${XPU_SDK_ROOT}' != '':
        package_data['paddle.libs']+=['libxpurt.so']


-### New custom op extension mechanism related ###
-
-# copy paddle_custom_op.lib/paddle_custom_op.dll to libs on Windows
-if os.name == 'nt':
-    shutil.copy('${PADDLE_CUSTOM_OP_IMPORT_LIB}', libs_path)
-    shutil.copy('${PADDLE_CUSTOM_OP_SHARED_LIB}', libs_path)
-    package_data['paddle.libs'] += ['paddle_custom_op.lib', 'paddle_custom_op.dll']
-
 # remove unused paddle/libs/__init__.py
 if os.path.isfile(libs_path+'/__init__.py'):
    os.remove(libs_path+'/__init__.py')

--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -534,7 +534,6 @@ CPU_PARALLEL_JOB = [
    'test_fleetrun',
    'test_check_abi',
    'dense_table_test',
-    'test_custom_relu_op_setup',
    'test_adaptive_pool2d_convert_global_pass',
    'test_fleet_recompute_meta_optimizer',
    'test_fleet_fp16_allreduce_meta_optimizer',
@@ -670,9 +669,7 @@ TETRAD_PARALLEL_JOB = [
    'test_tensor_type_promotion',
    'test_view_op_reuse_allocation',
    'test_complex_grad_accumulated',
-    'test_tensor_methods',
    'test_sequential',
-    'test_tensor_methods',
    'test_sequential',
    'test_imperative_layers',
    'test_dgc_momentum_op',
@@ -680,7 +677,6 @@ TETRAD_PARALLEL_JOB = [
    'test_dgc_op',
    'test_modelaverage',
    'test_lookahead',
-    'test_word2vec_book',
    'test_callback_visualdl',
    'test_new_group_api',
    'test_collective_split_embedding_none_divisible',