[2.0Custom OP]Support New Custom OP on Windows (#31063)

* [2.0.1]Support New Custom OP on windows * fix CI * fix code style * fix CI * fix CI * fix coverage * fix CI * fix CI

[2.0Custom OP]Support New Custom OP on Windows (#31063)
* [2.0.1]Support New Custom OP on windows * fix CI * fix code style * fix CI * fix CI * fix coverage * fix CI * fix CI
adaec007 · Zhou Wei · GitHub · 2168f08a · adaec007 · adaec007
20 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -335,6 +335,8 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")

+add_definitions(-DPADDLE_DLL_EXPORT)
+
 if(ON_INFER)
    # you can trun off the paddle fluid and inference lib by set ON_INFER=OFF
    message(STATUS "On inference mode, will take place some specific optimization.")

--- a/paddle/fluid/extension/include/all.h
+++ b/paddle/fluid/extension/include/all.h
@@ -18,6 +18,12 @@ limitations under the License. */
 #error C++11 or later compatible compiler is required to use Paddle.
 #endif

+#ifdef _WIN32
+#ifndef NOMINMAX
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#endif
+#endif
+
 #include "paddle/fluid/extension/include/dispatch.h"
 #include "paddle/fluid/extension/include/dtype.h"
 #include "paddle/fluid/extension/include/op_meta_info.h"

--- a/paddle/fluid/extension/include/dll_decl.h
+++ b/paddle/fluid/extension/include/dll_decl.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(_WIN32)
+#ifndef PD_DLL_DECL
+#ifdef PADDLE_DLL_EXPORT
+#define PD_DLL_DECL __declspec(dllexport)
+#else
+#define PD_DLL_DECL __declspec(dllimport)
+#endif  // PADDLE_DLL_EXPORT
+#endif  // PD_DLL_DECL
+#else
+#define PD_DLL_DECL
+#endif  // _WIN32
--- a/paddle/fluid/extension/include/op_meta_info.h
+++ b/paddle/fluid/extension/include/op_meta_info.h
@@ -14,12 +14,14 @@ limitations under the License. */

 #pragma once

+#include <iostream>
 #include <string>
 #include <unordered_map>
 #include <vector>

 #include <boost/any.hpp>

+#include "paddle/fluid/extension/include/dll_decl.h"
 #include "paddle/fluid/extension/include/tensor.h"

 /**
@@ -31,7 +33,7 @@ limitations under the License. */

 namespace paddle {
 namespace framework {
-class OpMetaInfoHelper;
+class PD_DLL_DECL OpMetaInfoHelper;
 }  // namespace framework

 using Tensor = paddle::Tensor;
@@ -43,6 +45,26 @@ using Tensor = paddle::Tensor;
  classname& operator=(const classname&) = delete; \
  classname& operator=(classname&&) = delete

+#if defined _WIN32
+#define HANDLE_THE_ERROR try {
+#define END_HANDLE_THE_ERROR            \
+  }                                     \
+  catch (const std::exception& e) {     \
+    std::cerr << e.what() << std::endl; \
+    throw e;                            \
+  }
+#else
+#define HANDLE_THE_ERROR
+#define END_HANDLE_THE_ERROR
+#endif
+
+#define PD_THROW(err_msg)              \
+  do {                                 \
+    HANDLE_THE_ERROR                   \
+    throw std::runtime_error(err_msg); \
+    END_HANDLE_THE_ERROR               \
+  } while (0)
+
 ///////////////// Util Define and Function ////////////////

 inline std::string Grad(const std::string& var_name) {
@@ -106,7 +128,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
                                                            attr_idx + 1>(
            inputs, attrs, pargs..., arg);
      } catch (boost::bad_any_cast&) {
-        throw std::runtime_error(
+        PD_THROW(
            "Attribute cast error in custom operator. Expected int value.");
      }
    }
@@ -220,7 +242,7 @@ struct InferDtypeFuncImpl<Return (*)(Args...), impl_fn> {

 ////////////////////// Op Meta Info //////////////////////

-class OpMetaInfo {
+class PD_DLL_DECL OpMetaInfo {
 public:
  explicit OpMetaInfo(const std::string& op_name) : name_(op_name) {}
  OpMetaInfo& Inputs(std::vector<std::string>&& inputs);
@@ -246,7 +268,7 @@ class OpMetaInfo {

 //////////////// Op Meta Info Map /////////////////

-class OpMetaInfoMap {
+class PD_DLL_DECL OpMetaInfoMap {
 public:
  // this function's impl should keep in header file.
  // if move to cc file, meta info can not be added
@@ -270,14 +292,14 @@ class OpMetaInfoMap {

 //////////////// Op Meta Info Builder /////////////////

-class OpMetaInfoBuilder {
+class PD_DLL_DECL OpMetaInfoBuilder {
 public:
  explicit OpMetaInfoBuilder(std::string&& name);
  OpMetaInfoBuilder& Inputs(std::vector<std::string>&& inputs);
  OpMetaInfoBuilder& Outputs(std::vector<std::string>&& outputs);
-  OpMetaInfoBuilder& SetKernelFn(KernelFunc&& func);
-  OpMetaInfoBuilder& SetInferShapeFn(InferShapeFunc&& func);
-  OpMetaInfoBuilder& SetInferDtypeFn(InferDtypeFunc&& func);
+  OpMetaInfoBuilder& SetKernelFn(KernelFunc func);
+  OpMetaInfoBuilder& SetInferShapeFn(InferShapeFunc func);
+  OpMetaInfoBuilder& SetInferDtypeFn(InferDtypeFunc func);
  OpMetaInfoBuilder& SetBackwardOp(const std::string& bwd_op_name);

 private:
@@ -317,8 +339,12 @@ void LoadCustomOperatorLib(const std::string& dso_name);
 extern "C" {
 #endif

+#if defined(_WIN32)
 // C-API to get global OpMetaInfoMap.
-paddle::OpMetaInfoMap& PD_GetOpMetaInfoMap();
+__declspec(dllexport) inline paddle::OpMetaInfoMap& PD_GetOpMetaInfoMap() {
+  return paddle::OpMetaInfoMap::Instance();
+}
+#endif  // _WIN32

 #ifdef __cplusplus
 }

--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <memory>
 #include <vector>
+#include "paddle/fluid/extension/include/dll_decl.h"
 #include "paddle/fluid/extension/include/dtype.h"
 #include "paddle/fluid/extension/include/place.h"

@@ -23,7 +24,7 @@ namespace paddle {
 namespace framework {
 class CustomTensorUtils;
 }  // namespace framework
-class Tensor {
+class PD_DLL_DECL Tensor {
 public:
  /// \brief Construct a Tensor on target Place for CustomOp.
  /// Generally it's only used for user to create Tensor.

--- a/paddle/fluid/extension/src/op_meta_info.cc
+++ b/paddle/fluid/extension/src/op_meta_info.cc
@@ -78,17 +78,17 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::Outputs(
  return *this;
 }

-OpMetaInfoBuilder& OpMetaInfoBuilder::SetKernelFn(KernelFunc&& func) {
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetKernelFn(KernelFunc func) {
  info_ptr_->SetKernelFn(std::forward<KernelFunc>(func));
  return *this;
 }

-OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferShapeFn(InferShapeFunc&& func) {
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferShapeFn(InferShapeFunc func) {
  info_ptr_->SetInferShapeFn(std::forward<InferShapeFunc>(func));
  return *this;
 }

-OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferDtypeFn(InferDtypeFunc&& func) {
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferDtypeFn(InferDtypeFunc func) {
  info_ptr_->SetInferDtypeFn(std::forward<InferDtypeFunc>(func));
  return *this;
 }
@@ -114,10 +114,17 @@ void LoadCustomOperatorLib(const std::string& dso_name) {
 }
 }  // namespace paddle

+#ifdef __cplusplus
 extern "C" {
+#endif

+#ifndef _WIN32
+// C-API to get global OpMetaInfoMap.
 paddle::OpMetaInfoMap& PD_GetOpMetaInfoMap() {
  return paddle::OpMetaInfoMap::Instance();
 }
+#endif

+#ifdef __cplusplus
 }  // end extern "C"
+#endif
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -207,73 +207,87 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const {
  return target;
 }

-template Tensor Tensor::copy_to<paddle::platform::float16>(
+template PD_DLL_DECL Tensor
+Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::bfloat16>(
    const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<paddle::platform::bfloat16>(
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex64>(
    const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<paddle::platform::complex64>(
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex128>(
    const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<paddle::platform::complex128>(
-    const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<float>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<double>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<bool>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<float>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<double>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<bool>(const PlaceType &target_place) const;

-template float *Tensor::data<float>() const;
-template double *Tensor::data<double>() const;
-template int64_t *Tensor::data<int64_t>() const;
-template int32_t *Tensor::data<int32_t>() const;
-template uint8_t *Tensor::data<uint8_t>() const;
-template int8_t *Tensor::data<int8_t>() const;
-template paddle::platform::float16 *Tensor::data<paddle::platform::float16>()
-    const;
-template paddle::platform::bfloat16 *Tensor::data<paddle::platform::bfloat16>()
-    const;
-template paddle::platform::complex128 *
+template PD_DLL_DECL float *Tensor::data<float>() const;
+template PD_DLL_DECL double *Tensor::data<double>() const;
+template PD_DLL_DECL int64_t *Tensor::data<int64_t>() const;
+template PD_DLL_DECL int32_t *Tensor::data<int32_t>() const;
+template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>() const;
+template PD_DLL_DECL int8_t *Tensor::data<int8_t>() const;
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::data<paddle::platform::float16>() const;
+template PD_DLL_DECL paddle::platform::bfloat16 *
+Tensor::data<paddle::platform::bfloat16>() const;
+template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::data<paddle::platform::complex128>() const;
-template paddle::platform::complex64 *
+template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::data<paddle::platform::complex64>() const;
-template int16_t *Tensor::data<int16_t>() const;
-template bool *Tensor::data<bool>() const;
+template PD_DLL_DECL int16_t *Tensor::data<int16_t>() const;
+template PD_DLL_DECL bool *Tensor::data<bool>() const;

-template float *Tensor::mutable_data<float>();
-template double *Tensor::mutable_data<double>();
-template int64_t *Tensor::mutable_data<int64_t>();
-template int32_t *Tensor::mutable_data<int32_t>();
-template uint8_t *Tensor::mutable_data<uint8_t>();
-template int8_t *Tensor::mutable_data<int8_t>();
-template paddle::platform::float16 *
+template PD_DLL_DECL float *Tensor::mutable_data<float>();
+template PD_DLL_DECL double *Tensor::mutable_data<double>();
+template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>();
+template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>();
+template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
+template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
+template PD_DLL_DECL paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>();
-template paddle::platform::bfloat16 *
+template PD_DLL_DECL paddle::platform::bfloat16 *
 Tensor::mutable_data<paddle::platform::bfloat16>();
-template paddle::platform::complex128 *
+template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::mutable_data<paddle::platform::complex128>();
-template paddle::platform::complex64 *
+template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::mutable_data<paddle::platform::complex64>();
-template int16_t *Tensor::mutable_data<int16_t>();
-template bool *Tensor::mutable_data<bool>();
+template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
+template PD_DLL_DECL bool *Tensor::mutable_data<bool>();

-template float *Tensor::mutable_data<float>(const PlaceType &place);
-template double *Tensor::mutable_data<double>(const PlaceType &place);
-template int64_t *Tensor::mutable_data<int64_t>(const PlaceType &place);
-template int32_t *Tensor::mutable_data<int32_t>(const PlaceType &place);
-template uint8_t *Tensor::mutable_data<uint8_t>(const PlaceType &place);
-template int8_t *Tensor::mutable_data<int8_t>(const PlaceType &place);
-template paddle::platform::float16 *
+template PD_DLL_DECL float *Tensor::mutable_data<float>(const PlaceType &place);
+template PD_DLL_DECL double *Tensor::mutable_data<double>(
+    const PlaceType &place);
+template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>(
+    const PlaceType &place);
+template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>(
+    const PlaceType &place);
+template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>(
+    const PlaceType &place);
+template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
+    const PlaceType &place);
+template PD_DLL_DECL paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
-template paddle::platform::bfloat16 *
+template PD_DLL_DECL paddle::platform::bfloat16 *
 Tensor::mutable_data<paddle::platform::bfloat16>(const PlaceType &place);
-template paddle::platform::complex128 *
+template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
-template paddle::platform::complex64 *
+template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
-template int16_t *Tensor::mutable_data<int16_t>(const PlaceType &place);
-template bool *Tensor::mutable_data<bool>(const PlaceType &place);
+template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
+    const PlaceType &place);
+template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);

 std::vector<int> Tensor::shape() const {
  GET_CASTED_TENSOR

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -345,9 +345,12 @@ if (LINUX)
 endif()

 if (WIN32)
+  set(FLUID_FRAMEWORK_IMPORT_LIB
+    ${PADDLE_BINARY_DIR}/paddle/fluid/framework/${CMAKE_BUILD_TYPE}/paddle_framework.lib
+    CACHE INTERNAL "Fluid framework lib")
  set(FLUID_FRAMEWORK_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.dll
-      CACHE INTERNAL "Fluid framework lib")
+      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/${CMAKE_BUILD_TYPE}/paddle_framework.dll
+      CACHE INTERNAL "Fluid framework dll")
 endif()

 if(APPLE)

--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -416,9 +416,6 @@ void* GetOpDsoHandle(const std::string& dso_name) {
 #if defined(__APPLE__) || defined(__OSX__)
  PADDLE_THROW(platform::errors::Unimplemented(
      "Create custom cpp op outside framework do not support Apple."));
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  PADDLE_THROW(platform::errors::Unimplemented(
-      "Create custom cpp op outside framework do not support Windows."));
 #else
  return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
 #endif

--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -114,23 +114,24 @@ rem ------pre install python requirement----------
 where python
 where pip
 pip install wheel --user
-pip install -r %work_dir%\python\requirements.txt --user
-pip install -r %work_dir%\python\unittest_py\requirements.txt --user
+pip install --force-reinstall -r %work_dir%\python\requirements.txt --user
+pip install --force-reinstall -r %work_dir%\python\unittest_py\requirements.txt --user
 if %ERRORLEVEL% NEQ 0 (
    echo pip install requirements.txt failed!
    exit /b 7
 )

 rem ------pre install clcache and init config----------
-pip install clcache --user
+rem pip install clcache --user
+pip uninstall -y clcache
 :: set USE_CLCACHE to enable clcache
-set USE_CLCACHE=1
+rem set USE_CLCACHE=1
 :: In some scenarios, CLCACHE_HARDLINK can save one file copy.
-set CLCACHE_HARDLINK=1
+rem set CLCACHE_HARDLINK=1
 :: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
-set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
+rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
 :: set maximum cache size to 20G
-clcache.exe -M 21474836480
+rem clcache.exe -M 21474836480

 rem ------show summary of current environment----------
 cmake --version
@@ -281,7 +282,7 @@ echo Build third_party successfully!
 set build_times=1
 :build_paddle
 :: reset clcache zero stats for collect PR's actual hit rate
-clcache.exe -z
+rem clcache.exe -z

 echo Build Paddle the %build_times% time:
 if "%WITH_CLCACHE%"=="OFF" (
@@ -305,7 +306,7 @@ echo 0 > %cache_dir%\error_code.txt
 type %cache_dir%\error_code.txt

 :: ci will collect clcache hit rate
-goto :collect_clcache_hits
+rem goto :collect_clcache_hits

 goto:eof

@@ -346,13 +347,14 @@ set /p PADDLE_WHL_FILE_WIN=< whl_file.txt
 @ECHO ON
 pip uninstall -y paddlepaddle
 pip uninstall -y paddlepaddle-gpu
-pip install -U %PADDLE_WHL_FILE_WIN% --user
+pip install %PADDLE_WHL_FILE_WIN% --user
 if %ERRORLEVEL% NEQ 0 (
    call paddle_winci\Scripts\deactivate.bat 2>NUL
    echo pip install whl package failed!
    exit /b 1
 )

+
 set CUDA_VISIBLE_DEVICES=0
 python %work_dir%\paddle\scripts\installation_validate.py
 goto:eof

--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -9,7 +9,14 @@ endforeach()
 add_subdirectory(unittests)
 add_subdirectory(book)

-if(NOT APPLE AND NOT WIN32)
+# TODO: support New Custom OP on Mac
+if(Linux)
  add_subdirectory(custom_op)
 endif()
+
+# Windows CPU machine doesn't have CUDA, can't compile .cu file
+# if(WIN32 AND WITH_GPU)
+#   add_subdirectory(custom_op)
+# endif()
+
 set_tests_properties(test_beam_search_decoder PROPERTIES TIMEOUT 120)
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+# New custom OP can support Windows/Linux now
+# 'test_simple_custom_op_jit/test_simple_custom_op_setup' compile .cc and .cu file
+py_test(test_simple_custom_op_setup SRCS test_simple_custom_op_setup.py)
+py_test(test_simple_custom_op_jit SRCS test_simple_custom_op_jit.py)
+
+# Compiling shared library will cost some time, but running process is very fast.
+set_tests_properties(test_simple_custom_op_setup PROPERTIES TIMEOUT 250)
+set_tests_properties(test_simple_custom_op_jit PROPERTIES TIMEOUT 180)
+
+py_test(test_sysconfig SRCS test_sysconfig.py)
+
+# 'test_dispatch' compile .cc file
+py_test(test_dispatch SRCS test_dispatch.py)
+set_tests_properties(test_dispatch PROPERTIES TIMEOUT 180)
+
+if(NOT Linux)
+    return()
+endif()
+
+# TODO(zhouwei): support test_check_abi and abi check on Windows
+py_test(test_check_abi SRCS test_check_abi.py)
+
+# Old custom OP only support Linux, only run on Linux
+py_test(test_custom_op SRCS test_custom_op.py)
+py_test(test_jit_load SRCS test_jit_load.py)
+py_test(test_setup_install SRCS test_setup_install.py)
+py_test(test_setup_build SRCS test_setup_build.py)
+
+set_tests_properties(test_jit_load PROPERTIES TIMEOUT 180)
+set_tests_properties(test_setup_install PROPERTIES TIMEOUT 180)
+set_tests_properties(test_setup_build PROPERTIES TIMEOUT 180)
+
+
 if(WITH_ROCM)
    hip_library(relu_op_shared SHARED SRCS relu_op.cc relu_op.cu DEPS paddle_framework_shared)
 elseif(WITH_GPU)
@@ -18,19 +51,3 @@ get_target_property(TARGET_LIBRARIES relu_op_shared LINK_LIBRARIES)
 LIST(REMOVE_ITEM TARGET_LIBRARIES glog)
 LIST(REMOVE_ITEM TARGET_LIBRARIES gflags)
 set_property(TARGET relu_op_shared PROPERTY LINK_LIBRARIES  ${TARGET_LIBRARIES} )
-
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
-
-# Compiling .so will cost some time, but running process is very fast.
-set_tests_properties(test_jit_load PROPERTIES TIMEOUT 180)
-set_tests_properties(test_setup_install PROPERTIES TIMEOUT 180)
-set_tests_properties(test_setup_build PROPERTIES TIMEOUT 180)
-set_tests_properties(test_dispatch PROPERTIES TIMEOUT 180)
-
-set_tests_properties(test_simple_custom_op_setup PROPERTIES TIMEOUT 250)
-set_tests_properties(test_simple_custom_op_jit PROPERTIES TIMEOUT 180)
--- a/python/paddle/fluid/tests/custom_op/test_dispatch.py
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch.py
@@ -16,8 +16,18 @@ import os
 import unittest
 import paddle
 import numpy as np
-from paddle.utils.cpp_extension import load
+from paddle.utils.cpp_extension import load, get_build_directory
 from utils import paddle_includes, extra_compile_args
+from paddle.utils.cpp_extension.extension_utils import run_cmd
+
+# Because the shared lib already exists in the cache dir,
+# it will not be compiled again unless the cache dir is cleared.
+if os.name == 'nt':
+    cmd = 'rmdir {} /s/q'.format(get_build_directory())
+else:
+    cmd = 'rm -rf {}'.format(get_build_directory())
+
+run_cmd(cmd, True)

 dispatch_op = load(
    name='dispatch_op',

--- a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py
@@ -13,13 +13,24 @@
 # limitations under the License.

 import os
+import subprocess
 import unittest
 import paddle
 import numpy as np
-from paddle.utils.cpp_extension import load
+from paddle.utils.cpp_extension import load, get_build_directory
+from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_compile_args
 from test_simple_custom_op_setup import relu2_dynamic, relu2_static

+# Because the shared lib already exists in the cache dir,
+# it will not be compiled again unless the cache dir is cleared.
+if os.name == 'nt':
+    cmd = 'rmdir {} /s/q'.format(get_build_directory())
+else:
+    cmd = 'rm -rf {}'.format(get_build_directory())
+
+run_cmd(cmd, True)
+
 # Compile and load custom op Just-In-Time.
 custom_module = load(
    name='simple_jit_relu2',

--- a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
@@ -91,7 +91,12 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
    def setUp(self):
        cur_dir = os.path.dirname(os.path.abspath(__file__))
        # compile, install the custom op egg into site-packages under background
-        cmd = 'cd {} && python setup_install_simple.py install'.format(cur_dir)
+        if os.name == 'nt':
+            cmd = 'cd /d {} && python setup_install_simple.py install'.format(
+                cur_dir)
+        else:
+            cmd = 'cd {} && python setup_install_simple.py install'.format(
+                cur_dir)
        run_cmd(cmd)

        # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
@@ -99,7 +104,11 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
        # sys.path has been updated. So we update it manually.

        # See: https://stackoverflow.com/questions/56974185/import-runtime-installed-module-using-pip-in-python-3
-        site_dir = site.getsitepackages()[0]
+        if os.name == 'nt':
+            # NOTE(zhouwei25): getsitepackages on windows will return a list: [python install dir, site packages dir]
+            site_dir = site.getsitepackages()[1]
+        else:
+            site_dir = site.getsitepackages()[0]
        custom_egg_path = [
            x for x in os.listdir(site_dir) if 'simple_setup_relu2' in x
        ]

--- a/python/paddle/fluid/tests/custom_op/utils.py
+++ b/python/paddle/fluid/tests/custom_op/utils.py
@@ -23,8 +23,8 @@ site_packages_path = get_python_lib()
 # paddle include directory. Because the following path is generated after insalling
 # PaddlePaddle whl. So here we specific `include_dirs` to avoid errors in CI.
 paddle_includes = [
-    os.path.join(site_packages_path, 'paddle/include'),
-    os.path.join(site_packages_path, 'paddle/include/third_party')
+    os.path.join(site_packages_path, 'paddle', 'include'),
+    os.path.join(site_packages_path, 'paddle', 'include', 'third_party')
 ]

 # TODO(Aurelius84): Memory layout is different if build paddle with PADDLE_WITH_MKLDNN=ON,

--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -17,16 +17,25 @@ import six
 import sys
 import textwrap
 import copy
+import re

 import setuptools
 from setuptools.command.easy_install import easy_install
 from setuptools.command.build_ext import build_ext

 from .extension_utils import find_cuda_home, normalize_extension_kwargs, add_compile_flag, bootstrap_context
-from .extension_utils import is_cuda_file, prepare_unix_cflags, add_std_without_repeat, get_build_directory
+from .extension_utils import is_cuda_file, prepare_unix_cflags, prepare_win_cflags, add_std_without_repeat, get_build_directory
 from .extension_utils import _import_module_from_library, CustomOpInfo, _write_setup_file, _jit_compile, parse_op_name_from
-from .extension_utils import check_abi_compatibility, log_v, IS_WINDOWS
-from .extension_utils import use_new_custom_op_load_method
+from .extension_utils import check_abi_compatibility, log_v, IS_WINDOWS, OS_NAME
+from .extension_utils import use_new_custom_op_load_method, MSVC_COMPILE_FLAGS
+
+# Note(zhouwei): On windows, it will export function 'PyInit_[name]' by default,
+# The solution is: 1.User add function PyInit_[name] 2. set not to export
+# refer to https://stackoverflow.com/questions/34689210/error-exporting-symbol-when-building-python-c-extension-in-windows
+if IS_WINDOWS and six.PY3:
+    from distutils.command.build_ext import build_ext as _du_build_ext
+    from unittest.mock import Mock
+    _du_build_ext.get_export_symbols = Mock(return_value=None)

 CUDA_HOME = find_cuda_home()

@@ -112,7 +121,7 @@ def CppExtension(sources, *args, **kwargs):
           sources(list[str]): The C++/CUDA source file names
           args(list[options]): list of config options used to compile shared library
           kwargs(dict[option]): dict of config options used to compile shared library
-           
+
       Returns:
           Extension: An instance of setuptools.Extension
    """
@@ -137,7 +146,7 @@ def CUDAExtension(sources, *args, **kwargs):
           sources(list[str]): The C++/CUDA source file names
           args(list[options]): list of config options used to compile shared library
           kwargs(dict[option]): dict of config options used to compile shared library
-           
+
       Returns:
           Extension: An instance of setuptools.Extension
    """
@@ -191,12 +200,12 @@ class BuildExtension(build_ext, object):
    def __init__(self, *args, **kwargs):
        """
        Attributes is initialized with following oreder:
-        
+
            1. super(self).__init__()
            2. initialize_options(self)
            3. the reset of current __init__()
            4. finalize_options(self)
-        
+
        So, it is recommended to set attribute value in `finalize_options`.
        """
        super(BuildExtension, self).__init__(*args, **kwargs)
@@ -225,15 +234,17 @@ class BuildExtension(build_ext, object):
                for compiler in ['cxx', 'nvcc']:
                    if compiler not in extension.extra_compile_args:
                        extension.extra_compile_args[compiler] = []
-            # add determine compile flags
-            add_compile_flag(extension, '-std=c++11')

        # Consider .cu, .cu.cc as valid source extensions.
        self.compiler.src_extensions += ['.cu', '.cu.cc']
        # Save the original _compile method for later.
-        if self.compiler.compiler_type == 'msvc' or IS_WINDOWS:
-            raise NotImplementedError("Not support on MSVC currently.")
+        if self.compiler.compiler_type == 'msvc':
+            self.compiler._cpp_extensions += ['.cu', '.cuh']
+            original_compile = self.compiler.compile
+            original_spawn = self.compiler.spawn
        else:
+            # add determine compile flags
+            add_compile_flag(extension, '-std=c++11')
            original_compile = self.compiler._compile

        def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
@@ -268,6 +279,81 @@ class BuildExtension(build_ext, object):
                # restore original_compiler
                self.compiler.compiler_so = original_compiler

+        def win_custom_single_compiler(sources,
+                                       output_dir=None,
+                                       macros=None,
+                                       include_dirs=None,
+                                       debug=0,
+                                       extra_preargs=None,
+                                       extra_postargs=None,
+                                       depends=None):
+
+            self.cflags = copy.deepcopy(extra_postargs)
+            extra_postargs = None
+
+            def win_custom_spawn(cmd):
+                # Using regex to modify compile options
+                compile_options = self.compiler.compile_options
+                for i in range(len(cmd)):
+                    if re.search('/MD', cmd[i]) is not None:
+                        cmd[i] = '/MT'
+                    if re.search('/W[1-4]', cmd[i]) is not None:
+                        cmd[i] = '/W0'
+
+                # Using regex to match src, obj and include files
+                src_regex = re.compile('/T(p|c)(.*)')
+                src_list = [
+                    m.group(2) for m in (src_regex.match(elem) for elem in cmd)
+                    if m
+                ]
+
+                obj_regex = re.compile('/Fo(.*)')
+                obj_list = [
+                    m.group(1) for m in (obj_regex.match(elem) for elem in cmd)
+                    if m
+                ]
+
+                include_regex = re.compile(r'((\-|\/)I.*)')
+                include_list = [
+                    m.group(1)
+                    for m in (include_regex.match(elem) for elem in cmd) if m
+                ]
+
+                assert len(src_list) == 1 and len(obj_list) == 1
+                src = src_list[0]
+                obj = obj_list[0]
+                if is_cuda_file(src):
+                    assert CUDA_HOME is not None
+                    nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
+                    if isinstance(self.cflags, dict):
+                        cflags = self.cflags['nvcc']
+                    elif isinstance(self.cflags, list):
+                        cflags = self.cflags
+                    else:
+                        cflags = []
+
+                    cflags = prepare_win_cflags(cflags) + ['--use-local-env']
+                    for flag in MSVC_COMPILE_FLAGS:
+                        cflags = ['-Xcompiler', flag] + cflags
+                    cmd = [nvcc_cmd, '-c', src, '-o', obj
+                           ] + include_list + cflags
+                elif isinstance(self.cflags, dict):
+                    cflags = MSVC_COMPILE_FLAGS + self.cflags['cxx']
+                    cmd += cflags
+                elif isinstance(self.cflags, list):
+                    cflags = MSVC_COMPILE_FLAGS + self.cflags
+                    cmd += cflags
+
+                return original_spawn(cmd)
+
+            try:
+                self.compiler.spawn = win_custom_spawn
+                return original_compile(sources, output_dir, macros,
+                                        include_dirs, debug, extra_preargs,
+                                        extra_postargs, depends)
+            finally:
+                self.compiler.spawn = original_spawn
+
        def object_filenames_with_cuda(origina_func, build_directory):
            """
            Decorated the function to add customized naming machanism.
@@ -280,10 +366,13 @@ class BuildExtension(build_ext, object):
                    objects = origina_func(source_filenames, strip_dir,
                                           output_dir)
                    for i, source in enumerate(source_filenames):
-                        # modify xx.o -> xx.cu.o
+                        # modify xx.o -> xx.cu.o/xx.cu.obj
                        if is_cuda_file(source):
                            old_obj = objects[i]
-                            objects[i] = old_obj[:-1] + 'cu.o'
+                            if self.compiler.compiler_type == 'msvc':
+                                objects[i] = old_obj[:-3] + 'cu.obj'
+                            else:
+                                objects[i] = old_obj[:-1] + 'cu.o'
                    # if user set build_directory, output objects there.
                    if build_directory is not None:
                        objects = [
@@ -300,10 +389,13 @@ class BuildExtension(build_ext, object):
            return wrapper

        # customized compile process
-        self.compiler._compile = unix_custom_single_compiler
+        if self.compiler.compiler_type == 'msvc':
+            self.compiler.compile = win_custom_single_compiler
+        else:
+            self.compiler._compile = unix_custom_single_compiler
+
        self.compiler.object_filenames = object_filenames_with_cuda(
            self.compiler.object_filenames, self.build_lib)
-
        self._record_op_info()

        print("Compiling user custom op, it will cost a few seconds.....")
@@ -333,15 +425,21 @@ class BuildExtension(build_ext, object):
            compiler = self.compiler.compiler_cxx[0]
        elif IS_WINDOWS:
            compiler = os.environ.get('CXX', 'cl')
-            raise NotImplementedError("We don't support Windows Currently.")
        else:
            compiler = os.environ.get('CXX', 'c++')

        check_abi_compatibility(compiler)
+        # Warn user if VC env is activated but `DISTUILS_USE_SDK` is not set.
+        if IS_WINDOWS and 'VSCMD_ARG_TGT_ARCH' in os.environ and 'DISTUTILS_USE_SDK' not in os.environ:
+            msg = (
+                'It seems that the VC environment is activated but DISTUTILS_USE_SDK is not set.'
+                'This may lead to multiple activations of the VC env.'
+                'Please set `DISTUTILS_USE_SDK=1` and try again.')
+            raise UserWarning(msg)

    def _record_op_info(self):
        """
-        Record custum op inforomation. 
+        Record custum op inforomation.
        """
        # parse shared library abs path
        outputs = self.get_outputs()
@@ -380,7 +478,13 @@ class EasyInstallCommand(easy_install, object):
        # .so shared library to another name.
        for egg_file in self.outputs:
            filename, ext = os.path.splitext(egg_file)
-            if ext == '.so':
+            will_rename = False
+            if OS_NAME.startswith('linux') and ext == '.so':
+                will_rename = True
+            elif IS_WINDOWS and ext == '.pyd':
+                will_rename = True
+
+            if will_rename:
                new_so_path = filename + "_pd_" + ext
                if not os.path.exists(new_so_path):
                    os.rename(r'%s' % egg_file, r'%s' % new_so_path)
@@ -425,7 +529,7 @@ def load(name,
        extra_include_paths(list[str]): additional include path used to search header files.
                                        Default None.
        build_directory(str): specific directory path to put shared library file. If set None,
-                            it will use `PADDLE_EXTENSION_DIR` from os.environ. Use 
+                            it will use `PADDLE_EXTENSION_DIR` from os.environ. Use
                            `paddle.utils.cpp_extension.get_build_directory()` to see the location.
        interpreter(str): alias or full interpreter path to specific which one to use if have installed multiple.
                           If set None, will use `python` as default interpreter.
@@ -448,6 +552,10 @@ def load(name,

    # ensure to use abs path
    build_directory = os.path.abspath(build_directory)
+    # Will load shared library from 'path' on windows
+    if IS_WINDOWS:
+        os.environ['path'] = build_directory + ';' + os.environ['path']
+
    log_v("build_directory: {}".format(build_directory), verbose)

    file_path = os.path.join(build_directory, "setup.py")
@@ -460,7 +568,7 @@ def load(name,
    log_v("additonal compile_flags: [{}]".format(' '.join(compile_flags)),
          verbose)

-    # write setup.py file and compile it 
+    # write setup.py file and compile it
    _write_setup_file(name, sources, file_path, extra_include_paths,
                      compile_flags, extra_ldflags, verbose)
    _jit_compile(file_path, interpreter, verbose)

--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -38,9 +38,19 @@ logger = logging.getLogger("utils.cpp_extension")

 OS_NAME = sys.platform
 IS_WINDOWS = OS_NAME.startswith('win')
-NVCC_COMPILE_FLAGS = [
-    '-ccbin', 'cc', '-DPADDLE_WITH_CUDA', '-DEIGEN_USE_GPU', '-DPADDLE_USE_DSO',
-    '-Xcompiler', '-fPIC', '-w', '--expt-relaxed-constexpr', '-O3', '-DNVCC'
+
+MSVC_COMPILE_FLAGS = [
+    '/MT', '/wd4819', '/wd4251', '/wd4244', '/wd4267', '/wd4275', '/wd4018',
+    '/wd4190', '/EHsc', '/w', '/DPADDLE_WITH_CUDA', '/DEIGEN_USE_GPU',
+    '/DNDEBUG'
+]
+
+MSVC_LINK_FLAGS = [
+    '/MACHINE:X64', 'paddle_framework.lib', 'cudadevrt.lib', 'cudart_static.lib'
+]
+
+COMMON_NVCC_FLAGS = [
+    '-DPADDLE_WITH_CUDA', '-DEIGEN_USE_GPU', '-DPADDLE_USE_DSO', '-O3'
 ]

 GCC_MINI_VERSION = (5, 4, 0)
@@ -81,8 +91,8 @@ information
 USING_NEW_CUSTOM_OP_LOAD_METHOD = True


-# NOTE(chenweihang): In order to be compatible with 
-# the two custom op define method, after removing 
+# NOTE(chenweihang): In order to be compatible with
+# the two custom op define method, after removing
 # old method, we can remove them together
 def use_new_custom_op_load_method(*args):
    global USING_NEW_CUSTOM_OP_LOAD_METHOD
@@ -210,7 +220,21 @@ def prepare_unix_cflags(cflags):
    """
    Prepare all necessary compiled flags for nvcc compiling CUDA files.
    """
-    cflags = NVCC_COMPILE_FLAGS + cflags + get_cuda_arch_flags(cflags)
+    cflags = COMMON_NVCC_FLAGS + [
+        '-ccbin', 'cc', '-Xcompiler', '-fPIC', '-w', '--expt-relaxed-constexpr',
+        '-DNVCC'
+    ] + cflags + get_cuda_arch_flags(cflags)
+
+    return cflags
+
+
+def prepare_win_cflags(cflags):
+    """
+    Prepare all necessary compiled flags for nvcc compiling CUDA files.
+    """
+    cflags = COMMON_NVCC_FLAGS + [
+        '-DGOOGLE_GLOG_DLL_DECL', '-DBOOST_HAS_STATIC_ASSERT', '-w'
+    ] + cflags + get_cuda_arch_flags(cflags)

    return cflags

@@ -238,7 +262,7 @@ def get_cuda_arch_flags(cflags):


 def normalize_extension_kwargs(kwargs, use_cuda=False):
-    """ 
+    """
    Normalize include_dirs, library_dir and other attributes in kwargs.
    """
    assert isinstance(kwargs, dict)
@@ -252,52 +276,36 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
    library_dirs.extend(find_paddle_libraries(use_cuda))
    kwargs['library_dirs'] = library_dirs

-    # add runtime library dirs
-    runtime_library_dirs = kwargs.get('runtime_library_dirs', [])
-    runtime_library_dirs.extend(find_paddle_libraries(use_cuda))
-    kwargs['runtime_library_dirs'] = runtime_library_dirs
+    if IS_WINDOWS:
+        # TODO(zhouwei): may append compile flags in future
+        pass
+        # append link flags
+        extra_link_args = kwargs.get('extra_link_args', [])
+        extra_link_args.extend(MSVC_LINK_FLAGS)
+        kwargs['extra_link_args'] = extra_link_args
+    else:
+        # append compile flags
+        extra_compile_args = kwargs.get('extra_compile_args', [])
+        extra_compile_args.extend(['-g', '-w'])  # diable warnings
+        kwargs['extra_compile_args'] = extra_compile_args

-    # append compile flags
-    extra_compile_args = kwargs.get('extra_compile_args', [])
-    extra_compile_args.extend(['-g', '-w'])  # diable warnings
-    kwargs['extra_compile_args'] = extra_compile_args
+        # append link flags
+        extra_link_args = kwargs.get('extra_link_args', [])
+        extra_link_args.append('-lpaddle_framework')
+        if use_cuda:
+            extra_link_args.append('-lcudart')

-    # append link flags
-    extra_link_args = kwargs.get('extra_link_args', [])
-    extra_link_args.append('-lpaddle_framework')
-    if use_cuda:
-        extra_link_args.append('-lcudart')
+        kwargs['extra_link_args'] = extra_link_args

-    kwargs['extra_link_args'] = extra_link_args
+        # add runtime library dirs
+        runtime_library_dirs = kwargs.get('runtime_library_dirs', [])
+        runtime_library_dirs.extend(find_paddle_libraries(use_cuda))
+        kwargs['runtime_library_dirs'] = runtime_library_dirs

    kwargs['language'] = 'c++'
    return kwargs


-def find_paddle_includes(use_cuda=False):
-    """
-    Return Paddle necessary include dir path.
-    """
-    # pythonXX/site-packages/paddle/include
-    paddle_include_dir = get_include()
-    third_party_dir = os.path.join(paddle_include_dir, 'third_party')
-
-    include_dirs = [paddle_include_dir, third_party_dir]
-
-    return include_dirs
-
-
-def find_cuda_includes():
-
-    cuda_home = find_cuda_home()
-    if cuda_home is None:
-        raise ValueError(
-            "Not found CUDA runtime, please use `export CUDA_HOME=XXX` to specific it."
-        )
-
-    return [os.path.join(cuda_home, 'lib64')]
-
-
 def find_cuda_home():
    """
    Use heuristic method to find cuda path
@@ -315,19 +323,22 @@ def find_cuda_home():
                if six.PY3:
                    nvcc_path = nvcc_path.decode()
                nvcc_path = nvcc_path.rstrip('\r\n')
+                log_v(nvcc_path)
                # for example: /usr/local/cuda/bin/nvcc
                cuda_home = os.path.dirname(os.path.dirname(nvcc_path))
        except:
            if IS_WINDOWS:
                # search from default NVIDIA GPU path
                candidate_paths = glob.glob(
-                    'C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*')
+                    'C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*.*'
+                )
                if len(candidate_paths) > 0:
                    cuda_home = candidate_paths[0]
            else:
                cuda_home = "/usr/local/cuda"
    # step 3. check whether path is valid
-    if not os.path.exists(cuda_home) and core.is_compiled_with_cuda():
+    if cuda_home and not os.path.exists(
+            cuda_home) and core.is_compiled_with_cuda():
        cuda_home = None
        warnings.warn(
            "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it."
@@ -336,15 +347,65 @@ def find_cuda_home():
    return cuda_home


+def find_cuda_includes():
+    """
+    Use heuristic method to find cuda include path
+    """
+    cuda_home = find_cuda_home()
+    if cuda_home is None:
+        raise ValueError(
+            "Not found CUDA runtime, please use `export CUDA_HOME=XXX` to specific it."
+        )
+
+    return [os.path.join(cuda_home, 'include')]
+
+
+def find_paddle_includes(use_cuda=False):
+    """
+    Return Paddle necessary include dir path.
+    """
+    # pythonXX/site-packages/paddle/include
+    paddle_include_dir = get_include()
+    third_party_dir = os.path.join(paddle_include_dir, 'third_party')
+    include_dirs = [paddle_include_dir, third_party_dir]
+
+    #TODO(zhouwei): because eigen need cuda_runtime.h
+    #So, extend cuda_include_dir always
+    cuda_include_dir = find_cuda_includes()
+    include_dirs.extend(cuda_include_dir)
+
+    return include_dirs
+
+
+def find_cuda_libraries():
+    """
+    Use heuristic method to find cuda static lib path
+    """
+    cuda_home = find_cuda_home()
+    if cuda_home is None:
+        raise ValueError(
+            "Not found CUDA runtime, please use `export CUDA_HOME=XXX` to specific it."
+        )
+    if IS_WINDOWS:
+        cuda_lib_dir = [os.path.join(cuda_home, 'lib', 'x64')]
+    else:
+        cuda_lib_dir = [os.path.join(cuda_home, 'lib64')]
+
+    return cuda_lib_dir
+
+
 def find_paddle_libraries(use_cuda=False):
    """
    Return Paddle necessary library dir path.
    """
    # pythonXX/site-packages/paddle/libs
    paddle_lib_dirs = [get_lib()]
-    if use_cuda:
-        cuda_dirs = find_cuda_includes()
-        paddle_lib_dirs.extend(cuda_dirs)
+
+    #TODO(zhouwei): because eigen need cuda_runtime.h
+    #So, extend cuda_lib_dir always
+    cuda_lib_dir = find_cuda_libraries()
+    paddle_lib_dirs.extend(cuda_lib_dir)
+
    return paddle_lib_dirs


@@ -374,12 +435,14 @@ def get_build_directory(verbose=False):
    root_extensions_directory = os.environ.get('PADDLE_EXTENSION_DIR')
    if root_extensions_directory is None:
        dir_name = "paddle_extensions"
-        if OS_NAME.startswith('linux'):
-            root_extensions_directory = os.path.join(
-                os.path.expanduser('~/.cache'), dir_name)
-        else:
-            # TODO(Aurelius84): consider wind32/macOs
-            raise NotImplementedError("Only support Linux now.")
+        root_extensions_directory = os.path.join(
+            os.path.expanduser('~/.cache'), dir_name)
+        if IS_WINDOWS:
+            root_extensions_directory = os.path.normpath(
+                root_extensions_directory)
+        elif OS_NAME.startswith('darwin'):
+            # TODO(Aurelius84): consider macOs
+            raise NotImplementedError("Not support Mac now.")

        log_v("$PADDLE_EXTENSION_DIR is not set, using path: {} by default.".
              format(root_extensions_directory), verbose)
@@ -410,10 +473,13 @@ def parse_op_info(op_name):

 def _import_module_from_library(module_name, build_directory, verbose=False):
    """
-    Load .so shared library and import it as callable python module.
+    Load shared library and import it as callable python module.
    """
-    # TODO(Aurelius84): Consider file suffix is .dll on Windows Platform.
-    ext_path = os.path.join(build_directory, module_name + '.so')
+    if IS_WINDOWS:
+        dynamic_suffix = '.pyd'
+    else:
+        dynamic_suffix = '.so'
+    ext_path = os.path.join(build_directory, module_name + dynamic_suffix)
    if not os.path.exists(ext_path):
        raise FileNotFoundError("Extension path: {} does not exist.".format(
            ext_path))
@@ -565,12 +631,12 @@ def _write_setup_file(name,

 def list2str(args):
    """
-    Convert list[str] into string. For example: [x, y] -> "['x', 'y']"
+    Convert list[str] into string. For example: ['x', 'y'] -> "['x', 'y']"
    """
    if args is None: return '[]'
    assert isinstance(args, (list, tuple))
-    args = ["'{}'".format(arg) for arg in args]
-    return '[' + ','.join(args) + ']'
+    args = ["{}".format(arg) for arg in args]
+    return repr(args)


 def _jit_compile(file_path, interpreter=None, verbose=False):
@@ -583,7 +649,8 @@ def _jit_compile(file_path, interpreter=None, verbose=False):
    if interpreter is None:
        interpreter = 'python'
    try:
-        py_path = subprocess.check_output(['which', interpreter])
+        which = 'where' if IS_WINDOWS else 'which'
+        py_path = subprocess.check_output([which, interpreter])
        py_version = subprocess.check_output([interpreter, '-V'])
        if six.PY3:
            py_path = py_path.decode()
@@ -596,8 +663,13 @@ def _jit_compile(file_path, interpreter=None, verbose=False):
            'Failed to check Python interpreter with `{}`, errors: {}'.format(
                interpreter, error))

-    compile_cmd = 'cd {} && {} {} build'.format(ext_dir, interpreter,
-                                                setup_file)
+    if IS_WINDOWS:
+        compile_cmd = 'cd /d {} && {} {} build'.format(ext_dir, interpreter,
+                                                       setup_file)
+    else:
+        compile_cmd = 'cd {} && {} {} build'.format(ext_dir, interpreter,
+                                                    setup_file)
+
    print("Compiling user custom op, it will cost a few seconds.....")
    run_cmd(compile_cmd, verbose)

@@ -682,7 +754,7 @@ def check_abi_compatibility(compiler, verbose=False):
    try:
        if OS_NAME.startswith('linux'):
            version_info = subprocess.check_output(
-                [compiler, '-dumpfullversion'])
+                [compiler, '-dumpfullversion', '-dumpversion'])
            if six.PY3:
                version_info = version_info.decode()
            version = version_info.strip().split('.')
@@ -694,8 +766,8 @@ def check_abi_compatibility(compiler, verbose=False):
                warnings.warn(
                    ABI_INCOMPATIBILITY_WARNING.format(
                        user_compiler=compiler, version=version_info.strip()))
-        # TODO(Aurelius84): check version compatibility on windows
        elif IS_WINDOWS:
+            # TODO(zhouwei): support check abi compatibility on windows
            warnings.warn("We don't support Windows now.")
    except Exception:
        _, error, _ = sys.exc_info()
@@ -714,7 +786,7 @@ def _expected_compiler_current_platform():
    return expect_compilers


-def log_v(info, verbose):
+def log_v(info, verbose=True):
    """
    Print log information on stdout.
    """

--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -3,7 +3,8 @@ numpy>=1.13, <=1.16.4 ; python_version<"3.5"
 numpy>=1.13 ; python_version>="3.5" and platform_system != "Windows"
 numpy>=1.13, <=1.19.3 ; python_version>="3.5" and platform_system == "Windows"
 protobuf>=3.1.0
-gast>=0.3.3
+gast>=0.3.3 ; platform_system != "Windows"
+gast==0.3.3 ; platform_system == "Windows"
 Pillow
 six
 decorator

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -335,11 +335,16 @@ if '${WITH_XPU_BKCL}' == 'ON':
    shutil.copy('${XPU_BKCL_LIB}', libs_path)
    package_data['paddle.libs']+=['${XPU_BKCL_LIB_NAME}']

-# copy libfuild_framework.so to libs
-if os.name != 'nt' and sys.platform != 'darwin':
-    paddle_framework_lib='${FLUID_FRAMEWORK_SHARED_LIB}'
-    shutil.copy(paddle_framework_lib, libs_path)
-    package_data['paddle.libs'] += [('libpaddle_framework' if os.name != 'nt' else 'paddle_framework') + ext_name]
+# copy libpaddle_framework.so to libs on linux
+if sys.platform.startswith('linux'):
+    shutil.copy('${FLUID_FRAMEWORK_SHARED_LIB}', libs_path)
+    package_data['paddle.libs'] += ['libpaddle_framework.so']
+
+# copy paddle_framework.lib/paddle_framework.dll to libs on windows
+if os.name == 'nt':
+    shutil.copy('${FLUID_FRAMEWORK_IMPORT_LIB}', libs_path)
+    shutil.copy('${FLUID_FRAMEWORK_SHARED_LIB}', libs_path)
+    package_data['paddle.libs'] += ['paddle_framework.lib', 'paddle_framework.dll']

 # remove unused paddle/libs/__init__.py
 if os.path.isfile(libs_path+'/__init__.py'):
@@ -410,9 +415,9 @@ if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON':
 class InstallCommand(InstallCommandBase):
    def finalize_options(self):
        ret = InstallCommandBase.finalize_options(self)
-        self.install_headers = os.path.join(self.install_purelib, 'paddle',
-                                            'include')
        self.install_lib = self.install_platlib
+        self.install_headers = os.path.join(self.install_platlib, 'paddle',
+                                            'include')
        return ret


@@ -463,11 +468,6 @@ class InstallHeaders(Command):
        return self.copy_file(header, install_dir)

    def run(self):
-        # only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows
-        if os.name == 'nt' or sys.platform == 'darwin':
-            if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON':
-                self.mkdir_and_copy_file('${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb')
-            return
        hdrs = self.distribution.headers
        if not hdrs:
            return