diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd9605a1abb3dbac13c8a95cdb0f90ca718d509d..f24513d605c49b608cb32425a861448a3acd6c6a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -335,6 +335,8 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 
+add_definitions(-DPADDLE_DLL_EXPORT)
+
 if(ON_INFER)
     # you can trun off the paddle fluid and inference lib by set ON_INFER=OFF
     message(STATUS "On inference mode, will take place some specific optimization.")
diff --git a/paddle/fluid/extension/include/all.h b/paddle/fluid/extension/include/all.h
index 5aa61f8203e75320cfdf11ed34fe9a7462548c60..e2a3bc38c5f4ab3ee1d126159b7961d979a33c06 100644
--- a/paddle/fluid/extension/include/all.h
+++ b/paddle/fluid/extension/include/all.h
@@ -18,6 +18,12 @@ limitations under the License. */
 #error C++11 or later compatible compiler is required to use Paddle.
 #endif
 
+#ifdef _WIN32
+#ifndef NOMINMAX
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#endif
+#endif
+
 #include "paddle/fluid/extension/include/dispatch.h"
 #include "paddle/fluid/extension/include/dtype.h"
 #include "paddle/fluid/extension/include/op_meta_info.h"
diff --git a/paddle/fluid/extension/include/dll_decl.h b/paddle/fluid/extension/include/dll_decl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3dbea5e6dffc271cd2edc4e399d96e18e259d936
--- /dev/null
+++ b/paddle/fluid/extension/include/dll_decl.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(_WIN32)
+#ifndef PD_DLL_DECL
+#ifdef PADDLE_DLL_EXPORT
+#define PD_DLL_DECL __declspec(dllexport)
+#else
+#define PD_DLL_DECL __declspec(dllimport)
+#endif  // PADDLE_DLL_EXPORT
+#endif  // PD_DLL_DECL
+#else
+#define PD_DLL_DECL
+#endif  // _WIN32
diff --git a/paddle/fluid/extension/include/op_meta_info.h b/paddle/fluid/extension/include/op_meta_info.h
index 920049e2390ed38b12f3466fb35bf37c77dfbbe2..c16f61374f7cba5dd727fe5d22449bbeca772de8 100644
--- a/paddle/fluid/extension/include/op_meta_info.h
+++ b/paddle/fluid/extension/include/op_meta_info.h
@@ -14,12 +14,14 @@ limitations under the License. */
 
 #pragma once
 
+#include <iostream>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
 #include <boost/any.hpp>
 
+#include "paddle/fluid/extension/include/dll_decl.h"
 #include "paddle/fluid/extension/include/tensor.h"
 
 /**
@@ -31,7 +33,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class OpMetaInfoHelper;
+class PD_DLL_DECL OpMetaInfoHelper;
 }  // namespace framework
 
 using Tensor = paddle::Tensor;
@@ -43,6 +45,26 @@ using Tensor = paddle::Tensor;
   classname& operator=(const classname&) = delete; \
   classname& operator=(classname&&) = delete
 
+#if defined _WIN32
+#define HANDLE_THE_ERROR try {
+#define END_HANDLE_THE_ERROR            \
+  }                                     \
+  catch (const std::exception& e) {     \
+    std::cerr << e.what() << std::endl; \
+    throw e;                            \
+  }
+#else
+#define HANDLE_THE_ERROR
+#define END_HANDLE_THE_ERROR
+#endif
+
+#define PD_THROW(err_msg)              \
+  do {                                 \
+    HANDLE_THE_ERROR                   \
+    throw std::runtime_error(err_msg); \
+    END_HANDLE_THE_ERROR               \
+  } while (0)
+
 ///////////////// Util Define and Function ////////////////
 
 inline std::string Grad(const std::string& var_name) {
@@ -106,7 +128,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
                                                             attr_idx + 1>(
             inputs, attrs, pargs..., arg);
       } catch (boost::bad_any_cast&) {
-        throw std::runtime_error(
+        PD_THROW(
             "Attribute cast error in custom operator. Expected int value.");
       }
     }
@@ -220,7 +242,7 @@ struct InferDtypeFuncImpl<Return (*)(Args...), impl_fn> {
 
 ////////////////////// Op Meta Info //////////////////////
 
-class OpMetaInfo {
+class PD_DLL_DECL OpMetaInfo {
  public:
   explicit OpMetaInfo(const std::string& op_name) : name_(op_name) {}
   OpMetaInfo& Inputs(std::vector<std::string>&& inputs);
@@ -246,7 +268,7 @@ class OpMetaInfo {
 
 //////////////// Op Meta Info Map /////////////////
 
-class OpMetaInfoMap {
+class PD_DLL_DECL OpMetaInfoMap {
  public:
   // this function's impl should keep in header file.
   // if move to cc file, meta info can not be added
@@ -270,14 +292,14 @@ class OpMetaInfoMap {
 
 //////////////// Op Meta Info Builder /////////////////
 
-class OpMetaInfoBuilder {
+class PD_DLL_DECL OpMetaInfoBuilder {
  public:
   explicit OpMetaInfoBuilder(std::string&& name);
   OpMetaInfoBuilder& Inputs(std::vector<std::string>&& inputs);
   OpMetaInfoBuilder& Outputs(std::vector<std::string>&& outputs);
-  OpMetaInfoBuilder& SetKernelFn(KernelFunc&& func);
-  OpMetaInfoBuilder& SetInferShapeFn(InferShapeFunc&& func);
-  OpMetaInfoBuilder& SetInferDtypeFn(InferDtypeFunc&& func);
+  OpMetaInfoBuilder& SetKernelFn(KernelFunc func);
+  OpMetaInfoBuilder& SetInferShapeFn(InferShapeFunc func);
+  OpMetaInfoBuilder& SetInferDtypeFn(InferDtypeFunc func);
   OpMetaInfoBuilder& SetBackwardOp(const std::string& bwd_op_name);
 
  private:
@@ -317,8 +339,12 @@ void LoadCustomOperatorLib(const std::string& dso_name);
 extern "C" {
 #endif
 
+#if defined(_WIN32)
 // C-API to get global OpMetaInfoMap.
-paddle::OpMetaInfoMap& PD_GetOpMetaInfoMap();
+__declspec(dllexport) inline paddle::OpMetaInfoMap& PD_GetOpMetaInfoMap() {
+  return paddle::OpMetaInfoMap::Instance();
+}
+#endif  // _WIN32
 
 #ifdef __cplusplus
 }
diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index a5ce0d1a5858b0422e6187bf2ca0e7198b87ed57..47af4dc70a15ffde980daa65ce769f5e2371058c 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <vector>
+#include "paddle/fluid/extension/include/dll_decl.h"
 #include "paddle/fluid/extension/include/dtype.h"
 #include "paddle/fluid/extension/include/place.h"
 
@@ -23,7 +24,7 @@ namespace paddle {
 namespace framework {
 class CustomTensorUtils;
 }  // namespace framework
-class Tensor {
+class PD_DLL_DECL Tensor {
  public:
   /// \brief Construct a Tensor on target Place for CustomOp.
   /// Generally it's only used for user to create Tensor.
diff --git a/paddle/fluid/extension/src/op_meta_info.cc b/paddle/fluid/extension/src/op_meta_info.cc
index f31723e5ac83675884f950c1c4e8917c220bc474..0273dfd5d07a69a30e1ca00c3f2a42b9ff8a8c50 100644
--- a/paddle/fluid/extension/src/op_meta_info.cc
+++ b/paddle/fluid/extension/src/op_meta_info.cc
@@ -78,17 +78,17 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::Outputs(
   return *this;
 }
 
-OpMetaInfoBuilder& OpMetaInfoBuilder::SetKernelFn(KernelFunc&& func) {
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetKernelFn(KernelFunc func) {
   info_ptr_->SetKernelFn(std::forward<KernelFunc>(func));
   return *this;
 }
 
-OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferShapeFn(InferShapeFunc&& func) {
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferShapeFn(InferShapeFunc func) {
   info_ptr_->SetInferShapeFn(std::forward<InferShapeFunc>(func));
   return *this;
 }
 
-OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferDtypeFn(InferDtypeFunc&& func) {
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferDtypeFn(InferDtypeFunc func) {
   info_ptr_->SetInferDtypeFn(std::forward<InferDtypeFunc>(func));
   return *this;
 }
@@ -114,10 +114,17 @@ void LoadCustomOperatorLib(const std::string& dso_name) {
 }
 }  // namespace paddle
 
+#ifdef __cplusplus
 extern "C" {
+#endif
 
+#ifndef _WIN32
+// C-API to get global OpMetaInfoMap.
 paddle::OpMetaInfoMap& PD_GetOpMetaInfoMap() {
   return paddle::OpMetaInfoMap::Instance();
 }
+#endif
 
+#ifdef __cplusplus
 }  // end extern "C"
+#endif
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 11d505a5aab4f4d33926162445cffd3f5ca4db32..39ed27486411080c167c19f02e9adebb2c2c1d90 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -207,73 +207,87 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const {
   return target;
 }
 
-template Tensor Tensor::copy_to<paddle::platform::float16>(
+template PD_DLL_DECL Tensor
+Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::bfloat16>(
     const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<paddle::platform::bfloat16>(
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex64>(
     const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<paddle::platform::complex64>(
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex128>(
     const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<paddle::platform::complex128>(
-    const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<float>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<double>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<bool>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<float>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<double>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<bool>(const PlaceType &target_place) const;
 
-template float *Tensor::data<float>() const;
-template double *Tensor::data<double>() const;
-template int64_t *Tensor::data<int64_t>() const;
-template int32_t *Tensor::data<int32_t>() const;
-template uint8_t *Tensor::data<uint8_t>() const;
-template int8_t *Tensor::data<int8_t>() const;
-template paddle::platform::float16 *Tensor::data<paddle::platform::float16>()
-    const;
-template paddle::platform::bfloat16 *Tensor::data<paddle::platform::bfloat16>()
-    const;
-template paddle::platform::complex128 *
+template PD_DLL_DECL float *Tensor::data<float>() const;
+template PD_DLL_DECL double *Tensor::data<double>() const;
+template PD_DLL_DECL int64_t *Tensor::data<int64_t>() const;
+template PD_DLL_DECL int32_t *Tensor::data<int32_t>() const;
+template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>() const;
+template PD_DLL_DECL int8_t *Tensor::data<int8_t>() const;
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::data<paddle::platform::float16>() const;
+template PD_DLL_DECL paddle::platform::bfloat16 *
+Tensor::data<paddle::platform::bfloat16>() const;
+template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::data<paddle::platform::complex128>() const;
-template paddle::platform::complex64 *
+template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::data<paddle::platform::complex64>() const;
-template int16_t *Tensor::data<int16_t>() const;
-template bool *Tensor::data<bool>() const;
+template PD_DLL_DECL int16_t *Tensor::data<int16_t>() const;
+template PD_DLL_DECL bool *Tensor::data<bool>() const;
 
-template float *Tensor::mutable_data<float>();
-template double *Tensor::mutable_data<double>();
-template int64_t *Tensor::mutable_data<int64_t>();
-template int32_t *Tensor::mutable_data<int32_t>();
-template uint8_t *Tensor::mutable_data<uint8_t>();
-template int8_t *Tensor::mutable_data<int8_t>();
-template paddle::platform::float16 *
+template PD_DLL_DECL float *Tensor::mutable_data<float>();
+template PD_DLL_DECL double *Tensor::mutable_data<double>();
+template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>();
+template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>();
+template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
+template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
+template PD_DLL_DECL paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>();
-template paddle::platform::bfloat16 *
+template PD_DLL_DECL paddle::platform::bfloat16 *
 Tensor::mutable_data<paddle::platform::bfloat16>();
-template paddle::platform::complex128 *
+template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::mutable_data<paddle::platform::complex128>();
-template paddle::platform::complex64 *
+template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::mutable_data<paddle::platform::complex64>();
-template int16_t *Tensor::mutable_data<int16_t>();
-template bool *Tensor::mutable_data<bool>();
+template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
+template PD_DLL_DECL bool *Tensor::mutable_data<bool>();
 
-template float *Tensor::mutable_data<float>(const PlaceType &place);
-template double *Tensor::mutable_data<double>(const PlaceType &place);
-template int64_t *Tensor::mutable_data<int64_t>(const PlaceType &place);
-template int32_t *Tensor::mutable_data<int32_t>(const PlaceType &place);
-template uint8_t *Tensor::mutable_data<uint8_t>(const PlaceType &place);
-template int8_t *Tensor::mutable_data<int8_t>(const PlaceType &place);
-template paddle::platform::float16 *
+template PD_DLL_DECL float *Tensor::mutable_data<float>(const PlaceType &place);
+template PD_DLL_DECL double *Tensor::mutable_data<double>(
+    const PlaceType &place);
+template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>(
+    const PlaceType &place);
+template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>(
+    const PlaceType &place);
+template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>(
+    const PlaceType &place);
+template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
+    const PlaceType &place);
+template PD_DLL_DECL paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
-template paddle::platform::bfloat16 *
+template PD_DLL_DECL paddle::platform::bfloat16 *
 Tensor::mutable_data<paddle::platform::bfloat16>(const PlaceType &place);
-template paddle::platform::complex128 *
+template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
-template paddle::platform::complex64 *
+template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
-template int16_t *Tensor::mutable_data<int16_t>(const PlaceType &place);
-template bool *Tensor::mutable_data<bool>(const PlaceType &place);
+template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
+    const PlaceType &place);
+template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);
 
 std::vector<int> Tensor::shape() const {
   GET_CASTED_TENSOR
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 14179172db2294a863aa42e1a05ac3bd6a73169f..b037c111865451cdcdd5512e97087eb7052c5d90 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -345,9 +345,12 @@ if (LINUX)
 endif()
 
 if (WIN32)
+  set(FLUID_FRAMEWORK_IMPORT_LIB
+    ${PADDLE_BINARY_DIR}/paddle/fluid/framework/${CMAKE_BUILD_TYPE}/paddle_framework.lib
+    CACHE INTERNAL "Fluid framework lib")
   set(FLUID_FRAMEWORK_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.dll
-      CACHE INTERNAL "Fluid framework lib")
+      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/${CMAKE_BUILD_TYPE}/paddle_framework.dll
+      CACHE INTERNAL "Fluid framework dll")
 endif()
 
 if(APPLE)
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index c347d82d1d10ed282300f507f77d0852b73a2830..6669d18f75cc67fb9e721374f330fa31bd5d11c5 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -416,9 +416,6 @@ void* GetOpDsoHandle(const std::string& dso_name) {
 #if defined(__APPLE__) || defined(__OSX__)
   PADDLE_THROW(platform::errors::Unimplemented(
       "Create custom cpp op outside framework do not support Apple."));
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  PADDLE_THROW(platform::errors::Unimplemented(
-      "Create custom cpp op outside framework do not support Windows."));
 #else
   return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
 #endif
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index eb356b5869326d44c07914af4841412075531975..8050e881a4832cdea5b548f1b9473c334a7a0354 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -114,23 +114,24 @@ rem ------pre install python requirement----------
 where python
 where pip
 pip install wheel --user
-pip install -r %work_dir%\python\requirements.txt --user
-pip install -r %work_dir%\python\unittest_py\requirements.txt --user
+pip install --force-reinstall -r %work_dir%\python\requirements.txt --user
+pip install --force-reinstall -r %work_dir%\python\unittest_py\requirements.txt --user
 if %ERRORLEVEL% NEQ 0 (
     echo pip install requirements.txt failed!
     exit /b 7
 )
 
 rem ------pre install clcache and init config----------
-pip install clcache --user
+rem pip install clcache --user
+pip uninstall -y clcache
 :: set USE_CLCACHE to enable clcache
-set USE_CLCACHE=1
+rem set USE_CLCACHE=1
 :: In some scenarios, CLCACHE_HARDLINK can save one file copy.
-set CLCACHE_HARDLINK=1
+rem set CLCACHE_HARDLINK=1
 :: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
-set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
+rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
 :: set maximum cache size to 20G
-clcache.exe -M 21474836480
+rem clcache.exe -M 21474836480
 
 rem ------show summary of current environment----------
 cmake --version
@@ -281,7 +282,7 @@ echo Build third_party successfully!
 set build_times=1
 :build_paddle
 :: reset clcache zero stats for collect PR's actual hit rate
-clcache.exe -z
+rem clcache.exe -z
 
 echo Build Paddle the %build_times% time:
 if "%WITH_CLCACHE%"=="OFF" (
@@ -305,7 +306,7 @@ echo 0 > %cache_dir%\error_code.txt
 type %cache_dir%\error_code.txt
 
 :: ci will collect clcache hit rate
-goto :collect_clcache_hits
+rem goto :collect_clcache_hits
 
 goto:eof
 
@@ -346,13 +347,14 @@ set /p PADDLE_WHL_FILE_WIN=< whl_file.txt
 @ECHO ON
 pip uninstall -y paddlepaddle
 pip uninstall -y paddlepaddle-gpu
-pip install -U %PADDLE_WHL_FILE_WIN% --user
+pip install %PADDLE_WHL_FILE_WIN% --user
 if %ERRORLEVEL% NEQ 0 (
     call paddle_winci\Scripts\deactivate.bat 2>NUL
     echo pip install whl package failed!
     exit /b 1
 )
 
+
 set CUDA_VISIBLE_DEVICES=0
 python %work_dir%\paddle\scripts\installation_validate.py
 goto:eof
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index bee49945f0074f2e8dc1af9662878ec495d25644..60be92b892fbecdb12cdb408534ad62f3c6445ef 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -9,7 +9,14 @@ endforeach()
 add_subdirectory(unittests)
 add_subdirectory(book)
 
-if(NOT APPLE AND NOT WIN32)
+# TODO: support New Custom OP on Mac
+if(Linux)
   add_subdirectory(custom_op)
 endif()
+
+# Windows CPU machine doesn't have CUDA, can't compile .cu file
+# if(WIN32 AND WITH_GPU)
+#   add_subdirectory(custom_op)
+# endif()
+
 set_tests_properties(test_beam_search_decoder PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index 9b89e5ceda5b45f7b94a02542456189ff7109ebd..0daf662f551ecbdb5a232c520bedee19408f1613 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -1,3 +1,36 @@
+# New custom OP can support Windows/Linux now
+# 'test_simple_custom_op_jit/test_simple_custom_op_setup' compile .cc and .cu file
+py_test(test_simple_custom_op_setup SRCS test_simple_custom_op_setup.py)
+py_test(test_simple_custom_op_jit SRCS test_simple_custom_op_jit.py)
+
+# Compiling shared library will cost some time, but running process is very fast.
+set_tests_properties(test_simple_custom_op_setup PROPERTIES TIMEOUT 250)
+set_tests_properties(test_simple_custom_op_jit PROPERTIES TIMEOUT 180)
+
+py_test(test_sysconfig SRCS test_sysconfig.py)
+
+# 'test_dispatch' compile .cc file
+py_test(test_dispatch SRCS test_dispatch.py)
+set_tests_properties(test_dispatch PROPERTIES TIMEOUT 180)
+
+if(NOT Linux)
+    return()
+endif()
+
+# TODO(zhouwei): support test_check_abi and abi check on Windows
+py_test(test_check_abi SRCS test_check_abi.py)
+
+# Old custom OP only support Linux, only run on Linux
+py_test(test_custom_op SRCS test_custom_op.py)
+py_test(test_jit_load SRCS test_jit_load.py)
+py_test(test_setup_install SRCS test_setup_install.py)
+py_test(test_setup_build SRCS test_setup_build.py)
+
+set_tests_properties(test_jit_load PROPERTIES TIMEOUT 180)
+set_tests_properties(test_setup_install PROPERTIES TIMEOUT 180)
+set_tests_properties(test_setup_build PROPERTIES TIMEOUT 180)
+
+
 if(WITH_ROCM)
     hip_library(relu_op_shared SHARED SRCS relu_op.cc relu_op.cu DEPS paddle_framework_shared)
 elseif(WITH_GPU)
@@ -18,19 +51,3 @@ get_target_property(TARGET_LIBRARIES relu_op_shared LINK_LIBRARIES)
 LIST(REMOVE_ITEM TARGET_LIBRARIES glog)
 LIST(REMOVE_ITEM TARGET_LIBRARIES gflags)
 set_property(TARGET relu_op_shared PROPERTY LINK_LIBRARIES  ${TARGET_LIBRARIES} )
-
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
-
-# Compiling .so will cost some time, but running process is very fast.
-set_tests_properties(test_jit_load PROPERTIES TIMEOUT 180)
-set_tests_properties(test_setup_install PROPERTIES TIMEOUT 180)
-set_tests_properties(test_setup_build PROPERTIES TIMEOUT 180)
-set_tests_properties(test_dispatch PROPERTIES TIMEOUT 180)
-
-set_tests_properties(test_simple_custom_op_setup PROPERTIES TIMEOUT 250)
-set_tests_properties(test_simple_custom_op_jit PROPERTIES TIMEOUT 180)
diff --git a/python/paddle/fluid/tests/custom_op/test_dispatch.py b/python/paddle/fluid/tests/custom_op/test_dispatch.py
index 1766a6042f395f34a39fc6da8d93646ca6b50597..aaca7333561ee65ef6e267a0ce5a48263800a4c2 100644
--- a/python/paddle/fluid/tests/custom_op/test_dispatch.py
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch.py
@@ -16,8 +16,18 @@ import os
 import unittest
 import paddle
 import numpy as np
-from paddle.utils.cpp_extension import load
+from paddle.utils.cpp_extension import load, get_build_directory
 from utils import paddle_includes, extra_compile_args
+from paddle.utils.cpp_extension.extension_utils import run_cmd
+
+# Because the shared lib already exists in the cache dir,
+# it will not be compiled again unless the cache dir is cleared.
+if os.name == 'nt':
+    cmd = 'rmdir {} /s/q'.format(get_build_directory())
+else:
+    cmd = 'rm -rf {}'.format(get_build_directory())
+
+run_cmd(cmd, True)
 
 dispatch_op = load(
     name='dispatch_op',
diff --git a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py
index 2c0dc1a4ca6a119c1dc9dd0bf8add15e677aaf43..2832e8070d142ab294c69ab3adab382b38d1ce9f 100644
--- a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py
@@ -13,13 +13,24 @@
 # limitations under the License.
 
 import os
+import subprocess
 import unittest
 import paddle
 import numpy as np
-from paddle.utils.cpp_extension import load
+from paddle.utils.cpp_extension import load, get_build_directory
+from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_compile_args
 from test_simple_custom_op_setup import relu2_dynamic, relu2_static
 
+# Because the shared lib already exists in the cache dir,
+# it will not be compiled again unless the cache dir is cleared.
+if os.name == 'nt':
+    cmd = 'rmdir {} /s/q'.format(get_build_directory())
+else:
+    cmd = 'rm -rf {}'.format(get_build_directory())
+
+run_cmd(cmd, True)
+
 # Compile and load custom op Just-In-Time.
 custom_module = load(
     name='simple_jit_relu2',
diff --git a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
index cfa2db0ba24a49a20b825e47d2b90077c3b6d463..f312508d3932043c1ed8e273ef8b4b49c02aced6 100644
--- a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
@@ -91,7 +91,12 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
     def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         # compile, install the custom op egg into site-packages under background
-        cmd = 'cd {} && python setup_install_simple.py install'.format(cur_dir)
+        if os.name == 'nt':
+            cmd = 'cd /d {} && python setup_install_simple.py install'.format(
+                cur_dir)
+        else:
+            cmd = 'cd {} && python setup_install_simple.py install'.format(
+                cur_dir)
         run_cmd(cmd)
 
         # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
@@ -99,7 +104,11 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
         # sys.path has been updated. So we update it manually.
 
         # See: https://stackoverflow.com/questions/56974185/import-runtime-installed-module-using-pip-in-python-3
-        site_dir = site.getsitepackages()[0]
+        if os.name == 'nt':
+            # NOTE(zhouwei25): getsitepackages on windows will return a list: [python install dir, site packages dir]
+            site_dir = site.getsitepackages()[1]
+        else:
+            site_dir = site.getsitepackages()[0]
         custom_egg_path = [
             x for x in os.listdir(site_dir) if 'simple_setup_relu2' in x
         ]
diff --git a/python/paddle/fluid/tests/custom_op/utils.py b/python/paddle/fluid/tests/custom_op/utils.py
index f293c751942cda432400ce1786326eb14cf6a9b2..52b294dc72b4ba08ef380f954fd39cc5577918b5 100644
--- a/python/paddle/fluid/tests/custom_op/utils.py
+++ b/python/paddle/fluid/tests/custom_op/utils.py
@@ -23,8 +23,8 @@ site_packages_path = get_python_lib()
 # paddle include directory. Because the following path is generated after insalling
 # PaddlePaddle whl. So here we specific `include_dirs` to avoid errors in CI.
 paddle_includes = [
-    os.path.join(site_packages_path, 'paddle/include'),
-    os.path.join(site_packages_path, 'paddle/include/third_party')
+    os.path.join(site_packages_path, 'paddle', 'include'),
+    os.path.join(site_packages_path, 'paddle', 'include', 'third_party')
 ]
 
 # TODO(Aurelius84): Memory layout is different if build paddle with PADDLE_WITH_MKLDNN=ON,
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 121c1626125af9974519e30ac87d8130c7466f25..8c0893b16cf88728cfa199fe2cda407167b8c09b 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -17,16 +17,25 @@ import six
 import sys
 import textwrap
 import copy
+import re
 
 import setuptools
 from setuptools.command.easy_install import easy_install
 from setuptools.command.build_ext import build_ext
 
 from .extension_utils import find_cuda_home, normalize_extension_kwargs, add_compile_flag, bootstrap_context
-from .extension_utils import is_cuda_file, prepare_unix_cflags, add_std_without_repeat, get_build_directory
+from .extension_utils import is_cuda_file, prepare_unix_cflags, prepare_win_cflags, add_std_without_repeat, get_build_directory
 from .extension_utils import _import_module_from_library, CustomOpInfo, _write_setup_file, _jit_compile, parse_op_name_from
-from .extension_utils import check_abi_compatibility, log_v, IS_WINDOWS
-from .extension_utils import use_new_custom_op_load_method
+from .extension_utils import check_abi_compatibility, log_v, IS_WINDOWS, OS_NAME
+from .extension_utils import use_new_custom_op_load_method, MSVC_COMPILE_FLAGS
+
+# Note(zhouwei): On windows, it will export function 'PyInit_[name]' by default,
+# The solution is: 1.User add function PyInit_[name] 2. set not to export
+# refer to https://stackoverflow.com/questions/34689210/error-exporting-symbol-when-building-python-c-extension-in-windows
+if IS_WINDOWS and six.PY3:
+    from distutils.command.build_ext import build_ext as _du_build_ext
+    from unittest.mock import Mock
+    _du_build_ext.get_export_symbols = Mock(return_value=None)
 
 CUDA_HOME = find_cuda_home()
 
@@ -112,7 +121,7 @@ def CppExtension(sources, *args, **kwargs):
            sources(list[str]): The C++/CUDA source file names
            args(list[options]): list of config options used to compile shared library
            kwargs(dict[option]): dict of config options used to compile shared library
-           
+
        Returns:
            Extension: An instance of setuptools.Extension
     """
@@ -137,7 +146,7 @@ def CUDAExtension(sources, *args, **kwargs):
            sources(list[str]): The C++/CUDA source file names
            args(list[options]): list of config options used to compile shared library
            kwargs(dict[option]): dict of config options used to compile shared library
-           
+
        Returns:
            Extension: An instance of setuptools.Extension
     """
@@ -191,12 +200,12 @@ class BuildExtension(build_ext, object):
     def __init__(self, *args, **kwargs):
         """
         Attributes is initialized with following oreder:
-        
+
             1. super(self).__init__()
             2. initialize_options(self)
             3. the reset of current __init__()
             4. finalize_options(self)
-        
+
         So, it is recommended to set attribute value in `finalize_options`.
         """
         super(BuildExtension, self).__init__(*args, **kwargs)
@@ -225,15 +234,17 @@ class BuildExtension(build_ext, object):
                 for compiler in ['cxx', 'nvcc']:
                     if compiler not in extension.extra_compile_args:
                         extension.extra_compile_args[compiler] = []
-            # add determine compile flags
-            add_compile_flag(extension, '-std=c++11')
 
         # Consider .cu, .cu.cc as valid source extensions.
         self.compiler.src_extensions += ['.cu', '.cu.cc']
         # Save the original _compile method for later.
-        if self.compiler.compiler_type == 'msvc' or IS_WINDOWS:
-            raise NotImplementedError("Not support on MSVC currently.")
+        if self.compiler.compiler_type == 'msvc':
+            self.compiler._cpp_extensions += ['.cu', '.cuh']
+            original_compile = self.compiler.compile
+            original_spawn = self.compiler.spawn
         else:
+            # add determine compile flags
+            add_compile_flag(extension, '-std=c++11')
             original_compile = self.compiler._compile
 
         def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
@@ -268,6 +279,81 @@ class BuildExtension(build_ext, object):
                 # restore original_compiler
                 self.compiler.compiler_so = original_compiler
 
+        def win_custom_single_compiler(sources,
+                                       output_dir=None,
+                                       macros=None,
+                                       include_dirs=None,
+                                       debug=0,
+                                       extra_preargs=None,
+                                       extra_postargs=None,
+                                       depends=None):
+
+            self.cflags = copy.deepcopy(extra_postargs)
+            extra_postargs = None
+
+            def win_custom_spawn(cmd):
+                # Using regex to modify compile options
+                compile_options = self.compiler.compile_options
+                for i in range(len(cmd)):
+                    if re.search('/MD', cmd[i]) is not None:
+                        cmd[i] = '/MT'
+                    if re.search('/W[1-4]', cmd[i]) is not None:
+                        cmd[i] = '/W0'
+
+                # Using regex to match src, obj and include files
+                src_regex = re.compile('/T(p|c)(.*)')
+                src_list = [
+                    m.group(2) for m in (src_regex.match(elem) for elem in cmd)
+                    if m
+                ]
+
+                obj_regex = re.compile('/Fo(.*)')
+                obj_list = [
+                    m.group(1) for m in (obj_regex.match(elem) for elem in cmd)
+                    if m
+                ]
+
+                include_regex = re.compile(r'((\-|\/)I.*)')
+                include_list = [
+                    m.group(1)
+                    for m in (include_regex.match(elem) for elem in cmd) if m
+                ]
+
+                assert len(src_list) == 1 and len(obj_list) == 1
+                src = src_list[0]
+                obj = obj_list[0]
+                if is_cuda_file(src):
+                    assert CUDA_HOME is not None
+                    nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
+                    if isinstance(self.cflags, dict):
+                        cflags = self.cflags['nvcc']
+                    elif isinstance(self.cflags, list):
+                        cflags = self.cflags
+                    else:
+                        cflags = []
+
+                    cflags = prepare_win_cflags(cflags) + ['--use-local-env']
+                    for flag in MSVC_COMPILE_FLAGS:
+                        cflags = ['-Xcompiler', flag] + cflags
+                    cmd = [nvcc_cmd, '-c', src, '-o', obj
+                           ] + include_list + cflags
+                elif isinstance(self.cflags, dict):
+                    cflags = MSVC_COMPILE_FLAGS + self.cflags['cxx']
+                    cmd += cflags
+                elif isinstance(self.cflags, list):
+                    cflags = MSVC_COMPILE_FLAGS + self.cflags
+                    cmd += cflags
+
+                return original_spawn(cmd)
+
+            try:
+                self.compiler.spawn = win_custom_spawn
+                return original_compile(sources, output_dir, macros,
+                                        include_dirs, debug, extra_preargs,
+                                        extra_postargs, depends)
+            finally:
+                self.compiler.spawn = original_spawn
+
         def object_filenames_with_cuda(origina_func, build_directory):
             """
             Decorated the function to add customized naming machanism.
@@ -280,10 +366,13 @@ class BuildExtension(build_ext, object):
                     objects = origina_func(source_filenames, strip_dir,
                                            output_dir)
                     for i, source in enumerate(source_filenames):
-                        # modify xx.o -> xx.cu.o
+                        # modify xx.o -> xx.cu.o/xx.cu.obj
                         if is_cuda_file(source):
                             old_obj = objects[i]
-                            objects[i] = old_obj[:-1] + 'cu.o'
+                            if self.compiler.compiler_type == 'msvc':
+                                objects[i] = old_obj[:-3] + 'cu.obj'
+                            else:
+                                objects[i] = old_obj[:-1] + 'cu.o'
                     # if user set build_directory, output objects there.
                     if build_directory is not None:
                         objects = [
@@ -300,10 +389,13 @@ class BuildExtension(build_ext, object):
             return wrapper
 
         # customized compile process
-        self.compiler._compile = unix_custom_single_compiler
+        if self.compiler.compiler_type == 'msvc':
+            self.compiler.compile = win_custom_single_compiler
+        else:
+            self.compiler._compile = unix_custom_single_compiler
+
         self.compiler.object_filenames = object_filenames_with_cuda(
             self.compiler.object_filenames, self.build_lib)
-
         self._record_op_info()
 
         print("Compiling user custom op, it will cost a few seconds.....")
@@ -333,15 +425,21 @@ class BuildExtension(build_ext, object):
             compiler = self.compiler.compiler_cxx[0]
         elif IS_WINDOWS:
             compiler = os.environ.get('CXX', 'cl')
-            raise NotImplementedError("We don't support Windows Currently.")
         else:
             compiler = os.environ.get('CXX', 'c++')
 
         check_abi_compatibility(compiler)
+        # Warn user if VC env is activated but `DISTUILS_USE_SDK` is not set.
+        if IS_WINDOWS and 'VSCMD_ARG_TGT_ARCH' in os.environ and 'DISTUTILS_USE_SDK' not in os.environ:
+            msg = (
+                'It seems that the VC environment is activated but DISTUTILS_USE_SDK is not set.'
+                'This may lead to multiple activations of the VC env.'
+                'Please set `DISTUTILS_USE_SDK=1` and try again.')
+            raise UserWarning(msg)
 
     def _record_op_info(self):
         """
-        Record custum op inforomation. 
+        Record custum op inforomation.
         """
         # parse shared library abs path
         outputs = self.get_outputs()
@@ -380,7 +478,13 @@ class EasyInstallCommand(easy_install, object):
         # .so shared library to another name.
         for egg_file in self.outputs:
             filename, ext = os.path.splitext(egg_file)
-            if ext == '.so':
+            will_rename = False
+            if OS_NAME.startswith('linux') and ext == '.so':
+                will_rename = True
+            elif IS_WINDOWS and ext == '.pyd':
+                will_rename = True
+
+            if will_rename:
                 new_so_path = filename + "_pd_" + ext
                 if not os.path.exists(new_so_path):
                     os.rename(r'%s' % egg_file, r'%s' % new_so_path)
@@ -425,7 +529,7 @@ def load(name,
         extra_include_paths(list[str]): additional include path used to search header files.
                                         Default None.
         build_directory(str): specific directory path to put shared library file. If set None,
-                            it will use `PADDLE_EXTENSION_DIR` from os.environ. Use 
+                            it will use `PADDLE_EXTENSION_DIR` from os.environ. Use
                             `paddle.utils.cpp_extension.get_build_directory()` to see the location.
         interpreter(str): alias or full interpreter path to specific which one to use if have installed multiple.
                            If set None, will use `python` as default interpreter.
@@ -448,6 +552,10 @@ def load(name,
 
     # ensure to use abs path
     build_directory = os.path.abspath(build_directory)
+    # Will load shared library from 'path' on windows
+    if IS_WINDOWS:
+        os.environ['path'] = build_directory + ';' + os.environ['path']
+
     log_v("build_directory: {}".format(build_directory), verbose)
 
     file_path = os.path.join(build_directory, "setup.py")
@@ -460,7 +568,7 @@ def load(name,
     log_v("additonal compile_flags: [{}]".format(' '.join(compile_flags)),
           verbose)
 
-    # write setup.py file and compile it 
+    # write setup.py file and compile it
     _write_setup_file(name, sources, file_path, extra_include_paths,
                       compile_flags, extra_ldflags, verbose)
     _jit_compile(file_path, interpreter, verbose)
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 52c17d77bd4771ce44f2282adfd9a25394ce97ea..f4a801fe3ec478ddbd543d5929f0456b354cde08 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -38,9 +38,19 @@ logger = logging.getLogger("utils.cpp_extension")
 
 OS_NAME = sys.platform
 IS_WINDOWS = OS_NAME.startswith('win')
-NVCC_COMPILE_FLAGS = [
-    '-ccbin', 'cc', '-DPADDLE_WITH_CUDA', '-DEIGEN_USE_GPU', '-DPADDLE_USE_DSO',
-    '-Xcompiler', '-fPIC', '-w', '--expt-relaxed-constexpr', '-O3', '-DNVCC'
+
+MSVC_COMPILE_FLAGS = [
+    '/MT', '/wd4819', '/wd4251', '/wd4244', '/wd4267', '/wd4275', '/wd4018',
+    '/wd4190', '/EHsc', '/w', '/DPADDLE_WITH_CUDA', '/DEIGEN_USE_GPU',
+    '/DNDEBUG'
+]
+
+MSVC_LINK_FLAGS = [
+    '/MACHINE:X64', 'paddle_framework.lib', 'cudadevrt.lib', 'cudart_static.lib'
+]
+
+COMMON_NVCC_FLAGS = [
+    '-DPADDLE_WITH_CUDA', '-DEIGEN_USE_GPU', '-DPADDLE_USE_DSO', '-O3'
 ]
 
 GCC_MINI_VERSION = (5, 4, 0)
@@ -81,8 +91,8 @@ information
 USING_NEW_CUSTOM_OP_LOAD_METHOD = True
 
 
-# NOTE(chenweihang): In order to be compatible with 
-# the two custom op define method, after removing 
+# NOTE(chenweihang): In order to be compatible with
+# the two custom op define method, after removing
 # old method, we can remove them together
 def use_new_custom_op_load_method(*args):
     global USING_NEW_CUSTOM_OP_LOAD_METHOD
@@ -210,7 +220,21 @@ def prepare_unix_cflags(cflags):
     """
     Prepare all necessary compiled flags for nvcc compiling CUDA files.
     """
-    cflags = NVCC_COMPILE_FLAGS + cflags + get_cuda_arch_flags(cflags)
+    cflags = COMMON_NVCC_FLAGS + [
+        '-ccbin', 'cc', '-Xcompiler', '-fPIC', '-w', '--expt-relaxed-constexpr',
+        '-DNVCC'
+    ] + cflags + get_cuda_arch_flags(cflags)
+
+    return cflags
+
+
+def prepare_win_cflags(cflags):
+    """
+    Prepare all necessary compiled flags for nvcc compiling CUDA files.
+    """
+    cflags = COMMON_NVCC_FLAGS + [
+        '-DGOOGLE_GLOG_DLL_DECL', '-DBOOST_HAS_STATIC_ASSERT', '-w'
+    ] + cflags + get_cuda_arch_flags(cflags)
 
     return cflags
 
@@ -238,7 +262,7 @@ def get_cuda_arch_flags(cflags):
 
 
 def normalize_extension_kwargs(kwargs, use_cuda=False):
-    """ 
+    """
     Normalize include_dirs, library_dir and other attributes in kwargs.
     """
     assert isinstance(kwargs, dict)
@@ -252,52 +276,36 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
     library_dirs.extend(find_paddle_libraries(use_cuda))
     kwargs['library_dirs'] = library_dirs
 
-    # add runtime library dirs
-    runtime_library_dirs = kwargs.get('runtime_library_dirs', [])
-    runtime_library_dirs.extend(find_paddle_libraries(use_cuda))
-    kwargs['runtime_library_dirs'] = runtime_library_dirs
+    if IS_WINDOWS:
+        # TODO(zhouwei): may append compile flags in future
+        pass
+        # append link flags
+        extra_link_args = kwargs.get('extra_link_args', [])
+        extra_link_args.extend(MSVC_LINK_FLAGS)
+        kwargs['extra_link_args'] = extra_link_args
+    else:
+        # append compile flags
+        extra_compile_args = kwargs.get('extra_compile_args', [])
+        extra_compile_args.extend(['-g', '-w'])  # diable warnings
+        kwargs['extra_compile_args'] = extra_compile_args
 
-    # append compile flags
-    extra_compile_args = kwargs.get('extra_compile_args', [])
-    extra_compile_args.extend(['-g', '-w'])  # diable warnings
-    kwargs['extra_compile_args'] = extra_compile_args
+        # append link flags
+        extra_link_args = kwargs.get('extra_link_args', [])
+        extra_link_args.append('-lpaddle_framework')
+        if use_cuda:
+            extra_link_args.append('-lcudart')
 
-    # append link flags
-    extra_link_args = kwargs.get('extra_link_args', [])
-    extra_link_args.append('-lpaddle_framework')
-    if use_cuda:
-        extra_link_args.append('-lcudart')
+        kwargs['extra_link_args'] = extra_link_args
 
-    kwargs['extra_link_args'] = extra_link_args
+        # add runtime library dirs
+        runtime_library_dirs = kwargs.get('runtime_library_dirs', [])
+        runtime_library_dirs.extend(find_paddle_libraries(use_cuda))
+        kwargs['runtime_library_dirs'] = runtime_library_dirs
 
     kwargs['language'] = 'c++'
     return kwargs
 
 
-def find_paddle_includes(use_cuda=False):
-    """
-    Return Paddle necessary include dir path.
-    """
-    # pythonXX/site-packages/paddle/include
-    paddle_include_dir = get_include()
-    third_party_dir = os.path.join(paddle_include_dir, 'third_party')
-
-    include_dirs = [paddle_include_dir, third_party_dir]
-
-    return include_dirs
-
-
-def find_cuda_includes():
-
-    cuda_home = find_cuda_home()
-    if cuda_home is None:
-        raise ValueError(
-            "Not found CUDA runtime, please use `export CUDA_HOME=XXX` to specific it."
-        )
-
-    return [os.path.join(cuda_home, 'lib64')]
-
-
 def find_cuda_home():
     """
     Use heuristic method to find cuda path
@@ -315,19 +323,22 @@ def find_cuda_home():
                 if six.PY3:
                     nvcc_path = nvcc_path.decode()
                 nvcc_path = nvcc_path.rstrip('\r\n')
+                log_v(nvcc_path)
                 # for example: /usr/local/cuda/bin/nvcc
                 cuda_home = os.path.dirname(os.path.dirname(nvcc_path))
         except:
             if IS_WINDOWS:
                 # search from default NVIDIA GPU path
                 candidate_paths = glob.glob(
-                    'C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*')
+                    'C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*.*'
+                )
                 if len(candidate_paths) > 0:
                     cuda_home = candidate_paths[0]
             else:
                 cuda_home = "/usr/local/cuda"
     # step 3. check whether path is valid
-    if not os.path.exists(cuda_home) and core.is_compiled_with_cuda():
+    if cuda_home and not os.path.exists(
+            cuda_home) and core.is_compiled_with_cuda():
         cuda_home = None
         warnings.warn(
             "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it."
@@ -336,15 +347,65 @@ def find_cuda_home():
     return cuda_home
 
 
+def find_cuda_includes():
+    """
+    Use heuristic method to find cuda include path
+    """
+    cuda_home = find_cuda_home()
+    if cuda_home is None:
+        raise ValueError(
+            "Not found CUDA runtime, please use `export CUDA_HOME=XXX` to specific it."
+        )
+
+    return [os.path.join(cuda_home, 'include')]
+
+
+def find_paddle_includes(use_cuda=False):
+    """
+    Return Paddle necessary include dir path.
+    """
+    # pythonXX/site-packages/paddle/include
+    paddle_include_dir = get_include()
+    third_party_dir = os.path.join(paddle_include_dir, 'third_party')
+    include_dirs = [paddle_include_dir, third_party_dir]
+
+    #TODO(zhouwei): because eigen need cuda_runtime.h
+    #So, extend cuda_include_dir always
+    cuda_include_dir = find_cuda_includes()
+    include_dirs.extend(cuda_include_dir)
+
+    return include_dirs
+
+
+def find_cuda_libraries():
+    """
+    Use heuristic method to find cuda static lib path
+    """
+    cuda_home = find_cuda_home()
+    if cuda_home is None:
+        raise ValueError(
+            "Not found CUDA runtime, please use `export CUDA_HOME=XXX` to specific it."
+        )
+    if IS_WINDOWS:
+        cuda_lib_dir = [os.path.join(cuda_home, 'lib', 'x64')]
+    else:
+        cuda_lib_dir = [os.path.join(cuda_home, 'lib64')]
+
+    return cuda_lib_dir
+
+
 def find_paddle_libraries(use_cuda=False):
     """
     Return Paddle necessary library dir path.
     """
     # pythonXX/site-packages/paddle/libs
     paddle_lib_dirs = [get_lib()]
-    if use_cuda:
-        cuda_dirs = find_cuda_includes()
-        paddle_lib_dirs.extend(cuda_dirs)
+
+    #TODO(zhouwei): because eigen need cuda_runtime.h
+    #So, extend cuda_lib_dir always
+    cuda_lib_dir = find_cuda_libraries()
+    paddle_lib_dirs.extend(cuda_lib_dir)
+
     return paddle_lib_dirs
 
 
@@ -374,12 +435,14 @@ def get_build_directory(verbose=False):
     root_extensions_directory = os.environ.get('PADDLE_EXTENSION_DIR')
     if root_extensions_directory is None:
         dir_name = "paddle_extensions"
-        if OS_NAME.startswith('linux'):
-            root_extensions_directory = os.path.join(
-                os.path.expanduser('~/.cache'), dir_name)
-        else:
-            # TODO(Aurelius84): consider wind32/macOs
-            raise NotImplementedError("Only support Linux now.")
+        root_extensions_directory = os.path.join(
+            os.path.expanduser('~/.cache'), dir_name)
+        if IS_WINDOWS:
+            root_extensions_directory = os.path.normpath(
+                root_extensions_directory)
+        elif OS_NAME.startswith('darwin'):
+            # TODO(Aurelius84): consider macOs
+            raise NotImplementedError("Not support Mac now.")
 
         log_v("$PADDLE_EXTENSION_DIR is not set, using path: {} by default.".
               format(root_extensions_directory), verbose)
@@ -410,10 +473,13 @@ def parse_op_info(op_name):
 
 def _import_module_from_library(module_name, build_directory, verbose=False):
     """
-    Load .so shared library and import it as callable python module.
+    Load shared library and import it as callable python module.
     """
-    # TODO(Aurelius84): Consider file suffix is .dll on Windows Platform.
-    ext_path = os.path.join(build_directory, module_name + '.so')
+    if IS_WINDOWS:
+        dynamic_suffix = '.pyd'
+    else:
+        dynamic_suffix = '.so'
+    ext_path = os.path.join(build_directory, module_name + dynamic_suffix)
     if not os.path.exists(ext_path):
         raise FileNotFoundError("Extension path: {} does not exist.".format(
             ext_path))
@@ -565,12 +631,12 @@ def _write_setup_file(name,
 
 def list2str(args):
     """
-    Convert list[str] into string. For example: [x, y] -> "['x', 'y']"
+    Convert list[str] into string. For example: ['x', 'y'] -> "['x', 'y']"
     """
     if args is None: return '[]'
     assert isinstance(args, (list, tuple))
-    args = ["'{}'".format(arg) for arg in args]
-    return '[' + ','.join(args) + ']'
+    args = ["{}".format(arg) for arg in args]
+    return repr(args)
 
 
 def _jit_compile(file_path, interpreter=None, verbose=False):
@@ -583,7 +649,8 @@ def _jit_compile(file_path, interpreter=None, verbose=False):
     if interpreter is None:
         interpreter = 'python'
     try:
-        py_path = subprocess.check_output(['which', interpreter])
+        which = 'where' if IS_WINDOWS else 'which'
+        py_path = subprocess.check_output([which, interpreter])
         py_version = subprocess.check_output([interpreter, '-V'])
         if six.PY3:
             py_path = py_path.decode()
@@ -596,8 +663,13 @@ def _jit_compile(file_path, interpreter=None, verbose=False):
             'Failed to check Python interpreter with `{}`, errors: {}'.format(
                 interpreter, error))
 
-    compile_cmd = 'cd {} && {} {} build'.format(ext_dir, interpreter,
-                                                setup_file)
+    if IS_WINDOWS:
+        compile_cmd = 'cd /d {} && {} {} build'.format(ext_dir, interpreter,
+                                                       setup_file)
+    else:
+        compile_cmd = 'cd {} && {} {} build'.format(ext_dir, interpreter,
+                                                    setup_file)
+
     print("Compiling user custom op, it will cost a few seconds.....")
     run_cmd(compile_cmd, verbose)
 
@@ -682,7 +754,7 @@ def check_abi_compatibility(compiler, verbose=False):
     try:
         if OS_NAME.startswith('linux'):
             version_info = subprocess.check_output(
-                [compiler, '-dumpfullversion'])
+                [compiler, '-dumpfullversion', '-dumpversion'])
             if six.PY3:
                 version_info = version_info.decode()
             version = version_info.strip().split('.')
@@ -694,8 +766,8 @@ def check_abi_compatibility(compiler, verbose=False):
                 warnings.warn(
                     ABI_INCOMPATIBILITY_WARNING.format(
                         user_compiler=compiler, version=version_info.strip()))
-        # TODO(Aurelius84): check version compatibility on windows
         elif IS_WINDOWS:
+            # TODO(zhouwei): support check abi compatibility on windows
             warnings.warn("We don't support Windows now.")
     except Exception:
         _, error, _ = sys.exc_info()
@@ -714,7 +786,7 @@ def _expected_compiler_current_platform():
     return expect_compilers
 
 
-def log_v(info, verbose):
+def log_v(info, verbose=True):
     """
     Print log information on stdout.
     """
diff --git a/python/requirements.txt b/python/requirements.txt
index 77232f4fd71831585d6f2301eb802930b4444c2c..e89b3ede94fd4a624b3ddc335f5d2ea6e7b20b8a 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -3,7 +3,8 @@ numpy>=1.13, <=1.16.4 ; python_version<"3.5"
 numpy>=1.13 ; python_version>="3.5" and platform_system != "Windows"
 numpy>=1.13, <=1.19.3 ; python_version>="3.5" and platform_system == "Windows"
 protobuf>=3.1.0
-gast>=0.3.3
+gast>=0.3.3 ; platform_system != "Windows"
+gast==0.3.3 ; platform_system == "Windows"
 Pillow
 six
 decorator
diff --git a/python/setup.py.in b/python/setup.py.in
index d5c098aa9e350c14a59f014f9c30134c100c8d3c..43a74d191d804510dafeff72e67e169265e58f1b 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -335,11 +335,16 @@ if '${WITH_XPU_BKCL}' == 'ON':
     shutil.copy('${XPU_BKCL_LIB}', libs_path)
     package_data['paddle.libs']+=['${XPU_BKCL_LIB_NAME}']
 
-# copy libfuild_framework.so to libs
-if os.name != 'nt' and sys.platform != 'darwin':
-    paddle_framework_lib='${FLUID_FRAMEWORK_SHARED_LIB}'
-    shutil.copy(paddle_framework_lib, libs_path)
-    package_data['paddle.libs'] += [('libpaddle_framework' if os.name != 'nt' else 'paddle_framework') + ext_name]
+# copy libpaddle_framework.so to libs on linux
+if sys.platform.startswith('linux'):
+    shutil.copy('${FLUID_FRAMEWORK_SHARED_LIB}', libs_path)
+    package_data['paddle.libs'] += ['libpaddle_framework.so']
+
+# copy paddle_framework.lib/paddle_framework.dll to libs on windows
+if os.name == 'nt':
+    shutil.copy('${FLUID_FRAMEWORK_IMPORT_LIB}', libs_path)
+    shutil.copy('${FLUID_FRAMEWORK_SHARED_LIB}', libs_path)
+    package_data['paddle.libs'] += ['paddle_framework.lib', 'paddle_framework.dll']
 
 # remove unused paddle/libs/__init__.py
 if os.path.isfile(libs_path+'/__init__.py'):
@@ -410,9 +415,9 @@ if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON':
 class InstallCommand(InstallCommandBase):
     def finalize_options(self):
         ret = InstallCommandBase.finalize_options(self)
-        self.install_headers = os.path.join(self.install_purelib, 'paddle',
-                                            'include')
         self.install_lib = self.install_platlib
+        self.install_headers = os.path.join(self.install_platlib, 'paddle',
+                                            'include')
         return ret
 
 
@@ -463,11 +468,6 @@ class InstallHeaders(Command):
         return self.copy_file(header, install_dir)
 
     def run(self):
-        # only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows
-        if os.name == 'nt' or sys.platform == 'darwin':
-            if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON':
-                self.mkdir_and_copy_file('${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb')
-            return
         hdrs = self.distribution.headers
         if not hdrs:
             return