[Cherry-pick] The 4th part of new custom op (#31282)

* modify custom op dependent from paddle_framework to paddle_custom_op (#31195) * [Custom Op] Remove unsupport dtypes (#31232) * remove remove_unsupport_dtype * remove remove_unsupport_dtype * remove test dtype * add more include * change dtype.h's enum as enum class to avoid conflict with inference lib * make enum as enum class * remove additional test * merge develop * polish code * [Custom OP] Support stream set on Custom Op (#31257) * [Custom OP] change the user header file format, test=develop (#31274) * [Custom OP]add PD_THROW and PD_CHECK for User Error message (#31253) * [Custom OP]add PD_THROW and PD_CHECK for User error message * PD_THROW and PD_CHECK, fix comment * fix Windows error message * fix Windows error message * fix CI * [Custom OP]add MSVC compile check on Windows (#31265) * fix test_check_abi Co-authored-by: N Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Co-authored-by: N Jiabin Yang <marsyang199376@gmail.com> Co-authored-by: N 石晓伟 <39303645+Shixiaowei02@users.noreply.github.com> Co-authored-by: N zhouwei25 <zhouwei25@baidu.com>

[Cherry-pick] The 4th part of new custom op (#31282)
* modify custom op dependent from paddle_framework to paddle_custom_op (#31195) * [Custom Op] Remove unsupport dtypes (#31232) * remove remove_unsupport_dtype * remove remove_unsupport_dtype * remove test dtype * add more include * change dtype.h's enum as enum class to avoid conflict with inference lib * make enum as enum class * remove additional test * merge develop * polish code * [Custom OP] Support stream set on Custom Op (#31257) * [Custom OP] change the user header file format, test=develop (#31274) * [Custom OP]add PD_THROW and PD_CHECK for User Error message (#31253) * [Custom OP]add PD_THROW and PD_CHECK for User error message * PD_THROW and PD_CHECK, fix comment * fix Windows error message * fix Windows error message * fix CI * [Custom OP]add MSVC compile check on Windows (#31265) * fix test_check_abi Co-authored-by: N Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Co-authored-by: N Jiabin Yang <marsyang199376@gmail.com> Co-authored-by: N 石晓伟 <39303645+Shixiaowei02@users.noreply.github.com> Co-authored-by: N zhouwei25 <zhouwei25@baidu.com>
777d1a45 · Chen Weihang · GitHub · f4a69d58 · 777d1a45 · 777d1a45
31 changed file
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -189,6 +189,10 @@ copy(inference_lib_dist
        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
 include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include/*
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 # CAPI inference library for only inference
 set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
 "A path setting CAPI paddle inference shared")

--- a/paddle/extension.h
+++ b/paddle/extension.h
@@ -15,4 +15,4 @@ limitations under the License. */
 #pragma once
 // All paddle apis in C++ frontend
-#include "paddle/fluid/extension/include/all.h"
+#include "paddle/fluid/extension/include/ext_all.h"
--- a/paddle/fluid/extension/include/all.h
+++ b/paddle/fluid/extension/include/all.h
@@ -24,8 +24,9 @@ limitations under the License. */
 #endif
 #endif
-#include "paddle/fluid/extension/include/dispatch.h"
+#include "ext_dispatch.h"      // NOLINT
-#include "paddle/fluid/extension/include/dtype.h"
+#include "ext_dtype.h"         // NOLINT
-#include "paddle/fluid/extension/include/op_meta_info.h"
+#include "ext_exception.h"     // NOLINT
-#include "paddle/fluid/extension/include/place.h"
+#include "ext_op_meta_info.h"  // NOLINT
-#include "paddle/fluid/extension/include/tensor.h"
+#include "ext_place.h"         // NOLINT
+#include "ext_tensor.h"        // NOLINT
--- a/paddle/fluid/extension/include/dispatch.h
+++ b/paddle/fluid/extension/include/dispatch.h
@@ -14,7 +14,8 @@ limitations under the License. */
 #pragma once
-#include "paddle/fluid/extension/include/dtype.h"
+#include "ext_dtype.h"      // NOLINT
+#include "ext_exception.h"  // NOLINT
 namespace paddle {
@@ -32,19 +33,18 @@ namespace paddle {
 ///////// Floating Dispatch Marco ///////////
-#define PD_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                    \
+#define PD_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                       \
-  [&] {                                                                \
+  [&] {                                                                   \
-    const auto& __dtype__ = TYPE;                                      \
+    const auto& __dtype__ = TYPE;                                         \
-    switch (__dtype__) {                                               \
+    switch (__dtype__) {                                                  \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,   \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,      \
-                           __VA_ARGS__)                                \
+                           __VA_ARGS__)                                   \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,  \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,     \
-                           __VA_ARGS__)                                \
+                           __VA_ARGS__)                                   \
-      default:                                                         \
+      default:                                                            \
-        throw std::runtime_error("function " #NAME                     \
+        PD_THROW("function " #NAME " is not implemented for data type `", \
-                                 " not implemented for data type `" +  \
+                 ::paddle::ToString(__dtype__), "`");                     \
-                                 ::paddle::ToString(__dtype__) + "`"); \
+    }                                                                     \
-    }                                                                  \
  }()
 ///////// Integral Dispatch Marco ///////////
@@ -63,29 +63,11 @@ namespace paddle {
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
                           __VA_ARGS__)                                       \
      default:                                                                \
-        throw std::runtime_error("function " #NAME                            \
+        PD_THROW("function " #NAME " is not implemented for data type `" +    \
-                                 " not implemented for data type `" +         \
+                 ::paddle::ToString(__dtype__) + "`");                        \
-                                 ::paddle::ToString(__dtype__) + "`");        \
    }                                                                         \
  }()
-///////// Complex Dispatch Marco ///////////
-#define PD_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...)                     \
-  [&] {                                                                \
-    const auto& __dtype__ = TYPE;                                      \
-    switch (__dtype__) {                                               \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,        \
-                           ::paddle::complex64, __VA_ARGS__)           \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,       \
-                           ::paddle::complex128, __VA_ARGS__)          \
-      default:                                                         \
-        throw std::runtime_error("function " #NAME                     \
-                                 " not implemented for data type `" +  \
-                                 ::paddle::ToString(__dtype__) + "`"); \
-    }                                                                  \
-  }()
 ///////// Floating and Integral Dispatch Marco ///////////
 #define PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...)              \
@@ -106,60 +88,8 @@ namespace paddle {
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
                           __VA_ARGS__)                                       \
      default:                                                                \
-        throw std::runtime_error("function " #NAME                            \
+        PD_THROW("function " #NAME " is not implemented for data type `" +    \
-                                 " not implemented for data type `" +         \
+                 ::paddle::ToString(__dtype__) + "`");                        \
-                                 ::paddle::ToString(__dtype__) + "`");        \
-    }                                                                         \
-  }()
-///////// Floating and Complex Dispatch Marco ///////////
-#define PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...)        \
-  [&] {                                                                \
-    const auto& __dtype__ = TYPE;                                      \
-    switch (__dtype__) {                                               \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,   \
-                           __VA_ARGS__)                                \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,  \
-                           __VA_ARGS__)                                \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,        \
-                           ::paddle::complex64, __VA_ARGS__)           \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,       \
-                           ::paddle::complex128, __VA_ARGS__)          \
-      default:                                                         \
-        throw std::runtime_error("function " #NAME                     \
-                                 " not implemented for data type `" +  \
-                                 ::paddle::ToString(__dtype__) + "`"); \
-    }                                                                  \
-  }()
-///////// Floating, Integral and Complex Dispatch Marco ///////////
-#define PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...)  \
-  [&] {                                                                       \
-    const auto& __dtype__ = TYPE;                                             \
-    switch (__dtype__) {                                                      \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,         \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t,            \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,               \
-                           ::paddle::complex64, __VA_ARGS__)                  \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,              \
-                           ::paddle::complex128, __VA_ARGS__)                 \
-      default:                                                                \
-        throw std::runtime_error("function " #NAME                            \
-                                 " not implemented for data type `" +         \
-                                 ::paddle::ToString(__dtype__) + "`");        \
    }                                                                         \
  }()

--- a/paddle/fluid/extension/include/dll_decl.h
+++ b/paddle/fluid/extension/include/dll_decl.h
--- a/paddle/fluid/extension/include/dtype.h
+++ b/paddle/fluid/extension/include/dtype.h
@@ -11,34 +11,24 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "paddle/fluid/platform/bfloat16.h"
+#include <cstdint>
-#include "paddle/fluid/platform/complex128.h"
+#include <string>
-#include "paddle/fluid/platform/complex64.h"
-#include "paddle/fluid/platform/float16.h"
-namespace paddle {
+#include "ext_exception.h"  // NOLINT
-using float16 = paddle::platform::float16;
+namespace paddle {
-using bfloat16 = paddle::platform::bfloat16;
-using complex64 = paddle::platform::complex64;
-using complex128 = paddle::platform::complex128;
-enum DataType {
+enum class DataType {
  BOOL,
  INT8,
  UINT8,
  INT16,
  INT32,
  INT64,
-  FLOAT16,
-  BFLOAT16,
  FLOAT32,
  FLOAT64,
-  COMPLEX64,
-  COMPLEX128,
  // TODO(JiabinYang) support more data types if needed.
 };
@@ -56,36 +46,24 @@ inline std::string ToString(DataType dtype) {
      return "int32_t";
    case DataType::INT64:
      return "int64_t";
-    case DataType::FLOAT16:
-      return "float16";
-    case DataType::BFLOAT16:
-      return "bfloat16";
    case DataType::FLOAT32:
      return "float";
    case DataType::FLOAT64:
      return "double";
-    case DataType::COMPLEX64:
-      return "complex64";
-    case DataType::COMPLEX128:
-      return "complex128";
    default:
-      throw std::runtime_error("Unsupported paddle enum data type.");
+      PD_THROW("Unsupported paddle enum data type.");
  }
 }
-#define PD_FOR_EACH_DATA_TYPE(_)    \
+#define PD_FOR_EACH_DATA_TYPE(_) \
-  _(bool, DataType::BOOL)           \
+  _(bool, DataType::BOOL)        \
-  _(int8_t, DataType::INT8)         \
+  _(int8_t, DataType::INT8)      \
-  _(uint8_t, DataType::UINT8)       \
+  _(uint8_t, DataType::UINT8)    \
-  _(int16_t, DataType::INT16)       \
+  _(int16_t, DataType::INT16)    \
-  _(int, DataType::INT32)           \
+  _(int, DataType::INT32)        \
-  _(int64_t, DataType::INT64)       \
+  _(int64_t, DataType::INT64)    \
-  _(float16, DataType::FLOAT16)     \
+  _(float, DataType::FLOAT32)    \
-  _(bfloat16, DataType::BFLOAT16)   \
+  _(double, DataType::FLOAT64)
-  _(float, DataType::FLOAT32)       \
-  _(double, DataType::FLOAT64)      \
-  _(complex64, DataType::COMPLEX64) \
-  _(complex128, DataType::COMPLEX128)
 template <paddle::DataType T>
 struct DataTypeToCPPType;

--- a/paddle/fluid/extension/include/ext_exception.h
+++ b/paddle/fluid/extension/include/ext_exception.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <iostream>
+#include <sstream>
+#include <string>
+namespace paddle {
+//////////////// Exception handling and Error Message  /////////////////
+#if !defined(_WIN32)
+#define PD_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))
+#define PD_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
+#else
+#define PD_UNLIKELY(expr) (expr)
+#define PD_LIKELY(expr) (expr)
+#endif
+struct PD_Exception : public std::exception {
+ public:
+  template <typename... Args>
+  explicit PD_Exception(const std::string& msg, const char* file, int line,
+                        const char* default_msg) {
+    std::ostringstream sout;
+    if (msg.empty()) {
+      sout << default_msg << "\n  [" << file << ":" << line << "]";
+    } else {
+      sout << msg << "\n  [" << file << ":" << line << "]";
+    }
+    err_msg_ = sout.str();
+  }
+  const char* what() const noexcept override { return err_msg_.c_str(); }
+ private:
+  std::string err_msg_;
+};
+class ErrorMessage {
+ public:
+  template <typename... Args>
+  explicit ErrorMessage(const Args&... args) {
+    build_string(args...);
+  }
+  void build_string() { oss << ""; }
+  template <typename T>
+  void build_string(const T& t) {
+    oss << t;
+  }
+  template <typename T, typename... Args>
+  void build_string(const T& t, const Args&... args) {
+    build_string(t);
+    build_string(args...);
+  }
+  std::string to_string() { return oss.str(); }
+ private:
+  std::ostringstream oss;
+};
+#if defined _WIN32
+#define HANDLE_THE_ERROR try {
+#define END_HANDLE_THE_ERROR            \
+  }                                     \
+  catch (const std::exception& e) {     \
+    std::cerr << e.what() << std::endl; \
+    throw e;                            \
+  }
+#else
+#define HANDLE_THE_ERROR
+#define END_HANDLE_THE_ERROR
+#endif
+#define PD_CHECK(COND, ...)                                               \
+  do {                                                                    \
+    if (PD_UNLIKELY(!(COND))) {                                           \
+      auto __message__ = ::paddle::ErrorMessage(__VA_ARGS__).to_string(); \
+      throw ::paddle::PD_Exception(__message__, __FILE__, __LINE__,       \
+                                   "Expected " #COND                      \
+                                   ", but it's not satisfied.");          \
+    }                                                                     \
+  } while (0)
+#define PD_THROW(...)                                                   \
+  do {                                                                  \
+    auto __message__ = ::paddle::ErrorMessage(__VA_ARGS__).to_string(); \
+    throw ::paddle::PD_Exception(__message__, __FILE__, __LINE__,       \
+                                 "An error occured.");                  \
+  } while (0)
+}  // namespace paddle
--- a/paddle/fluid/extension/include/op_meta_info.h
+++ b/paddle/fluid/extension/include/op_meta_info.h
@@ -21,8 +21,9 @@ limitations under the License. */
 #include <boost/any.hpp>
-#include "paddle/fluid/extension/include/dll_decl.h"
+#include "ext_dll_decl.h"   // NOLINT
-#include "paddle/fluid/extension/include/tensor.h"
+#include "ext_exception.h"  // NOLINT
+#include "ext_tensor.h"     // NOLINT
 /**
 * Op Meta Info Related Define.
@@ -47,26 +48,6 @@ using Tensor = paddle::Tensor;
  classname& operator=(const classname&) = delete; \
  classname& operator=(classname&&) = delete
-#if defined _WIN32
-#define HANDLE_THE_ERROR try {
-#define END_HANDLE_THE_ERROR            \
-  }                                     \
-  catch (const std::exception& e) {     \
-    std::cerr << e.what() << std::endl; \
-    throw e;                            \
-  }
-#else
-#define HANDLE_THE_ERROR
-#define END_HANDLE_THE_ERROR
-#endif
-#define PD_THROW(err_msg)              \
-  do {                                 \
-    HANDLE_THE_ERROR                   \
-    throw std::runtime_error(err_msg); \
-    END_HANDLE_THE_ERROR               \
-  } while (0)
 #define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                        \
  struct __test_global_namespace_##uniq_name##__ {};                          \
  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \

--- a/paddle/fluid/extension/include/place.h
+++ b/paddle/fluid/extension/include/place.h
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -16,14 +16,37 @@ limitations under the License. */
 #include <memory>
 #include <vector>
-#include "paddle/fluid/extension/include/dll_decl.h"
+#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/extension/include/dtype.h"
+#include <cuda_runtime.h>
-#include "paddle/fluid/extension/include/place.h"
+#endif
+#include "ext_dll_decl.h"  // NOLINT
+#include "ext_dtype.h"     // NOLINT
+#include "ext_place.h"     // NOLINT
 namespace paddle {
 namespace framework {
 class CustomTensorUtils;
 }  // namespace framework
+class StreamWrapper {
+ public:
+  StreamWrapper() : stream_(nullptr), is_stream_set_(false) {}
+  void SetStream(void* stream) {
+    stream_ = stream;
+    is_stream_set_ = true;
+  }
+  void* GetStream() const { return stream_; }
+  bool IsStreamSet() const { return is_stream_set_; }
+ private:
+  //  cudaStream_t stream_;
+  void* stream_;
+  bool is_stream_set_;
+};
 class PD_DLL_DECL Tensor {
 public:
  /// \brief Construct a Tensor on target Place for CustomOp.
@@ -87,10 +110,16 @@ class PD_DLL_DECL Tensor {
  /// \brief Cast datatype from one to another
  Tensor cast(const DataType& target_type) const;
+#ifdef PADDLE_WITH_CUDA
+  /// \bref Get current stream of Tensor
+  cudaStream_t stream() const;
+#endif
 private:
  friend class framework::CustomTensorUtils;
  mutable std::shared_ptr<void> tensor_;
  mutable PlaceType place_;
+  StreamWrapper stream_;
 };
 }  // namespace paddle
--- a/paddle/fluid/extension/src/op_meta_info.cc
+++ b/paddle/fluid/extension/src/op_meta_info.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/extension/include/op_meta_info.h"
+#include "paddle/fluid/extension/include/ext_op_meta_info.h"
 #include <string>
 #include <unordered_map>

--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/extension/include/tensor.h"
+#include "paddle/fluid/extension/include/ext_tensor.h"
 #include <utility>
 #include "paddle/fluid/framework/custom_tensor_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -101,8 +101,9 @@ void Tensor::reshape(const std::vector<int> &shape) {
 }
 Tensor::Tensor(const PlaceType &place)
-    : tensor_(std::make_shared<framework::LoDTensor>()), place_(place) {}
+    : tensor_(std::make_shared<framework::LoDTensor>()),
+      place_(place),
+      stream_(StreamWrapper()) {}
 template <typename T>
 T *Tensor::mutable_data(const PlaceType &place) {
  place_ = place;
@@ -159,17 +160,10 @@ DataType Tensor::type() const {
    return DataType::UINT8;
  } else if (type == framework::proto::VarType::FP64) {
    return DataType::FLOAT64;
-  } else if (type == framework::proto::VarType::BF16) {
-    return DataType::BFLOAT16;
-  } else if (type == framework::proto::VarType::FP16) {
-    return DataType::FLOAT16;
-  } else if (type == framework::proto::VarType::COMPLEX64) {
-    return DataType::COMPLEX64;
-  } else if (type == framework::proto::VarType::COMPLEX128) {
-    return DataType::COMPLEX128;
  } else if (type == framework::proto::VarType::BOOL) {
    return DataType::BOOL;
  }
+  // TODO(JiabinYang) Support more dtype here
  return DataType::FLOAT32;
 }
@@ -207,14 +201,6 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const {
  return target;
 }
-template PD_DLL_DECL Tensor
-Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::bfloat16>(
-    const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex64>(
-    const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex128>(
-    const PlaceType &target_place) const;
 template PD_DLL_DECL Tensor
 Tensor::copy_to<float>(const PlaceType &target_place) const;
 template PD_DLL_DECL Tensor
@@ -238,14 +224,6 @@ template PD_DLL_DECL int64_t *Tensor::data<int64_t>() const;
 template PD_DLL_DECL int32_t *Tensor::data<int32_t>() const;
 template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>() const;
 template PD_DLL_DECL int8_t *Tensor::data<int8_t>() const;
-template PD_DLL_DECL paddle::platform::float16 *
-Tensor::data<paddle::platform::float16>() const;
-template PD_DLL_DECL paddle::platform::bfloat16 *
-Tensor::data<paddle::platform::bfloat16>() const;
-template PD_DLL_DECL paddle::platform::complex128 *
-Tensor::data<paddle::platform::complex128>() const;
-template PD_DLL_DECL paddle::platform::complex64 *
-Tensor::data<paddle::platform::complex64>() const;
 template PD_DLL_DECL int16_t *Tensor::data<int16_t>() const;
 template PD_DLL_DECL bool *Tensor::data<bool>() const;
@@ -255,14 +233,6 @@ template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>();
 template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>();
 template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
 template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
-template PD_DLL_DECL paddle::platform::float16 *
-Tensor::mutable_data<paddle::platform::float16>();
-template PD_DLL_DECL paddle::platform::bfloat16 *
-Tensor::mutable_data<paddle::platform::bfloat16>();
-template PD_DLL_DECL paddle::platform::complex128 *
-Tensor::mutable_data<paddle::platform::complex128>();
-template PD_DLL_DECL paddle::platform::complex64 *
-Tensor::mutable_data<paddle::platform::complex64>();
 template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
 template PD_DLL_DECL bool *Tensor::mutable_data<bool>();
@@ -277,14 +247,6 @@ template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>(
    const PlaceType &place);
 template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
    const PlaceType &place);
-template PD_DLL_DECL paddle::platform::float16 *
-Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::bfloat16 *
-Tensor::mutable_data<paddle::platform::bfloat16>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::complex128 *
-Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::complex64 *
-Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
 template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
    const PlaceType &place);
 template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);
@@ -320,14 +282,6 @@ Tensor Tensor::cast(const DataType &target_type) const {
  auto dst_type =
      framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(target_type);
  switch (src_type) {
-    case framework::proto::VarType::FP16:
-      framework::VisitDataType(
-          dst_type, CastDataType<platform::float16>(*tensor, rlt_tensor_, ctx));
-      break;
-    case framework::proto::VarType::BF16:
-      framework::VisitDataType(dst_type, CastDataType<platform::bfloat16>(
-                                             *tensor, rlt_tensor_, ctx));
-      break;
    case framework::proto::VarType::FP32:
      framework::VisitDataType(dst_type,
                               CastDataType<float>(*tensor, rlt_tensor_, ctx));
@@ -356,14 +310,7 @@ Tensor Tensor::cast(const DataType &target_type) const {
      framework::VisitDataType(
          dst_type, CastDataType<uint8_t>(*tensor, rlt_tensor_, ctx));
      break;
-    case framework::proto::VarType::COMPLEX64:
+    // TODO(JiabinYang) Support more dtype here
-      framework::VisitDataType(dst_type, CastDataType<platform::complex64>(
-                                             *tensor, rlt_tensor_, ctx));
-      break;
-    case framework::proto::VarType::COMPLEX128:
-      framework::VisitDataType(dst_type, CastDataType<platform::complex128>(
-                                             *tensor, rlt_tensor_, ctx));
-      break;
    default:
      PADDLE_THROW(platform::errors::Unimplemented(
          "Data type (%s) is not supported when casting data type.",
@@ -377,6 +324,18 @@ int64_t Tensor::size() const {
  return tensor->numel();
 }
+#ifdef PADDLE_WITH_CUDA
+cudaStream_t Tensor::stream() const {
+  if (!stream_.IsStreamSet()) {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Stream is not Set, only input tensor will have "
+        "stream which is set by framework "));
+  } else {
+    return reinterpret_cast<cudaStream_t>(stream_.GetStream());
+  }
+}
+#endif
 namespace framework {
 void CustomTensorUtils::ShareDataTo(const paddle::Tensor &src, void *dst) {

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -321,17 +321,20 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 configure_file(commit.h.in commit.h)
-cc_library(custom_tensor SRCS ../extension/src/tensor.cc DEPS lod_tensor memory enforce)
+cc_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce)
-cc_library(op_meta_info SRCS ../extension/src/op_meta_info.cc DEPS custom_tensor)
+cc_library(op_meta_info SRCS ../extension/src/ext_op_meta_info.cc DEPS custom_tensor)
 cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper custom_tensor op_meta_info)
 cc_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor glog)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include)
 set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
+# Old custom op extension mechanism related, will be removed in 2.1.0
 cc_library(paddle_framework_shared
-    SHARED SRCS executor.cc operator.cc custom_operator.cc ../extension/src/tensor.cc
+    SHARED SRCS executor.cc operator.cc
-    ../extension/src/op_meta_info.cc
    ${CMAKE_CURRENT_SOURCE_DIR}/c/c_api.cc
    ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc
    DEPS ${FLUID_FRAMEWORK_MODULES})
@@ -368,10 +371,16 @@ endif()
 # if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_
 set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer)
+set(PADDLE_CUSTOM_OP_SRCS
+    ${CMAKE_CURRENT_SOURCE_DIR}/custom_operator.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_tensor.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_op_meta_info.cc
+    ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc)
+set(PADDLE_CUSTOM_OP_SRCS ${PADDLE_CUSTOM_OP_SRCS} PARENT_SCOPE)
 cc_library(paddle_custom_op_shared
-    SHARED SRCS custom_operator.cc ../extension/src/tensor.cc ../extension/src/op_meta_info.cc
+    SHARED SRCS ${PADDLE_CUSTOM_OP_SRCS} DEPS ${PADDLE_CUSTOM_OP_MODULES})
-    ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc
-    DEPS ${PADDLE_CUSTOM_OP_MODULES})
 get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
 set_target_properties(paddle_custom_op_shared PROPERTIES OUTPUT_NAME paddle_custom_op)
 target_link_libraries(paddle_custom_op_shared ${os_dependency_modules})
@@ -383,9 +392,9 @@ if (LINUX)
 endif()
 if (WIN32)
-  set(PADDLE_CUSTOM_OP_SHARED_LIB
+  set(PADDLE_CUSTOM_OP_IMPORT_LIB
      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/${CMAKE_BUILD_TYPE}/paddle_custom_op.lib
-      CACHE INTERNAL "Paddle custom op lib")
+      CACHE INTERNAL "Paddle custom op import lib")
  set(PADDLE_CUSTOM_OP_SHARED_LIB
      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/${CMAKE_BUILD_TYPE}/paddle_custom_op.dll
      CACHE INTERNAL "Paddle custom op dll")

--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
-#include "paddle/fluid/extension/include/tensor.h"
+#include "paddle/fluid/extension/include/ext_tensor.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/c/c_api.h"
 #include "paddle/fluid/framework/custom_tensor_utils.h"
@@ -114,6 +114,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
    auto custom_in = paddle::Tensor(
        CustomTensorUtils::ConvertInnerPlaceToEnumPlace(x->place()));
    CustomTensorUtils::ShareDataFrom(static_cast<const void*>(x), custom_in);
+    CustomTensorUtils::SetTensorCurrentStream(&custom_in, ctx.GetPlace());
    custom_ins.emplace_back(custom_in);
  }

--- a/paddle/fluid/framework/custom_operator.h
+++ b/paddle/fluid/framework/custom_operator.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <string>
-#include "paddle/fluid/extension/include/op_meta_info.h"
+#include "paddle/fluid/extension/include/ext_op_meta_info.h"
 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -14,7 +14,7 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/extension/include/all.h"
+#include "paddle/fluid/extension/include/ext_all.h"
 #include "paddle/fluid/framework/custom_tensor_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -91,7 +91,7 @@ void TestCast(paddle::DataType data_type) {
  t1.reshape(tensor_shape);
  t1.template mutable_data<T>();
  auto t2 = t1.cast(data_type);
-  CHECK_EQ(t2.type(), data_type);
+  CHECK(t2.type() == data_type);
 }
 void GroupTestCopy() {
@@ -99,14 +99,6 @@ void GroupTestCopy() {
  TestCopyTensor<float>();
  VLOG(2) << "Double cpu-cpu-gpu-gpu-cpu";
  TestCopyTensor<double>();
-  VLOG(2) << "Fp16 cpu-cpu-gpu-gpu-cpu";
-  TestCopyTensor<paddle::platform::float16>();
-  VLOG(2) << "BF16 cpu-cpu-gpu-gpu-cpu";
-  TestCopyTensor<paddle::platform::bfloat16>();
-  VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu";
-  TestCopyTensor<paddle::platform::complex128>();
-  VLOG(2) << "complex64 cpu-cpu-gpu-gpu-cpu";
-  TestCopyTensor<paddle::platform::complex64>();
  VLOG(2) << "int cpu-cpu-gpu-gpu-cpu";
  TestCopyTensor<int>();
  VLOG(2) << "int64 cpu-cpu-gpu-gpu-cpu";
@@ -128,31 +120,17 @@ void GroupTestCast() {
  TestCast<int64_t>(paddle::DataType::FLOAT32);
  VLOG(2) << "double cast";
  TestCast<double>(paddle::DataType::FLOAT32);
-  VLOG(2) << "bfloat16 cast";
-  TestCast<paddle::platform::bfloat16>(paddle::DataType::FLOAT32);
-  VLOG(2) << "float16 cast";
-  TestCast<paddle::platform::float16>(paddle::DataType::FLOAT32);
  VLOG(2) << "bool cast";
  TestCast<bool>(paddle::DataType::FLOAT32);
  VLOG(2) << "uint8 cast";
  TestCast<uint8_t>(paddle::DataType::FLOAT32);
  VLOG(2) << "float cast";
  TestCast<float>(paddle::DataType::FLOAT32);
-  VLOG(2) << "complex64 cast";
-  TestCast<float>(paddle::DataType::FLOAT32);
-  VLOG(2) << "complex128 cast";
-  TestCast<float>(paddle::DataType::FLOAT32);
 }
 void GroupTestDtype() {
  CHECK(TestDtype<float>() == paddle::DataType::FLOAT32);
  CHECK(TestDtype<double>() == paddle::DataType::FLOAT64);
-  CHECK(TestDtype<paddle::platform::float16>() == paddle::DataType::FLOAT16);
-  CHECK(TestDtype<paddle::platform::bfloat16>() == paddle::DataType::BFLOAT16);
-  CHECK(TestDtype<paddle::platform::complex128>() ==
-        paddle::DataType::COMPLEX128);
-  CHECK(TestDtype<paddle::platform::complex64>() ==
-        paddle::DataType::COMPLEX64);
  CHECK(TestDtype<int>() == paddle::DataType::INT32);
  CHECK(TestDtype<int64_t>() == paddle::DataType::INT64);
  CHECK(TestDtype<int16_t>() == paddle::DataType::INT16);
@@ -162,24 +140,12 @@ void GroupTestDtype() {
 void GroupTestDtypeConvert() {
  // enum -> proto
-  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-            paddle::DataType::COMPLEX128) ==
-        paddle::framework::proto::VarType::COMPLEX128);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-            paddle::DataType::COMPLEX64) ==
-        paddle::framework::proto::VarType::COMPLEX64);
  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
            paddle::DataType::FLOAT64) ==
        paddle::framework::proto::VarType::FP64);
  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
            paddle::DataType::FLOAT32) ==
        paddle::framework::proto::VarType::FP32);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-            paddle::DataType::FLOAT16) ==
-        paddle::framework::proto::VarType::FP16);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-            paddle::DataType::BFLOAT16) ==
-        paddle::framework::proto::VarType::BF16);
  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
            paddle::DataType::UINT8) ==
        paddle::framework::proto::VarType::UINT8);
@@ -197,24 +163,12 @@ void GroupTestDtypeConvert() {
  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
            paddle::DataType::BOOL) == paddle::framework::proto::VarType::BOOL);
  // proto -> enum
-  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
-            paddle::framework::proto::VarType::COMPLEX128) ==
-        paddle::DataType::COMPLEX128);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
-            paddle::framework::proto::VarType::COMPLEX64) ==
-        paddle::DataType::COMPLEX64);
  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
            paddle::framework::proto::VarType::FP64) ==
        paddle::DataType::FLOAT64);
  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
            paddle::framework::proto::VarType::FP32) ==
        paddle::DataType::FLOAT32);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
-            paddle::framework::proto::VarType::FP16) ==
-        paddle::DataType::FLOAT16);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
-            paddle::framework::proto::VarType::BF16) ==
-        paddle::DataType::BFLOAT16);
  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
            paddle::framework::proto::VarType::INT64) ==
        paddle::DataType::INT64);

--- a/paddle/fluid/framework/custom_tensor_utils.h
+++ b/paddle/fluid/framework/custom_tensor_utils.h
@@ -16,10 +16,13 @@ limitations under the License. */
 #include <memory>
-#include "paddle/fluid/extension/include/tensor.h"
+#include "paddle/fluid/extension/include/ext_tensor.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_CUDA
+#endif
+#include "paddle/fluid/platform/device_context.h"
 namespace paddle {
 namespace framework {
@@ -39,18 +42,10 @@ class CustomTensorUtils {
  static framework::proto::VarType::Type ConvertEnumDTypeToInnerDType(
      const paddle::DataType& dtype) {
    switch (dtype) {
-      case paddle::DataType::COMPLEX128:
-        return framework::proto::VarType::COMPLEX128;
-      case paddle::DataType::COMPLEX64:
-        return framework::proto::VarType::COMPLEX64;
      case paddle::DataType::FLOAT64:
        return framework::proto::VarType::FP64;
      case paddle::DataType::FLOAT32:
        return framework::proto::VarType::FP32;
-      case paddle::DataType::FLOAT16:
-        return framework::proto::VarType::FP16;
-      case paddle::DataType::BFLOAT16:
-        return framework::proto::VarType::BF16;
      case paddle::DataType::UINT8:
        return framework::proto::VarType::UINT8;
      case paddle::DataType::INT8:
@@ -74,18 +69,10 @@ class CustomTensorUtils {
  static paddle::DataType ConvertInnerDTypeToEnumDType(
      const framework::proto::VarType::Type& dtype) {
    switch (dtype) {
-      case framework::proto::VarType::COMPLEX128:
-        return paddle::DataType::COMPLEX128;
-      case framework::proto::VarType::COMPLEX64:
-        return paddle::DataType::COMPLEX64;
      case framework::proto::VarType::FP64:
        return paddle::DataType::FLOAT64;
      case framework::proto::VarType::FP32:
        return paddle::DataType::FLOAT32;
-      case framework::proto::VarType::FP16:
-        return paddle::DataType::FLOAT16;
-      case framework::proto::VarType::BF16:
-        return paddle::DataType::BFLOAT16;
      case framework::proto::VarType::INT64:
        return paddle::DataType::INT64;
      case framework::proto::VarType::INT32:
@@ -139,6 +126,19 @@ class CustomTensorUtils {
    }
    return PlaceType::kUNK;
  }
+  static void SetTensorCurrentStream(paddle::Tensor* src,
+                                     const platform::Place& pc) {
+    if (platform::is_gpu_place(pc)) {
+#ifdef PADDLE_WITH_CUDA
+      auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
+          platform::DeviceContextPool::Instance().Get(pc));
+      src->stream_.SetStream(reinterpret_cast<void*>(dev_ctx->stream()));
+#endif
+    } else {
+      return;
+    }
+  }
 };
 }  // namespace framework

--- a/paddle/fluid/framework/op_meta_info_helper.h
+++ b/paddle/fluid/framework/op_meta_info_helper.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
-#include "paddle/fluid/extension/include/op_meta_info.h"
+#include "paddle/fluid/extension/include/ext_op_meta_info.h"
 namespace paddle {
 namespace framework {

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -84,6 +84,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
                       const framework::AttributeMap& attrs) {
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto* dev_ctx = pool.Get(place);
  framework::RuntimeContext ctx({}, {});
 #ifdef PADDLE_WITH_MKLDNN

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -73,7 +73,8 @@ set(SHARED_INFERENCE_SRCS
    ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
    ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc
    ${CMAKE_CURRENT_SOURCE_DIR}/utils/io_utils.cc
-    ${mkldnn_quantizer_src_file})
+    ${mkldnn_quantizer_src_file}
+    ${PADDLE_CUSTOM_OP_SRCS})
 # shared inference library deps
 set(SHARED_INFERENCE_DEPS ${fluid_modules} analysis_predictor)

--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -28,6 +28,7 @@ if not exist %cache_dir%\tools (
 )
 taskkill /f /im op_function_generator.exe
 wmic process where name="op_function_generator.exe" call terminate
+taskkill /f /im python.exe  2>NUL
 rem ------initialize common variable------
 if not defined BRANCH set BRANCH=develop
@@ -45,6 +46,14 @@ set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo
 rem -------set cache build work directory-----------
 rmdir build\python /s/q
+rmdir build\paddle_install_dir /s/q
+rmdir build\paddle_inference_install_dir /s/q
+rmdir build\paddle_inference_c_install_dir /s/q
+del build\CMakeCache.txt
+: set CI_SKIP_CPP_TEST if only *.py changed
+git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON
 if "%WITH_CACHE%"=="OFF" (
    rmdir build /s/q
    goto :mkbuild
@@ -62,8 +71,8 @@ setlocal enabledelayedexpansion
 git show-ref --verify --quiet refs/heads/last_pr
 if %ERRORLEVEL% EQU 0 (
    git diff HEAD last_pr --stat --name-only
-    git diff HEAD last_pr --stat --name-only | findstr ".cmake CMakeLists.txt paddle_build.bat"
+    git diff HEAD last_pr --stat --name-only | findstr "setup.py.in"
-    if !ERRORLEVEL! EQU 0 (
+    if %ERRORLEVEL% EQU 0 (
        rmdir build /s/q
    )
    git branch -D last_pr
@@ -208,11 +217,15 @@ set /p day_before=< %cache_dir%\day.txt
 if %day_now% NEQ %day_before% (
    echo %day_now% > %cache_dir%\day.txt
    type %cache_dir%\day.txt
-    if %day_now% EQU 25 (
+    if %day_now% EQU 21 (
+        rmdir %cache_dir%\third_party_GPU/ /s/q
+        rmdir %cache_dir%\third_party/ /s/q
+    )
+    if %day_now% EQU 11 (
        rmdir %cache_dir%\third_party_GPU/ /s/q
        rmdir %cache_dir%\third_party/ /s/q
    )
-    if %day_now% EQU 10 (
+    if %day_now% EQU 01 (
        rmdir %cache_dir%\third_party_GPU/ /s/q
        rmdir %cache_dir%\third_party/ /s/q
    )
@@ -597,6 +610,22 @@ goto:eof
 :check_change_of_unittest_error
 exit /b 1
+rem ---------------------------------------------------------------------------------------------
+:zip_file
+tree /F %cd%\paddle_inference_install_dir\paddle
+if exist paddle_inference.zip del paddle_inference.zip
+python -c "import shutil;shutil.make_archive('paddle_inference', 'zip', root_dir='paddle_inference_install_dir')"
+%cache_dir%\tools\busybox64.exe du -h -k paddle_inference.zip > lib_size.txt
+set /p libsize=< lib_size.txt
+for /F %%i in ("%libsize%") do (
+    set /a libsize_m=%%i/1024
+    echo "Windows Paddle_Inference ZIP Size: !libsize_m!M"
+)
+goto:eof
+:zip_file_error
+echo Tar inference library failed!
+exit /b 1
 :timestamp
 setlocal enabledelayedexpansion

--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -23,13 +23,14 @@ set_tests_properties(test_multi_out_jit PROPERTIES TIMEOUT 120)
 py_test(test_custom_attrs_jit SRCS test_custom_attrs_jit.py)
 set_tests_properties(test_custom_attrs_jit PROPERTIES TIMEOUT 120)
+py_test(test_check_abi SRCS test_check_abi.py)
+cc_test(test_check_error SRCS test_check_error.cc DEPS gtest)
 if(NOT LINUX)
    return()
 endif()
-# TODO(zhouwei): support test_check_abi and abi check on Windows
-py_test(test_check_abi SRCS test_check_abi.py)
 # Old custom OP only support Linux, only run on Linux
 py_test(test_custom_op SRCS test_custom_op.py)
 py_test(test_jit_load SRCS test_jit_load.py)

--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -39,8 +39,8 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data,
 std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(x.shape());
+  out.reshape(x.shape());
  PD_DISPATCH_FLOATING_TYPES(
      x.type(), "relu_cpu_forward", ([&] {
        relu_cpu_forward_kernel<data_t>(
@@ -79,7 +79,7 @@ std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
  } else if (x.place() == paddle::PlaceType::kGPU) {
    return relu_cuda_forward(x);
  } else {
-    throw std::runtime_error("Not implemented.");
+    PD_THROW("Not implemented.");
  }
 }
@@ -92,7 +92,7 @@ std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
  } else if (x.place() == paddle::PlaceType::kGPU) {
    return relu_cuda_backward(x, out, grad_out);
  } else {
-    throw std::runtime_error("Not implemented.");
+    PD_THROW("Not implemented.");
  }
 }

--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -37,14 +37,14 @@ __global__ void relu_cuda_backward_kernel(const data_t* dy,
 std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
  auto out = paddle::Tensor(paddle::PlaceType::kGPU);
-  out.reshape(x.shape());
+  out.reshape(x.shape());
  int numel = x.size();
  int block = 512;
  int grid = (numel + block - 1) / block;
  PD_DISPATCH_FLOATING_TYPES(
      x.type(), "relu_cuda_forward_kernel", ([&] {
-        relu_cuda_forward_kernel<data_t><<<grid, block>>>(
+        relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
            x.data<data_t>(), out.mutable_data<data_t>(x.place()), numel);
      }));
@@ -62,7 +62,7 @@ std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
  int grid = (numel + block - 1) / block;
  PD_DISPATCH_FLOATING_TYPES(
      out.type(), "relu_cuda_backward_kernel", ([&] {
-        relu_cuda_backward_kernel<data_t><<<grid, block>>>(
+        relu_cuda_backward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
            grad_out.data<data_t>(),
            out.data<data_t>(),
            grad_x.mutable_data<data_t>(x.place()),

--- a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
@@ -44,24 +44,6 @@ PD_BUILD_OP(dispatch_test_integer)
    .Outputs({"Out"})
    .SetKernelFn(PD_KERNEL(DispatchTestInterger));
-std::vector<paddle::Tensor> DispatchTestComplex(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(x.shape());
-  PD_DISPATCH_COMPLEX_TYPES(
-      x.type(), "assign_cpu_kernel", ([&] {
-        assign_cpu_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
-      }));
-  return {out};
-}
-PD_BUILD_OP(dispatch_test_complex)
-    .Inputs({"X"})
-    .Outputs({"Out"})
-    .SetKernelFn(PD_KERNEL(DispatchTestComplex));
 std::vector<paddle::Tensor> DispatchTestFloatAndInteger(
    const paddle::Tensor& x) {
  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
@@ -80,41 +62,3 @@ PD_BUILD_OP(dispatch_test_float_and_integer)
    .Inputs({"X"})
    .Outputs({"Out"})
    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndInteger));
-std::vector<paddle::Tensor> DispatchTestFloatAndComplex(
-    const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(x.shape());
-  PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
-      x.type(), "assign_cpu_kernel", ([&] {
-        assign_cpu_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
-      }));
-  return {out};
-}
-PD_BUILD_OP(dispatch_test_float_and_complex)
-    .Inputs({"X"})
-    .Outputs({"Out"})
-    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndComplex));
-std::vector<paddle::Tensor> DispatchTestFloatAndIntegerAndComplex(
-    const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(x.shape());
-  PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(
-      x.type(), "assign_cpu_kernel", ([&] {
-        assign_cpu_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
-      }));
-  return {out};
-}
-PD_BUILD_OP(dispatch_test_float_and_integer_and_complex)
-    .Inputs({"X"})
-    .Outputs({"Out"})
-    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndIntegerAndComplex));
--- a/python/paddle/fluid/tests/custom_op/test_check_abi.py
+++ b/python/paddle/fluid/tests/custom_op/test_check_abi.py
@@ -22,10 +22,11 @@ import paddle.utils.cpp_extension.extension_utils as utils
 class TestABIBase(unittest.TestCase):
    def test_environ(self):
-        compiler = 'gcc'
+        compiler_list = ['gcc', 'cl']
-        for flag in ['1', 'True', 'true']:
+        for compiler in compiler_list:
-            os.environ['PADDLE_SKIP_CHECK_ABI'] = flag
+            for flag in ['1', 'True', 'true']:
-            self.assertTrue(utils.check_abi_compatibility(compiler))
+                os.environ['PADDLE_SKIP_CHECK_ABI'] = flag
+                self.assertTrue(utils.check_abi_compatibility(compiler))
    def del_environ(self):
        key = 'PADDLE_SKIP_CHECK_ABI'
@@ -33,43 +34,49 @@ class TestABIBase(unittest.TestCase):
            del os.environ[key]
-class TestCheckLinux(TestABIBase):
+class TestCheckCompiler(TestABIBase):
    def test_expected_compiler(self):
        if utils.OS_NAME.startswith('linux'):
            gt = ['gcc', 'g++', 'gnu-c++', 'gnu-cc']
-            self.assertListEqual(utils._expected_compiler_current_platform(),
+        elif utils.IS_WINDOWS:
-                                 gt)
+            gt = ['cl']
+        elif utils.OS_NAME.startswith('darwin'):
+            gt = ['clang', 'clang++']
+        self.assertListEqual(utils._expected_compiler_current_platform(), gt)
-    def test_gcc_version(self):
+    def test_compiler_version(self):
        # clear environ
        self.del_environ()
-        compiler = 'g++'
        if utils.OS_NAME.startswith('linux'):
-            # all CI gcc version > 5.4.0
+            compiler = 'g++'
-            self.assertTrue(
+        elif utils.IS_WINDOWS:
-                utils.check_abi_compatibility(
+            compiler = 'cl'
-                    compiler, verbose=True))
+        # Linux: all CI gcc version > 5.4.0
+        # Windows: all CI MSVC version > 19.00.24215
+        # Mac: clang has no version limitation, always return true
+        self.assertTrue(utils.check_abi_compatibility(compiler, verbose=True))
    def test_wrong_compiler_warning(self):
        # clear environ
        self.del_environ()
-        compiler = 'nvcc'  # fake wrong compiler
+        compiler = 'python'  # fake wrong compiler
-        if utils.OS_NAME.startswith('linux'):
+        with warnings.catch_warnings(record=True) as error:
-            with warnings.catch_warnings(record=True) as error:
+            flag = utils.check_abi_compatibility(compiler, verbose=True)
-                flag = utils.check_abi_compatibility(compiler, verbose=True)
+            # check return False
-                # check return False
+            self.assertFalse(flag)
-                self.assertFalse(flag)
+            # check Compiler Compatibility WARNING
-                # check Compiler Compatibility WARNING
+            self.assertTrue(len(error) == 1)
-                self.assertTrue(len(error) == 1)
+            self.assertTrue(
-                self.assertTrue(
+                "Compiler Compatibility WARNING" in str(error[0].message))
-                    "Compiler Compatibility WARNING" in str(error[0].message))
    def test_exception(self):
        # clear environ
        self.del_environ()
        compiler = 'python'  # fake command
        if utils.OS_NAME.startswith('linux'):
-            # to skip _expected_compiler_current_platform
            def fake():
                return [compiler]
@@ -89,32 +96,6 @@ class TestCheckLinux(TestABIBase):
            utils._expected_compiler_current_platform = raw_func
-class TestCheckMacOs(TestABIBase):
-    def test_expected_compiler(self):
-        if utils.OS_NAME.startswith('darwin'):
-            gt = ['clang', 'clang++']
-            self.assertListEqual(utils._expected_compiler_current_platform(),
-                                 gt)
-    def test_gcc_version(self):
-        # clear environ
-        self.del_environ()
-        if utils.OS_NAME.startswith('darwin'):
-            # clang has no version limitation.
-            self.assertTrue(utils.check_abi_compatibility())
-class TestCheckWindows(TestABIBase):
-    def test_gcc_version(self):
-        # clear environ
-        self.del_environ()
-        if utils.IS_WINDOWS:
-            # we skip windows now
-            self.assertTrue(utils.check_abi_compatibility())
 class TestJITCompilerException(unittest.TestCase):
    def test_exception(self):
        with self.assertRaisesRegexp(RuntimeError,

--- a/python/paddle/fluid/tests/custom_op/test_check_error.cc
+++ b/python/paddle/fluid/tests/custom_op/test_check_error.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <iostream>
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/extension/include/ext_exception.h"
+TEST(PD_THROW, empty) {
+  bool caught_exception = false;
+  try {
+    PD_THROW();
+  } catch (const std::exception& e) {
+    caught_exception = true;
+    std::string err_msg = e.what();
+    EXPECT_TRUE(err_msg.find("An error occured.") != std::string::npos);
+#if _WIN32
+    EXPECT_TRUE(err_msg.find("tests\\custom_op\\test_check_error.cc:20") !=
+                std::string::npos);
+#else
+    EXPECT_TRUE(
+        err_msg.find(
+            "python/paddle/fluid/tests/custom_op/test_check_error.cc:20") !=
+        std::string::npos);
+#endif
+  }
+  EXPECT_TRUE(caught_exception);
+}
+TEST(PD_THROW, non_empty) {
+  bool caught_exception = false;
+  try {
+    PD_THROW("PD_THROW returns ",
+             false,
+             ". DataType of ",
+             1,
+             " is INT. ",
+             "DataType of ",
+             0.23,
+             " is FLOAT. ");
+  } catch (const std::exception& e) {
+    caught_exception = true;
+    std::string err_msg = e.what();
+    EXPECT_TRUE(err_msg.find("PD_THROW returns 0. DataType of 1 is INT. ") !=
+                std::string::npos);
+#if _WIN32
+    EXPECT_TRUE(err_msg.find("tests\\custom_op\\test_check_error.cc") !=
+                std::string::npos);
+#else
+    EXPECT_TRUE(
+        err_msg.find(
+            "python/paddle/fluid/tests/custom_op/test_check_error.cc") !=
+        std::string::npos);
+#endif
+  }
+  EXPECT_TRUE(caught_exception);
+}
+TEST(PD_CHECK, OK) {
+  PD_CHECK(true);
+  PD_CHECK(true, "PD_CHECK returns ", true, "now");
+  const size_t a = 1;
+  const size_t b = 10;
+  PD_CHECK(a < b);
+  PD_CHECK(a < b, "PD_CHECK returns ", true, a, "should < ", b);
+}
+TEST(PD_CHECK, FAILED) {
+  bool caught_exception = false;
+  try {
+    PD_CHECK(false);
+  } catch (const std::exception& e) {
+    caught_exception = true;
+    std::string err_msg = e.what();
+    EXPECT_TRUE(err_msg.find("Expected false, but it's not satisfied.") !=
+                std::string::npos);
+#if _WIN32
+    EXPECT_TRUE(err_msg.find("tests\\custom_op\\test_check_error.cc") !=
+                std::string::npos);
+#else
+    EXPECT_TRUE(
+        err_msg.find(
+            "python/paddle/fluid/tests/custom_op/test_check_error.cc") !=
+        std::string::npos);
+#endif
+  }
+  EXPECT_TRUE(caught_exception);
+  caught_exception = false;
+  try {
+    PD_CHECK(false,
+             "PD_CHECK returns ",
+             false,
+             ". DataType of ",
+             1,
+             " is INT. ",
+             "DataType of ",
+             0.23,
+             " is FLOAT. ");
+  } catch (const std::exception& e) {
+    caught_exception = true;
+    std::string err_msg = e.what();
+    EXPECT_TRUE(err_msg.find("PD_CHECK returns 0. DataType of 1 is INT. ") !=
+                std::string::npos);
+#if _WIN32
+    EXPECT_TRUE(err_msg.find("tests\\custom_op\\test_check_error.cc") !=
+                std::string::npos);
+#else
+    EXPECT_TRUE(
+        err_msg.find(
+            "python/paddle/fluid/tests/custom_op/test_check_error.cc") !=
+        std::string::npos);
+#endif
+  }
+  EXPECT_TRUE(caught_exception);
+  const size_t a = 1;
+  const size_t b = 10;
+  caught_exception = false;
+  try {
+    PD_CHECK(a > b);
+  } catch (const std::exception& e) {
+    caught_exception = true;
+    std::string err_msg = e.what();
+    EXPECT_TRUE(err_msg.find("Expected a > b, but it's not satisfied.") !=
+                std::string::npos);
+#if _WIN32
+    EXPECT_TRUE(err_msg.find("tests\\custom_op\\test_check_error.cc") !=
+                std::string::npos);
+#else
+    EXPECT_TRUE(
+        err_msg.find(
+            "python/paddle/fluid/tests/custom_op/test_check_error.cc") !=
+        std::string::npos);
+#endif
+  }
+  EXPECT_TRUE(caught_exception);
+  const size_t c = 123;
+  const float d = 0.345;
+  caught_exception = false;
+  try {
+    PD_CHECK(c < d, "PD_CHECK returns ", false, ", because ", c, " > ", d);
+  } catch (const std::exception& e) {
+    caught_exception = true;
+    std::string err_msg = e.what();
+    EXPECT_TRUE(err_msg.find("PD_CHECK returns 0, because 123 > 0.345") !=
+                std::string::npos);
+#if _WIN32
+    EXPECT_TRUE(err_msg.find("tests\\custom_op\\test_check_error.cc") !=
+                std::string::npos);
+#else
+    EXPECT_TRUE(
+        err_msg.find(
+            "python/paddle/fluid/tests/custom_op/test_check_error.cc") !=
+        std::string::npos);
+#endif
+  }
+  EXPECT_TRUE(caught_exception);
+}
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -19,7 +19,7 @@ import paddle
 import numpy as np
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
-from utils import paddle_includes, extra_compile_args
+from utils import paddle_includes, extra_compile_args, IS_WINDOWS
 from test_custom_relu_op_setup import custom_relu_dynamic, custom_relu_static
 # Because Windows don't use docker, the shared lib already exists in the 
@@ -84,6 +84,40 @@ class TestJITLoad(unittest.TestCase):
                        "custom op x grad: {},\n paddle api x grad: {}".format(
                            x_grad, pd_x_grad))
+    def test_exception(self):
+        caught_exception = False
+        try:
+            x = np.random.uniform(-1, 1, [4, 8]).astype('int32')
+            custom_relu_dynamic(custom_module.custom_relu, 'cpu', 'float32', x)
+        except OSError as e:
+            caught_exception = True
+            self.assertTrue(
+                "function \"relu_cpu_forward\" is not implemented for data type `int32_t`"
+                in str(e))
+            if IS_WINDOWS:
+                self.assertTrue(
+                    r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc:48"
+                    in str(e))
+            else:
+                self.assertTrue(
+                    "python/paddle/fluid/tests/custom_op/custom_relu_op.cc:48"
+                    in str(e))
+        self.assertTrue(caught_exception)
+        caught_exception = False
+        try:
+            x = np.random.uniform(-1, 1, [4, 8]).astype('int64')
+            custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'float32', x)
+        except OSError as e:
+            caught_exception = True
+            self.assertTrue(
+                "function \"relu_cuda_forward_kernel\" is not implemented for data type `int64_t`"
+                in str(e))
+            self.assertTrue(
+                "python/paddle/fluid/tests/custom_op/custom_relu_op.cu:49" in
+                str(e))
+        self.assertTrue(caught_exception)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
@@ -55,11 +55,6 @@ class TestJitDispatch(unittest.TestCase):
        for dtype in dtypes:
            self.run_dispatch_test(dispatch_op.dispatch_test_integer, dtype)
-    def test_dispatch_complex(self):
-        dtypes = ["complex64", "complex128"]
-        for dtype in dtypes:
-            self.run_dispatch_test(dispatch_op.dispatch_test_complex, dtype)
    def test_dispatch_float_and_integer(self):
        dtypes = [
            "float32", "float64", "int32", "int64", "int8", "uint8", "int16"
@@ -68,21 +63,6 @@ class TestJitDispatch(unittest.TestCase):
            self.run_dispatch_test(dispatch_op.dispatch_test_float_and_integer,
                                   dtype)
-    def test_dispatch_float_and_complex(self):
-        dtypes = ["float32", "float64", "complex64", "complex128"]
-        for dtype in dtypes:
-            self.run_dispatch_test(dispatch_op.dispatch_test_float_and_complex,
-                                   dtype)
-    def test_dispatch_float_and_integer_and_complex(self):
-        dtypes = [
-            "float32", "float64", "int32", "int64", "int8", "uint8", "int16",
-            "complex64", "complex128"
-        ]
-        for dtype in dtypes:
-            self.run_dispatch_test(
-                dispatch_op.dispatch_test_float_and_integer_and_complex, dtype)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -46,11 +46,12 @@ MSVC_COMPILE_FLAGS = [
    '/DBOOST_HAS_STATIC_ASSERT', '/DNDEBUG', '/DPADDLE_USE_DSO'
 ]
-MSVC_LINK_FLAGS = ['/MACHINE:X64', 'paddle_framework.lib']
+MSVC_LINK_FLAGS = ['/MACHINE:X64', 'paddle_custom_op.lib']
 COMMON_NVCC_FLAGS = ['-DPADDLE_WITH_CUDA', '-DEIGEN_USE_GPU', '-O3']
 GCC_MINI_VERSION = (5, 4, 0)
+MSVC_MINI_VERSION = (19, 0, 24215)
 # Give warning if using wrong compiler
 WRONG_COMPILER_WARNING = '''
                        *************************************
@@ -64,7 +65,7 @@ built Paddle for this platform, which is {paddle_compiler} on {platform}. Please
 use {paddle_compiler} to compile your custom op. Or you may compile Paddle from
 source using {user_compiler}, and then also use it compile your custom op.
-See https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/2.0/install/compile/linux-compile.html
+See https://www.paddlepaddle.org.cn/documentation/docs/zh/install/compile/fromsource.html
 for help with compiling Paddle from source.
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -877,13 +878,12 @@ def check_abi_compatibility(compiler, verbose=False):
    Check whether GCC version on user local machine is compatible with Paddle in
    site-packages.
    """
-    # TODO(Aurelius84): After we support windows, remove IS_WINDOWS in following code.
+    if os.environ.get('PADDLE_SKIP_CHECK_ABI') in ['True', 'true', '1']:
-    if os.environ.get('PADDLE_SKIP_CHECK_ABI') in ['True', 'true', '1'
-                                                   ] or IS_WINDOWS:
        return True
+    which = 'where' if IS_WINDOWS else 'which'
    cmd_out = subprocess.check_output(
-        ['which', compiler], stderr=subprocess.STDOUT)
+        [which, compiler], stderr=subprocess.STDOUT)
    compiler_path = os.path.realpath(cmd_out.decode()
                                     if six.PY3 else cmd_out).strip()
    # step 1. if not found any suitable compiler, raise error
@@ -896,32 +896,41 @@ def check_abi_compatibility(compiler, verbose=False):
                platform=OS_NAME))
        return False
+    version = (0, 0, 0)
    # clang++ have no ABI compatibility problem
    if OS_NAME.startswith('darwin'):
        return True
    try:
        if OS_NAME.startswith('linux'):
+            mini_required_version = GCC_MINI_VERSION
            version_info = subprocess.check_output(
                [compiler, '-dumpfullversion', '-dumpversion'])
            if six.PY3:
                version_info = version_info.decode()
            version = version_info.strip().split('.')
-            assert len(version) == 3
-            # check version compatibility
-            if tuple(map(int, version)) >= GCC_MINI_VERSION:
-                return True
-            else:
-                warnings.warn(
-                    ABI_INCOMPATIBILITY_WARNING.format(
-                        user_compiler=compiler, version=version_info.strip()))
        elif IS_WINDOWS:
-            # TODO(zhouwei): support check abi compatibility on windows
+            mini_required_version = MSVC_MINI_VERSION
-            warnings.warn("We don't support Windows now.")
+            compiler_info = subprocess.check_output(
+                compiler, stderr=subprocess.STDOUT)
+            if six.PY3:
+                compiler_info = compiler_info.decode()
+            match = re.search(r'(\d+)\.(\d+)\.(\d+)', compiler_info.strip())
+            if match is not None:
+                version = match.groups()
    except Exception:
+        # check compiler version failed
        _, error, _ = sys.exc_info()
        warnings.warn('Failed to check compiler version for {}: {}'.format(
            compiler, error))
+        return False
+    # check version compatibility
+    assert len(version) == 3
+    if tuple(map(int, version)) >= mini_required_version:
+        return True
+    warnings.warn(
+        ABI_INCOMPATIBILITY_WARNING.format(
+            user_compiler=compiler, version='.'.join(version)))
    return False
@@ -929,8 +938,12 @@ def _expected_compiler_current_platform():
    """
    Returns supported compiler string on current platform
    """
-    expect_compilers = ['clang', 'clang++'] if OS_NAME.startswith(
+    if OS_NAME.startswith('darwin'):
-        'darwin') else ['gcc', 'g++', 'gnu-c++', 'gnu-cc']
+        expect_compilers = ['clang', 'clang++']
+    elif OS_NAME.startswith('linux'):
+        expect_compilers = ['gcc', 'g++', 'gnu-c++', 'gnu-cc']
+    elif IS_WINDOWS:
+        expect_compilers = ['cl']
    return expect_compilers

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -334,21 +334,23 @@ if '${WITH_XPU_BKCL}' == 'ON':
    shutil.copy('${XPU_BKCL_LIB}', libs_path)
    package_data['paddle.libs']+=['${XPU_BKCL_LIB_NAME}']
+### Old custom op extension mechanism related, will be removed in 2.1.0 ###
 # copy libpaddle_framework.so to libs on linux
 if sys.platform.startswith('linux'):
    shutil.copy('${FLUID_FRAMEWORK_SHARED_LIB}', libs_path)
    package_data['paddle.libs'] += ['libpaddle_framework.so']
+### New custom op extension mechanism related ###
 # copy libpaddle_custom_op.so to libs on linux
 if sys.platform.startswith('linux'):
    shutil.copy('${PADDLE_CUSTOM_OP_SHARED_LIB}', libs_path)
    package_data['paddle.libs'] += ['libpaddle_custom_op.so']
-# copy paddle_framework.lib/paddle_framework.dll to libs on windows
+# copy paddle_custom_op.lib/paddle_custom_op.dll to libs on Windows
 if os.name == 'nt':
-    shutil.copy('${FLUID_FRAMEWORK_IMPORT_LIB}', libs_path)
+    shutil.copy('${PADDLE_CUSTOM_OP_IMPORT_LIB}', libs_path)
-    shutil.copy('${FLUID_FRAMEWORK_SHARED_LIB}', libs_path)
+    shutil.copy('${PADDLE_CUSTOM_OP_SHARED_LIB}', libs_path)
-    package_data['paddle.libs'] += ['paddle_framework.lib', 'paddle_framework.dll']
+    package_data['paddle.libs'] += ['paddle_custom_op.lib', 'paddle_custom_op.dll']
 # remove unused paddle/libs/__init__.py
 if os.path.isfile(libs_path+'/__init__.py'):