feat(whl/imperative): compat for build python whl imperative and legacy runtime

GitOrigin-RevId: 7f6629ae1f84b4aec3a4211f22b1d8d18d36a1b7

feat(whl/imperative): compat for build python whl imperative and legacy runtime
GitOrigin-RevId: 7f6629ae1f84b4aec3a4211f22b1d8d18d36a1b7
6e882c1a · Megvii Engine Team · 40d18c89 · 6e882c1a · 6e882c1a · 6e882c1a
33 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -697,8 +697,10 @@ endif()
 if(MGE_WITH_PYTHON_MODULE)
    if(MGE_BUILD_IMPERATIVE_RT)
        add_subdirectory(imperative)
+        message("-- Enable imperative python wrapper runtime")
    else()
        add_subdirectory(python_module)
+        message("-- Enable legacy python wrapper runtime")
    endif()
 endif()


--- a/dnn/src/common/utils.h
+++ b/dnn/src/common/utils.h
@@ -342,7 +342,11 @@ template <typename T>
 struct SafeMultiplies;

 template <typename T>
+#if __cplusplus >= 201703L
+struct _SafeMultipliesImplUnsigned {
+#else
 struct _SafeMultipliesImplUnsigned : public std::binary_function<T, T, T> {
+#endif
    static MEGDNN_CONSTEXPR size_t nbits = sizeof(T) * 8;

    static size_t clz(unsigned x) {

--- a/dnn/test/CMakeLists.txt
+++ b/dnn/test/CMakeLists.txt
@@ -70,8 +70,10 @@ if (MEG_WITH_ROCM)
    target_link_libraries (megdnn_test ${MGE_ROCM_LIBS})
 endif ()

-if(APPLE OR ANDROID)
-    target_link_libraries(megdnn_test dl)
-else()
-    target_link_libraries(megdnn_test dl rt)
+if(UNIX)
+    if(APPLE OR ANDROID)
+        target_link_libraries(megdnn_test dl)
+    else()
+        target_link_libraries(megdnn_test dl rt)
+    endif()
 endif()
--- a/dnn/test/common/mesh_indexing.h
+++ b/dnn/test/common/mesh_indexing.h
@@ -89,7 +89,7 @@ public:
        auto ptr = tensor.ptr<int>();
        for (size_t n = 0; n < size; ++n) {
            std::set<int> used;
-            std::random_shuffle(seq.begin(), seq.end());
+            COMPAT_RANDOM(seq.begin(), seq.end());
            for (size_t step = 0; step < stride; ++step) {
                megdnn_assert(used.size() < m_size);
                ptr[n * stride + step] = seq[step];

--- a/dnn/test/common/rng.cpp
+++ b/dnn/test/common/rng.cpp
@@ -75,7 +75,7 @@ Float16PeriodicalRNG::Float16PeriodicalRNG() : m_offset(0) {
        i2f.i = static_cast<uint16_t>(x);
        m_sequence.push_back(i2f.f);
    }
-    std::random_shuffle(m_sequence.begin(), m_sequence.end());
+    COMPAT_RANDOM(m_sequence.begin(), m_sequence.end());
 }

 Float16PeriodicalRNG::Float16PeriodicalRNG(size_t range) : m_offset(0) {
@@ -99,7 +99,7 @@ Float16PeriodicalRNG::Float16PeriodicalRNG(size_t range) : m_offset(0) {
        m_sequence.push_back(i2f.f);
    }

-    std::random_shuffle(m_sequence.begin(), m_sequence.end());
+    COMPAT_RANDOM(m_sequence.begin(), m_sequence.end());
 }

 void Float16PeriodicalRNG::gen(const TensorND& tensor) {

--- a/dnn/test/common/rng.h
+++ b/dnn/test/common/rng.h
@@ -19,6 +19,16 @@
 namespace megdnn {
 namespace test {

+#if __cplusplus >= 201703L
+#define COMPAT_RANDOM(begin, end)              \
+    {                                          \
+        std::default_random_engine rng_engine; \
+        std::shuffle(begin, end, rng_engine);  \
+    }
+#else
+#define COMPAT_RANDOM(begin, end) std::random_shuffle(begin, end);
+#endif
+
 class RNG {
 protected:
    class RNGxorshf;

--- a/dnn/test/cuda/argmxx.cpp
+++ b/dnn/test/cuda/argmxx.cpp
@@ -24,15 +24,16 @@ class ArgmxxRNG final: public RNG {
        void gen(const TensorND &tensor) override {
            auto offset = tensor.layout.span().low_elem;
            auto nr_elems = tensor.layout.span().dist_elem();
-#define cb(DType) \
-            if (tensor.layout.dtype == DType()) { \
-                using ctype = typename DTypeTrait<DType>::ctype; \
-                auto ptr = tensor.ptr<ctype>(); \
-                for (size_t i = 0; i < nr_elems; ++i) { \
-                    ptr[offset+i] = i; \
-                } \
-                std::random_shuffle(ptr + offset, ptr + offset + nr_elems); \
-            }
+
+#define cb(DType)                                             \
+    if (tensor.layout.dtype == DType()) {                     \
+        using ctype = typename DTypeTrait<DType>::ctype;      \
+        auto ptr = tensor.ptr<ctype>();                       \
+        for (size_t i = 0; i < nr_elems; ++i) {               \
+            ptr[offset + i] = i;                              \
+        }                                                     \
+        COMPAT_RANDOM(ptr + offset, ptr + offset + nr_elems); \
+    }
            MEGDNN_FOREACH_COMPUTING_DTYPE(cb);
 #undef cb
        }

--- a/dnn/test/cuda/argsort.cpp
+++ b/dnn/test/cuda/argsort.cpp
@@ -32,7 +32,7 @@ class ArgsortRNG final : public RNG {
        } else {
            for (int i = 0; i < n; ++i)
                ptr[i] = static_cast<T>(i - n / 2);
-            std::random_shuffle(ptr, ptr + n);
+            COMPAT_RANDOM(ptr, ptr + n);
        }
    }

@@ -86,7 +86,7 @@ void run_backward_test(Handle* handle, DType dtype) {
                for (size_t j = 0; j < n; ++j) {
                    ptr[j] = j;
                }
-                std::random_shuffle(ptr, ptr + n);
+                COMPAT_RANDOM(ptr, ptr + n);
                ptr += n;
            }
        }

--- a/dnn/test/cuda/relayout.cpp
+++ b/dnn/test/cuda/relayout.cpp
@@ -361,9 +361,8 @@ TEST_F(CUDA, BENCHMARK_RELAYOUT_7) {
    for (size_t r = 0; r < _dim.size(); r++)
        permutation[r] = r;
    for (int nsample = 0; nsample < 50; nsample++) {
-        std::random_shuffle(_dim.begin(), _dim.end());
-
-        std::random_shuffle(permutation.begin(), permutation.end());
+        COMPAT_RANDOM(_dim.begin(), _dim.end());
+        COMPAT_RANDOM(permutation.begin(), permutation.end());
        if (!isTrivial(permutation)) {
            run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5],
                  _dim[6]},
@@ -451,9 +450,10 @@ TEST_F(CUDA, BENCHMARK_RELAYOUT_5) {
            printf("vol %d cur_ratio %lf | %lf\n", vol, cur_ratio, vol_re);
            // printVec(dim);

-            std::random_shuffle(dim.begin(), dim.end());
+            COMPAT_RANDOM(dim.begin(), dim.end());
+
            while (isTrivial(permutation)) {
-                std::random_shuffle(permutation.begin(), permutation.end());
+                COMPAT_RANDOM(permutation.begin(), permutation.end());
            }

            run({{dim[0], dim[1], dim[2], dim[3], dim[4]}, dtype::Int32()},
@@ -603,8 +603,9 @@ TEST_F(CUDA, BENCHMARK_LAST_CONTIG_ALIGN_TEST) {
    for (size_t r = 0; r < _dim.size(); r++)
        permutation[r] = r;
    for (int nsample = 0; nsample < 20; nsample++) {
-        std::random_shuffle(_dim.begin(), _dim.end() - 1);
-        std::random_shuffle(permutation.begin(), permutation.end() - 1);
+        COMPAT_RANDOM(_dim.begin(), _dim.end() - 1);
+
+        COMPAT_RANDOM(permutation.begin(), permutation.end() - 1);

        if (nsample < 5)
            _dim[5] = (u.gen_single_val() / 4 + 1) * 4;

--- a/dnn/test/cuda/sleep.cpp
+++ b/dnn/test/cuda/sleep.cpp
@@ -24,7 +24,7 @@ using namespace test;


 TEST_F(CUDA, SLEEP) {
-    auto opr = this->handle_cuda()->create_operator<Sleep>();
+    auto opr = this->handle_cuda()->create_operator<megdnn::SleepForward>();

    auto run = [&](float time) -> double {
        opr->param() = {time};

--- a/dnn/test/rocm/argmxx.cpp
+++ b/dnn/test/rocm/argmxx.cpp
@@ -24,16 +24,17 @@ class ArgmxxRNG final: public RNG {
        void gen(const TensorND &tensor) override {
            auto offset = tensor.layout.span().low_elem;
            auto nr_elems = tensor.layout.span().dist_elem();
-#define cb(DType) \
-            if (tensor.layout.dtype == DType()) { \
-                using ctype = typename DTypeTrait<DType>::ctype; \
-                auto ptr = tensor.ptr<ctype>(); \
-                for (size_t i = 0; i < nr_elems; ++i) { \
-                    ptr[offset+i] = i; \
-                } \
-                std::random_shuffle(ptr + offset, ptr + offset + nr_elems); \
-                return; \
-            }
+
+#define cb(DType)                                             \
+    if (tensor.layout.dtype == DType()) {                     \
+        using ctype = typename DTypeTrait<DType>::ctype;      \
+        auto ptr = tensor.ptr<ctype>();                       \
+        for (size_t i = 0; i < nr_elems; ++i) {               \
+            ptr[offset + i] = i;                              \
+        }                                                     \
+        COMPAT_RANDOM(ptr + offset, ptr + offset + nr_elems); \
+        return;                                               \
+    }
            MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb);
 #undef cb
            megdnn_throw(megdnn_mangle(ssprintf("Unsupported DType: %s",

--- a/imperative/CMakeLists.txt
+++ b/imperative/CMakeLists.txt
@@ -76,7 +76,11 @@ add_custom_target(_version_ld SOURCES ${VERSION_SCRIPT})

 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/pybind11 ${PROJECT_BINARY_DIR}/third_party/pybind11)
 pybind11_add_module(${MODULE_NAME} NO_EXTRAS ${SRCS})
-target_link_libraries(${MODULE_NAME} PRIVATE gen_op_def megbrain megdnn -Wl,--version-script=${VERSION_SCRIPT})
+if (APPLE OR MSVC OR WIN32)
+    target_link_libraries(${MODULE_NAME} PRIVATE gen_op_def megbrain megdnn)
+else()
+    target_link_libraries(${MODULE_NAME} PRIVATE gen_op_def megbrain megdnn -Wl,--version-script=${VERSION_SCRIPT})
+endif()
 if (MGE_WITH_DISTRIBUTED)
    message("Imperative configured to link megray")
    target_link_libraries(${MODULE_NAME} PRIVATE megray)
@@ -91,6 +95,10 @@ set_target_properties(${MODULE_NAME} PROPERTIES
    SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}
    LIBRARY_OUTPUT_DIRECTORY ${MEGENGINE_DIR}/${PACKAGE_NAME}/core
 )
+if (APPLE OR MSVC OR WIN32)
+    message("-- overwriting SUFFIX at macos and windows before config by set_target_properties")
+    pybind11_extension(${MODULE_NAME})
+endif()
 add_dependencies(${MODULE_NAME} gen_opr_py _version_ld)

 if(MGE_WITH_TEST AND MGE_ENABLE_RTTI)

--- a/imperative/python/megengine/__init__.py
+++ b/imperative/python/megengine/__init__.py
@@ -8,6 +8,67 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import os
 import sys
+import platform
+import ctypes
+
+if sys.platform == "win32":
+    lib_path = os.path.join(os.path.dirname(__file__), "core/lib")
+    dll_paths = list(filter(os.path.exists, [lib_path,]))
+    assert len(dll_paths) > 0
+
+    kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
+    has_load_library_attr = hasattr(kernel32, "AddDllDirectory")
+    old_error_mode = kernel32.SetErrorMode(0x0001)
+
+    kernel32.LoadLibraryW.restype = ctypes.c_void_p
+    if has_load_library_attr:
+        kernel32.AddDllDirectory.restype = ctypes.c_void_p
+        kernel32.LoadLibraryExW.restype = ctypes.c_void_p
+
+    for dll_path in dll_paths:
+        if sys.version_info >= (3, 8):
+            os.add_dll_directory(dll_path)
+        elif has_load_library_attr:
+            res = kernel32.AddDllDirectory(dll_path)
+            if res is None:
+                err = ctypes.WinError(ctypes.get_last_error())
+                err.strerror += ' Error adding "{}" to the DLL search PATH.'.format(
+                    dll_path
+                )
+                raise err
+        else:
+            print("WARN: python or OS env have some issue, may load DLL failed!!!")
+
+    import glob
+
+    dlls = glob.glob(os.path.join(lib_path, "*.dll"))
+    path_patched = False
+    for dll in dlls:
+        is_loaded = False
+        if has_load_library_attr:
+            res = kernel32.LoadLibraryExW(dll, None, 0x00001100)
+            last_error = ctypes.get_last_error()
+            if res is None and last_error != 126:
+                err = ctypes.WinError(last_error)
+                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(
+                    dll
+                )
+                raise err
+            elif res is not None:
+                is_loaded = True
+        if not is_loaded:
+            if not path_patched:
+                os.environ["PATH"] = ";".join(dll_paths + [os.environ["PATH"]])
+                path_patched = True
+            res = kernel32.LoadLibraryW(dll)
+            if res is None:
+                err = ctypes.WinError(ctypes.get_last_error())
+                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(
+                    dll
+                )
+                raise err
+
+    kernel32.SetErrorMode(old_error_mode)

 from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
 from .device import *

--- a/imperative/python/megengine/utils/max_recursion_limit.py
+++ b/imperative/python/megengine/utils/max_recursion_limit.py
@@ -6,10 +6,14 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import resource
+import platform
 import sys
 import threading

+# Windows do not imp resource package
+if platform.system() != "Windows":
+    import resource
+

 class AlternativeRecursionLimit:
    r"""A reentrant context manager for setting global recursion limits.
@@ -28,16 +32,24 @@ class AlternativeRecursionLimit:
        with self.lock:
            if self.count == 0:
                self.orig_py_limit = sys.getrecursionlimit()
+            if platform.system() != "Windows":
                (
                    self.orig_rlim_stack_soft,
                    self.orig_rlim_stack_hard,
                ) = resource.getrlimit(resource.RLIMIT_STACK)
-                resource.setrlimit(
-                    resource.RLIMIT_STACK,
-                    (self.orig_rlim_stack_hard, self.orig_rlim_stack_hard),
-                )
-                # increase recursion limit
-                sys.setrecursionlimit(self.new_py_limit)
+                # FIXME: https://bugs.python.org/issue34602, python3 release version
+                # on Macos always have this issue, not all user install python3 from src
+                try:
+                    resource.setrlimit(
+                        resource.RLIMIT_STACK,
+                        (self.orig_rlim_stack_hard, self.orig_rlim_stack_hard),
+                    )
+                except ValueError as exc:
+                    if platform.system() != "Darwin":
+                        raise exc
+
+            # increase recursion limit
+            sys.setrecursionlimit(self.new_py_limit)
            self.count += 1

    def __exit__(self, type, value, traceback):
@@ -45,10 +57,16 @@ class AlternativeRecursionLimit:
            self.count -= 1
            if self.count == 0:
                sys.setrecursionlimit(self.orig_py_limit)
-                resource.setrlimit(
-                    resource.RLIMIT_STACK,
-                    (self.orig_rlim_stack_soft, self.orig_rlim_stack_hard),
-                )
+
+            if platform.system() != "Windows":
+                try:
+                    resource.setrlimit(
+                        resource.RLIMIT_STACK,
+                        (self.orig_rlim_stack_soft, self.orig_rlim_stack_hard),
+                    )
+                except ValueError as exc:
+                    if platform.system() != "Darwin":
+                        raise exc


 _max_recursion_limit_context_manager = AlternativeRecursionLimit(2 ** 31 - 1)

--- a/imperative/python/setup.py
+++ b/imperative/python/setup.py
@@ -9,6 +9,7 @@
 import os
 import re
 import pathlib
+import platform
 from distutils.file_util import copy_file
 from setuptools import setup, find_packages, Extension
 from setuptools.command.build_ext import build_ext as _build_ext
@@ -29,7 +30,10 @@ class build_ext(_build_ext):
            extdir.parent.mkdir(parents=True, exist_ok=True)

            modpath = self.get_ext_fullname(ext.name).split('.')
-            modpath[-1] += '.so'
+            if platform.system() == 'Windows':
+                modpath[-1] += '.pyd'
+            else:
+                modpath[-1] += '.so'
            modpath = str(pathlib.Path(*modpath).resolve())

            copy_file(modpath, fullpath, verbose=self.verbose, dry_run=self.dry_run)
@@ -47,6 +51,14 @@ if local_version:
    __version__ = '{}+{}'.format(__version__, local_version)

 packages = find_packages(exclude=['test'])
+package_data = [
+    str(f.relative_to('megengine'))
+    for f in pathlib.Path('megengine', 'core', 'include').glob('**/*')
+]
+package_data += [
+    str(f.relative_to('megengine'))
+    for f in pathlib.Path('megengine', 'core', 'lib').glob('**/*')
+]

 with open('requires.txt') as f:
    requires = f.read().splitlines()
@@ -63,6 +75,9 @@ setup_kwargs = dict(
    author='Megvii Engine Team',
    author_email=email,
    packages=packages,
+    package_data={
+        'megengine': package_data,
+    },
    ext_modules=[PrecompiledExtesion('megengine.core._imperative_rt')],
    install_requires=requires,
    extras_require={

--- a/imperative/python/src/helper.cpp
+++ b/imperative/python/src/helper.cpp
@@ -9,15 +9,6 @@
 #include "megbrain/utils/mempool.h"
 #include "./numpy_dtypes.h"

-/*
- * demangle typeid, see
- * http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname
- */
-#ifdef __GNUG__
-#include <cstdlib>
-#include <memory>
-#include <cxxabi.h>
-
 namespace py = pybind11;

 PyTaskDipatcher py_task_q = {};
@@ -34,10 +25,18 @@ py::module rel_import(py::str name, py::module m, int level) {
    return import(name, m.attr("__dict__"), py::arg("level")=level);
 }

+/*
+ * demangle typeid, see
+ * http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname
+ */
+#ifdef __GNUG__
+#include <cxxabi.h>
+#include <cstdlib>
+#include <memory>
+
 namespace {

 std::string demangle_typeid(const char* name) {
-
    int status = -4; // some arbitrary value to eliminate the compiler warning

    // enable c++11 by passing the flag -std=c++11 to g++
@@ -48,7 +47,7 @@ std::string demangle_typeid(const char* name) {

    return (status==0) ? res.get() : name ;
 }
-}
+}  // namespace
 #else

 namespace {

--- a/imperative/python/src/utils.cpp
+++ b/imperative/python/src/utils.cpp
 #include "utils.h"
+#ifdef WIN32
+#include <stdio.h>
+#include <windows.h>
+#endif

 #include <pybind11/operators.h>
 #include <atomic>

--- a/imperative/python/test/integration/test_dp_correctness.py
+++ b/imperative/python/test/integration/test_dp_correctness.py
@@ -8,6 +8,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import multiprocessing as mp
 import os
+import platform
 import re
 import subprocess
 import sys
@@ -196,6 +197,9 @@ def run_test(


 @pytest.mark.isolated_distributed
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
+)
 def test_dp_correctness():
    model_name = "mnist_model_with_test.mge"
    model_path = os.path.join(os.path.dirname(__file__), model_name)

--- a/imperative/python/test/unit/functional/test_distributed.py
+++ b/imperative/python/test/unit/functional/test_distributed.py
@@ -35,7 +35,7 @@ from megengine.functional.distributed import (
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_reduce_sum():
@@ -77,7 +77,7 @@ def test_reduce_sum():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_broadcast():
@@ -115,7 +115,7 @@ def test_broadcast():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_all_gather():
@@ -154,7 +154,7 @@ def test_all_gather():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_reduce_scatter_sum():
@@ -193,7 +193,7 @@ def test_reduce_scatter_sum():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_all_reduce_sum():
@@ -232,7 +232,7 @@ def test_all_reduce_sum():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_all_reduce_max():
@@ -271,7 +271,7 @@ def test_all_reduce_max():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_all_reduce_min():
@@ -310,7 +310,7 @@ def test_all_reduce_min():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_gather():
@@ -352,7 +352,7 @@ def test_gather():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_scatter():
@@ -390,7 +390,7 @@ def test_scatter():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_all_to_all():
@@ -430,7 +430,7 @@ def test_all_to_all():
    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_io_remote():

--- a/imperative/python/test/unit/test_autodiff.py
+++ b/imperative/python/test/unit/test_autodiff.py
@@ -6,6 +6,7 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import platform
 import weakref

 import numpy as np
@@ -51,6 +52,9 @@ def save_to(self, name="grad"):


 @pytest.mark.isolated_distributed
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
+)
 def test_dist_grad():
    world_size = 2
    x_np = np.random.rand(10).astype("float32")

--- a/imperative/src/impl/profiler.cpp
+++ b/imperative/src/impl/profiler.cpp
@@ -9,7 +9,17 @@

 #include "megbrain/imperative/profiler.h"

+#if defined(_MSC_VER) || defined(WIN32)
+#include <windows.h>
+#define getpid GetCurrentProcessId
+#else
 #include <sys/unistd.h>
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <unistd.h>
+#endif
+
 #include <variant>

 #include "megbrain/imperative/ops/opr_attr.h"

--- a/imperative/src/impl/proxy_graph.cpp
+++ b/imperative/src/impl/proxy_graph.cpp
@@ -16,6 +16,10 @@
 #include "megbrain/imperative/ops/opr_attr.h"
 #include "megbrain/imperative/ops/backward_graph.h"

+#if __cplusplus >= 201703L
+#include <optional>
+#endif
+
 namespace mgb {
 namespace imperative {


--- a/imperative/test/CMakeLists.txt
+++ b/imperative/test/CMakeLists.txt
@@ -38,8 +38,11 @@ if(CXX_SUPPORT_WCLASS_MEMACCESS)
 endif()

 if(UNIX)
-    target_link_libraries(imperative_test dl rt)
+    if(APPLE OR ANDROID)
+        target_link_libraries(imperative_test dl)
+    else()
+        target_link_libraries(imperative_test dl rt)
+    endif()
 endif()

-
 install(TARGETS imperative_test RUNTIME DESTINATION test)
--- a/python_module/CMakeLists.txt
+++ b/python_module/CMakeLists.txt
@@ -81,7 +81,10 @@ else()
    target_link_libraries(mgb megbrain megdnn -Wl,--version-script=${VERSION_SCRIPT})
 endif()
 target_include_directories(mgb PRIVATE ${PYTHON_INCLUDE_DIRS} src/cpp ${CMAKE_CURRENT_BINARY_DIR} ${NUMPY_INCLUDE_DIR})
-target_link_libraries(mgb ${PYTHON_LIBRARIES})
+# only windows need link PYTHON_LIBRARIES
+if(MSVC OR WIN32)
+    target_link_libraries(mgb ${PYTHON_LIBRARIES})
+endif()

 if (MGE_WITH_DISTRIBUTED)
    target_link_libraries(mgb megray)

--- a/scripts/cmake-build/BUILD_README.md
+++ b/scripts/cmake-build/BUILD_README.md
@@ -30,11 +30,17 @@
    4e: add C:\Program Files\NVIDIA GPU Computing Toolkit\cudnn-10.1-windows10-x64-v7.6.5.32\cuda\bin to system Path env
    4f: add C:\Program Files\NVIDIA GPU Computing Toolkit\TensorRT-6.0.1.5\lib Path
    if u do not do 4d/4e/4f, CUDA runtime can not find dll
+    5: install python3 (DFT 3.8.3) to /c/Users/${USER}/mge_whl_python_env/3.8.3 and
+    put it to PATH env and run python3 -m pip install numpy (if u want to build with training mode or build python whl)
+    6: install swig from install gui (if u want to build with training mode or build python whl)
+       a: download swig: https://nchc.dl.sourceforge.net/project/swig/swigwin/swigwin-4.0.2/swigwin-4.0.2.zip
+       b: install swig to /c/Users/${USER}/swigwin-4.0.2
+       c: apply scripts/whl/windows/fix-ptr-define-issue.patch to c/Users/${USER}/swigwin-4.0.2
    ```
 ### linux host build
    ```
    1: cmake, which version > 3.14.4
-    2: gcc/g++, which version > 6
+    2: gcc/g++, which version > 6, (gcc/g++ >= 7, if need build training)
    3: install build-essential git git-lfs gfortran libgfortran-6-dev autoconf gnupg flex bison gperf curl 
    4: zlib1g-dev gcc-multilib g++-multilib lib32ncurses5-dev libxml2-utils xsltproc unzip libtool:
    5: librdmacm-dev rdmacm-utils python3-dev swig python3-numpy texinfo
@@ -47,6 +53,7 @@
    3: brew install python python3 swig coreutils
    4: install at least xcode command line tool: https://developer.apple.com/xcode/
    5: about cuda: we do not support CUDA on macos
+    6: python3 -m pip install numpy (if u want to build with training mode or build python whl)
    ```
 ### cross build for arm-android
    now we support windows/linux/macos cross build to arm-android

--- a/scripts/cmake-build/host_build.sh
+++ b/scripts/cmake-build/host_build.sh
@@ -9,6 +9,7 @@ function usage() {
    echo "-t : Build with training mode, default inference only"
    echo "-m : Build with m32 mode(only for windows build), default m64"
    echo "-r : remove old build dir before make, default off"
+    echo "-n : enable new python runtime(valid when training mode with -t, default is legacy runtime)"
    echo "-h : show usage"
    echo "append other cmake config by export EXTRA_CMAKE_ARGS=..."
    echo "example: $0 -d"
@@ -22,9 +23,10 @@ MGE_WINDOWS_BUILD_ARCH=x64
 MGE_WINDOWS_BUILD_MARCH=m64
 MGE_ARCH=x86_64
 REMOVE_OLD_BUILD=false
+MGE_BUILD_IMPERATIVE_RT=OFF
 echo "EXTRA_CMAKE_ARGS: ${EXTRA_CMAKE_ARGS}"

-while getopts "rhdctm" arg
+while getopts "rhdctmn" arg
 do
    case $arg in
        d)
@@ -48,11 +50,15 @@ do
            REMOVE_OLD_BUILD=true
            ;;
        m)
-            echo "build for m32(only use for windows)"
+            echo "build for m32(only valid use for windows)"
            MGE_WINDOWS_BUILD_ARCH=x86
            MGE_WINDOWS_BUILD_MARCH=m32
            MGE_ARCH=i386
            ;;
+        n)
+            echo "Enable imperative python wrapper runtime"
+            MGE_BUILD_IMPERATIVE_RT=ON
+            ;;
        ?)
            echo "unkonw argument"
            usage
@@ -101,6 +107,7 @@ function cmake_build() {
    cmake \
        -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
        -DMGE_INFERENCE_ONLY=$MGE_INFERENCE_ONLY \
+        -DMGE_BUILD_IMPERATIVE_RT=${MGE_BUILD_IMPERATIVE_RT} \
        -DMGE_WITH_CUDA=$MGE_WITH_CUDA \
        -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR \
        ${EXTRA_CMAKE_ARGS} \
@@ -112,7 +119,7 @@ function cmake_build() {

 function windows_env_err() {
    echo "check windows env failed!!"
-    echo "please install LLVM/clang-cl/cmake/python at Visual Studio Extensions"
+    echo "please install env refs for: scripts/cmake-build/BUILD_README.md"
    exit -1
 }

@@ -178,6 +185,25 @@ function prepare_env_for_windows_build() {
    export CPATH=$CPATH:$NIVIDA_INSTALL_PRE/${TRT_V}/include:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/include:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/include/nvtx3:$PC_CUDNN_INCLUDE_DIRS
    export LIBRARY_PATH=$LIBRARY_PATH:$LD_LIBRARY_PATH
    export INCLUDE=$INCLUDE:$CPATH
+
+    # python version will be config by whl build script or ci script, we need
+    # a DFT version for build success when we just call host_build.sh
+    if [[ -z ${ALREADY_CONFIG_PYTHON_VER} ]]
+    then
+        echo "config a default python3"
+        DFT_PYTHON_BIN=/c/Users/${USER}/mge_whl_python_env/3.8.3
+        if [ ! -f "${DFT_PYTHON_BIN}/python3.exe" ]; then
+            echo "ERR: can not find ${DFT_PYTHON_BIN}/python3.exe , Invalid env"
+            windows_env_err
+        else
+            echo "put python3 to env..."
+            export PATH=${DFT_PYTHON_BIN}:$PATH
+            which python3
+        fi
+    fi
+
+    echo "export swig pwd to PATH"
+    export PATH=/c/Users/${USER}/swigwin-4.0.2::$PATH
 }

 WINDOWS_BUILD_TARGET="Ninja all > build.log"
@@ -218,6 +244,7 @@ function cmake_build_windows() {
        vcvarsall.bat $MGE_WINDOWS_BUILD_ARCH && cmake  -G "Ninja" \
        -DMGE_ARCH=$MGE_ARCH \
        -DMGE_INFERENCE_ONLY=$MGE_INFERENCE_ONLY \
+        -DMGE_BUILD_IMPERATIVE_RT=${MGE_BUILD_IMPERATIVE_RT} \
        -DMGE_WITH_CUDA=$MGE_WITH_CUDA \
        -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
        -DCMAKE_INSTALL_PREFIX:PATH=$INSTALL_DIR  \
@@ -230,8 +257,18 @@ function cmake_build_windows() {
        ${WINDOWS_BUILD_TARGET}"
 }

+if [ ${MGE_BUILD_IMPERATIVE_RT} = "ON" ] && [ ${MGE_INFERENCE_ONLY} = "ON" ]; then
+    echo "ERR: MGE_BUILD_IMPERATIVE_RT(-n) only valid when enable training mode(-t)"
+    echo "pls remove -n or add -t"
+    exit -1
+fi

 if [[ $OS =~ "NT" ]]; then
+    if [ ${MGE_ARCH} = "i386" ] && [ ${MGE_INFERENCE_ONLY} = "OFF" ]; then
+        echo "ERR: training mode(-t) only support 64 bit mode"
+        echo "pls remove -t or remove -m"
+        exit -1
+    fi
    config_windows_build_target
    cmake_build_windows $MGE_WITH_CUDA $MGE_INFERENCE_ONLY $BUILD_TYPE
 else

--- a/scripts/whl/BUILD_PYTHON_WHL_README.md
+++ b/scripts/whl/BUILD_PYTHON_WHL_README.md
@@ -53,10 +53,6 @@
       d0: /c/Users/${USER}/mge_whl_python_env/3.8.3/python3.exe -m pip install --upgrade pip
       d1: /c/Users/${USER}/mge_whl_python_env/3.8.3/python3.exe -m pip install -r python_module/requires-test.txt
       d2: /c/Users/${USER}/mge_whl_python_env/3.8.3/python3.exe -m pip install numpy wheel requests tqdm tabulate
-    5: install swig from install gui
-       a: download swig: https://nchc.dl.sourceforge.net/project/swig/swigwin/swigwin-4.0.2/swigwin-4.0.2.zip
-       b: install swig to /c/Users/${USER}/swigwin-4.0.2
-       c: apply scripts/whl/windows/fix-ptr-define-issue.patch to c/Users/${USER}/swigwin-4.0.2
    ```

 # how to build
@@ -90,6 +86,11 @@
    ```
    ALL_PYTHON=3.5.9 ./scripts/whl/macos/macos_build_whl.sh
    ```
+    If you want to build with imperative rt, set env BUILD_IMPERATIVE="ON", eg:
+
+    ```
+    ALL_PYTHON=3.5.9 BUILD_IMPERATIVE="ON" ./scripts/whl/macos/macos_build_whl.sh
+    ```
 ## build for windows
    ```
    ./scripts/whl/windows/windows_build_whl.sh
@@ -102,5 +103,7 @@
    If you want to build windows whl with cuda, also a specific Python verison. eg:

    ```
-    WINDOWS_WHL_WITH_CUDA="true" ALL_PYTHON=3.5.4 ./scripts/whl/windows/windows_build_whl.sh
+    WINDOWS_WHL_WITH_CUDA="ON" ALL_PYTHON=3.5.4 ./scripts/whl/windows/windows_build_whl.sh
    ```
+    If you want to build with imperative rt, set env BUILD_IMPERATIVE="ON", eg:
+    BUILD_IMPERATIVE="ON" WINDOWS_WHL_WITH_CUDA="ON" ALL_PYTHON=3.5.4 ./scripts/whl/windows/windows_build_whl.sh
--- a/scripts/whl/macos/macos_build_whl.sh
+++ b/scripts/whl/macos/macos_build_whl.sh
@@ -65,16 +65,18 @@ function config_python_env() {
    fi
    echo ${ver}

-    #config a dir to trick cmake find a null pythonlib
-    PYTHON_LIBRARY=${PYTHON_DIR}lib/
    if [ "$1" = "3.5.9" ]; then
        PYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python3.5m
+        PYTHON_LIBRARY=${PYTHON_DIR}/lib/libpython3.5m.dylib
    elif [ "$1" = "3.6.10" ]; then
        PYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python3.6m
+        PYTHON_LIBRARY=${PYTHON_DIR}/lib/libpython3.6m.dylib
    elif [ "$1" = "3.7.7" ]; then
        PYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python3.7m
+        PYTHON_LIBRARY=${PYTHON_DIR}/lib/libpython3.7m.dylib
    elif [ "$1" = "3.8.3" ]; then
        PYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python3.8
+        PYTHON_LIBRARY=${PYTHON_DIR}/lib/libpython3.8.dylib
    else
        echo "ERR: DO NOT SUPPORT PYTHON VERSION"
        echo "now support list: ${FULL_PYTHON_VER}"
@@ -82,6 +84,11 @@ function config_python_env() {
    fi
 }

+if [[ -z ${BUILD_IMPERATIVE} ]]
+then
+    BUILD_IMPERATIVE="OFF"
+fi
+
 function do_build() {
    for ver in ${ALL_PYTHON}
    do
@@ -89,7 +96,7 @@ function do_build() {
        config_python_env ${ver}

        #check env
-        if [ ! -d "$PYTHON_LIBRARY" ]; then
+        if [ ! -f "$PYTHON_LIBRARY" ]; then
            echo "ERR: can not find $PYTHON_LIBRARY , Invalid python package"
            err_env
        fi
@@ -102,14 +109,20 @@ function do_build() {
        #append cmake args for config python
        export EXTRA_CMAKE_ARGS="-DCMAKE_PREFIX_PATH=${PYTHON_DIR} -DPYTHON_LIBRARY=${PYTHON_LIBRARY} -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR} "
        #config build type to RelWithDebInfo to enable MGB_ENABLE_DEBUG_UTIL etc
-        export EXTRA_CMAKE_ARGS=${EXTRA_CMAKE_ARGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo "
+        export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_BUILD_TYPE=RelWithDebInfo "

        #call build and install
        #FIXME: cmake do not triger update python config, after
        #change PYTHON_LIBRARY and PYTHON_INCLUDE_DIR, so add
        #-r to remove build cache after a new ver build, which
        #will be more slow build than without -r
-        ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r
+        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
+            echo "build whl with IMPERATIVE python rt"
+            ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -n -r
+        else
+            echo "build whl with legacy python rt"
+            ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r
+        fi

        #call setup.py
        BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release/build/
@@ -121,12 +134,47 @@ function do_build() {
        fi
        mkdir -p staging

+        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
+            echo "build whl with IMPERATIVE python rt"
+            cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
+            cd ${BUILD_DIR}/staging/megengine/core
+            rt_file=`ls _imperative_rt.*.so`
+            echo "rt file is: ${rt_file}"
+            if [[ -z ${rt_file} ]]
+            then
+                echo "ERR: can not find valid rt file"
+                exit -1
+            fi
+            llvm-strip -s ${rt_file}
+            mv ${rt_file} _imperative_rt.so
+            echo "check so valid or not..."
+            otool_out=`otool -L _imperative_rt.so`
+            if [[ "${otool_out}" =~ "ython" ]]; then
+                echo "ERR: invalid _imperative_rt.so which depend on python lib, detail: log"
+                echo ${otool_out}
+                exit -1
+            else
+                echo "valid..."
+            fi
+        else
+            echo "build whl with legacy python rt"
+
+            cp -a python_module/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
+            cd ${BUILD_DIR}/staging/megengine/_internal
+            #FIXME: set lib suffix to dylib may be better, BUT we find after distutils.file_util.copy_file
+            #will change to .so at macos even we set suffix to dylib, at the same time, macos also support .so
+            echo "check so valid or not..."
+            llvm-strip -s _mgb.so
+            otool_out=`otool -L _mgb.so`
+            if [[ "${otool_out}" =~ "ython" ]]; then
+                echo "ERR: invalid _mgb.so which depend on python lib, detail: log"
+                echo ${otool_out}
+                exit -1
+            else
+                echo "valid..."
+            fi
+        fi

-        cp -a python_module/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
-        cd ${BUILD_DIR}/staging/megengine/_internal
-        #FIXME: set lib suffix to dylib may be better, BUT we find after distutils.file_util.copy_file
-        #will change to .so at macos even we set suffix to dylib, at the same time, macos also support .so
-        llvm-strip -s _mgb.so
        cd ${BUILD_DIR}/staging
        ${PYTHON_DIR}/bin/python3 setup.py bdist_wheel
        cd ${BUILD_DIR}/staging/dist/

--- a/scripts/whl/windows/windows_build_whl.sh
+++ b/scripts/whl/windows/windows_build_whl.sh
@@ -14,8 +14,6 @@ function err_env() {
 }

 function append_path_env_and_check() {
-    echo "export swig pwd to PATH"
-    export PATH=/c/Users/${USER}/swigwin-4.0.2::$PATH
    echo  "export vs2019 install path"
    export VS_PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2019/Enterprise
    # for llvm-strip
@@ -62,7 +60,7 @@ function config_python_env() {

 if [[ -z ${WINDOWS_WHL_WITH_CUDA} ]]
 then
-    WINDOWS_WHL_WITH_CUDA="false"
+    WINDOWS_WHL_WITH_CUDA="OFF"
 fi


@@ -74,26 +72,46 @@ CUBLAS_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cublas6
 CURAND_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/curand64_10.dll"
 CUBLASLT_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cublasLt64_10.dll"
 CUDART_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cudart64_101.dll"
+function depend_real_copy() {
+    REAL_DST=$1
+    echo "real copy lib to $1"
+    cp "${TRT_LIB}" ${REAL_DST}
+    cp "${CUDNN_LIB}" ${REAL_DST}
+    cp "${CUSOLVER_LIB}" ${REAL_DST}
+    cp "${CUBLAS_LIB}" ${REAL_DST}
+    cp "${CURAND_LIB}" ${REAL_DST}
+    cp "${CUBLASLT_LIB}" ${REAL_DST}
+    cp "${CUDART_LIB}" ${REAL_DST}
+}
+
 function copy_more_dll() {
    # for python whl real use
-    CP_DST=${BUILD_DIR}/staging/megengine/_internal/lib
-    rm -rf ${CP_DST}
-    mkdir ${CP_DST}
+    if [ ${BUILD_IMPERATIVE} = "ON" ]; then
+        echo "config BUILD_IMPERATIVE core lib dir"
+        CP_WHL_DST=${BUILD_DIR}/staging/megengine/core/lib
+    else
+        echo "config legacy python lib dir"
+        CP_WHL_DST=${BUILD_DIR}/staging/megengine/_internal/lib
+    fi
+    rm -rf ${CP_WHL_DST}
+    mkdir ${CP_WHL_DST}
+    # workround for cpu-only version import failed, use a
+    # empty.file to triger setup.py to create a null empty
+    echo "empty" > ${CP_WHL_DST}/empty.file


-    if [ ${WINDOWS_WHL_WITH_CUDA} = "true" ]; then
+    if [ ${WINDOWS_WHL_WITH_CUDA} = "ON" ]; then
        echo "copy nvidia lib to whl use...."
-        cp "${TRT_LIB}" ${CP_DST}
-        cp "${CUDNN_LIB}" ${CP_DST}
-        cp "${CUSOLVER_LIB}" ${CP_DST}
-        cp "${CUBLAS_LIB}" ${CP_DST}
-        cp "${CURAND_LIB}" ${CP_DST}
-        cp "${CUBLASLT_LIB}" ${CP_DST}
-        cp "${CUDART_LIB}" ${CP_DST}
+        depend_real_copy ${CP_WHL_DST}

    fi
 }

+if [[ -z ${BUILD_IMPERATIVE} ]]
+then
+    BUILD_IMPERATIVE="OFF"
+fi
+
 function do_build() {
    for ver in ${ALL_PYTHON}
    do
@@ -118,21 +136,31 @@ function do_build() {
        #force LINK a real PYTHON_LIBRARY file, after test we do not find the symbols conflict with python
        #export EXTRA_CMAKE_ARGS="-DPYTHON_LIBRARY=${PYTHON_LIBRARY} -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR} "
        #config build type to RelWithDebInfo to enable MGB_ENABLE_DEBUG_UTIL etc
-        export EXTRA_CMAKE_ARGS=${EXTRA_CMAKE_ARGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo "
+        export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_BUILD_TYPE=RelWithDebInfo "

        #call build and install
        #FIXME: cmake do not triger update python config, after
        #change PYTHON_LIBRARY and PYTHON_INCLUDE_DIR, so add
        #-r to remove build cache after a new ver build, which
        #will be more slow build than without -r
-        if [ ${WINDOWS_WHL_WITH_CUDA} = "true" ]; then
+        BUILD_ARGS=" -t -r"
+        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
+            echo "build whl with IMPERATIVE python rt"
+            BUILD_ARGS="${BUILD_ARGS} -n "
+        else
+            echo "build whl with legacy python rt"
+        fi
+
+        if [ ${WINDOWS_WHL_WITH_CUDA} = "ON" ]; then
            echo "build windows whl with cuda"
-            ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r -c
+            BUILD_ARGS="${BUILD_ARGS} -c "
        else
            echo "build windows whl with cpu only"
-            ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r
        fi

+        echo "host_build.sh BUILD_ARGS: ${BUILD_ARGS}"
+        ${SRC_DIR}/scripts/cmake-build/host_build.sh ${BUILD_ARGS}
+
        #call setup.py
        BUILD_DIR=${SRC_DIR}/build_dir/host/build/
        cd ${BUILD_DIR}
@@ -143,10 +171,27 @@ function do_build() {
        fi
        mkdir -p staging

+        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
+            echo "build whl with IMPERATIVE python rt"
+            cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
+            cd ${BUILD_DIR}/staging/megengine/core
+            rt_file=`ls _imperative_rt.*.pyd`
+            echo "rt file is: ${rt_file}"
+            if [[ -z ${rt_file} ]]
+            then
+                echo "ERR: can not find valid rt file"
+                exit -1
+            fi
+            llvm-strip -s ${rt_file}
+            mv ${rt_file} _imperative_rt.pyd
+        else
+            echo "build whl with legacy python rt"
+
+            cp -a python_module/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
+            cd ${BUILD_DIR}/staging/megengine/_internal
+            llvm-strip -s _mgb.pyd
+        fi

-        cp -a python_module/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
-        cd ${BUILD_DIR}/staging/megengine/_internal
-        llvm-strip -s _mgb.pyd
        copy_more_dll
        cd ${BUILD_DIR}/staging
        ${PYTHON_DIR}/python3 setup.py bdist_wheel
@@ -175,5 +220,6 @@ function third_party_prepare() {
 }

 ######################
+export ALREADY_CONFIG_PYTHON_VER="yes"
 third_party_prepare
 do_build
--- a/src/core/impl/graph/seq_sublinear_memory.cpp
+++ b/src/core/impl/graph/seq_sublinear_memory.cpp
@@ -33,6 +33,11 @@ class RNGxorshf {
    uint64_t s[2];

 public:
+#if __cplusplus >= 201703L
+    typedef uint64_t result_type;
+    static constexpr uint64_t min() { return 0; }
+    static constexpr uint64_t max() { return UINT64_MAX; }
+#endif
    RNGxorshf(uint64_t seed) {
        std::mt19937_64 gen(seed);
        s[0] = gen();
@@ -936,8 +941,12 @@ void SeqModifierForSublinearMemory::ActionSearcherSingleCN::search_genetic() {
            }
        }
        m_cur_records = records;
+#if __cplusplus >= 201703L
+        std::shuffle(perm.begin(), perm.end(), rng);
+#else
        std::random_shuffle(perm.begin(), perm.end(),
                            [&](size_t x) { return rng() % x; });
+#endif
        for (size_t i = 0; i < length; ++i) {
            invoke_search(mutation(mutation(records[i].first)));
            invoke_search(crossover(records[i].first, records[perm[i]].first));

--- a/src/opr/test/blas.cpp
+++ b/src/opr/test/blas.cpp
@@ -705,7 +705,12 @@ TEST(TestOprBlas, MatrixInverse) {
        }
        auto ptr = inp[0]->ptr<float>();
        for (size_t i = 0; i < batch; ++i, ptr += n * n) {
+#if __cplusplus >= 201703L
+            std::default_random_engine rng_engine;
+            std::shuffle(perm.begin(), perm.end(), rng_engine);
+#else
            std::random_shuffle(perm.begin(), perm.end());
+#endif
            for (size_t j = 0; j < n; ++j) {
                ptr[j * n + perm[j]] += 5;
            }

--- a/src/opr/test/muxing.cpp
+++ b/src/opr/test/muxing.cpp
@@ -36,7 +36,12 @@ void run_all_gather(const std::vector<size_t>& axis_size, bool& success,
        sleep_time.push_back(i * 0.05 + 0.1);
        tot_axis_size += axis_size[i];
    }
+#if __cplusplus >= 201703L
+    std::default_random_engine rng_engine;
+    std::shuffle(sleep_time.begin(), sleep_time.end(), rng_engine);
+#else
    std::random_shuffle(sleep_time.begin(), sleep_time.end());
+#endif

    auto constexpr DEVICE_TYPE = CompNode::DeviceType::CUDA;
    size_t nr_dev = std::min<size_t>(

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -18,7 +18,11 @@ endif()

 add_executable(megbrain_test ${SOURCES})
 target_link_libraries(megbrain_test gtest)
-target_link_libraries(megbrain_test megengine)
+if(MSVC OR WIN32)
+    target_link_libraries(megbrain_test megbrain megdnn)
+else()
+    target_link_libraries(megbrain_test megengine)
+endif()
 if(CXX_SUPPORT_WCLASS_MEMACCESS)
    if(MGE_WITH_CUDA)
        target_compile_options(megbrain_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-class-memaccess>"
@@ -28,10 +32,12 @@ if(CXX_SUPPORT_WCLASS_MEMACCESS)
    endif()
 endif()

-if(APPLE OR ANDROID)
-    target_link_libraries(megbrain_test dl)
-else()
-    target_link_libraries(megbrain_test dl rt)
+if(UNIX)
+    if(APPLE OR ANDROID)
+        target_link_libraries(megbrain_test dl)
+    else()
+        target_link_libraries(megbrain_test dl rt)
+    endif()
 endif()

 if (MGE_WITH_DISTRIBUTED)