diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1c4ef52e64df2106fdcc137b5d9e7d2ad62e0b8e..833bd064bd7ebde0a33e4446dfdbe51364756cff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -697,8 +697,10 @@ endif()
 if(MGE_WITH_PYTHON_MODULE)
     if(MGE_BUILD_IMPERATIVE_RT)
         add_subdirectory(imperative)
+        message("-- Enable imperative python wrapper runtime")
     else()
         add_subdirectory(python_module)
+        message("-- Enable legacy python wrapper runtime")
     endif()
 endif()
 
diff --git a/dnn/src/common/utils.h b/dnn/src/common/utils.h
index 449c9b04eff77e2d2f286a42e395a5f6df9e633c..66807c964afbc814ed5c6f72f94d71699004bb19 100644
--- a/dnn/src/common/utils.h
+++ b/dnn/src/common/utils.h
@@ -342,7 +342,11 @@ template <typename T>
 struct SafeMultiplies;
 
 template <typename T>
+#if __cplusplus >= 201703L
+struct _SafeMultipliesImplUnsigned {
+#else
 struct _SafeMultipliesImplUnsigned : public std::binary_function<T, T, T> {
+#endif
     static MEGDNN_CONSTEXPR size_t nbits = sizeof(T) * 8;
 
     static size_t clz(unsigned x) {
diff --git a/dnn/test/CMakeLists.txt b/dnn/test/CMakeLists.txt
index b37be5d9a1aa3df0919be1795693424a5681eca3..5d24a6131891a04cba20b56db7ce2be4ce3b1e37 100644
--- a/dnn/test/CMakeLists.txt
+++ b/dnn/test/CMakeLists.txt
@@ -70,8 +70,10 @@ if (MEG_WITH_ROCM)
     target_link_libraries (megdnn_test ${MGE_ROCM_LIBS})
 endif ()
 
-if(APPLE OR ANDROID)
-    target_link_libraries(megdnn_test dl)
-else()
-    target_link_libraries(megdnn_test dl rt)
+if(UNIX)
+    if(APPLE OR ANDROID)
+        target_link_libraries(megdnn_test dl)
+    else()
+        target_link_libraries(megdnn_test dl rt)
+    endif()
 endif()
diff --git a/dnn/test/common/mesh_indexing.h b/dnn/test/common/mesh_indexing.h
index 27612212cd01c190eb4cb1e15a9be1f2f7996aca..c3db52233fdbe2835481af5dd8fbe195a2501e1b 100644
--- a/dnn/test/common/mesh_indexing.h
+++ b/dnn/test/common/mesh_indexing.h
@@ -89,7 +89,7 @@ public:
         auto ptr = tensor.ptr<int>();
         for (size_t n = 0; n < size; ++n) {
             std::set<int> used;
-            std::random_shuffle(seq.begin(), seq.end());
+            COMPAT_RANDOM(seq.begin(), seq.end());
             for (size_t step = 0; step < stride; ++step) {
                 megdnn_assert(used.size() < m_size);
                 ptr[n * stride + step] = seq[step];
diff --git a/dnn/test/common/rng.cpp b/dnn/test/common/rng.cpp
index 4f7200bb60b9d0b88b5ebd79c2c8454eb43ded10..10c7f213a169f411e230016c7acc9c8293ecfc48 100644
--- a/dnn/test/common/rng.cpp
+++ b/dnn/test/common/rng.cpp
@@ -75,7 +75,7 @@ Float16PeriodicalRNG::Float16PeriodicalRNG() : m_offset(0) {
         i2f.i = static_cast<uint16_t>(x);
         m_sequence.push_back(i2f.f);
     }
-    std::random_shuffle(m_sequence.begin(), m_sequence.end());
+    COMPAT_RANDOM(m_sequence.begin(), m_sequence.end());
 }
 
 Float16PeriodicalRNG::Float16PeriodicalRNG(size_t range) : m_offset(0) {
@@ -99,7 +99,7 @@ Float16PeriodicalRNG::Float16PeriodicalRNG(size_t range) : m_offset(0) {
         m_sequence.push_back(i2f.f);
     }
 
-    std::random_shuffle(m_sequence.begin(), m_sequence.end());
+    COMPAT_RANDOM(m_sequence.begin(), m_sequence.end());
 }
 
 void Float16PeriodicalRNG::gen(const TensorND& tensor) {
diff --git a/dnn/test/common/rng.h b/dnn/test/common/rng.h
index 7af67117573ae87f6e3d304ccfa19f6ff17690c2..f20c0e5f4604bd4ac0f4ec2d4b9009c6061e962d 100644
--- a/dnn/test/common/rng.h
+++ b/dnn/test/common/rng.h
@@ -19,6 +19,16 @@
 namespace megdnn {
 namespace test {
 
+#if __cplusplus >= 201703L
+#define COMPAT_RANDOM(begin, end)              \
+    {                                          \
+        std::default_random_engine rng_engine; \
+        std::shuffle(begin, end, rng_engine);  \
+    }
+#else
+#define COMPAT_RANDOM(begin, end) std::random_shuffle(begin, end);
+#endif
+
 class RNG {
 protected:
     class RNGxorshf;
diff --git a/dnn/test/cuda/argmxx.cpp b/dnn/test/cuda/argmxx.cpp
index e90333ad98830bc99b4464b0dbbd23ba9e75babc..89e6cadb81a5621136ec9f8fc511313b7918b552 100644
--- a/dnn/test/cuda/argmxx.cpp
+++ b/dnn/test/cuda/argmxx.cpp
@@ -24,15 +24,16 @@ class ArgmxxRNG final: public RNG {
         void gen(const TensorND &tensor) override {
             auto offset = tensor.layout.span().low_elem;
             auto nr_elems = tensor.layout.span().dist_elem();
-#define cb(DType) \
-            if (tensor.layout.dtype == DType()) { \
-                using ctype = typename DTypeTrait<DType>::ctype; \
-                auto ptr = tensor.ptr<ctype>(); \
-                for (size_t i = 0; i < nr_elems; ++i) { \
-                    ptr[offset+i] = i; \
-                } \
-                std::random_shuffle(ptr + offset, ptr + offset + nr_elems); \
-            }
+
+#define cb(DType)                                             \
+    if (tensor.layout.dtype == DType()) {                     \
+        using ctype = typename DTypeTrait<DType>::ctype;      \
+        auto ptr = tensor.ptr<ctype>();                       \
+        for (size_t i = 0; i < nr_elems; ++i) {               \
+            ptr[offset + i] = i;                              \
+        }                                                     \
+        COMPAT_RANDOM(ptr + offset, ptr + offset + nr_elems); \
+    }
             MEGDNN_FOREACH_COMPUTING_DTYPE(cb);
 #undef cb
         }
diff --git a/dnn/test/cuda/argsort.cpp b/dnn/test/cuda/argsort.cpp
index 7c1f57524e94e52c06c2ad17a5726d58822b505a..b8779f72f4a125f1568320d9bc3377a7c266b654 100644
--- a/dnn/test/cuda/argsort.cpp
+++ b/dnn/test/cuda/argsort.cpp
@@ -32,7 +32,7 @@ class ArgsortRNG final : public RNG {
         } else {
             for (int i = 0; i < n; ++i)
                 ptr[i] = static_cast<T>(i - n / 2);
-            std::random_shuffle(ptr, ptr + n);
+            COMPAT_RANDOM(ptr, ptr + n);
         }
     }
 
@@ -86,7 +86,7 @@ void run_backward_test(Handle* handle, DType dtype) {
                 for (size_t j = 0; j < n; ++j) {
                     ptr[j] = j;
                 }
-                std::random_shuffle(ptr, ptr + n);
+                COMPAT_RANDOM(ptr, ptr + n);
                 ptr += n;
             }
         }
diff --git a/dnn/test/cuda/relayout.cpp b/dnn/test/cuda/relayout.cpp
index 24d1aebfc534ba3f2d3cc25b54de54572ecef00a..a5fdc4f906b1f9e4cd64bf80e54d1c56b2abf71b 100644
--- a/dnn/test/cuda/relayout.cpp
+++ b/dnn/test/cuda/relayout.cpp
@@ -361,9 +361,8 @@ TEST_F(CUDA, BENCHMARK_RELAYOUT_7) {
     for (size_t r = 0; r < _dim.size(); r++)
         permutation[r] = r;
     for (int nsample = 0; nsample < 50; nsample++) {
-        std::random_shuffle(_dim.begin(), _dim.end());
-
-        std::random_shuffle(permutation.begin(), permutation.end());
+        COMPAT_RANDOM(_dim.begin(), _dim.end());
+        COMPAT_RANDOM(permutation.begin(), permutation.end());
         if (!isTrivial(permutation)) {
             run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5],
                   _dim[6]},
@@ -451,9 +450,10 @@ TEST_F(CUDA, BENCHMARK_RELAYOUT_5) {
             printf("vol %d cur_ratio %lf | %lf\n", vol, cur_ratio, vol_re);
             // printVec(dim);
 
-            std::random_shuffle(dim.begin(), dim.end());
+            COMPAT_RANDOM(dim.begin(), dim.end());
+
             while (isTrivial(permutation)) {
-                std::random_shuffle(permutation.begin(), permutation.end());
+                COMPAT_RANDOM(permutation.begin(), permutation.end());
             }
 
             run({{dim[0], dim[1], dim[2], dim[3], dim[4]}, dtype::Int32()},
@@ -603,8 +603,9 @@ TEST_F(CUDA, BENCHMARK_LAST_CONTIG_ALIGN_TEST) {
     for (size_t r = 0; r < _dim.size(); r++)
         permutation[r] = r;
     for (int nsample = 0; nsample < 20; nsample++) {
-        std::random_shuffle(_dim.begin(), _dim.end() - 1);
-        std::random_shuffle(permutation.begin(), permutation.end() - 1);
+        COMPAT_RANDOM(_dim.begin(), _dim.end() - 1);
+
+        COMPAT_RANDOM(permutation.begin(), permutation.end() - 1);
 
         if (nsample < 5)
             _dim[5] = (u.gen_single_val() / 4 + 1) * 4;
diff --git a/dnn/test/cuda/sleep.cpp b/dnn/test/cuda/sleep.cpp
index 5395b04276fb57c315f565873d68a9041a3f96ad..52d2a05ef8f1199c9d484dbe0888f1162260e128 100644
--- a/dnn/test/cuda/sleep.cpp
+++ b/dnn/test/cuda/sleep.cpp
@@ -24,7 +24,7 @@ using namespace test;
 
 
 TEST_F(CUDA, SLEEP) {
-    auto opr = this->handle_cuda()->create_operator<Sleep>();
+    auto opr = this->handle_cuda()->create_operator<megdnn::SleepForward>();
 
     auto run = [&](float time) -> double {
         opr->param() = {time};
diff --git a/dnn/test/rocm/argmxx.cpp b/dnn/test/rocm/argmxx.cpp
index f94b259e854ff3935201a111cd1d755425b1dcde..1fada4e58c7b3ef4e9b53fe6529fb81321a4e9dc 100644
--- a/dnn/test/rocm/argmxx.cpp
+++ b/dnn/test/rocm/argmxx.cpp
@@ -24,16 +24,17 @@ class ArgmxxRNG final: public RNG {
         void gen(const TensorND &tensor) override {
             auto offset = tensor.layout.span().low_elem;
             auto nr_elems = tensor.layout.span().dist_elem();
-#define cb(DType) \
-            if (tensor.layout.dtype == DType()) { \
-                using ctype = typename DTypeTrait<DType>::ctype; \
-                auto ptr = tensor.ptr<ctype>(); \
-                for (size_t i = 0; i < nr_elems; ++i) { \
-                    ptr[offset+i] = i; \
-                } \
-                std::random_shuffle(ptr + offset, ptr + offset + nr_elems); \
-                return; \
-            }
+
+#define cb(DType)                                             \
+    if (tensor.layout.dtype == DType()) {                     \
+        using ctype = typename DTypeTrait<DType>::ctype;      \
+        auto ptr = tensor.ptr<ctype>();                       \
+        for (size_t i = 0; i < nr_elems; ++i) {               \
+            ptr[offset + i] = i;                              \
+        }                                                     \
+        COMPAT_RANDOM(ptr + offset, ptr + offset + nr_elems); \
+        return;                                               \
+    }
             MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb);
 #undef cb
             megdnn_throw(megdnn_mangle(ssprintf("Unsupported DType: %s",
diff --git a/imperative/CMakeLists.txt b/imperative/CMakeLists.txt
index 55a97a20f5ea2b2b92e33bb92ca87a31d76230a5..3bbdeffd0befd82baa263a72ec689eb90cfd8f53 100644
--- a/imperative/CMakeLists.txt
+++ b/imperative/CMakeLists.txt
@@ -76,7 +76,11 @@ add_custom_target(_version_ld SOURCES ${VERSION_SCRIPT})
 
 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/pybind11 ${PROJECT_BINARY_DIR}/third_party/pybind11)
 pybind11_add_module(${MODULE_NAME} NO_EXTRAS ${SRCS})
-target_link_libraries(${MODULE_NAME} PRIVATE gen_op_def megbrain megdnn -Wl,--version-script=${VERSION_SCRIPT})
+if (APPLE OR MSVC OR WIN32)
+    target_link_libraries(${MODULE_NAME} PRIVATE gen_op_def megbrain megdnn)
+else()
+    target_link_libraries(${MODULE_NAME} PRIVATE gen_op_def megbrain megdnn -Wl,--version-script=${VERSION_SCRIPT})
+endif()
 if (MGE_WITH_DISTRIBUTED)
     message("Imperative configured to link megray")
     target_link_libraries(${MODULE_NAME} PRIVATE megray)
@@ -91,6 +95,10 @@ set_target_properties(${MODULE_NAME} PROPERTIES
     SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}
     LIBRARY_OUTPUT_DIRECTORY ${MEGENGINE_DIR}/${PACKAGE_NAME}/core
 )
+if (APPLE OR MSVC OR WIN32)
+    message("-- overwriting SUFFIX at macos and windows before config by set_target_properties")
+    pybind11_extension(${MODULE_NAME})
+endif()
 add_dependencies(${MODULE_NAME} gen_opr_py _version_ld)
 
 if(MGE_WITH_TEST AND MGE_ENABLE_RTTI)
diff --git a/imperative/python/megengine/__init__.py b/imperative/python/megengine/__init__.py
index f27cdc7270dfb0dd99f640611906e4b0d7a03757..ef9b67d604df795a4a7580fe76b1f31eccebb419 100644
--- a/imperative/python/megengine/__init__.py
+++ b/imperative/python/megengine/__init__.py
@@ -8,6 +8,67 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import os
 import sys
+import platform
+import ctypes
+
+if sys.platform == "win32":
+    lib_path = os.path.join(os.path.dirname(__file__), "core/lib")
+    dll_paths = list(filter(os.path.exists, [lib_path,]))
+    assert len(dll_paths) > 0
+
+    kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
+    has_load_library_attr = hasattr(kernel32, "AddDllDirectory")
+    old_error_mode = kernel32.SetErrorMode(0x0001)
+
+    kernel32.LoadLibraryW.restype = ctypes.c_void_p
+    if has_load_library_attr:
+        kernel32.AddDllDirectory.restype = ctypes.c_void_p
+        kernel32.LoadLibraryExW.restype = ctypes.c_void_p
+
+    for dll_path in dll_paths:
+        if sys.version_info >= (3, 8):
+            os.add_dll_directory(dll_path)
+        elif has_load_library_attr:
+            res = kernel32.AddDllDirectory(dll_path)
+            if res is None:
+                err = ctypes.WinError(ctypes.get_last_error())
+                err.strerror += ' Error adding "{}" to the DLL search PATH.'.format(
+                    dll_path
+                )
+                raise err
+        else:
+            print("WARN: python or OS env have some issue, may load DLL failed!!!")
+
+    import glob
+
+    dlls = glob.glob(os.path.join(lib_path, "*.dll"))
+    path_patched = False
+    for dll in dlls:
+        is_loaded = False
+        if has_load_library_attr:
+            res = kernel32.LoadLibraryExW(dll, None, 0x00001100)
+            last_error = ctypes.get_last_error()
+            if res is None and last_error != 126:
+                err = ctypes.WinError(last_error)
+                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(
+                    dll
+                )
+                raise err
+            elif res is not None:
+                is_loaded = True
+        if not is_loaded:
+            if not path_patched:
+                os.environ["PATH"] = ";".join(dll_paths + [os.environ["PATH"]])
+                path_patched = True
+            res = kernel32.LoadLibraryW(dll)
+            if res is None:
+                err = ctypes.WinError(ctypes.get_last_error())
+                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(
+                    dll
+                )
+                raise err
+
+    kernel32.SetErrorMode(old_error_mode)
 
 from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
 from .device import *
diff --git a/imperative/python/megengine/utils/max_recursion_limit.py b/imperative/python/megengine/utils/max_recursion_limit.py
index 0870b7fa0e48bff3bc53aa98d2206ae81b1d2aaa..d7bce6e8b493d2c0e89948d521c905209665a2e8 100644
--- a/imperative/python/megengine/utils/max_recursion_limit.py
+++ b/imperative/python/megengine/utils/max_recursion_limit.py
@@ -6,10 +6,14 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import resource
+import platform
 import sys
 import threading
 
+# Windows do not imp resource package
+if platform.system() != "Windows":
+    import resource
+
 
 class AlternativeRecursionLimit:
     r"""A reentrant context manager for setting global recursion limits.
@@ -28,16 +32,24 @@ class AlternativeRecursionLimit:
         with self.lock:
             if self.count == 0:
                 self.orig_py_limit = sys.getrecursionlimit()
+            if platform.system() != "Windows":
                 (
                     self.orig_rlim_stack_soft,
                     self.orig_rlim_stack_hard,
                 ) = resource.getrlimit(resource.RLIMIT_STACK)
-                resource.setrlimit(
-                    resource.RLIMIT_STACK,
-                    (self.orig_rlim_stack_hard, self.orig_rlim_stack_hard),
-                )
-                # increase recursion limit
-                sys.setrecursionlimit(self.new_py_limit)
+                # FIXME: https://bugs.python.org/issue34602, python3 release version
+                # on Macos always have this issue, not all user install python3 from src
+                try:
+                    resource.setrlimit(
+                        resource.RLIMIT_STACK,
+                        (self.orig_rlim_stack_hard, self.orig_rlim_stack_hard),
+                    )
+                except ValueError as exc:
+                    if platform.system() != "Darwin":
+                        raise exc
+
+            # increase recursion limit
+            sys.setrecursionlimit(self.new_py_limit)
             self.count += 1
 
     def __exit__(self, type, value, traceback):
@@ -45,10 +57,16 @@ class AlternativeRecursionLimit:
             self.count -= 1
             if self.count == 0:
                 sys.setrecursionlimit(self.orig_py_limit)
-                resource.setrlimit(
-                    resource.RLIMIT_STACK,
-                    (self.orig_rlim_stack_soft, self.orig_rlim_stack_hard),
-                )
+
+            if platform.system() != "Windows":
+                try:
+                    resource.setrlimit(
+                        resource.RLIMIT_STACK,
+                        (self.orig_rlim_stack_soft, self.orig_rlim_stack_hard),
+                    )
+                except ValueError as exc:
+                    if platform.system() != "Darwin":
+                        raise exc
 
 
 _max_recursion_limit_context_manager = AlternativeRecursionLimit(2 ** 31 - 1)
diff --git a/imperative/python/setup.py b/imperative/python/setup.py
index e583cce4412f89331ef8de62eb1455b46f75a524..c788b75c93b38c44159025c712108918e484616b 100644
--- a/imperative/python/setup.py
+++ b/imperative/python/setup.py
@@ -9,6 +9,7 @@
 import os
 import re
 import pathlib
+import platform
 from distutils.file_util import copy_file
 from setuptools import setup, find_packages, Extension
 from setuptools.command.build_ext import build_ext as _build_ext
@@ -29,7 +30,10 @@ class build_ext(_build_ext):
             extdir.parent.mkdir(parents=True, exist_ok=True)
 
             modpath = self.get_ext_fullname(ext.name).split('.')
-            modpath[-1] += '.so'
+            if platform.system() == 'Windows':
+                modpath[-1] += '.pyd'
+            else:
+                modpath[-1] += '.so'
             modpath = str(pathlib.Path(*modpath).resolve())
 
             copy_file(modpath, fullpath, verbose=self.verbose, dry_run=self.dry_run)
@@ -47,6 +51,14 @@ if local_version:
     __version__ = '{}+{}'.format(__version__, local_version)
 
 packages = find_packages(exclude=['test'])
+package_data = [
+    str(f.relative_to('megengine'))
+    for f in pathlib.Path('megengine', 'core', 'include').glob('**/*')
+]
+package_data += [
+    str(f.relative_to('megengine'))
+    for f in pathlib.Path('megengine', 'core', 'lib').glob('**/*')
+]
 
 with open('requires.txt') as f:
     requires = f.read().splitlines()
@@ -63,6 +75,9 @@ setup_kwargs = dict(
     author='Megvii Engine Team',
     author_email=email,
     packages=packages,
+    package_data={
+        'megengine': package_data,
+    },
     ext_modules=[PrecompiledExtesion('megengine.core._imperative_rt')],
     install_requires=requires,
     extras_require={
diff --git a/imperative/python/src/helper.cpp b/imperative/python/src/helper.cpp
index a1b8b27759e7b3873d60223023fa84b1049dd687..13c16099c218dd9693a02f43539a24863d7126cd 100644
--- a/imperative/python/src/helper.cpp
+++ b/imperative/python/src/helper.cpp
@@ -9,15 +9,6 @@
 #include "megbrain/utils/mempool.h"
 #include "./numpy_dtypes.h"
 
-/*
- * demangle typeid, see
- * http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname
- */
-#ifdef __GNUG__
-#include <cstdlib>
-#include <memory>
-#include <cxxabi.h>
-
 namespace py = pybind11;
 
 PyTaskDipatcher py_task_q = {};
@@ -34,10 +25,18 @@ py::module rel_import(py::str name, py::module m, int level) {
     return import(name, m.attr("__dict__"), py::arg("level")=level);
 }
 
+/*
+ * demangle typeid, see
+ * http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname
+ */
+#ifdef __GNUG__
+#include <cxxabi.h>
+#include <cstdlib>
+#include <memory>
+
 namespace {
 
 std::string demangle_typeid(const char* name) {
-
     int status = -4; // some arbitrary value to eliminate the compiler warning
 
     // enable c++11 by passing the flag -std=c++11 to g++
@@ -48,7 +47,7 @@ std::string demangle_typeid(const char* name) {
 
     return (status==0) ? res.get() : name ;
 }
-}
+}  // namespace
 #else
 
 namespace {
diff --git a/imperative/python/src/utils.cpp b/imperative/python/src/utils.cpp
index b0e615a437e63d9aade2a1d17f2c1bc56ec0aa8b..3d4548a2747628594f19f0a17ca2a080d033ba9e 100644
--- a/imperative/python/src/utils.cpp
+++ b/imperative/python/src/utils.cpp
@@ -1,4 +1,8 @@
 #include "utils.h"
+#ifdef WIN32
+#include <stdio.h>
+#include <windows.h>
+#endif
 
 #include <pybind11/operators.h>
 #include <atomic>
diff --git a/imperative/python/test/integration/test_dp_correctness.py b/imperative/python/test/integration/test_dp_correctness.py
index 5719136942cced84a8e17f0bc0351f1b5d5c618c..b706adb6a1914fbf79d852f123bc4ca7ac598d35 100644
--- a/imperative/python/test/integration/test_dp_correctness.py
+++ b/imperative/python/test/integration/test_dp_correctness.py
@@ -8,6 +8,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import multiprocessing as mp
 import os
+import platform
 import re
 import subprocess
 import sys
@@ -196,6 +197,9 @@ def run_test(
 
 
 @pytest.mark.isolated_distributed
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
+)
 def test_dp_correctness():
     model_name = "mnist_model_with_test.mge"
     model_path = os.path.join(os.path.dirname(__file__), model_name)
diff --git a/imperative/python/test/unit/functional/test_distributed.py b/imperative/python/test/unit/functional/test_distributed.py
index 9ff2031907b51240faccb2ea30dd23619bb88d41..70b30fb28ae20258363c729446f4fc3592922ee7 100644
--- a/imperative/python/test/unit/functional/test_distributed.py
+++ b/imperative/python/test/unit/functional/test_distributed.py
@@ -35,7 +35,7 @@ from megengine.functional.distributed import (
     platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_reduce_sum():
@@ -77,7 +77,7 @@ def test_reduce_sum():
     platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_broadcast():
@@ -115,7 +115,7 @@ def test_broadcast():
     platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_all_gather():
@@ -154,7 +154,7 @@ def test_all_gather():
     platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_reduce_scatter_sum():
@@ -193,7 +193,7 @@ def test_reduce_scatter_sum():
     platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_all_reduce_sum():
@@ -232,7 +232,7 @@ def test_all_reduce_sum():
     platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_all_reduce_max():
@@ -271,7 +271,7 @@ def test_all_reduce_max():
     platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_all_reduce_min():
@@ -310,7 +310,7 @@ def test_all_reduce_min():
     platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_gather():
@@ -352,7 +352,7 @@ def test_gather():
     platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_scatter():
@@ -390,7 +390,7 @@ def test_scatter():
     platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_all_to_all():
@@ -430,7 +430,7 @@ def test_all_to_all():
     platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
 )
 @pytest.mark.skipif(
-    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
 )
 @pytest.mark.isolated_distributed
 def test_io_remote():
diff --git a/imperative/python/test/unit/test_autodiff.py b/imperative/python/test/unit/test_autodiff.py
index 929e967cae28069bc287dcaae5159160d4b61d2d..85b60e82aff48d7dec099038363a3088b7cd765e 100644
--- a/imperative/python/test/unit/test_autodiff.py
+++ b/imperative/python/test/unit/test_autodiff.py
@@ -6,6 +6,7 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import platform
 import weakref
 
 import numpy as np
@@ -51,6 +52,9 @@ def save_to(self, name="grad"):
 
 
 @pytest.mark.isolated_distributed
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
+)
 def test_dist_grad():
     world_size = 2
     x_np = np.random.rand(10).astype("float32")
diff --git a/imperative/src/impl/profiler.cpp b/imperative/src/impl/profiler.cpp
index f35f5b3c9232c276ac3b0db93308e66ea5833ba0..623522ac47976b20dd899c160a76c6ce1b8cefea 100644
--- a/imperative/src/impl/profiler.cpp
+++ b/imperative/src/impl/profiler.cpp
@@ -9,7 +9,17 @@
 
 #include "megbrain/imperative/profiler.h"
 
+#if defined(_MSC_VER) || defined(WIN32)
+#include <windows.h>
+#define getpid GetCurrentProcessId
+#else
 #include <sys/unistd.h>
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <unistd.h>
+#endif
+
 #include <variant>
 
 #include "megbrain/imperative/ops/opr_attr.h"
diff --git a/imperative/src/impl/proxy_graph.cpp b/imperative/src/impl/proxy_graph.cpp
index b750749ff921e5293b838a0b31b0021f53892bd4..7b0409f3decfa78b04c244e2aa29bcd73b20caa5 100644
--- a/imperative/src/impl/proxy_graph.cpp
+++ b/imperative/src/impl/proxy_graph.cpp
@@ -16,6 +16,10 @@
 #include "megbrain/imperative/ops/opr_attr.h"
 #include "megbrain/imperative/ops/backward_graph.h"
 
+#if __cplusplus >= 201703L
+#include <optional>
+#endif
+
 namespace mgb {
 namespace imperative {
 
diff --git a/imperative/test/CMakeLists.txt b/imperative/test/CMakeLists.txt
index 6b766cddce55b0893d49907bfa010d159d662fb7..7e50124a30927a35092010333b9b5a287e5ae55e 100644
--- a/imperative/test/CMakeLists.txt
+++ b/imperative/test/CMakeLists.txt
@@ -38,8 +38,11 @@ if(CXX_SUPPORT_WCLASS_MEMACCESS)
 endif()
 
 if(UNIX)
-    target_link_libraries(imperative_test dl rt)
+    if(APPLE OR ANDROID)
+        target_link_libraries(imperative_test dl)
+    else()
+        target_link_libraries(imperative_test dl rt)
+    endif()
 endif()
 
-
 install(TARGETS imperative_test RUNTIME DESTINATION test)
diff --git a/python_module/CMakeLists.txt b/python_module/CMakeLists.txt
index c64b520d234b98995fe89ae6783b712631ecd81d..e23c2488da168c501bdbab4dc0d2ade4ee19ed3c 100644
--- a/python_module/CMakeLists.txt
+++ b/python_module/CMakeLists.txt
@@ -81,7 +81,10 @@ else()
     target_link_libraries(mgb megbrain megdnn -Wl,--version-script=${VERSION_SCRIPT})
 endif()
 target_include_directories(mgb PRIVATE ${PYTHON_INCLUDE_DIRS} src/cpp ${CMAKE_CURRENT_BINARY_DIR} ${NUMPY_INCLUDE_DIR})
-target_link_libraries(mgb ${PYTHON_LIBRARIES})
+# only windows need link PYTHON_LIBRARIES
+if(MSVC OR WIN32)
+    target_link_libraries(mgb ${PYTHON_LIBRARIES})
+endif()
 
 if (MGE_WITH_DISTRIBUTED)
     target_link_libraries(mgb megray)
diff --git a/scripts/cmake-build/BUILD_README.md b/scripts/cmake-build/BUILD_README.md
index f9c70a510254655e776d71f0d19f606305a20318..457adcd667ded01b5543bcf10e9e2e2bee43c868 100644
--- a/scripts/cmake-build/BUILD_README.md
+++ b/scripts/cmake-build/BUILD_README.md
@@ -30,11 +30,17 @@
     4e: add C:\Program Files\NVIDIA GPU Computing Toolkit\cudnn-10.1-windows10-x64-v7.6.5.32\cuda\bin to system Path env
     4f: add C:\Program Files\NVIDIA GPU Computing Toolkit\TensorRT-6.0.1.5\lib Path
     if u do not do 4d/4e/4f, CUDA runtime can not find dll
+    5: install python3 (DFT 3.8.3) to /c/Users/${USER}/mge_whl_python_env/3.8.3 and
+    put it to PATH env and run python3 -m pip install numpy (if u want to build with training mode or build python whl)
+    6: install swig from install gui (if u want to build with training mode or build python whl)
+       a: download swig: https://nchc.dl.sourceforge.net/project/swig/swigwin/swigwin-4.0.2/swigwin-4.0.2.zip
+       b: install swig to /c/Users/${USER}/swigwin-4.0.2
+       c: apply scripts/whl/windows/fix-ptr-define-issue.patch to c/Users/${USER}/swigwin-4.0.2
     ```
 ### linux host build
     ```
     1: cmake, which version > 3.14.4
-    2: gcc/g++, which version > 6
+    2: gcc/g++, which version > 6, (gcc/g++ >= 7, if need build training)
     3: install build-essential git git-lfs gfortran libgfortran-6-dev autoconf gnupg flex bison gperf curl 
     4: zlib1g-dev gcc-multilib g++-multilib lib32ncurses5-dev libxml2-utils xsltproc unzip libtool:
     5: librdmacm-dev rdmacm-utils python3-dev swig python3-numpy texinfo
@@ -47,6 +53,7 @@
     3: brew install python python3 swig coreutils
     4: install at least xcode command line tool: https://developer.apple.com/xcode/
     5: about cuda: we do not support CUDA on macos
+    6: python3 -m pip install numpy (if u want to build with training mode or build python whl)
     ```
 ### cross build for arm-android
     now we support windows/linux/macos cross build to arm-android
diff --git a/scripts/cmake-build/host_build.sh b/scripts/cmake-build/host_build.sh
index 78e1562e3b817855bd0b076d647de061eff665d2..8a8f1508fb9f7490301d22688fc65b26f36b40ac 100755
--- a/scripts/cmake-build/host_build.sh
+++ b/scripts/cmake-build/host_build.sh
@@ -9,6 +9,7 @@ function usage() {
     echo "-t : Build with training mode, default inference only"
     echo "-m : Build with m32 mode(only for windows build), default m64"
     echo "-r : remove old build dir before make, default off"
+    echo "-n : enable new python runtime(valid when training mode with -t, default is legacy runtime)"
     echo "-h : show usage"
     echo "append other cmake config by export EXTRA_CMAKE_ARGS=..."
     echo "example: $0 -d"
@@ -22,9 +23,10 @@ MGE_WINDOWS_BUILD_ARCH=x64
 MGE_WINDOWS_BUILD_MARCH=m64
 MGE_ARCH=x86_64
 REMOVE_OLD_BUILD=false
+MGE_BUILD_IMPERATIVE_RT=OFF
 echo "EXTRA_CMAKE_ARGS: ${EXTRA_CMAKE_ARGS}"
 
-while getopts "rhdctm" arg
+while getopts "rhdctmn" arg
 do
     case $arg in
         d)
@@ -48,11 +50,15 @@ do
             REMOVE_OLD_BUILD=true
             ;;
         m)
-            echo "build for m32(only use for windows)"
+            echo "build for m32(only valid use for windows)"
             MGE_WINDOWS_BUILD_ARCH=x86
             MGE_WINDOWS_BUILD_MARCH=m32
             MGE_ARCH=i386
             ;;
+        n)
+            echo "Enable imperative python wrapper runtime"
+            MGE_BUILD_IMPERATIVE_RT=ON
+            ;;
         ?)
             echo "unkonw argument"
             usage
@@ -101,6 +107,7 @@ function cmake_build() {
     cmake \
         -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
         -DMGE_INFERENCE_ONLY=$MGE_INFERENCE_ONLY \
+        -DMGE_BUILD_IMPERATIVE_RT=${MGE_BUILD_IMPERATIVE_RT} \
         -DMGE_WITH_CUDA=$MGE_WITH_CUDA \
         -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR \
         ${EXTRA_CMAKE_ARGS} \
@@ -112,7 +119,7 @@ function cmake_build() {
 
 function windows_env_err() {
     echo "check windows env failed!!"
-    echo "please install LLVM/clang-cl/cmake/python at Visual Studio Extensions"
+    echo "please install env refs for: scripts/cmake-build/BUILD_README.md"
     exit -1
 }
 
@@ -178,6 +185,25 @@ function prepare_env_for_windows_build() {
     export CPATH=$CPATH:$NIVIDA_INSTALL_PRE/${TRT_V}/include:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/include:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/include/nvtx3:$PC_CUDNN_INCLUDE_DIRS
     export LIBRARY_PATH=$LIBRARY_PATH:$LD_LIBRARY_PATH
     export INCLUDE=$INCLUDE:$CPATH
+
+    # python version will be config by whl build script or ci script, we need
+    # a DFT version for build success when we just call host_build.sh
+    if [[ -z ${ALREADY_CONFIG_PYTHON_VER} ]]
+    then
+        echo "config a default python3"
+        DFT_PYTHON_BIN=/c/Users/${USER}/mge_whl_python_env/3.8.3
+        if [ ! -f "${DFT_PYTHON_BIN}/python3.exe" ]; then
+            echo "ERR: can not find ${DFT_PYTHON_BIN}/python3.exe , Invalid env"
+            windows_env_err
+        else
+            echo "put python3 to env..."
+            export PATH=${DFT_PYTHON_BIN}:$PATH
+            which python3
+        fi
+    fi
+
+    echo "export swig pwd to PATH"
+    export PATH=/c/Users/${USER}/swigwin-4.0.2::$PATH
 }
 
 WINDOWS_BUILD_TARGET="Ninja all > build.log"
@@ -218,6 +244,7 @@ function cmake_build_windows() {
         vcvarsall.bat $MGE_WINDOWS_BUILD_ARCH && cmake  -G "Ninja" \
         -DMGE_ARCH=$MGE_ARCH \
         -DMGE_INFERENCE_ONLY=$MGE_INFERENCE_ONLY \
+        -DMGE_BUILD_IMPERATIVE_RT=${MGE_BUILD_IMPERATIVE_RT} \
         -DMGE_WITH_CUDA=$MGE_WITH_CUDA \
         -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
         -DCMAKE_INSTALL_PREFIX:PATH=$INSTALL_DIR  \
@@ -230,8 +257,18 @@ function cmake_build_windows() {
         ${WINDOWS_BUILD_TARGET}"
 }
 
+if [ ${MGE_BUILD_IMPERATIVE_RT} = "ON" ] && [ ${MGE_INFERENCE_ONLY} = "ON" ]; then
+    echo "ERR: MGE_BUILD_IMPERATIVE_RT(-n) only valid when enable training mode(-t)"
+    echo "pls remove -n or add -t"
+    exit -1
+fi
 
 if [[ $OS =~ "NT" ]]; then
+    if [ ${MGE_ARCH} = "i386" ] && [ ${MGE_INFERENCE_ONLY} = "OFF" ]; then
+        echo "ERR: training mode(-t) only support 64 bit mode"
+        echo "pls remove -t or remove -m"
+        exit -1
+    fi
     config_windows_build_target
     cmake_build_windows $MGE_WITH_CUDA $MGE_INFERENCE_ONLY $BUILD_TYPE
 else
diff --git a/scripts/whl/BUILD_PYTHON_WHL_README.md b/scripts/whl/BUILD_PYTHON_WHL_README.md
index 5636560414bff09a6e69360c48c082d9892ed241..07e55febded09009d84e169812a0298211ce78d5 100644
--- a/scripts/whl/BUILD_PYTHON_WHL_README.md
+++ b/scripts/whl/BUILD_PYTHON_WHL_README.md
@@ -53,10 +53,6 @@
        d0: /c/Users/${USER}/mge_whl_python_env/3.8.3/python3.exe -m pip install --upgrade pip
        d1: /c/Users/${USER}/mge_whl_python_env/3.8.3/python3.exe -m pip install -r python_module/requires-test.txt
        d2: /c/Users/${USER}/mge_whl_python_env/3.8.3/python3.exe -m pip install numpy wheel requests tqdm tabulate
-    5: install swig from install gui
-       a: download swig: https://nchc.dl.sourceforge.net/project/swig/swigwin/swigwin-4.0.2/swigwin-4.0.2.zip
-       b: install swig to /c/Users/${USER}/swigwin-4.0.2
-       c: apply scripts/whl/windows/fix-ptr-define-issue.patch to c/Users/${USER}/swigwin-4.0.2
     ```
 
 # how to build
@@ -90,6 +86,11 @@
     ```
     ALL_PYTHON=3.5.9 ./scripts/whl/macos/macos_build_whl.sh
     ```
+    If you want to build with imperative rt, set env BUILD_IMPERATIVE="ON", eg:
+
+    ```
+    ALL_PYTHON=3.5.9 BUILD_IMPERATIVE="ON" ./scripts/whl/macos/macos_build_whl.sh
+    ```
 ## build for windows
     ```
     ./scripts/whl/windows/windows_build_whl.sh
@@ -102,5 +103,7 @@
     If you want to build windows whl with cuda, also a specific Python verison. eg:
 
     ```
-    WINDOWS_WHL_WITH_CUDA="true" ALL_PYTHON=3.5.4 ./scripts/whl/windows/windows_build_whl.sh
+    WINDOWS_WHL_WITH_CUDA="ON" ALL_PYTHON=3.5.4 ./scripts/whl/windows/windows_build_whl.sh
     ```
+    If you want to build with imperative rt, set env BUILD_IMPERATIVE="ON", eg:
+    BUILD_IMPERATIVE="ON" WINDOWS_WHL_WITH_CUDA="ON" ALL_PYTHON=3.5.4 ./scripts/whl/windows/windows_build_whl.sh
diff --git a/scripts/whl/macos/macos_build_whl.sh b/scripts/whl/macos/macos_build_whl.sh
index b3d1a70ec894fabc5daf71cd3b463d94f9e1e945..a10912ddf668a605e1422d75f445585809624d51 100755
--- a/scripts/whl/macos/macos_build_whl.sh
+++ b/scripts/whl/macos/macos_build_whl.sh
@@ -65,16 +65,18 @@ function config_python_env() {
     fi
     echo ${ver}
 
-    #config a dir to trick cmake find a null pythonlib
-    PYTHON_LIBRARY=${PYTHON_DIR}lib/
     if [ "$1" = "3.5.9" ]; then
         PYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python3.5m
+        PYTHON_LIBRARY=${PYTHON_DIR}/lib/libpython3.5m.dylib
     elif [ "$1" = "3.6.10" ]; then
         PYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python3.6m
+        PYTHON_LIBRARY=${PYTHON_DIR}/lib/libpython3.6m.dylib
     elif [ "$1" = "3.7.7" ]; then
         PYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python3.7m
+        PYTHON_LIBRARY=${PYTHON_DIR}/lib/libpython3.7m.dylib
     elif [ "$1" = "3.8.3" ]; then
         PYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python3.8
+        PYTHON_LIBRARY=${PYTHON_DIR}/lib/libpython3.8.dylib
     else
         echo "ERR: DO NOT SUPPORT PYTHON VERSION"
         echo "now support list: ${FULL_PYTHON_VER}"
@@ -82,6 +84,11 @@ function config_python_env() {
     fi
 }
 
+if [[ -z ${BUILD_IMPERATIVE} ]]
+then
+    BUILD_IMPERATIVE="OFF"
+fi
+
 function do_build() {
     for ver in ${ALL_PYTHON}
     do
@@ -89,7 +96,7 @@ function do_build() {
         config_python_env ${ver}
 
         #check env
-        if [ ! -d "$PYTHON_LIBRARY" ]; then
+        if [ ! -f "$PYTHON_LIBRARY" ]; then
             echo "ERR: can not find $PYTHON_LIBRARY , Invalid python package"
             err_env
         fi
@@ -102,14 +109,20 @@ function do_build() {
         #append cmake args for config python
         export EXTRA_CMAKE_ARGS="-DCMAKE_PREFIX_PATH=${PYTHON_DIR} -DPYTHON_LIBRARY=${PYTHON_LIBRARY} -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR} "
         #config build type to RelWithDebInfo to enable MGB_ENABLE_DEBUG_UTIL etc
-        export EXTRA_CMAKE_ARGS=${EXTRA_CMAKE_ARGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo "
+        export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_BUILD_TYPE=RelWithDebInfo "
 
         #call build and install
         #FIXME: cmake do not triger update python config, after
         #change PYTHON_LIBRARY and PYTHON_INCLUDE_DIR, so add
         #-r to remove build cache after a new ver build, which
         #will be more slow build than without -r
-        ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r
+        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
+            echo "build whl with IMPERATIVE python rt"
+            ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -n -r
+        else
+            echo "build whl with legacy python rt"
+            ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r
+        fi
 
         #call setup.py
         BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release/build/
@@ -121,12 +134,47 @@ function do_build() {
         fi
         mkdir -p staging
 
+        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
+            echo "build whl with IMPERATIVE python rt"
+            cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
+            cd ${BUILD_DIR}/staging/megengine/core
+            rt_file=`ls _imperative_rt.*.so`
+            echo "rt file is: ${rt_file}"
+            if [[ -z ${rt_file} ]]
+            then
+                echo "ERR: can not find valid rt file"
+                exit -1
+            fi
+            llvm-strip -s ${rt_file}
+            mv ${rt_file} _imperative_rt.so
+            echo "check so valid or not..."
+            otool_out=`otool -L _imperative_rt.so`
+            if [[ "${otool_out}" =~ "ython" ]]; then
+                echo "ERR: invalid _imperative_rt.so which depend on python lib, detail: log"
+                echo ${otool_out}
+                exit -1
+            else
+                echo "valid..."
+            fi
+        else
+            echo "build whl with legacy python rt"
+
+            cp -a python_module/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
+            cd ${BUILD_DIR}/staging/megengine/_internal
+            #FIXME: set lib suffix to dylib may be better, BUT we find after distutils.file_util.copy_file
+            #will change to .so at macos even we set suffix to dylib, at the same time, macos also support .so
+            echo "check so valid or not..."
+            llvm-strip -s _mgb.so
+            otool_out=`otool -L _mgb.so`
+            if [[ "${otool_out}" =~ "ython" ]]; then
+                echo "ERR: invalid _mgb.so which depend on python lib, detail: log"
+                echo ${otool_out}
+                exit -1
+            else
+                echo "valid..."
+            fi
+        fi
 
-        cp -a python_module/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
-        cd ${BUILD_DIR}/staging/megengine/_internal
-        #FIXME: set lib suffix to dylib may be better, BUT we find after distutils.file_util.copy_file
-        #will change to .so at macos even we set suffix to dylib, at the same time, macos also support .so
-        llvm-strip -s _mgb.so
         cd ${BUILD_DIR}/staging
         ${PYTHON_DIR}/bin/python3 setup.py bdist_wheel
         cd ${BUILD_DIR}/staging/dist/
diff --git a/scripts/whl/windows/windows_build_whl.sh b/scripts/whl/windows/windows_build_whl.sh
index 434f3ed5c099ec51852720375f01b8c98265b063..1e1d553ad9d42359d732caa10bef93e8cbab6d9a 100755
--- a/scripts/whl/windows/windows_build_whl.sh
+++ b/scripts/whl/windows/windows_build_whl.sh
@@ -14,8 +14,6 @@ function err_env() {
 }
 
 function append_path_env_and_check() {
-    echo "export swig pwd to PATH"
-    export PATH=/c/Users/${USER}/swigwin-4.0.2::$PATH
     echo  "export vs2019 install path"
     export VS_PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2019/Enterprise
     # for llvm-strip
@@ -62,7 +60,7 @@ function config_python_env() {
 
 if [[ -z ${WINDOWS_WHL_WITH_CUDA} ]]
 then
-    WINDOWS_WHL_WITH_CUDA="false"
+    WINDOWS_WHL_WITH_CUDA="OFF"
 fi
 
 
@@ -74,26 +72,46 @@ CUBLAS_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cublas6
 CURAND_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/curand64_10.dll"
 CUBLASLT_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cublasLt64_10.dll"
 CUDART_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cudart64_101.dll"
+function depend_real_copy() {
+    REAL_DST=$1
+    echo "real copy lib to $1"
+    cp "${TRT_LIB}" ${REAL_DST}
+    cp "${CUDNN_LIB}" ${REAL_DST}
+    cp "${CUSOLVER_LIB}" ${REAL_DST}
+    cp "${CUBLAS_LIB}" ${REAL_DST}
+    cp "${CURAND_LIB}" ${REAL_DST}
+    cp "${CUBLASLT_LIB}" ${REAL_DST}
+    cp "${CUDART_LIB}" ${REAL_DST}
+}
+
 function copy_more_dll() {
     # for python whl real use
-    CP_DST=${BUILD_DIR}/staging/megengine/_internal/lib
-    rm -rf ${CP_DST}
-    mkdir ${CP_DST}
+    if [ ${BUILD_IMPERATIVE} = "ON" ]; then
+        echo "config BUILD_IMPERATIVE core lib dir"
+        CP_WHL_DST=${BUILD_DIR}/staging/megengine/core/lib
+    else
+        echo "config legacy python lib dir"
+        CP_WHL_DST=${BUILD_DIR}/staging/megengine/_internal/lib
+    fi
+    rm -rf ${CP_WHL_DST}
+    mkdir ${CP_WHL_DST}
+    # workround for cpu-only version import failed, use a
+    # empty.file to triger setup.py to create a null empty
+    echo "empty" > ${CP_WHL_DST}/empty.file
 
 
-    if [ ${WINDOWS_WHL_WITH_CUDA} = "true" ]; then
+    if [ ${WINDOWS_WHL_WITH_CUDA} = "ON" ]; then
         echo "copy nvidia lib to whl use...."
-        cp "${TRT_LIB}" ${CP_DST}
-        cp "${CUDNN_LIB}" ${CP_DST}
-        cp "${CUSOLVER_LIB}" ${CP_DST}
-        cp "${CUBLAS_LIB}" ${CP_DST}
-        cp "${CURAND_LIB}" ${CP_DST}
-        cp "${CUBLASLT_LIB}" ${CP_DST}
-        cp "${CUDART_LIB}" ${CP_DST}
+        depend_real_copy ${CP_WHL_DST}
 
     fi
 }
 
+if [[ -z ${BUILD_IMPERATIVE} ]]
+then
+    BUILD_IMPERATIVE="OFF"
+fi
+
 function do_build() {
     for ver in ${ALL_PYTHON}
     do
@@ -118,21 +136,31 @@ function do_build() {
         #force LINK a real PYTHON_LIBRARY file, after test we do not find the symbols conflict with python
         #export EXTRA_CMAKE_ARGS="-DPYTHON_LIBRARY=${PYTHON_LIBRARY} -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR} "
         #config build type to RelWithDebInfo to enable MGB_ENABLE_DEBUG_UTIL etc
-        export EXTRA_CMAKE_ARGS=${EXTRA_CMAKE_ARGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo "
+        export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_BUILD_TYPE=RelWithDebInfo "
 
         #call build and install
         #FIXME: cmake do not triger update python config, after
         #change PYTHON_LIBRARY and PYTHON_INCLUDE_DIR, so add
         #-r to remove build cache after a new ver build, which
         #will be more slow build than without -r
-        if [ ${WINDOWS_WHL_WITH_CUDA} = "true" ]; then
+        BUILD_ARGS=" -t -r"
+        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
+            echo "build whl with IMPERATIVE python rt"
+            BUILD_ARGS="${BUILD_ARGS} -n "
+        else
+            echo "build whl with legacy python rt"
+        fi
+
+        if [ ${WINDOWS_WHL_WITH_CUDA} = "ON" ]; then
             echo "build windows whl with cuda"
-            ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r -c
+            BUILD_ARGS="${BUILD_ARGS} -c "
         else
             echo "build windows whl with cpu only"
-            ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r
         fi
 
+        echo "host_build.sh BUILD_ARGS: ${BUILD_ARGS}"
+        ${SRC_DIR}/scripts/cmake-build/host_build.sh ${BUILD_ARGS}
+
         #call setup.py
         BUILD_DIR=${SRC_DIR}/build_dir/host/build/
         cd ${BUILD_DIR}
@@ -143,10 +171,27 @@ function do_build() {
         fi
         mkdir -p staging
 
+        if [ ${BUILD_IMPERATIVE} = "ON" ]; then
+            echo "build whl with IMPERATIVE python rt"
+            cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
+            cd ${BUILD_DIR}/staging/megengine/core
+            rt_file=`ls _imperative_rt.*.pyd`
+            echo "rt file is: ${rt_file}"
+            if [[ -z ${rt_file} ]]
+            then
+                echo "ERR: can not find valid rt file"
+                exit -1
+            fi
+            llvm-strip -s ${rt_file}
+            mv ${rt_file} _imperative_rt.pyd
+        else
+            echo "build whl with legacy python rt"
+
+            cp -a python_module/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
+            cd ${BUILD_DIR}/staging/megengine/_internal
+            llvm-strip -s _mgb.pyd
+        fi
 
-        cp -a python_module/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
-        cd ${BUILD_DIR}/staging/megengine/_internal
-        llvm-strip -s _mgb.pyd
         copy_more_dll
         cd ${BUILD_DIR}/staging
         ${PYTHON_DIR}/python3 setup.py bdist_wheel
@@ -175,5 +220,6 @@ function third_party_prepare() {
 }
 
 ######################
+export ALREADY_CONFIG_PYTHON_VER="yes"
 third_party_prepare
 do_build
diff --git a/src/core/impl/graph/seq_sublinear_memory.cpp b/src/core/impl/graph/seq_sublinear_memory.cpp
index 59750d80a9c318c0559d5911cd10772f446ca18e..04d89309c9ebff9123cf477409a4ac23dfc6a882 100644
--- a/src/core/impl/graph/seq_sublinear_memory.cpp
+++ b/src/core/impl/graph/seq_sublinear_memory.cpp
@@ -33,6 +33,11 @@ class RNGxorshf {
     uint64_t s[2];
 
 public:
+#if __cplusplus >= 201703L
+    typedef uint64_t result_type;
+    static constexpr uint64_t min() { return 0; }
+    static constexpr uint64_t max() { return UINT64_MAX; }
+#endif
     RNGxorshf(uint64_t seed) {
         std::mt19937_64 gen(seed);
         s[0] = gen();
@@ -936,8 +941,12 @@ void SeqModifierForSublinearMemory::ActionSearcherSingleCN::search_genetic() {
             }
         }
         m_cur_records = records;
+#if __cplusplus >= 201703L
+        std::shuffle(perm.begin(), perm.end(), rng);
+#else
         std::random_shuffle(perm.begin(), perm.end(),
                             [&](size_t x) { return rng() % x; });
+#endif
         for (size_t i = 0; i < length; ++i) {
             invoke_search(mutation(mutation(records[i].first)));
             invoke_search(crossover(records[i].first, records[perm[i]].first));
diff --git a/src/opr/test/blas.cpp b/src/opr/test/blas.cpp
index dab00573b2be80bb404616a4205aa10922234fff..71c4fb2bbae00a04e82c1c13369beb97bb0d7e39 100644
--- a/src/opr/test/blas.cpp
+++ b/src/opr/test/blas.cpp
@@ -705,7 +705,12 @@ TEST(TestOprBlas, MatrixInverse) {
         }
         auto ptr = inp[0]->ptr<float>();
         for (size_t i = 0; i < batch; ++i, ptr += n * n) {
+#if __cplusplus >= 201703L
+            std::default_random_engine rng_engine;
+            std::shuffle(perm.begin(), perm.end(), rng_engine);
+#else
             std::random_shuffle(perm.begin(), perm.end());
+#endif
             for (size_t j = 0; j < n; ++j) {
                 ptr[j * n + perm[j]] += 5;
             }
diff --git a/src/opr/test/muxing.cpp b/src/opr/test/muxing.cpp
index fdc09b02319f44e580d071e9036888d8a7e110e6..5fec111bf1a3e41866f4ad67e372fc21725124fd 100644
--- a/src/opr/test/muxing.cpp
+++ b/src/opr/test/muxing.cpp
@@ -36,7 +36,12 @@ void run_all_gather(const std::vector<size_t>& axis_size, bool& success,
         sleep_time.push_back(i * 0.05 + 0.1);
         tot_axis_size += axis_size[i];
     }
+#if __cplusplus >= 201703L
+    std::default_random_engine rng_engine;
+    std::shuffle(sleep_time.begin(), sleep_time.end(), rng_engine);
+#else
     std::random_shuffle(sleep_time.begin(), sleep_time.end());
+#endif
 
     auto constexpr DEVICE_TYPE = CompNode::DeviceType::CUDA;
     size_t nr_dev = std::min<size_t>(
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 45cc3d690352318f9130aa6a919e29d1fc532473..4c76dfbf14c540625da001de1377228bd1d50580 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -18,7 +18,11 @@ endif()
 
 add_executable(megbrain_test ${SOURCES})
 target_link_libraries(megbrain_test gtest)
-target_link_libraries(megbrain_test megengine)
+if(MSVC OR WIN32)
+    target_link_libraries(megbrain_test megbrain megdnn)
+else()
+    target_link_libraries(megbrain_test megengine)
+endif()
 if(CXX_SUPPORT_WCLASS_MEMACCESS)
     if(MGE_WITH_CUDA)
         target_compile_options(megbrain_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-class-memaccess>"
@@ -28,10 +32,12 @@ if(CXX_SUPPORT_WCLASS_MEMACCESS)
     endif()
 endif()
 
-if(APPLE OR ANDROID)
-    target_link_libraries(megbrain_test dl)
-else()
-    target_link_libraries(megbrain_test dl rt)
+if(UNIX)
+    if(APPLE OR ANDROID)
+        target_link_libraries(megbrain_test dl)
+    else()
+        target_link_libraries(megbrain_test dl rt)
+    endif()
 endif()
 
 if (MGE_WITH_DISTRIBUTED)