diff --git a/cmake/init.cmake b/cmake/init.cmake
index b11156d2e9986f879dcf4dd63354edb81c493260..4bdcaeb4c5f3c088778126bf841f05d5157b7de9 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -18,6 +18,16 @@ if(NOT WIN32)
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
     set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
 else()
+    set(CMAKE_C_FLAGS_DEBUG "/Zi /DEBUG")
+    set(CMAKE_C_FLAGS_RELEASE "/O2 /DNDEBUG")
+    set(CMAKE_C_FLAGS_RELWITHDEBINFO "/O2 /DNDEBUG")
+    set(CMAKE_C_FLAGS_MINSIZEREL "/Os /DNDEBUG")
+
+    set(CMAKE_CXX_FLAGS_DEBUG "/Zi /DEBUG")
+    set(CMAKE_CXX_FLAGS_RELEASE "/O2 /DNDEBUG")
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/O2 /DNDEBUG")
+    set(CMAKE_CXX_FLAGS_MINSIZEREL "/Os /DNDEBUG")
+
     # It can specify CUDA compile flag manualy,
     # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
     # because CUDA will update by nvidia, then error will occur.
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 49da54080734cf9f49a566c2861678c6c6c73599..5fcb1e30fbe677a8f87d3d5b3ad2228269e54f92 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -128,14 +128,20 @@ if(WITH_PYTHON)
     else()
       set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
     endif()
+    file(TO_NATIVE_PATH ${op_function_generator_path} op_function_generator_path)
+    file(TO_NATIVE_PATH ${impl_file} impl_file)
+    file(TO_NATIVE_PATH ${tmp_impl_file} tmp_impl_file)
+
     file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat ""
     "set build_times=1\n"
     ":retry\n"
     "ECHO op_function_generator run %build_times% time\n"
-    "${op_function_generator_path}/op_function_generator ${impl_file}\n"
+    "if exist ${tmp_impl_file} del ${tmp_impl_file}\n"
+    "taskkill /f /im op_function_generator.exe 2>NUL\n"
+    "${op_function_generator_path}\\op_function_generator.exe ${tmp_impl_file}\n"
     "if %ERRORLEVEL% NEQ 0 (\n"
     "    set /a build_times=%build_times%+1\n"
-    "    if %build_times% GTR 5 (\n"
+    "    if %build_times% GEQ 3 (\n"
     "        exit /b 1\n"
     "    ) else (\n"
     "        goto :retry\n"
@@ -145,6 +151,8 @@ if(WITH_PYTHON)
 
     add_custom_command(TARGET op_function_generator POST_BUILD
           COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
+          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
+          COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
     )
 
     if(${CBLAS_PROVIDER} STREQUAL MKLML)
@@ -176,7 +184,7 @@ if(WITH_PYTHON)
               "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator"
               "${tmp_impl_file}"
           COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
-          COMMENT "copy_if_different ${impl_file}"
+          COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
           VERBATIM
     )
     if(WITH_MKL)
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index e53828ff10be602dc2b1adc04512aab947fdec9c..76915061842d81b80eb39e12a84b2b44fb6e3583 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -26,11 +26,10 @@ set cache_dir=%work_dir:Paddle=cache%
 if not exist %cache_dir%\tools (
     git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
 )
-taskkill /f /im op_function_generator.exe  2>NUL
 taskkill /f /im cmake.exe  2>NUL
 taskkill /f /im MSBuild.exe 2>NUL
-taskkill /f /im CL.exe 2>NUL
-taskkill /f /im Lib.exe 2>NUL
+taskkill /f /im cl.exe 2>NUL
+taskkill /f /im lib.exe 2>NUL
 taskkill /f /im link.exe 2>NUL
 taskkill /f /im vctip.exe 2>NUL
 taskkill /f /im cvtres.exe 2>NUL
@@ -47,8 +46,8 @@ wmic process where name="op_function_generator.exe" call terminate 2>NUL
 wmic process where name="test_api_impl.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
-wmic process where name="CL.exe" call terminate 2>NUL
-wmic process where name="Lib.exe" call terminate 2>NUL
+wmic process where name="cl.exe" call terminate 2>NUL
+wmic process where name="lib.exe" call terminate 2>NUL
 wmic process where name="python.exe" call terminate 2>NUL
 
 rem ------initialize common variable------
@@ -79,6 +78,7 @@ if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
 
 rem -------set cache build directory-----------
 rmdir build\python /s/q
+rmdir build\paddle\fluid\pybind /s/q
 rmdir build\paddle_install_dir /s/q
 rmdir build\paddle_inference_install_dir /s/q
 rmdir build\paddle_inference_c_install_dir /s/q
@@ -112,6 +112,17 @@ if %ERRORLEVEL% EQU 0 (
     git branch last_pr
 )
 
+for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
+set day_now=%datetime:~6,2%
+set day_before=-1
+set /p day_before=< %cache_dir%\day.txt
+if %day_now% NEQ %day_before% (
+    echo %day_now% > %cache_dir%\day.txt
+    type %cache_dir%\day.txt
+    rmdir build /s/q
+    goto :mkbuild
+)
+
 :: git diff HEAD origin/develop --stat --name-only
 :: git diff HEAD origin/develop --stat --name-only | findstr ".cmake CMakeLists.txt paddle_build.bat"
 :: if %ERRORLEVEL% EQU 0 (
@@ -137,10 +148,11 @@ goto :CASE_%1
 
 echo "Usage: paddle_build.bat [OPTION]"
 echo "OPTION:"
-echo "wincheck_mkl: run Windows MKL/GPU/UnitTest CI tasks on Windows"
-echo "wincheck_openbals: run Windows OPENBLAS/CPU CI tasks on Windows"
+echo "wincheck_mkl: run Windows MKL/GPU PR CI tasks on Windows"
+echo "wincheck_openbals: run Windows OPENBLAS/CPU PR CI tasks on Windows"
 echo "build_avx_whl: build Windows avx whl package on Windows"
 echo "build_no_avx_whl: build Windows no avx whl package on Windows"
+echo "build_inference_lib: build Windows inference library on Windows"
 exit /b 1
 
 rem ------PR CI windows check for MKL/GPU----------
@@ -200,6 +212,7 @@ goto:success
 
 rem ------Build windows inference library------
 :CASE_build_inference_lib
+set ON_INFER=ON
 set WITH_PYTHON=OFF
 set CUDA_ARCH_NAME=All
 
@@ -249,9 +262,10 @@ if "%WITH_GPU%"=="ON" (
 )
 
 rem ------initialize the python environment------
+@ECHO ON
 set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
 set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
-if %WITH_PYTHON% == "OFF" (
+if %WITH_PYTHON% == "ON" (
     where python
     where pip
     pip install wheel --user
@@ -373,6 +387,7 @@ set build_times=1
 rem clcache.exe -z
 
 rem -------clean up environment again-----------
+taskkill /f /im cmake.exe  2>NUL
 taskkill /f /im MSBuild.exe 2>NUL
 taskkill /f /im cl.exe 2>NUL
 taskkill /f /im lib.exe 2>NUL
@@ -387,12 +402,13 @@ taskkill /f /im cicc.exe 2>NUL
 taskkill /f /im ptxas.exe 2>NUL
 taskkill /f /im test_api_impl.exe 2>NUL
 taskkill /f /im op_function_generator.exe 2>NUL
+wmic process where name="cmake.exe" call terminate 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
 wmic process where name="test_api_impl.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
-wmic process where name="CL.exe" call terminate 2>NUL
-wmic process where name="Lib.exe" call terminate 2>NUL
+wmic process where name="cl.exe" call terminate 2>NUL
+wmic process where name="lib.exe" call terminate 2>NUL
 
 echo Build Paddle the %build_times% time:
 if %GENERATOR% == "Ninja" (
@@ -766,8 +782,8 @@ wmic process where name="op_function_generator.exe" call terminate 2>NUL
 wmic process where name="test_api_impl.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
-wmic process where name="CL.exe" call terminate 2>NUL
-wmic process where name="Lib.exe" call terminate 2>NUL
+wmic process where name="cl.exe" call terminate 2>NUL
+wmic process where name="lib.exe" call terminate 2>NUL
 wmic process where name="python.exe" call terminate 2>NUL
 echo Windows CI run successfully!
 exit /b 0
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index e0aec2ba50bd41cee81a65446221437c37442217..7d9a01106285bed530c2d13b6d79f31330afeb3d 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1262,11 +1262,13 @@ set +x
                 testcase=''
         done <<< "$test_cases";
 
-        card_test "$single_card_tests_high_parallel" 1 8        # run cases the most each time with single GPU
+        card_test "$single_card_tests_high_parallel" 1 6        # run cases the most each time with single GPU
         card_test "$single_card_tests_two_parallel" 1 2         # run cases 2 job each time with single GPU
         card_test "$single_card_tests_non_parallel" 1           # run cases 1 job each time with single GPU
+        
         card_test "$multiple_card_tests_two_parallel" 2 2       # run cases 2 job each time with two GPUs
         card_test "$multiple_card_tests_non_parallel" 2         # run cases 1 job each time with two GPUs
+        
         card_test "$exclusive_tests_two_parallel" -1 2          # run cases exclusively, in this cases would be run with 2/4/8 GPUs
         card_test "$exclusive_tests_non_parallel" -1            # run cases exclusively, in this cases would be run with 2/4/8 GPUs
         collect_failed_tests
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py b/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
index 5796e13336ccf680930e0704e5f1fe5eca623937..6e8ee5589db77b8df14c2371452164515a7b50e9 100644
--- a/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
@@ -77,7 +77,11 @@ class DataLoaderKeepOrderTestBase(unittest.TestCase):
     def get_places(self):
         place_list = [fluid.cpu_places(1), fluid.cpu_places(4)]
         if fluid.is_compiled_with_cuda():
-            place_list.extend([fluid.cuda_places(0), fluid.cuda_places([0, 1])])
+            if os.name == "nt":
+                place_list.extend([fluid.cuda_places(0)])
+            else:
+                place_list.extend(
+                    [fluid.cuda_places(0), fluid.cuda_places([0, 1])])
         return place_list
 
     def test_main(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py b/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py
index 89bbc88e01eaffecfef4a16dd4c007af2135ce97..f779d762fb3026afa8b33d2883b28177779c06f1 100644
--- a/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py
@@ -96,7 +96,11 @@ class DataLoaderKeepOrderTestBase(unittest.TestCase):
     def get_places(self):
         place_list = [fluid.cpu_places(1), fluid.cpu_places(4)]
         if fluid.is_compiled_with_cuda():
-            place_list.extend([fluid.cuda_places(0), fluid.cuda_places([0, 1])])
+            if os.name == "nt":
+                place_list.extend([fluid.cuda_places(0)])
+            else:
+                place_list.extend(
+                    [fluid.cuda_places(0), fluid.cuda_places([0, 1])])
         return place_list
 
     def test_main(self):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
index d64aa510f4e1a523d4d30439fc3427e884ed0024..a34982ef3dd67d70cbf40101c1c4a027bd4012da 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
@@ -17,6 +17,7 @@ import numpy as np
 import six
 import paddle.fluid as fluid
 import paddle
+import os
 
 
 def enable_parallel_ssa_executor(enabled=True):
@@ -65,6 +66,9 @@ class TestParallelExecutorFetchIsolatedVarBase(unittest.TestCase):
             if fluid.core.globals()[
                     'FLAGS_enable_parallel_graph'] and not use_gpu:
                 return
+            # windows has only 1 GPU
+            if use_gpu and dev_cnt > 1 and os.name == "nt":
+                return
         else:
             if use_gpu:
                 return
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 9d03ae22de28f2946f98fde81ab52ae0339f3dc0..b36643a11023cfadb9814b0691613851345659e4 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -131,7 +131,6 @@ CPU_PARALLEL_JOB = [
     'test_ones_op',
     'test_npair_loss_op',
     'test_nn_functional_embedding_static',
-    'test_nce',
     'test_name_scope',
     'test_naive_executor',
     'test_multiprocess_dataloader_iterable_dataset_split',
@@ -293,8 +292,6 @@ CPU_PARALLEL_JOB = [
     'test_dataset_imdb',
     'test_dataset_conll05',
     'test_dataset_cifar',
-    'test_dataloader_unkeep_order',
-    'test_dataloader_keep_order',
     'test_dataloader_dataset',
     'test_data_generator',
     'test_data_feeder',
@@ -571,8 +568,6 @@ CPU_PARALLEL_JOB = [
     'test_fleet_cc',
     'test_repeated_fc_relu_fuse_pass_cc',
     'heter_server_test',
-    'test_static_save_load_large',
-    'graph_node_test',
     'test_custom_conj',
     'test_fleet_private_function',
     'test_fake_init_op',
@@ -604,27 +599,21 @@ CPU_PARALLEL_JOB = [
 # It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED,
 # just remove it from this list.
 TETRAD_PARALLEL_JOB = [
+    'graph_node_test',
+    'test_assert',
+    'test_nce',
     'buffered_allocator_test',
     'allocator_facade_frac_flags_test',
     'cuda_helper_test',
-    'sequence_padding_test',
     'test_auto_growth_gpu_memory_limit',
-    'test_imperative_framework',
     'device_context_test',
     'test_reference_count_pass_last_lived_ops',
     'copy_same_tensor_test',
-    'float16_gpu_test',
-    'test_leaky_relu_grad_grad_functor',
-    'sequence_pooling_test',
     'mixed_vector_test',
     'op_registry_test',
-    'strided_memcpy_test',
-    'selected_rows_functor_gpu_test',
     'test_prepare_op',
     'data_device_transform_test',
-    'test_tensor_to_numpy',
     'test_naive_best_fit_gpu_memory_limit',
-    'vol2col_test',
     'test_imperative_using_non_zero_gpu',
     'retry_allocator_test',
     'system_allocator_test',
@@ -659,23 +648,16 @@ TETRAD_PARALLEL_JOB = [
     'test_analyzer_paddletensor_tensor',
     'test_analyzer_bert',
     'test_analyzer_googlenet',
-    'zero_copy_tensor_test',
-    'custom_tensor_test',
     'test_fleet_base',
     'test_imperative_container_layerdict',
-    'test_complex_simplenet',
-    'test_tensor_register_hook',
     'test_set_value_op',
-    'test_tensor_type_promotion',
     'test_view_op_reuse_allocation',
-    'test_complex_grad_accumulated',
     'test_sequential',
     'test_sequential',
     'test_imperative_layers',
     'test_dgc_momentum_op',
     'test_memcpy_op',
     'test_dgc_op',
-    'test_modelaverage',
     'test_lookahead',
     'test_callback_visualdl',
     'test_new_group_api',
@@ -684,32 +666,37 @@ TETRAD_PARALLEL_JOB = [
     'test_collective_split_row_linear',
     'test_collective_split_col_linear',
     'test_collective_split_embedding',
+    'test_custom_attrs_jit',
+    'float16_gpu_test',
+    'test_leaky_relu_grad_grad_functor',
+    'test_complex_simplenet',
+    'selected_rows_functor_gpu_test',
+    'test_imperative_framework',
 ]
 
 # It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED,
 # just remove it from this list.
 TWO_PARALLEL_JOB = [
+    'test_tensor_to_numpy',
+    'zero_copy_tensor_test',
+    'sequence_pooling_test',
+    'sequence_padding_test',
+    'vol2col_test',
     'convert_model2dot_ernie',
     'im2col_test',
-    'test_elementwise_add_grad_grad',
     'test_logical_op',
     'test_imperative_mnist',
     'test_imperative_deepcf',
     'test_cholesky_op',
-    'test_multiprocess_dataloader_iterable_dataset_static',
     'test_sample_logits_op',
     'test_ir_fc_fuse_pass',
-    'test_imperative_qat_channelwise',
     'test_fleet_base_single',
-    'test_imperative_out_scale',
     'test_multiprocess_dataloader_iterable_dataset_dynamic',
     'test_fill_op',
     'test_slice_op',
     'test_cond',
-    'test_compiled_program',
     'test_lstm',
     'test_ema',
-    'test_py_reader_using_executor',
     'test_nan_inf',
     'test_isinstance',
     'test_jit_save_load',
@@ -749,13 +736,11 @@ TWO_PARALLEL_JOB = [
     'test_anchor_generator_op',
     'test_imperative_ptb_rnn',
     'test_gather_nd_op',
-    'test_flatten_contiguous_range_op',
     'test_network_with_dtype',
     'test_elementwise_sub_op',
     'test_assert_op',
     'test_elementwise_div_op',
     'test_gather_tree_op',
-    'test_decoupled_py_reader',
     'test_imperative_named_members',
     'test_seqconv_eltadd_relu_fuse_pass',
     'test_analysis_predictor',
@@ -771,7 +756,6 @@ TWO_PARALLEL_JOB = [
     'test_traced_layer_err_msg',
     'test_unique_with_counts',
     'test_auc_single_pred_op',
-    'test_stack_op',
     'test_conv_bn_fuse_pass',
     'test_instance_norm_op_v2',
     'test_softmax_bf16_mkldnn_op',
@@ -793,10 +777,8 @@ TWO_PARALLEL_JOB = [
     'test_ctc_align',
     'test_imperative_save_load_v2',
     'test_decayed_adagrad_op',
-    'test_generator_dataloader',
     'test_dropout_op',
     'test_functional_conv3d',
-    'test_executor_return_tensor_not_overwriting',
     'test_flatten2_op',
     'test_fsp_op',
     'test_fusion_transpose_flatten_concat_op',
@@ -812,7 +794,6 @@ TWO_PARALLEL_JOB = [
     'test_temporal_shift_op',
     'test_case',
     'test_transformer_api',
-    'test_bmm_op',
     'test_adagrad_op',
     'test_batch_norm_mkldnn_op',
     'test_adam_op_multi_thread',
@@ -973,7 +954,6 @@ TWO_PARALLEL_JOB = [
     'test_auc_op',
     'test_adam_op',
     'test_bilinear_tensor_product_op',
-    'test_break_continue',
     'test_transpose_mkldnn_op',
     'test_callback_reduce_lr_on_plateau',
     'test_cast_op',
@@ -990,7 +970,6 @@ TWO_PARALLEL_JOB = [
     'test_functional_conv2d_transpose',
     'test_functional_conv3d_transpose',
     'test_dot_op',
-    'test_gru_op',
     'test_device',
     'test_imperative_layer_apply',
     'test_dataloader_early_reset',
@@ -1064,26 +1043,21 @@ TWO_PARALLEL_JOB = [
     'test_imperative_optimizer',
     'test_assign_value_op',
     'test_roi_pool_op',
-    'test_imperative_basic',
     'test_word2vec',
     'test_manual_seed',
-    'test_buffer_shared_memory_reuse_pass',
     'test_range',
     'test_box_decoder_and_assign_op',
     'test_imperative_optimizer_v2',
     'test_python_operator_overriding',
     'test_is_empty_op',
-    'test_imperative_qat',
     'test_py_reader_pin_memory',
     'test_train_recognize_digits',
     'test_parallel_executor_feed_persistable_var',
     'test_mnist',
     'test_update_loss_scaling_op',
     'test_rnn_cell_api',
-    'test_parallel_executor_fetch_isolated_var',
     'test_imperative_load_static_param',
     'test_fuse_bn_add_act_pass',
-    'test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass',
     'test_quantize_transpiler_v2',
     'paddle_infer_api_test',
     'test_analyzer_ernie',
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index d2cefcc441f6c26f66ecb2fad4f570eb3b949d5c..a89dcb61fb7e327cc362c7ffa28eaca14a81112f 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -46,81 +46,44 @@ if [ ${WITH_GPU:-OFF} == "ON" ];then
     set -e
 fi
 
-
 # /*==================Fixed Disabled Windows GPU MKL unittests==============================*/
 # TODO: fix these unittest that is bound to fail
-diable_wingpu_test="^lite_mul_model_test$|\
-^test_analyzer_int8_resnet50$|\
-^test_gradient_clip$|\
-^test_translated_layer$|\
-^test_imperative_resnet$|\
-^test_imperative_resnet_sorted_gradient$|\
-^test_model$|\
+disable_wingpu_test="^test_model$|\
+^test_dataloader_early_reset$|\
+^test_add_reader_dependency$|\
 ^test_decoupled_py_reader$|\
 ^test_generator_dataloader$|\
-^test_multiprocess_dataloader_iterable_dataset_static$|\
+^test_parallel_dygraph_sync_batch_norm$|\
 ^test_py_reader_using_executor$|\
-^test_parallel_executor_feed_persistable_var$|\
-^test_parallel_executor_fetch_isolated_var$|\
-^test_parallel_executor_inference_feed_partial_data$|\
 ^test_parallel_executor_seresnext_base_gpu$|\
 ^test_parallel_executor_seresnext_with_fuse_all_reduce_gpu$|\
 ^test_parallel_executor_seresnext_with_reduce_gpu$|\
-^test_parallel_ssa_graph_inference_feed_partial_data$|\
-^test_sync_batch_norm_op$|\
-^test_fuse_relu_depthwise_conv_pass$|\
-^test_buffer_shared_memory_reuse_pass$|\
-^test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass$|\
-^test_dataloader_keep_order$|\
-^test_dataloader_unkeep_order$|\
-^test_add_reader_dependency$|\
-^test_cholesky_op$|\
-^test_dataloader_early_reset$|\
+^test_program_prune_backward$|\
 ^test_decoupled_py_reader_data_check$|\
 ^test_fleet_base_single$|\
-^test_fuse_optimizer_pass$|\
 ^test_multiprocess_dataloader_iterable_dataset_dynamic$|\
-^test_parallel_dygraph_sync_batch_norm$|\
-^test_partial_eager_deletion_transformer$|\
-^test_rnn_nets$|\
+^test_parallel_executor_feed_persistable_var$|\
+^test_parallel_executor_inference_feed_partial_data$|\
+^test_parallel_ssa_graph_inference_feed_partial_data$|\
 ^test_py_reader_combination$|\
 ^test_py_reader_pin_memory$|\
 ^test_py_reader_push_pop$|\
 ^test_reader_reset$|\
-^test_imperative_se_resnext$|\
+^test_sync_batch_norm_op$|\
 ^test_imperative_static_runner_while$|\
+^test_dataloader_keep_order$|\
+^test_dataloader_unkeep_order$|\
+^test_multiprocess_dataloader_iterable_dataset_static$|\
 ^test_fuse_bn_act_pass$|\
 ^test_fuse_bn_add_act_pass$|\
-^test_gru_rnn_op$|\
-^test_rnn_op$|\
-^test_simple_rnn_op$|\
-^test_lstm_cudnn_op$|\
-^test_crypto$|\
-^test_program_prune_backward$|\
-^test_imperative_ocr_attention_model$|\
-^test_sentiment$|\
-^test_imperative_basic$|\
-^test_jit_save_load$|\
-^test_imperative_mnist$|\
-^test_imperative_mnist_sorted_gradient$|\
-^test_imperative_static_runner_mnist$|\
-^test_fuse_all_reduce_pass$|\
-^test_bert$|\
-^test_lac$|\
-^test_mnist$|\
-^test_mobile_net$|\
-^test_ptb_lm$|\
-^test_ptb_lm_v2$|\
-^test_se_resnet$|\
-^test_imperative_qat_channelwise$|\
-^test_imperative_qat$|\
-^test_imperative_out_scale$|\
-^diable_wingpu_test$"
+^disable_wingpu_test$"
+
+
 # /*============================================================================*/
 
 # /*==================Fixed Disabled Windows CPU OPENBLAS unittests==============================*/
 # TODO: fix these unittest that is bound to fail
-diable_wincpu_test="^jit_kernel_test$|\
+disable_wincpu_test="^jit_kernel_test$|\
 ^test_analyzer_transformer$|\
 ^test_vision_models$|\
 ^test_dygraph_multi_forward$|\
@@ -134,10 +97,11 @@ diable_wincpu_test="^jit_kernel_test$|\
 ^test_mobile_net$|\
 ^test_resnet_v2$|\
 ^test_se_resnet$|\
-^diable_wincpu_test$"
+^disable_wincpu_test$"
 
 # these unittest that cost long time, diabled temporarily, Maybe moved to the night
 long_time_test="^best_fit_allocator_test$|\
+^test_gru_op$|\
 ^decorator_test$|\
 ^test_dataset_cifar$|\
 ^test_dataset_imdb$|\
@@ -223,7 +187,6 @@ long_time_test="^best_fit_allocator_test$|\
 ^test_strided_slice_op$"
 
 if [ ${WITH_GPU:-OFF} == "ON" ];then
-    export FLAGS_call_stack_level=2
     export FLAGS_fraction_of_gpu_memory_to_use=0.92
     export CUDA_VISIBLE_DEVICES=0
 
@@ -274,7 +237,7 @@ function collect_failed_tests() {
 
 function run_unittest_cpu() {
     tmpfile=$tmp_dir/$RANDOM
-    (ctest -E "$disable_ut_quickly|$diable_wincpu_test" -LE "${nightly_label}" --output-on-failure -C Release -j 8 | tee $tmpfile) &
+    (ctest -E "$disable_ut_quickly|$disable_wincpu_test" -LE "${nightly_label}" --output-on-failure -C Release -j 8 | tee $tmpfile) &
     wait;
 }
 
@@ -292,16 +255,11 @@ function run_unittest_gpu() {
     echo "************************************************************************"
     export CUDA_VISIBLE_DEVICES=0
     tmpfile=$tmp_dir/$RANDOM
-    (ctest -R "$test_case" -E "$disable_ut_quickly|$diable_wingpu_test|$long_time_test" -LE "${nightly_label}" --output-on-failure -C Release -j $parallel_job | tee $tmpfile ) &
+    (ctest -R "$test_case" -E "$disable_ut_quickly|$disable_wingpu_test|$long_time_test" -LE "${nightly_label}" --output-on-failure -C Release -j $parallel_job | tee $tmpfile ) &
     wait;
 }
 
 function unittests_retry(){
-    if [ "${WITH_GPU:-OFF}" == "ON" ];then
-        parallel_job=1
-    else
-        parallel_job=4
-    fi
     is_retry_execuate=0
     wintest_error=1
     retry_time=3
@@ -338,7 +296,7 @@ function unittests_retry(){
                     echo "========================================="
                     rm -f $tmp_dir/*
                     failed_test_lists=''
-                    (ctest -R "($retry_unittests_regular)" --output-on-failure -C Release -j $parallel_job| tee $tmpfile ) &
+                    (ctest -R "($retry_unittests_regular)" --output-on-failure -C Release -j 1 | tee $tmpfile ) &
                     wait;
                     collect_failed_tests
                     exec_times=$(echo $exec_times | awk '{print $0+1}')
@@ -382,6 +340,7 @@ function show_ut_retry_result() {
 
 set +e
 
+export FLAGS_call_stack_level=2
 if [ "${WITH_GPU:-OFF}" == "ON" ];then
     if [ -f "$PADDLE_ROOT/added_ut" ];then
         added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$