Polish Windows CI and open the normal GPU unittest on CI (#32794)

* fix windows CI * fix windows CI

Polish Windows CI and open the normal GPU unittest on CI (#32794)
* fix windows CI * fix windows CI
eeca9639 · Zhou Wei · GitHub · 24ffcd0d · eeca9639 · eeca9639
9 changed file
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -18,6 +18,16 @@ if(NOT WIN32)
    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
    set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
 else()
+    set(CMAKE_C_FLAGS_DEBUG "/Zi /DEBUG")
+    set(CMAKE_C_FLAGS_RELEASE "/O2 /DNDEBUG")
+    set(CMAKE_C_FLAGS_RELWITHDEBINFO "/O2 /DNDEBUG")
+    set(CMAKE_C_FLAGS_MINSIZEREL "/Os /DNDEBUG")
+    set(CMAKE_CXX_FLAGS_DEBUG "/Zi /DEBUG")
+    set(CMAKE_CXX_FLAGS_RELEASE "/O2 /DNDEBUG")
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/O2 /DNDEBUG")
+    set(CMAKE_CXX_FLAGS_MINSIZEREL "/Os /DNDEBUG")
    # It can specify CUDA compile flag manualy,
    # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
    # because CUDA will update by nvidia, then error will occur.

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -128,14 +128,20 @@ if(WITH_PYTHON)
    else()
      set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
    endif()
+    file(TO_NATIVE_PATH ${op_function_generator_path} op_function_generator_path)
+    file(TO_NATIVE_PATH ${impl_file} impl_file)
+    file(TO_NATIVE_PATH ${tmp_impl_file} tmp_impl_file)
    file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat ""
    "set build_times=1\n"
    ":retry\n"
    "ECHO op_function_generator run %build_times% time\n"
-    "${op_function_generator_path}/op_function_generator ${impl_file}\n"
+    "if exist ${tmp_impl_file} del ${tmp_impl_file}\n"
+    "taskkill /f /im op_function_generator.exe 2>NUL\n"
+    "${op_function_generator_path}\\op_function_generator.exe ${tmp_impl_file}\n"
    "if %ERRORLEVEL% NEQ 0 (\n"
    "    set /a build_times=%build_times%+1\n"
-    "    if %build_times% GTR 5 (\n"
+    "    if %build_times% GEQ 3 (\n"
    "        exit /b 1\n"
    "    ) else (\n"
    "        goto :retry\n"
@@ -145,6 +151,8 @@ if(WITH_PYTHON)
    add_custom_command(TARGET op_function_generator POST_BUILD
          COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
+          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
+          COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
    )
    if(${CBLAS_PROVIDER} STREQUAL MKLML)
@@ -176,7 +184,7 @@ if(WITH_PYTHON)
              "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator"
              "${tmp_impl_file}"
          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
-          COMMENT "copy_if_different ${impl_file}"
+          COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
          VERBATIM
    )
    if(WITH_MKL)

--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -26,11 +26,10 @@ set cache_dir=%work_dir:Paddle=cache%
 if not exist %cache_dir%\tools (
    git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
 )
-taskkill /f /im op_function_generator.exe  2>NUL
 taskkill /f /im cmake.exe  2>NUL
 taskkill /f /im MSBuild.exe 2>NUL
-taskkill /f /im CL.exe 2>NUL
+taskkill /f /im cl.exe 2>NUL
-taskkill /f /im Lib.exe 2>NUL
+taskkill /f /im lib.exe 2>NUL
 taskkill /f /im link.exe 2>NUL
 taskkill /f /im vctip.exe 2>NUL
 taskkill /f /im cvtres.exe 2>NUL
@@ -47,8 +46,8 @@ wmic process where name="op_function_generator.exe" call terminate 2>NUL
 wmic process where name="test_api_impl.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
-wmic process where name="CL.exe" call terminate 2>NUL
+wmic process where name="cl.exe" call terminate 2>NUL
-wmic process where name="Lib.exe" call terminate 2>NUL
+wmic process where name="lib.exe" call terminate 2>NUL
 wmic process where name="python.exe" call terminate 2>NUL
 rem ------initialize common variable------
@@ -79,6 +78,7 @@ if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
 rem -------set cache build directory-----------
 rmdir build\python /s/q
+rmdir build\paddle\fluid\pybind /s/q
 rmdir build\paddle_install_dir /s/q
 rmdir build\paddle_inference_install_dir /s/q
 rmdir build\paddle_inference_c_install_dir /s/q
@@ -112,6 +112,17 @@ if %ERRORLEVEL% EQU 0 (
    git branch last_pr
 )
+for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
+set day_now=%datetime:~6,2%
+set day_before=-1
+set /p day_before=< %cache_dir%\day.txt
+if %day_now% NEQ %day_before% (
+    echo %day_now% > %cache_dir%\day.txt
+    type %cache_dir%\day.txt
+    rmdir build /s/q
+    goto :mkbuild
+)
 :: git diff HEAD origin/develop --stat --name-only
 :: git diff HEAD origin/develop --stat --name-only | findstr ".cmake CMakeLists.txt paddle_build.bat"
 :: if %ERRORLEVEL% EQU 0 (
@@ -137,10 +148,11 @@ goto :CASE_%1
 echo "Usage: paddle_build.bat [OPTION]"
 echo "OPTION:"
-echo "wincheck_mkl: run Windows MKL/GPU/UnitTest CI tasks on Windows"
+echo "wincheck_mkl: run Windows MKL/GPU PR CI tasks on Windows"
-echo "wincheck_openbals: run Windows OPENBLAS/CPU CI tasks on Windows"
+echo "wincheck_openbals: run Windows OPENBLAS/CPU PR CI tasks on Windows"
 echo "build_avx_whl: build Windows avx whl package on Windows"
 echo "build_no_avx_whl: build Windows no avx whl package on Windows"
+echo "build_inference_lib: build Windows inference library on Windows"
 exit /b 1
 rem ------PR CI windows check for MKL/GPU----------
@@ -200,6 +212,7 @@ goto:success
 rem ------Build windows inference library------
 :CASE_build_inference_lib
+set ON_INFER=ON
 set WITH_PYTHON=OFF
 set CUDA_ARCH_NAME=All
@@ -249,9 +262,10 @@ if "%WITH_GPU%"=="ON" (
 )
 rem ------initialize the python environment------
+@ECHO ON
 set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
 set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
-if %WITH_PYTHON% == "OFF" (
+if %WITH_PYTHON% == "ON" (
    where python
    where pip
    pip install wheel --user
@@ -373,6 +387,7 @@ set build_times=1
 rem clcache.exe -z
 rem -------clean up environment again-----------
+taskkill /f /im cmake.exe  2>NUL
 taskkill /f /im MSBuild.exe 2>NUL
 taskkill /f /im cl.exe 2>NUL
 taskkill /f /im lib.exe 2>NUL
@@ -387,12 +402,13 @@ taskkill /f /im cicc.exe 2>NUL
 taskkill /f /im ptxas.exe 2>NUL
 taskkill /f /im test_api_impl.exe 2>NUL
 taskkill /f /im op_function_generator.exe 2>NUL
+wmic process where name="cmake.exe" call terminate 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
 wmic process where name="test_api_impl.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
-wmic process where name="CL.exe" call terminate 2>NUL
+wmic process where name="cl.exe" call terminate 2>NUL
-wmic process where name="Lib.exe" call terminate 2>NUL
+wmic process where name="lib.exe" call terminate 2>NUL
 echo Build Paddle the %build_times% time:
 if %GENERATOR% == "Ninja" (
@@ -766,8 +782,8 @@ wmic process where name="op_function_generator.exe" call terminate 2>NUL
 wmic process where name="test_api_impl.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
-wmic process where name="CL.exe" call terminate 2>NUL
+wmic process where name="cl.exe" call terminate 2>NUL
-wmic process where name="Lib.exe" call terminate 2>NUL
+wmic process where name="lib.exe" call terminate 2>NUL
 wmic process where name="python.exe" call terminate 2>NUL
 echo Windows CI run successfully!
 exit /b 0

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1262,11 +1262,13 @@ set +x
                testcase=''
        done <<< "$test_cases";
-        card_test "$single_card_tests_high_parallel" 1 8        # run cases the most each time with single GPU
+        card_test "$single_card_tests_high_parallel" 1 6        # run cases the most each time with single GPU
        card_test "$single_card_tests_two_parallel" 1 2         # run cases 2 job each time with single GPU
        card_test "$single_card_tests_non_parallel" 1           # run cases 1 job each time with single GPU
        card_test "$multiple_card_tests_two_parallel" 2 2       # run cases 2 job each time with two GPUs
        card_test "$multiple_card_tests_non_parallel" 2         # run cases 1 job each time with two GPUs
        card_test "$exclusive_tests_two_parallel" -1 2          # run cases exclusively, in this cases would be run with 2/4/8 GPUs
        card_test "$exclusive_tests_non_parallel" -1            # run cases exclusively, in this cases would be run with 2/4/8 GPUs
        collect_failed_tests

--- a/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
@@ -77,7 +77,11 @@ class DataLoaderKeepOrderTestBase(unittest.TestCase):
    def get_places(self):
        place_list = [fluid.cpu_places(1), fluid.cpu_places(4)]
        if fluid.is_compiled_with_cuda():
-            place_list.extend([fluid.cuda_places(0), fluid.cuda_places([0, 1])])
+            if os.name == "nt":
+                place_list.extend([fluid.cuda_places(0)])
+            else:
+                place_list.extend(
+                    [fluid.cuda_places(0), fluid.cuda_places([0, 1])])
        return place_list
    def test_main(self):

--- a/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py
@@ -96,7 +96,11 @@ class DataLoaderKeepOrderTestBase(unittest.TestCase):
    def get_places(self):
        place_list = [fluid.cpu_places(1), fluid.cpu_places(4)]
        if fluid.is_compiled_with_cuda():
-            place_list.extend([fluid.cuda_places(0), fluid.cuda_places([0, 1])])
+            if os.name == "nt":
+                place_list.extend([fluid.cuda_places(0)])
+            else:
+                place_list.extend(
+                    [fluid.cuda_places(0), fluid.cuda_places([0, 1])])
        return place_list
    def test_main(self):

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
@@ -17,6 +17,7 @@ import numpy as np
 import six
 import paddle.fluid as fluid
 import paddle
+import os
 def enable_parallel_ssa_executor(enabled=True):
@@ -65,6 +66,9 @@ class TestParallelExecutorFetchIsolatedVarBase(unittest.TestCase):
            if fluid.core.globals()[
                    'FLAGS_enable_parallel_graph'] and not use_gpu:
                return
+            # windows has only 1 GPU
+            if use_gpu and dev_cnt > 1 and os.name == "nt":
+                return
        else:
            if use_gpu:
                return

--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -131,7 +131,6 @@ CPU_PARALLEL_JOB = [
    'test_ones_op',
    'test_npair_loss_op',
    'test_nn_functional_embedding_static',
-    'test_nce',
    'test_name_scope',
    'test_naive_executor',
    'test_multiprocess_dataloader_iterable_dataset_split',
@@ -293,8 +292,6 @@ CPU_PARALLEL_JOB = [
    'test_dataset_imdb',
    'test_dataset_conll05',
    'test_dataset_cifar',
-    'test_dataloader_unkeep_order',
-    'test_dataloader_keep_order',
    'test_dataloader_dataset',
    'test_data_generator',
    'test_data_feeder',
@@ -571,8 +568,6 @@ CPU_PARALLEL_JOB = [
    'test_fleet_cc',
    'test_repeated_fc_relu_fuse_pass_cc',
    'heter_server_test',
-    'test_static_save_load_large',
-    'graph_node_test',
    'test_custom_conj',
    'test_fleet_private_function',
    'test_fake_init_op',
@@ -604,27 +599,21 @@ CPU_PARALLEL_JOB = [
 # It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED,
 # just remove it from this list.
 TETRAD_PARALLEL_JOB = [
+    'graph_node_test',
+    'test_assert',
+    'test_nce',
    'buffered_allocator_test',
    'allocator_facade_frac_flags_test',
    'cuda_helper_test',
-    'sequence_padding_test',
    'test_auto_growth_gpu_memory_limit',
-    'test_imperative_framework',
    'device_context_test',
    'test_reference_count_pass_last_lived_ops',
    'copy_same_tensor_test',
-    'float16_gpu_test',
-    'test_leaky_relu_grad_grad_functor',
-    'sequence_pooling_test',
    'mixed_vector_test',
    'op_registry_test',
-    'strided_memcpy_test',
-    'selected_rows_functor_gpu_test',
    'test_prepare_op',
    'data_device_transform_test',
-    'test_tensor_to_numpy',
    'test_naive_best_fit_gpu_memory_limit',
-    'vol2col_test',
    'test_imperative_using_non_zero_gpu',
    'retry_allocator_test',
    'system_allocator_test',
@@ -659,23 +648,16 @@ TETRAD_PARALLEL_JOB = [
    'test_analyzer_paddletensor_tensor',
    'test_analyzer_bert',
    'test_analyzer_googlenet',
-    'zero_copy_tensor_test',
-    'custom_tensor_test',
    'test_fleet_base',
    'test_imperative_container_layerdict',
-    'test_complex_simplenet',
-    'test_tensor_register_hook',
    'test_set_value_op',
-    'test_tensor_type_promotion',
    'test_view_op_reuse_allocation',
-    'test_complex_grad_accumulated',
    'test_sequential',
    'test_sequential',
    'test_imperative_layers',
    'test_dgc_momentum_op',
    'test_memcpy_op',
    'test_dgc_op',
-    'test_modelaverage',
    'test_lookahead',
    'test_callback_visualdl',
    'test_new_group_api',
@@ -684,32 +666,37 @@ TETRAD_PARALLEL_JOB = [
    'test_collective_split_row_linear',
    'test_collective_split_col_linear',
    'test_collective_split_embedding',
+    'test_custom_attrs_jit',
+    'float16_gpu_test',
+    'test_leaky_relu_grad_grad_functor',
+    'test_complex_simplenet',
+    'selected_rows_functor_gpu_test',
+    'test_imperative_framework',
 ]
 # It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED,
 # just remove it from this list.
 TWO_PARALLEL_JOB = [
+    'test_tensor_to_numpy',
+    'zero_copy_tensor_test',
+    'sequence_pooling_test',
+    'sequence_padding_test',
+    'vol2col_test',
    'convert_model2dot_ernie',
    'im2col_test',
-    'test_elementwise_add_grad_grad',
    'test_logical_op',
    'test_imperative_mnist',
    'test_imperative_deepcf',
    'test_cholesky_op',
-    'test_multiprocess_dataloader_iterable_dataset_static',
    'test_sample_logits_op',
    'test_ir_fc_fuse_pass',
-    'test_imperative_qat_channelwise',
    'test_fleet_base_single',
-    'test_imperative_out_scale',
    'test_multiprocess_dataloader_iterable_dataset_dynamic',
    'test_fill_op',
    'test_slice_op',
    'test_cond',
-    'test_compiled_program',
    'test_lstm',
    'test_ema',
-    'test_py_reader_using_executor',
    'test_nan_inf',
    'test_isinstance',
    'test_jit_save_load',
@@ -749,13 +736,11 @@ TWO_PARALLEL_JOB = [
    'test_anchor_generator_op',
    'test_imperative_ptb_rnn',
    'test_gather_nd_op',
-    'test_flatten_contiguous_range_op',
    'test_network_with_dtype',
    'test_elementwise_sub_op',
    'test_assert_op',
    'test_elementwise_div_op',
    'test_gather_tree_op',
-    'test_decoupled_py_reader',
    'test_imperative_named_members',
    'test_seqconv_eltadd_relu_fuse_pass',
    'test_analysis_predictor',
@@ -771,7 +756,6 @@ TWO_PARALLEL_JOB = [
    'test_traced_layer_err_msg',
    'test_unique_with_counts',
    'test_auc_single_pred_op',
-    'test_stack_op',
    'test_conv_bn_fuse_pass',
    'test_instance_norm_op_v2',
    'test_softmax_bf16_mkldnn_op',
@@ -793,10 +777,8 @@ TWO_PARALLEL_JOB = [
    'test_ctc_align',
    'test_imperative_save_load_v2',
    'test_decayed_adagrad_op',
-    'test_generator_dataloader',
    'test_dropout_op',
    'test_functional_conv3d',
-    'test_executor_return_tensor_not_overwriting',
    'test_flatten2_op',
    'test_fsp_op',
    'test_fusion_transpose_flatten_concat_op',
@@ -812,7 +794,6 @@ TWO_PARALLEL_JOB = [
    'test_temporal_shift_op',
    'test_case',
    'test_transformer_api',
-    'test_bmm_op',
    'test_adagrad_op',
    'test_batch_norm_mkldnn_op',
    'test_adam_op_multi_thread',
@@ -973,7 +954,6 @@ TWO_PARALLEL_JOB = [
    'test_auc_op',
    'test_adam_op',
    'test_bilinear_tensor_product_op',
-    'test_break_continue',
    'test_transpose_mkldnn_op',
    'test_callback_reduce_lr_on_plateau',
    'test_cast_op',
@@ -990,7 +970,6 @@ TWO_PARALLEL_JOB = [
    'test_functional_conv2d_transpose',
    'test_functional_conv3d_transpose',
    'test_dot_op',
-    'test_gru_op',
    'test_device',
    'test_imperative_layer_apply',
    'test_dataloader_early_reset',
@@ -1064,26 +1043,21 @@ TWO_PARALLEL_JOB = [
    'test_imperative_optimizer',
    'test_assign_value_op',
    'test_roi_pool_op',
-    'test_imperative_basic',
    'test_word2vec',
    'test_manual_seed',
-    'test_buffer_shared_memory_reuse_pass',
    'test_range',
    'test_box_decoder_and_assign_op',
    'test_imperative_optimizer_v2',
    'test_python_operator_overriding',
    'test_is_empty_op',
-    'test_imperative_qat',
    'test_py_reader_pin_memory',
    'test_train_recognize_digits',
    'test_parallel_executor_feed_persistable_var',
    'test_mnist',
    'test_update_loss_scaling_op',
    'test_rnn_cell_api',
-    'test_parallel_executor_fetch_isolated_var',
    'test_imperative_load_static_param',
    'test_fuse_bn_add_act_pass',
-    'test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass',
    'test_quantize_transpiler_v2',
    'paddle_infer_api_test',
    'test_analyzer_ernie',

--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -46,81 +46,44 @@ if [ ${WITH_GPU:-OFF} == "ON" ];then
    set -e
 fi
 # /*==================Fixed Disabled Windows GPU MKL unittests==============================*/
 # TODO: fix these unittest that is bound to fail
-diable_wingpu_test="^lite_mul_model_test$|\
+disable_wingpu_test="^test_model$|\
-^test_analyzer_int8_resnet50$|\
+^test_dataloader_early_reset$|\
-^test_gradient_clip$|\
+^test_add_reader_dependency$|\
-^test_translated_layer$|\
-^test_imperative_resnet$|\
-^test_imperative_resnet_sorted_gradient$|\
-^test_model$|\
 ^test_decoupled_py_reader$|\
 ^test_generator_dataloader$|\
-^test_multiprocess_dataloader_iterable_dataset_static$|\
+^test_parallel_dygraph_sync_batch_norm$|\
 ^test_py_reader_using_executor$|\
-^test_parallel_executor_feed_persistable_var$|\
-^test_parallel_executor_fetch_isolated_var$|\
-^test_parallel_executor_inference_feed_partial_data$|\
 ^test_parallel_executor_seresnext_base_gpu$|\
 ^test_parallel_executor_seresnext_with_fuse_all_reduce_gpu$|\
 ^test_parallel_executor_seresnext_with_reduce_gpu$|\
-^test_parallel_ssa_graph_inference_feed_partial_data$|\
+^test_program_prune_backward$|\
-^test_sync_batch_norm_op$|\
-^test_fuse_relu_depthwise_conv_pass$|\
-^test_buffer_shared_memory_reuse_pass$|\
-^test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass$|\
-^test_dataloader_keep_order$|\
-^test_dataloader_unkeep_order$|\
-^test_add_reader_dependency$|\
-^test_cholesky_op$|\
-^test_dataloader_early_reset$|\
 ^test_decoupled_py_reader_data_check$|\
 ^test_fleet_base_single$|\
-^test_fuse_optimizer_pass$|\
 ^test_multiprocess_dataloader_iterable_dataset_dynamic$|\
-^test_parallel_dygraph_sync_batch_norm$|\
+^test_parallel_executor_feed_persistable_var$|\
-^test_partial_eager_deletion_transformer$|\
+^test_parallel_executor_inference_feed_partial_data$|\
-^test_rnn_nets$|\
+^test_parallel_ssa_graph_inference_feed_partial_data$|\
 ^test_py_reader_combination$|\
 ^test_py_reader_pin_memory$|\
 ^test_py_reader_push_pop$|\
 ^test_reader_reset$|\
-^test_imperative_se_resnext$|\
+^test_sync_batch_norm_op$|\
 ^test_imperative_static_runner_while$|\
+^test_dataloader_keep_order$|\
+^test_dataloader_unkeep_order$|\
+^test_multiprocess_dataloader_iterable_dataset_static$|\
 ^test_fuse_bn_act_pass$|\
 ^test_fuse_bn_add_act_pass$|\
-^test_gru_rnn_op$|\
+^disable_wingpu_test$"
-^test_rnn_op$|\
-^test_simple_rnn_op$|\
-^test_lstm_cudnn_op$|\
-^test_crypto$|\
-^test_program_prune_backward$|\
-^test_imperative_ocr_attention_model$|\
-^test_sentiment$|\
-^test_imperative_basic$|\
-^test_jit_save_load$|\
-^test_imperative_mnist$|\
-^test_imperative_mnist_sorted_gradient$|\
-^test_imperative_static_runner_mnist$|\
-^test_fuse_all_reduce_pass$|\
-^test_bert$|\
-^test_lac$|\
-^test_mnist$|\
-^test_mobile_net$|\
-^test_ptb_lm$|\
-^test_ptb_lm_v2$|\
-^test_se_resnet$|\
-^test_imperative_qat_channelwise$|\
-^test_imperative_qat$|\
-^test_imperative_out_scale$|\
-^diable_wingpu_test$"
 # /*============================================================================*/
 # /*==================Fixed Disabled Windows CPU OPENBLAS unittests==============================*/
 # TODO: fix these unittest that is bound to fail
-diable_wincpu_test="^jit_kernel_test$|\
+disable_wincpu_test="^jit_kernel_test$|\
 ^test_analyzer_transformer$|\
 ^test_vision_models$|\
 ^test_dygraph_multi_forward$|\
@@ -134,10 +97,11 @@ diable_wincpu_test="^jit_kernel_test$|\
 ^test_mobile_net$|\
 ^test_resnet_v2$|\
 ^test_se_resnet$|\
-^diable_wincpu_test$"
+^disable_wincpu_test$"
 # these unittest that cost long time, diabled temporarily, Maybe moved to the night
 long_time_test="^best_fit_allocator_test$|\
+^test_gru_op$|\
 ^decorator_test$|\
 ^test_dataset_cifar$|\
 ^test_dataset_imdb$|\
@@ -223,7 +187,6 @@ long_time_test="^best_fit_allocator_test$|\
 ^test_strided_slice_op$"
 if [ ${WITH_GPU:-OFF} == "ON" ];then
-    export FLAGS_call_stack_level=2
    export FLAGS_fraction_of_gpu_memory_to_use=0.92
    export CUDA_VISIBLE_DEVICES=0
@@ -274,7 +237,7 @@ function collect_failed_tests() {
 function run_unittest_cpu() {
    tmpfile=$tmp_dir/$RANDOM
-    (ctest -E "$disable_ut_quickly|$diable_wincpu_test" -LE "${nightly_label}" --output-on-failure -C Release -j 8 | tee $tmpfile) &
+    (ctest -E "$disable_ut_quickly|$disable_wincpu_test" -LE "${nightly_label}" --output-on-failure -C Release -j 8 | tee $tmpfile) &
    wait;
 }
@@ -292,16 +255,11 @@ function run_unittest_gpu() {
    echo "************************************************************************"
    export CUDA_VISIBLE_DEVICES=0
    tmpfile=$tmp_dir/$RANDOM
-    (ctest -R "$test_case" -E "$disable_ut_quickly|$diable_wingpu_test|$long_time_test" -LE "${nightly_label}" --output-on-failure -C Release -j $parallel_job | tee $tmpfile ) &
+    (ctest -R "$test_case" -E "$disable_ut_quickly|$disable_wingpu_test|$long_time_test" -LE "${nightly_label}" --output-on-failure -C Release -j $parallel_job | tee $tmpfile ) &
    wait;
 }
 function unittests_retry(){
-    if [ "${WITH_GPU:-OFF}" == "ON" ];then
-        parallel_job=1
-    else
-        parallel_job=4
-    fi
    is_retry_execuate=0
    wintest_error=1
    retry_time=3
@@ -338,7 +296,7 @@ function unittests_retry(){
                    echo "========================================="
                    rm -f $tmp_dir/*
                    failed_test_lists=''
-                    (ctest -R "($retry_unittests_regular)" --output-on-failure -C Release -j $parallel_job| tee $tmpfile ) &
+                    (ctest -R "($retry_unittests_regular)" --output-on-failure -C Release -j 1 | tee $tmpfile ) &
                    wait;
                    collect_failed_tests
                    exec_times=$(echo $exec_times | awk '{print $0+1}')
@@ -382,6 +340,7 @@ function show_ut_retry_result() {
 set +e
+export FLAGS_call_stack_level=2
 if [ "${WITH_GPU:-OFF}" == "ON" ];then
    if [ -f "$PADDLE_ROOT/added_ut" ];then
        added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$