diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91820123da48315e9a0382db7805121ad3f68873..2faa0a2bbbcb3f7b1b30e5f8d11da6fee0a7407a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,25 +74,39 @@ if(WIN32)
         endforeach(flag_var)
     endif()
 
-    # windows build turn off warnings.
+    # windows build turn off warnings, use parallel compiling.
     foreach(flag_var
         CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
         CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
         CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
         CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
         string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
+        set(${flag_var} "${${flag_var}} /MP /bigobj")
     endforeach(flag_var)
     foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
         set(${flag_var} "${${flag_var}} /w")
     endforeach(flag_var)
 
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
-    message(STATUS "Using parallel compiling (/MP)")
-    set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
-    set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
-    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
-    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    # Windows Remove /Zi, /ZI for Release, MinSizeRel builds
+    foreach(flag_var
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL)
+        if(${flag_var} MATCHES "/Z[iI]")
+            string(REGEX REPLACE "/Z[iI]" "" ${flag_var} "${${flag_var}}")
+        endif()
+    endforeach(flag_var)
+
+    foreach(flag_var 
+        CMAKE_STATIC_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS 
+        CMAKE_EXE_LINKER_FLAGS)
+        set(${flag_var} "${${flag_var}} /IGNORE:4006 /IGNORE:4098 /ignore:4049 /IGNORE:4217 /IGNORE:4221")
+        if(${flag_var} MATCHES "/INCREMENTAL" AND NOT ${flag_var} MATCHES "/INCREMENTAL:NO")
+            string(REGEX REPLACE "/INCREMENTAL" "/INCREMENTAL:NO" ${flag_var} "${${flag_var}}")
+        endif()
+    endforeach(flag_var)
+
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
 else(WIN32)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations")
 endif(WIN32)
diff --git a/cmake/init.cmake b/cmake/init.cmake
index 5f36a9adf1ae63e83bbffa021562cfca867270ba..aea02088750df4edc71a4909489c8ba250c8bb64 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -1,7 +1,7 @@
 # Attention: cmake will append these flags to compile command automatically.
 # So if you want to add global option, change this file rather than flags.cmake
 
-# NOT WIN32
+# Linux
 # DEBUG:  default: "-g"
 # RELEASE:  default: "-O3 -DNDEBUG"
 # RELWITHDEBINFO: default: "-O2 -g -DNDEBUG"
@@ -17,6 +17,8 @@ if(NOT WIN32)
     set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
     set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
+else()
+    set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
 endif()
 
 if(WITH_GPU)
@@ -25,9 +27,3 @@ if(WITH_GPU)
     set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
     set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
 endif()
-
-if(WIN32)
-    set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
-    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -Os -DNDEBUG")
-endif()
-
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index d557cad1c4c6fc22190980f210959e292fadcb0b..450cb7546fd4c34326a6926d03f6ad2dcab7c843 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -1,3 +1,4 @@
+@ECHO OFF
 rem Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 rem
 rem Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,15 +23,16 @@ setlocal
 rem -------clean up environment-----------
 set work_dir=%cd%
 set cache_dir=%work_dir:Paddle=cache%
+if not exist %cache_dir%\tools (
+    git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
+)
 taskkill /f /im op_function_generator.exe
 wmic process where name="op_function_generator.exe" call terminate
 
 rem ------initialize common variable------
-if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0"
 if not defined BRANCH set BRANCH=develop
 if not defined TENSORRT_ROOT set TENSORRT_ROOT="C:/TensorRT-5.1.5.0"
 if not defined WITH_MKL set WITH_MKL=ON
-if not defined WITH_GPU set WITH_GPU=OFF
 if not defined WITH_AVX set WITH_AVX=ON
 if not defined WITH_TESTING set WITH_TESTING=ON
 if not defined WITH_PYTHON set WITH_PYTHON=ON
@@ -60,7 +62,7 @@ setlocal enabledelayedexpansion
 git show-ref --verify --quiet refs/heads/last_pr
 if %ERRORLEVEL% EQU 0 (
     git diff HEAD last_pr --stat --name-only
-    git diff HEAD last_pr --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
+    git diff HEAD last_pr --stat --name-only | findstr ".cmake CMakeLists.txt paddle_build.bat"
     if !ERRORLEVEL! EQU 0 (
         rmdir build /s/q
     )
@@ -71,19 +73,19 @@ if %ERRORLEVEL% EQU 0 (
     git branch last_pr
 )
 
-for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
-set day_now=%datetime:~6,2%
-set day_before=-1
-set /p day_before=< %cache_dir%\day.txt
-if %day_now% NEQ %day_before% (
-    echo %day_now% > %cache_dir%\day.txt
-    type %cache_dir%\day.txt
-    rmdir build /s/q
-    goto :mkbuild
-)
+:: for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
+:: set day_now=%datetime:~6,2%
+:: set day_before=-1
+:: set /p day_before=< %cache_dir%\day.txt
+:: if %day_now% NEQ %day_before% (
+::     echo %day_now% > %cache_dir%\day.txt
+::     type %cache_dir%\day.txt
+::     rmdir build /s/q
+::     goto :mkbuild
+:: )
 
 :: git diff HEAD origin/develop --stat --name-only
-:: git diff HEAD origin/develop --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
+:: git diff HEAD origin/develop --stat --name-only | findstr ".cmake CMakeLists.txt paddle_build.bat"
 :: if %ERRORLEVEL% EQU 0 (
 ::     rmdir build /s/q
 :: )
@@ -117,13 +119,12 @@ pip install gym --user
 pip install -U -r %work_dir%\python\requirements.txt --user
 pip install -U -r %work_dir%\python\unittest_py\requirements.txt --user
 if %ERRORLEVEL% NEQ 0 (
-    call paddle_winci\Scripts\deactivate.bat 2>NUL
     echo pip install requirements.txt failed!
     exit /b 7
 )
 
 rem ------pre install clcache and init config----------
-pip install clcache
+pip install clcache --user
 :: set USE_CLCACHE to enable clcache
 set USE_CLCACHE=1
 :: In some scenarios, CLCACHE_HARDLINK can save one file copy.
@@ -133,29 +134,9 @@ set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
 :: set maximum cache size to 20G
 clcache.exe -M 21474836480
 
-rem ------set cache third_party------
-if not exist %cache_dir%\tools (
-    git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
-)
-
-if "%WITH_TPCACHE%"=="OFF" (
-    set THIRD_PARTY_PATH=%work_dir:\=/%/build/third_party
-    goto :CASE_%1
-)
-
-echo set -ex > cache.sh
-echo md5_content=$(cat %work_dir:\=/%/cmake/external/*.cmake  ^|md5sum ^| awk '{print $1}') >> cache.sh
-echo echo ${md5_content}^>md5.txt >> cache.sh
-
-%cache_dir%\tools\busybox64.exe cat cache.sh
-%cache_dir%\tools\busybox64.exe bash cache.sh
-
-set /p md5=< md5.txt
-if "%WITH_GPU%"=="ON" (
-    set THIRD_PARTY_PATH=%cache_dir:\=/%/third_party_GPU/%md5%
-) else (
-    set THIRD_PARTY_PATH=%cache_dir:\=/%/third_party/%md5%
-)
+rem ------show summary of current environment----------
+python %work_dir%\tools\summary_env.py
+%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\get_cpu_info.sh
 
 goto :CASE_%1
 
@@ -166,52 +147,88 @@ echo "wincheck_openbals: run Windows OPENBLAS/CPU CI tasks on Windows"
 exit /b 1
 
 :CASE_wincheck_mkl
+
+rem ------initialize cmake variable for mkl------
 set WITH_MKL=ON
 set WITH_GPU=OFF
 set MSVC_STATIC_CRT=ON
+
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
 call :unit_test || goto unit_test_error
 call :test_inference || goto test_inference_error
-call :check_change_of_unittest || goto check_change_of_unittest_error
+:: call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
 :CASE_wincheck_openblas
-set WITH_MKL=OFF
+
+rem ------initialize cmake variable for openblas------
+set WITH_MKL=ON
 set WITH_GPU=ON
 set MSVC_STATIC_CRT=OFF
 rem Temporarily turn off WITH_INFERENCE_API_TEST on GPU due to compile hang
 set WITH_INFERENCE_API_TEST=OFF
+
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
+:: call :unit_test || goto unit_test_error
 :: call :test_inference || goto test_inference_error
+:: call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
 rem "Other configurations are added here"
 rem :CASE_wincheck_others
 rem call ...
 
-
 rem ---------------------------------------------------------------------------------------------
 :cmake
 echo    ========================================
 echo    Step 1. Cmake ...
 echo    ========================================
+
 call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
 
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%
+
+@ECHO ON
+if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0
+set PATH=%CUDA_TOOLKIT_ROOT_DIR%\bin;%CUDA_TOOLKIT_ROOT_DIR%\libnvvp;%PATH%
+set CUDA_PATH=%CUDA_TOOLKIT_ROOT_DIR%
+
+rem ------set third_party cache dir------
+
+if "%WITH_TPCACHE%"=="OFF" (
+    set THIRD_PARTY_PATH=%work_dir:\=/%/build/third_party
+    goto :cmake_impl
+)
+
+echo set -ex > cache.sh
+echo md5_content=$(cat %work_dir:\=/%/cmake/external/*.cmake  ^|md5sum ^| awk '{print $1}') >> cache.sh
+echo echo ${md5_content}^>md5.txt >> cache.sh
+
+%cache_dir%\tools\busybox64.exe cat cache.sh
+%cache_dir%\tools\busybox64.exe bash cache.sh
+
+set /p md5=< md5.txt
+if "%WITH_GPU%"=="ON" (
+    set THIRD_PARTY_PATH=%cache_dir:\=/%/third_party_GPU/%md5%
+) else (
+    set THIRD_PARTY_PATH=%cache_dir:\=/%/third_party/%md5%
+)
+
+:cmake_impl
 echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
--DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
--DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^
+-DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
 -DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT%
 
 cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
--DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
--DON_INFER=%ON_INFER%  -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^
+-DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
 -DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT%
 goto:eof
@@ -224,6 +241,7 @@ exit /b 7
 
 rem ---------------------------------------------------------------------------------------------
 :build
+@ECHO OFF
 echo    ========================================
 echo    Step 2. Buile Paddle ...
 echo    ========================================
@@ -270,6 +288,7 @@ exit /b 7
 
 rem ---------------------------------------------------------------------------------------------
 :test_whl_pacakage
+@ECHO OFF
 echo    ========================================
 echo    Step 3. Test pip install whl package ...
 echo    ========================================
@@ -282,7 +301,7 @@ call :timestamp "%start%" "%end%" "Build"
 tree /F %cd%\paddle_inference_install_dir\paddle
 %cache_dir%\tools\busybox64.exe du -h -d 0 -k %cd%\paddle_inference_install_dir\paddle\lib > lib_size.txt
 set /p libsize=< lib_size.txt
-@ECHO OFF
+
 for /F %%i in ("%libsize%") do (
     set /a libsize_m=%%i/1024
     echo "Windows Paddle_Inference Size: !libsize_m!M"
@@ -303,17 +322,19 @@ if %ERRORLEVEL% NEQ 0 (
     exit /b 1
 )
 
+set CUDA_VISIBLE_DEVICES=0
 python %work_dir%\paddle\scripts\installation_validate.py
 goto:eof
 
 :test_whl_pacakage_error
-echo 1 > %cache_dir%\error_code.txt
-type %cache_dir%\error_code.txt
+::echo 1 > %cache_dir%\error_code.txt
+::type %cache_dir%\error_code.txt
 echo Test import paddle failed, will exit!
 exit /b 1
 
 rem ---------------------------------------------------------------------------------------------
 :unit_test
+@ECHO OFF
 echo    ========================================
 echo    Step 4. Running unit tests ...
 echo    ========================================
@@ -339,6 +360,7 @@ if %errorlevel%==0 (
 set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^
 %THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
 %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
+
 if "%NIGHTLY_MODE%"=="ON" (
     set nightly_label="()"
     ) else (
@@ -348,12 +370,82 @@ if "%NIGHTLY_MODE%"=="ON" (
     echo    ========================================
 )
 
+if "%WITH_GPU%"=="ON" (
+    goto:parallel_test_base_gpu
+) else (
+    goto:parallel_test_base_cpu
+)
+
+:parallel_test_base_gpu
+echo    ========================================
+echo    Running GPU unit tests in parallel way ...
+echo    ========================================
+
+set FLAGS_fraction_of_gpu_memory_to_use=0.75
+
+nvidia-smi -L
+for /F %%# in ('nvidia-smi -L ^| findstr "GPU" /C /I') do set CUDA_DEVICE_COUNT=%%#
+if !errorlevel! NEQ 0 exit /b 8
+
+rem TODO: fix these unittest that is bound to fail
+rem /*==================Disabled Windows==============================*/
+set diable_wingpu_test=tensor_util_test^|lod_tensor_test^|selected_rows_test^|broadcast_op_test^|fused_broadcast_op_test^|assign_op_test^|save_load_op_test^|save_load_combine_op_test^|im2col_test^|^
+beam_search_test^|test_analysis_predictor^|test_model^|test_add_reader_dependency^|test_bilateral_slice_op^|test_buffer_shared_memory_reuse_pass^|test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass^|^
+test_cholesky_op^|test_dataloader_early_reset^|test_dataloader_keep_order^|test_dataloader_unkeep_order^|test_decoupled_py_reader^|test_decoupled_py_reader_data_check^|test_eager_deletion_delete_vars^|^
+test_eager_deletion_while_op^|test_feed_data_check_shape_type^|test_fetch_lod_tensor_array^|test_fetch_unmerged^|test_fleet_base_single^|test_fuse_all_reduce_pass^|test_fuse_elewise_add_act_pass^|^
+test_fuse_optimizer_pass^|test_generator_dataloader^|test_gpu_package_without_gpu_device^|test_ir_memory_optimize_ifelse_op^|test_ir_memory_optimize_nlp^|test_lr_scheduler^|^
+test_multiprocess_dataloader_iterable_dataset_dynamic^|test_multiprocess_dataloader_iterable_dataset_static^|test_nvprof^|test_parallel_dygraph_sync_batch_norm^|test_parallel_executor_drop_scope^|^
+test_parallel_executor_dry_run^|test_parallel_executor_feed_persistable_var^|test_parallel_executor_fetch_isolated_var^|test_parallel_executor_inference_feed_partial_data^|test_parallel_executor_mnist^|^
+test_parallel_executor_seresnext_base_gpu^|test_parallel_executor_seresnext_with_fuse_all_reduce_gpu^|test_parallel_executor_seresnext_with_reduce_gpu^|test_parallel_executor_test_while_train^|^
+test_parallel_ssa_graph_inference_feed_partial_data^|test_partial_eager_deletion_transformer^|test_program_prune_backward^|test_prune^|test_py_reader_combination^|test_py_reader_pin_memory^|^
+test_py_reader_push_pop^|test_py_reader_using_executor^|test_reader_reset^|test_sync_batch_norm_op^|test_update_loss_scaling_op^|test_imperative_static_runner_while^|test_parallel_executor_crf^|^
+test_parallel_executor_profiler^|test_parallel_executor_transformer^|test_parallel_executor_transformer_auto_growth^|test_parallel_executor_seresnext_base_cpu^|test_yolov3^|^
+test_parallel_executor_seresnext_with_reduce_cpu^|test_parallel_executor_seresnext_with_fuse_all_reduce_cpu^|test_flags_use_mkldnn^|test_spawn_and_init_parallel_env^|test_train_recognize_digits^|^
+test_optimizer_in_control_flow^|test_fuse_bn_act_pass^|test_fuse_bn_add_act_pass^|test_activation_mkldnn_op^|test_tsm
+rem /*===============================================================*/
+
+rem these unittest that cost long time, diabled temporarily, greater than 10s
+set long_time_test=test_trilinear_interp_v2_op^|best_fit_allocator_test^|timer_test^|best_fit_allocator_test^|test_image_classification^|test_recognize_digits^|decorator_test^|test_callbacks^|^
+test_dataset_cifar^|test_dataset_imdb^|test_dataset_movielens^|test_datasets^|test_pretrained_model^|test_concat_op^|test_elementwise_add_op^|test_elementwise_sub_op^|test_gather_op^|test_gather_nd_op^|^
+test_sequence_concat^|test_sequence_conv^|test_sequence_pool^|test_sequence_slice_op^|test_space_to_depth_op^|test_activation_nn_grad^|test_activation_op^|test_auto_growth_gpu_memory_limit^|^
+test_bicubic_interp_op^|test_bicubic_interp_v2_op^|test_bilinear_interp_v2_op^|test_conv2d_op^|test_conv3d_op^|test_conv3d_transpose_part2_op^|test_conv_nn_grad^|test_crop_tensor_op^|^
+test_cross_entropy2_op^|test_cross_op^|test_deformable_conv_v1_op^|test_dropout_op^|test_dygraph_multi_forward^|test_elementwise_div_op^|test_elementwise_nn_grad^|test_empty_op^|^
+test_fused_elemwise_activation_op^|test_group_norm_op^|test_gru_op^|test_gru_unit_op^|test_imperative_lod_tensor_to_selected_rows^|test_imperative_optimizer^|test_imperative_ptb_rnn^|^
+test_imperative_save_load^|test_imperative_selected_rows_to_lod_tensor^|test_imperative_star_gan_with_gradient_penalty^|test_imperative_transformer_sorted_gradient^|test_layer_norm_op^|^
+test_lstm_cudnn_op^|test_masked_select_op^|test_matmul_v2_op^|test_multiclass_nms_op^|test_naive_best_fit_gpu_memory_limit^|test_nearest_interp_v2_op^|test_nn_grad^|test_norm_nn_grad^|^
+test_normal^|test_pool3d_op^|test_pool2d_op^|test_prroi_pool_op^|test_regularizer^|test_regularizer_api^|test_sgd_op^|test_softmax_with_cross_entropy_op^|test_static_save_load^|^
+test_trilinear_interp_op^|test_trilinear_interp_v2_op^|test_weight_decay^|test_bilinear_interp_op^|test_nearest_interp_op^|test_sequence_conv^|test_transformer^|test_imperative_out_scale^|^
+test_imperative_qat^|test_imperative_qat_channelwise^|test_quantization_pass^|test_beam_search_decoder^|test_argsort_op^|test_eager_deletion_gru_net^|test_lstmp_op^|test_label_semantic_roles^|^
+test_graph^|test_user_defined_quantization
+
+set /a end=CUDA_DEVICE_COUNT-1
+
+set parallel_test=''
+
+for /L %%# in (0,1,%end%) do (
+    set CUDA_VISIBLE_DEVICES=%%#
+    ctest.exe -I %%#,,%CUDA_DEVICE_COUNT% -R %parallel_test% -E "%disable_ut_quickly%|%diable_wingpu_test%|%long_time_test%" -LE %nightly_label% --output-on-failure -C Release -j 2 --repeat until-pass:4 after-timeout:4
+    if !errorlevel! NEQ 0 exit /b 8
+)
+
+for /L %%# in (0,1,%end%) do (
+    set CUDA_VISIBLE_DEVICES=%%#
+    ctest.exe -I %%#,,%CUDA_DEVICE_COUNT% -E "%disable_ut_quickly%|%parallel_test%|%diable_wingpu_test%|%long_time_test%" -LE %nightly_label% --output-on-failure -C Release -j 1 --repeat until-pass:4 after-timeout:4
+    if !errorlevel! NEQ 0 exit /b 8
+)
+goto:eof
+
+:parallel_test_base_cpu
+echo    ========================================
+echo    Running CPU unit tests in parallel way ...
+echo    ========================================
 ctest.exe -E "(%disable_ut_quickly%)" -LE %nightly_label% --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
+
 goto:eof
 
 :unit_test_error
-echo 8 > %cache_dir%\
-type %cache_dir%\error_code.txt
+:: echo 8 > %cache_dir%\error_code.txt
+:: type %cache_dir%\error_code.txt
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
 set end=%end:~4,10%
 call :timestamp "%start%" "%end%" "1 card TestCases Total"
@@ -363,6 +455,7 @@ exit /b 8
 
 rem ---------------------------------------------------------------------------------------------
 :test_inference
+@ECHO OFF
 echo    ========================================
 echo    Step 5. Testing fluid library for inference ...
 echo    ========================================
@@ -377,18 +470,18 @@ cd %work_dir%\paddle\fluid\inference\api\demo_ci
 goto:eof
 
 :test_inference_error
-echo 1 > %cache_dir%\error_code.txt
-type %cache_dir%\error_code.txt
+::echo 1 > %cache_dir%\error_code.txt
+::type %cache_dir%\error_code.txt
 echo Testing fluid library for inference failed!
 exit /b 1
 
 rem ---------------------------------------------------------------------------------------------
 :check_change_of_unittest
+@ECHO OFF
 echo    ========================================
 echo    Step 6. Check whether deleting a unit test ...
 echo    ========================================
 
-@ECHO OFF
 cd /d %work_dir%\build
 echo set -e>  check_change_of_unittest.sh
 echo set +x>> check_change_of_unittest.sh
@@ -398,6 +491,7 @@ echo BRANCH=%BRANCH%>>  check_change_of_unittest.sh
 echo if [ "${GITHUB_API_TOKEN}" == "" ] ^|^| [ "${GIT_PR_ID}" == "" ];then>> check_change_of_unittest.sh
 echo     exit 0 >>  check_change_of_unittest.sh
 echo fi>>  check_change_of_unittest.sh
+echo set -x>> check_change_of_unittest.sh
 echo cat ^<^<EOF>>  check_change_of_unittest.sh
 echo     ============================================ >>  check_change_of_unittest.sh
 echo     Generate unit tests.spec of this PR.         >>  check_change_of_unittest.sh
@@ -411,8 +505,8 @@ echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>>  check_change_of_un
 echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>>  check_change_of_unittest.sh
 echo if [ "$origin_upstream_url" == "" ]; then>>  check_change_of_unittest.sh
 echo     git remote add upstream $UPSTREAM_URL.git>>  check_change_of_unittest.sh
-echo elif [ "$origin_upstream_url" != "$UPSTREAM_URL" ] \>>  check_change_of_unittest.sh
-echo         ^&^& [ "$origin_upstream_url" != "$UPSTREAM_URL.git" ]; then>>  check_change_of_unittest.sh
+echo elif [ "$origin_upstream_url" ^!= "$UPSTREAM_URL" ] ^\>>  check_change_of_unittest.sh
+echo         ^&^& [ "$origin_upstream_url" ^!= "$UPSTREAM_URL.git" ]; then>>  check_change_of_unittest.sh
 echo     git remote remove upstream>>  check_change_of_unittest.sh
 echo     git remote add upstream $UPSTREAM_URL.git>>  check_change_of_unittest.sh
 echo fi>>  check_change_of_unittest.sh
@@ -422,9 +516,10 @@ echo fi>>  check_change_of_unittest.sh
 echo git checkout -b origin_pr >>  check_change_of_unittest.sh
 echo git checkout -f $BRANCH >>  check_change_of_unittest.sh
 echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
--DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
--DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
--DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% >>  check_change_of_unittest.sh
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^
+-DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
+-DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% >>  check_change_of_unittest.sh
 echo cat ^<^<EOF>>  check_change_of_unittest.sh
 echo     ============================================       >>  check_change_of_unittest.sh
 echo     Generate unit tests.spec of develop.               >>  check_change_of_unittest.sh
@@ -433,10 +528,11 @@ echo EOF>>  check_change_of_unittest.sh
 echo spec_path=$(pwd)/UNITTEST_DEV.spec>>  check_change_of_unittest.sh
 echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
 echo unittest_spec_diff=`python $(pwd)/../tools/diff_unittest.py $(pwd)/UNITTEST_DEV.spec $(pwd)/UNITTEST_PR.spec`>>  check_change_of_unittest.sh
-echo if [ "$unittest_spec_diff" != "" ]; then>>  check_change_of_unittest.sh
-echo     # approval_user_list: XiaoguangHu01 46782768,luotao1 6836917,phlrain 43953930,lanxianghit 47554610, zhouwei25 52485244, kolinwei 22165420>>  check_change_of_unittest.sh
+echo if [ "$unittest_spec_diff" ^!= "" ]; then>>  check_change_of_unittest.sh
+echo     set +x>> check_change_of_unittest.sh
 echo     approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`>>  check_change_of_unittest.sh
-echo     if [ "$approval_line" != "" ]; then>>  check_change_of_unittest.sh
+echo     set -x>> check_change_of_unittest.sh
+echo     if [ "$approval_line" ^!= "" ]; then>>  check_change_of_unittest.sh
 echo         APPROVALS=`echo ${approval_line} ^|python $(pwd)/../tools/check_pr_approval.py 1 22165420 52485244 6836917`>>  check_change_of_unittest.sh
 echo         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}">>  check_change_of_unittest.sh
 echo         if [ "${APPROVALS}" == "FALSE" ]; then>>  check_change_of_unittest.sh
@@ -458,13 +554,12 @@ echo git checkout -f origin_pr >>  check_change_of_unittest.sh
 goto:eof
 
 :check_change_of_unittest_error
-echo 1 > %cache_dir%\error_code.txt
-type %cache_dir%\error_code.txt
 exit /b 1
 
 
 :timestamp
 setlocal enabledelayedexpansion
+@ECHO OFF
 set start=%~1
 set dd=%start:~2,2%
 set /a dd=100%dd%%%100