diff --git a/examples/aishell/asr0/local/train.sh b/examples/aishell/asr0/local/train.sh index 256b30d22277e707d63cfc0ea47465b8191c2c3a..2b71b7f763fee57cde35aa402de7d512c8aa099c 100755 --- a/examples/aishell/asr0/local/train.sh +++ b/examples/aishell/asr0/local/train.sh @@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/aishell/asr1/local/train.sh b/examples/aishell/asr1/local/train.sh index f514de303b2b270e9090d655ab9eb4156231e959..bfa8dd97d445c858950642ab0603f94a1357d873 100755 --- a/examples/aishell/asr1/local/train.sh +++ b/examples/aishell/asr1/local/train.sh @@ -35,6 +35,10 @@ echo ${ips_config} mkdir -p exp +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/librispeech/asr0/local/train.sh b/examples/librispeech/asr0/local/train.sh index 71659e28dd97fb4a55a1b86785dc9d0b71e32143..bb41fd5549e6a7d7bc85da7a1228c7d2407c3614 100755 --- a/examples/librispeech/asr0/local/train.sh +++ b/examples/librispeech/asr0/local/train.sh @@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/librispeech/asr1/local/train.sh b/examples/librispeech/asr1/local/train.sh index f729ed22c997df13335f8d22048d721e57e777b1..e274b91339e1f7b73f0318f294cfb4860bb0d06e 100755 --- a/examples/librispeech/asr1/local/train.sh +++ b/examples/librispeech/asr1/local/train.sh @@ -29,6 +29,10 @@ fi # export FLAGS_cudnn_exhaustive_search=true # export FLAGS_conv_workspace_size_limit=4000 +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/librispeech/asr2/local/train.sh b/examples/librispeech/asr2/local/train.sh index 1f414ad41caa9e067ad74796688df8e6d59a99e1..c2f2d4b650e21381b04a3d296ae87b9f9fa2d31b 100755 --- a/examples/librispeech/asr2/local/train.sh +++ b/examples/librispeech/asr2/local/train.sh @@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/timit/asr1/local/train.sh b/examples/timit/asr1/local/train.sh index 661407582cc2776e1676249cbe002bcef8c3c2e7..1088c7ffab4fe6fe0f694d3c359adec4d2d58ca2 100755 --- a/examples/timit/asr1/local/train.sh +++ b/examples/timit/asr1/local/train.sh @@ -19,6 +19,10 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/tiny/asr0/local/train.sh b/examples/tiny/asr0/local/train.sh index 8b67902fee9074901392f71a056f209b13d806ec..e233a0c0a158c8752a5d45e7e31b6d1c01e54aaf 100755 --- a/examples/tiny/asr0/local/train.sh +++ b/examples/tiny/asr0/local/train.sh @@ -32,6 +32,10 @@ fi mkdir -p exp +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/tiny/asr1/local/train.sh b/examples/tiny/asr1/local/train.sh index 459f2e21843273be284d74f6617aafb4c2e6d95d..fbfb41f6fdb1a8ff1537be96ae4de92ffc9663ad 100755 --- a/examples/tiny/asr1/local/train.sh +++ b/examples/tiny/asr1/local/train.sh @@ -34,6 +34,10 @@ fi mkdir -p exp +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/wenetspeech/asr1/local/train.sh b/examples/wenetspeech/asr1/local/train.sh index 01af00b61efb8a55d1b8da99bfd2a754e732cb1a..6813d270ce0c56d726652bb54a3ec2fe989a4fca 100755 --- a/examples/wenetspeech/asr1/local/train.sh +++ b/examples/wenetspeech/asr1/local/train.sh @@ -35,6 +35,10 @@ echo ${ips_config} mkdir -p exp +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \