未验证 提交 a657cc3e 编写于 作者: H Hui Zhang 提交者: GitHub

Merge pull request #2478 from Zth9730/allocator_strategy

[ASR] Chang memory allocator strategy to fix gpu training hang
...@@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then ...@@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True export FLAGS_cudnn_deterministic=True
fi fi
# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit
if [ ${ngpu} == 0 ]; then if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
......
...@@ -35,6 +35,10 @@ echo ${ips_config} ...@@ -35,6 +35,10 @@ echo ${ips_config}
mkdir -p exp mkdir -p exp
# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit
if [ ${ngpu} == 0 ]; then if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
......
...@@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then ...@@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True export FLAGS_cudnn_deterministic=True
fi fi
# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit
if [ ${ngpu} == 0 ]; then if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
......
...@@ -29,6 +29,10 @@ fi ...@@ -29,6 +29,10 @@ fi
# export FLAGS_cudnn_exhaustive_search=true # export FLAGS_cudnn_exhaustive_search=true
# export FLAGS_conv_workspace_size_limit=4000 # export FLAGS_conv_workspace_size_limit=4000
# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit
if [ ${ngpu} == 0 ]; then if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
......
...@@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then ...@@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True export FLAGS_cudnn_deterministic=True
fi fi
# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit
if [ ${ngpu} == 0 ]; then if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
......
...@@ -19,6 +19,10 @@ if [ ${seed} != 0 ]; then ...@@ -19,6 +19,10 @@ if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True export FLAGS_cudnn_deterministic=True
fi fi
# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit
if [ ${ngpu} == 0 ]; then if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
......
...@@ -32,6 +32,10 @@ fi ...@@ -32,6 +32,10 @@ fi
mkdir -p exp mkdir -p exp
# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit
if [ ${ngpu} == 0 ]; then if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
......
...@@ -34,6 +34,10 @@ fi ...@@ -34,6 +34,10 @@ fi
mkdir -p exp mkdir -p exp
# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit
if [ ${ngpu} == 0 ]; then if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
......
...@@ -35,6 +35,10 @@ echo ${ips_config} ...@@ -35,6 +35,10 @@ echo ${ips_config}
mkdir -p exp mkdir -p exp
# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit
if [ ${ngpu} == 0 ]; then if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册