...
 
Commits (17)
    https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/d5a9552fb00c2562f0a056089698ffa245662938 Create cnn_e2e.yml 2021-09-18T10:51:45+08:00 XIE Xuan xiexuanx2@gmail.com https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/d35822594d9229b30853ca6d812a3d5cc0a03e0c test 2021-09-18T10:59:52+08:00 ShawnXuan xiexuanx2@gmail.com https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/558fc8dc01e0eb08c6898a926c70387c637f750d test 2021-09-18T12:27:06+08:00 ShawnXuan xiexuanx2@gmail.com https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/ff00620d6a6199ffd5c1573c99b6eb10a83af0e4 Merge pull request #221 from Oneflow-Inc/dev_cnn_ci_e2e 2021-09-18T14:21:09+08:00 XIE Xuan xiexuanx2@gmail.com Create cnn_e2e.yml https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/d6956cab86d928f4c5675295f22d7441b441bd64 Merge pull request #222 from Oneflow-Inc/dev_fix_gpt_kv_store 2021-09-28T11:28:05+08:00 leaves-zwx kunta0932@gmail.com Zero after output https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/afec85f2a1b060acd695941b7e1dedebb2b35d4d Add model save parameters 2021-09-28T12:17:36+08:00 ouyangyu xuanjiuye@gmail.com https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/f28f384f564bed81ad302dbc003749d3df5a4d66 Merge pull request #223 from Oneflow-Inc/dev_refine_model_parameter 2021-09-28T12:21:17+08:00 XIE Xuan xiexuanx2@gmail.com Add model save parameters https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/d314dc143d731d8d1678d2a66ba3a6000b4257d0 remove unaccessed import 2021-09-28T19:45:42+08:00 ShawnXuan xiexuanx2@gmail.com https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/04c88418cc230f9fb2c8f6a9b3b2c7615801e83c rm pandas from requirements 2021-09-29T09:39:54+08:00 ShawnXuan xiexuanx2@gmail.com https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/1c29d1a8df308a826086b33e4936a7680f60503c rm requirements.txt 2021-09-29T09:43:20+08:00 ShawnXuan xiexuanx2@gmail.com https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/302fa6a8184895b0e3522d86e7d7cf8d068520d4 Merge pull request #224 from Oneflow-Inc/rm_unaccessed_import 2021-09-29T09:52:22+08:00 Yu OuYang xuanjiuye@gmail.com remove unaccessed import https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/0a02d1f2b4b94956572c0dff9e5f50a7edab2c86 fix range epochs 2021-09-29T17:45:54+08:00 ouyangyu xuanjiuye@gmail.com https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/19831b4754274f60c71ff953d870dab9332f6aec Merge pull request #225 from Oneflow-Inc/dev_fix_range_epochs 2021-09-29T17:46:48+08:00 XIE Xuan xiexuanx2@gmail.com fix range epochs https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/fe1e78e0b375c32d5695bde2707d82c085c0b9e8 use Environment Variable ONEFLOW_COMM_NET_IB_ENABLE 2021-09-29T17:50:25+08:00 ouyangyu xuanjiuye@gmail.com https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/9c7bac4eac0452af2bef3ccc95e83403dc70d456 Merge pull request #226 from Oneflow-Inc/dev_remove_use_rdma 2021-09-29T17:56:13+08:00 XIE Xuan xiexuanx2@gmail.com use Environment Variable ONEFLOW_COMM_NET_IB_ENABLE https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/c73fcc9dbeeaca940725a46431f392c41673f634 merge master 2021-10-19T18:31:02+08:00 ouyangyu xuanjiuye@gmail.com https://gitcode.net/Oneflow-Inc/OneFlow-Benchmark/-/commit/e61477bc35dd4b7c647e0b89f2ea6ab1db3b3956 refine 2021-10-19T18:43:20+08:00 ouyangyu xuanjiuye@gmail.com
name: 'resnet e2e test'
on:
pull_request:
types: [review_requested]
branches:
- "*"
workflow_dispatch:
inputs:
placeholder:
description: "placeholder, no effect"
required: false
jobs:
build:
name: 'Build and test this repo'
runs-on: ubuntu-latest
steps:
- run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
rm -rf core.*
#! /bin/bash
# set -ex
# bash args_train.sh ${NUM_NODES} ${NUM_GPUS_PER_NODE} ${BATCH_SIZE} ${USE_FP16} ${NUM_EPOCH} ${LOSS_PRINT_ITER} ${TRAIN_DATA_PATH} ${VAL_DATA_PATH} ${PYTHON_BIN} ${NODE_IPS} ${DEBUG_AND_NCCL} ${NSYS_BIN} ${ITER_NUM}
# bash args_train.sh ${NUM_NODES} ${NUM_GPUS_PER_NODE} ${BATCH_SIZE} ${USE_FP16} ${NUM_EPOCH} ${LOSS_PRINT_ITER} ${TRAIN_DATA_PATH} ${VAL_DATA_PATH} ${PYTHON_BIN} ${NODE_IPS} ${DEBUG_AND_NCCL} ${NSYS_BIN} ${RUN_COMMIT}
NUM_NODES=${1:-1}
NUM_GPUS_PER_NODE=${2:-8}
......@@ -29,7 +29,7 @@ LOG_FILENAME=$LOG_FOLDER/${TRAN_MODEL}_lazy_${NUM_NODES}n${NUM_GPUS_PER_NODE}g_b
export PYTHONUNBUFFERED=1
echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
export NCCL_LAUNCH_MODE=PARALLEL
export NCCL_LAUNCH_MODE=GROUP
echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
echo DEBUG_AND_NCCL=$DEBUG_AND_NCCL
if $DEBUG_AND_NCCL; then
......@@ -39,6 +39,10 @@ if $DEBUG_AND_NCCL; then
echo NCCL_DEBUG=$NCCL_DEBUG
fi
if [[ ${NUM_NODES} -gt 1 ]]; then
export ONEFLOW_COMM_NET_IB_ENABLE=1
fi
CMD=""
if [[ ! -z "${NSYS_BIN}" ]]; then
......
......@@ -92,12 +92,27 @@ def get_parser(parser=None):
parser.add_argument(
"--model_load_dir", type=str, default=None, help="model load directory if need"
)
parser.add_argument(
"--save_epoch_interval",
type=int,
default=10,
help="Number of iterations between checkpoint saves.",
)
parser.add_argument(
"--save_last",
action="store_true",
default=False,
help="save model snapshot for last iteration",
)
parser.add_argument(
"--save_init",
action="store_true",
default=False,
help="save model snapshot for inited",
)
parser.add_argument("--batch_size_per_device", type=int, default=64)
parser.add_argument("--val_batch_size_per_device", type=int, default=8)
parser.add_argument(
"--use_rdma", type=str2bool, nargs="?", const=True, help="Use rdma.",
)
parser.add_argument(
"--nccl_fusion_threshold_mb",
type=int,
......
......@@ -64,8 +64,6 @@ if args.nccl_fusion_threshold_mb:
if args.nccl_fusion_max_ops:
flow.config.collective_boxing.nccl_fusion_max_ops(args.nccl_fusion_max_ops)
if args.num_nodes > 1 and args.use_rdma:
flow.config.use_rdma(True)
def label_smoothing(labels, classes, eta, dtype):
assert classes > 0
......@@ -132,11 +130,11 @@ def main():
InitNodes(args)
flow.env.log_dir(args.log_dir)
snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
snapshot = Snapshot(args.model_save_dir, args.model_load_dir, args.save_init)
print(" {} iter per epoch...".format(epoch_size))
for epoch in range(args.num_epochs):
for epoch in range(1, args.num_epochs + 1):
metric = Metric(
desc="train",
calculate_batches=args.loss_print_every_n_iter,
......@@ -154,7 +152,11 @@ def main():
)
for i in range(num_val_steps):
InferenceNet().async_get(metric.metric_cb(epoch, i))
#snapshot.save("epoch_{}".format(epoch))
#if epoch % args.save_epoch_interval == 0:
# snapshot.save("epoch_{}".format(epoch))
if args.save_last:
snapshot.save("epoch_{}".format("last"))
if __name__ == "__main__":
......
......@@ -24,6 +24,7 @@ export PYTHONUNBUFFERED=1
echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
export NCCL_LAUNCH_MODE=PARALLEL
echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
export ONEFLOW_COMM_NET_IB_ENABLE=1
python3 of_cnn_train_val.py \
--train_data_dir=$DATA_ROOT/train \
......
......@@ -17,8 +17,6 @@ limitations under the License.
import os
import time
import numpy as np
import pandas as pd
from datetime import datetime
import oneflow.compatible.single_client as flow
......@@ -36,14 +34,14 @@ def InitNodes(args):
class Snapshot(object):
def __init__(self, model_save_dir, model_load_dir):
def __init__(self, model_save_dir, model_load_dir, save_init=False):
self._model_save_dir = model_save_dir
if model_load_dir:
assert os.path.isdir(model_load_dir)
print("Restoring model from {}.".format(model_load_dir))
flow.load_variables(flow.checkpoint.get(model_load_dir))
else:
# flow.checkpoint.save("initial_model")
elif save_init:
flow.checkpoint.save("initial_model")
print("Init model on demand.")
def save(self, name):
......
rm -rf core.*
rm -rf ./output/logs/$HOSTNAME ./output/$HOSTNAME ./initial_model
#! /bin/bash
# set -ex
# bash args_run_pretraining.sh ${NUM_NODES} ${NUM_GPUS_PER_NODE} ${BSZ_PER_DEVICE} ${USE_FP16} ${ITER_NUM} ${LOSS_PRINT_ITER} ${DATA_DIR} ${DATA_PART_NUM} ${SEQ_LENGHT} ${NUM_HIDDEN_LAYERS} ${NUM_ATTENTION_HEADS} ${PYTHON_BIN} ${NODE_IPS} ${NSYS_BIN}
# bash args_run_pretraining.sh ${NUM_NODES} ${NUM_GPUS_PER_NODE} ${BSZ_PER_DEVICE} ${USE_FP16} ${ITER_NUM} ${LOSS_PRINT_ITER} ${DATA_DIR} ${DATA_PART_NUM} ${SEQ_LENGHT} ${NUM_HIDDEN_LAYERS} ${NUM_ATTENTION_HEADS} ${PYTHON_BIN} ${NODE_IPS} ${NSYS_BIN} ${RUN_COMMIT}
NUM_NODES=${1:-1}
NUM_GPUS_PER_NODE=${2:-8}
......@@ -20,7 +20,7 @@ PYTHON_BIN=${12:-"python3"}
NODE_IPS=${13:-"10.11.0.2,10.11.0.3,10.11.0.4,10.11.0.5"}
DEBUG_AND_NCCL=${14:-false}
NSYS_BIN=${15:-""}
ITER_N=${16:-1}
RUN_COMMIT=${16:-"master"}
NUM_ACC_STEP=${17:-1}
OPTIMIZER_TYPE=${18:-"adam"}
......@@ -28,10 +28,11 @@ OPTIMIZER_TYPE=${18:-"adam"}
RUN_TIME=$(date "+%Y%m%d_%H%M%S%N")
LOG_FOLDER=./output/logs/$HOSTNAME/${NUM_NODES}n${NUM_GPUS_PER_NODE}g
mkdir -p $LOG_FOLDER
LOG_FILENAME=$LOG_FOLDER/bert_${RUN_TIME}_${NUM_NODES}n${NUM_GPUS_PER_NODE}g_sq${SEQ_LENGHT}_nhl${NUM_HIDDEN_LAYERS}_nah${NUM_ATTENTION_HEADS}_bsz${BSZ_PER_DEVICE}_${OPTIMIZER_TYPE}_iter${ITER_N}.log
LOG_FILENAME=$LOG_FOLDER/bert_${NUM_NODES}n${NUM_GPUS_PER_NODE}g_sq${SEQ_LENGHT}_nhl${NUM_HIDDEN_LAYERS}_nah${NUM_ATTENTION_HEADS}_bsz${BSZ_PER_DEVICE}_${OPTIMIZER_TYPE}_${RUN_COMMIT}_${RUN_TIME}.log
export PYTHONUNBUFFERED=1
export GLOG_v=3
export NCCL_LAUNCH_MODE=GROUP
echo DEBUG_AND_NCCL=$DEBUG_AND_NCCL
if $DEBUG_AND_NCCL; then
......@@ -41,14 +42,14 @@ if $DEBUG_AND_NCCL; then
echo NCCL_DEBUG=$NCCL_DEBUG
fi
# if [ $NUM_GPUS_PER_NODE -eq 1 ]; then
# export CUDA_VISIBLE_DEVICES=$(($ITER_N-1))
# fi
if [[ ${NUM_NODES} -gt 1 ]]; then
export ONEFLOW_COMM_NET_IB_ENABLE=1
fi
CMD=""
if [[ ! -z "${NSYS_BIN}" ]]; then
CMD+="${NSYS_BIN} profile --stats true --output ${TRAN_MODEL}_v0.5.0_${NUM_NODES}_${NUM_GPUS_PER_NODE}_%h_%p "
CMD+="${NSYS_BIN} profile --stats true --output bert_${NUM_NODES}n${NUM_GPUS_PER_NODE}g_sq${SEQ_LENGHT}_nhl${NUM_HIDDEN_LAYERS}_nah${NUM_ATTENTION_HEADS}_bsz${BSZ_PER_DEVICE}_${OPTIMIZER_TYPE}_${RUN_COMMIT}_%h_%p "
fi
CMD+="${PYTHON_BIN} run_pretraining.py "
......@@ -88,8 +89,3 @@ echo "Rum cmd ${CMD}"
$CMD 2>&1 | tee ${LOG_FILENAME}
echo "Writting log to ${LOG_FILENAME}"
if [ ! -d "./test_result" ]; then
mkdir ./test_result
fi
cp -r $LOG_FOLDER ./test_result/
......@@ -121,6 +121,13 @@ def get_parser(parser=None):
required=False,
help="model save directory",
)
parser.add_argument(
"--model_save_init",
action="store_true",
default=False,
help="save model snapshot for inited",
)
parser.add_argument(
"--save_last_snapshot",
type=str2bool,
......
......@@ -124,7 +124,7 @@ def main():
InitNodes(args)
snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
snapshot = Snapshot(args.model_save_dir, args.model_load_dir, args.model_save_init)
print("num_accumulation_steps:", args.num_accumulation_steps)
metric = Metric(
......
......@@ -16,10 +16,7 @@ limitations under the License.
import os
import time
import numpy as np
from collections import OrderedDict
import pandas as pd
from datetime import datetime
import oneflow.compatible.single_client as flow
......@@ -37,13 +34,13 @@ def InitNodes(args):
class Snapshot(object):
def __init__(self, model_save_dir, model_load_dir):
def __init__(self, model_save_dir, model_load_dir, model_save_init=False):
self._model_save_dir = model_save_dir
if model_load_dir:
assert os.path.isdir(model_load_dir)
print("Restoring model from {}.".format(model_load_dir))
flow.load_variables(flow.checkpoint.get(model_load_dir))
else:
elif model_save_init:
flow.checkpoint.save("initial_model")
print("Init model on demand.")
......
......@@ -173,7 +173,7 @@ python3 tools/launch_container.py \
bash examples/pretrain_1n8d_2x4x1_16_1536x16.sh
```
如果需要进一步扩充设备数量,由于单台机器适配的 GPU 设备数量有限,我们需要更多的物理机器 node。此时,我们需要配置 `--num-nodes``--node-ips` 参数,并且分别在每台机器上启动训练命令(`traning.py`)。同时在有 rdma 的环境中,可以开启 `--use-rdma` 来带来更佳的训练效率。以下是4机8卡下,各种并行方式混合的示例:
如果需要进一步扩充设备数量,由于单台机器适配的 GPU 设备数量有限,我们需要更多的物理机器 node。此时,我们需要配置 `--num-nodes``--node-ips` 参数,并且分别在每台机器上启动训练命令(`traning.py`)。同时在有 rdma 的环境中,可以开启 `export ONEFLOW_COMM_NET_IB_ENABLE=1` 来带来更佳的训练效率。以下是4机8卡下,各种并行方式混合的示例:
```
bash examples/distribute_pretrain_4n8d_2x4x4_512_2304x24.sh
......
#! /bin/bash
# set -ex
# Runs the "117M" parameter model
# bash args_pretrain_gpt.sh $NUM_NODES $NUM_GPUS_PER_NODE $M_P $P_P $MICRO_BATCH_SIZE $GLOABAL_BATCH_SIZE $USE_FP16 $TRAIN_ITERS $LOG_INTERVAL $DATA_PATH $NUM_LAYERS $HIDDEN_SIZE $NUM_ATTENTION_HEADS $SEQ_LENGTH $PYTHON_BIN $NODE_IPS $DEBUG_AND_NCCL $NSYS_BIN $ITER_NUM
# bash args_pretrain_gpt.sh $NUM_NODES $NUM_GPUS_PER_NODE $M_P $P_P $MICRO_BATCH_SIZE $GLOABAL_BATCH_SIZE $USE_FP16 $TRAIN_ITERS $LOG_INTERVAL $DATA_PATH $NUM_LAYERS $HIDDEN_SIZE $NUM_ATTENTION_HEADS $SEQ_LENGTH $PYTHON_BIN $NODE_IPS $DEBUG_AND_NCCL $NSYS_BIN $RUN_COMMIT
rm -rf core.*
rm -rf ./output/logs/$HOSTNAME ./model_save
NUM_NODES=${1:-1}
NUM_GPUS_PER_NODE=${2:-8}
......@@ -25,18 +24,19 @@ PYTHON_BIN=${15:-"python3"}
NODE_IPS=${16:-"127.0.0.1"}
DEBUG_AND_NCCL=${17:-false}
NSYS_BIN=${18:-""}
ITER_NUM=${19:-1}
RUN_COMMIT=${19:-1}
WORLD_SIZE=$(($NUM_GPUS_PER_NODE*$NUM_NODES))
D_P=$(($WORLD_SIZE/$M_P/$P_P))
RUN_TIME=$(date "+%Y%m%d_%H%M%S%N")
LOG_FOLDER=./output/logs/$HOSTNAME/${NUM_NODES}n${NUM_GPUS_PER_NODE}g
mkdir -p $LOG_FOLDER
LOG_FILENAME=$LOG_FOLDER/oneflow_gpt_${NUM_NODES}n${NUM_GPUS_PER_NODE}g_dp${D_P}_mp${M_P}_pp${P_P}_mbs${MICRO_BATCH_SIZE}_gbs${GLOABAL_BATCH_SIZE}_sql${SEQ_LENGTH}_l${NUM_LAYERS}_hsz${HIDDEN_SIZE}_ahs${NUM_ATTENTION_HEADS}_pretrain_iter${ITER_NUM}.log
LOG_FILENAME=$LOG_FOLDER/oneflow_gpt_${NUM_NODES}n${NUM_GPUS_PER_NODE}g_dp${D_P}_mp${M_P}_pp${P_P}_mbs${MICRO_BATCH_SIZE}_gbs${GLOABAL_BATCH_SIZE}_sql${SEQ_LENGTH}_l${NUM_LAYERS}_hsz${HIDDEN_SIZE}_ahs${NUM_ATTENTION_HEADS}_${RUN_COMMIT}_${RUN_TIME}.log
CHECKPOINT_PATH=./model_save
mkdir -p $CHECKPOINT_PATH
# save model
# CHECKPOINT_PATH=./model_save
# mkdir -p $CHECKPOINT_PATH
export PYTHONUNBUFFERED=1
......@@ -54,14 +54,14 @@ if $DEBUG_AND_NCCL; then
echo NCCL_DEBUG=$NCCL_DEBUG
fi
if [ $NUM_GPUS_PER_NODE -eq 1 ]; then
export CUDA_VISIBLE_DEVICES=$(($ITER_NUM-1))
if [[ ${NUM_NODES} -gt 1 ]]; then
export ONEFLOW_COMM_NET_IB_ENABLE=1
fi
CMD=""
if [[ ! -z "${NSYS_BIN}" ]]; then
CMD+="${NSYS_BIN} profile --stats true --output oneflow_gpt_${NUM_NODES}n${NUM_GPUS_PER_NODE}g_%h_%p "
CMD+="${NSYS_BIN} profile --stats true --output oneflow_gpt_${NUM_NODES}n${NUM_GPUS_PER_NODE}g_dp${D_P}_mp${M_P}_pp${P_P}_mbs${MICRO_BATCH_SIZE}_gbs${GLOABAL_BATCH_SIZE}_sql${SEQ_LENGTH}_l${NUM_LAYERS}_hsz${HIDDEN_SIZE}_ahs${NUM_ATTENTION_HEADS}_${RUN_COMMIT}_%h_%p "
fi
CMD+="${PYTHON_BIN} oneflow_gpt/training.py "
......@@ -93,8 +93,8 @@ CMD+=" --lr-warmup-fraction 0.01"
CMD+=" --optimizer adamw"
CMD+=" --weight-decay 1e-2"
CMD+=" --clip-grad 1.0"
CMD+=" --save ${CHECKPOINT_PATH}"
CMD+=" --save-interval 100000"
# CMD+=" --save ${CHECKPOINT_PATH}"
# CMD+=" --save-interval 100000"
CMD+=" --log-interval ${LOG_INTERVAL}"
CMD+=" --checkpoint-activations"
CMD+=" --multihead-attention-fusion"
......@@ -104,9 +104,6 @@ if $USE_FP16; then
CMD+=" --fp16"
fi
if [[ ${NUM_NODES} -gt 1 ]]; then
CMD+=" --use-rdma"
fi
if [[ ! -z "${NSYS_BIN}" ]]; then
CMD+=" --profile-transformer-layer"
......@@ -118,9 +115,3 @@ echo "Rum cmd ${CMD}"
$CMD 2>&1 | tee ${LOG_FILENAME}
echo "Writting log to ${LOG_FILENAME}"
if [ ! -d "./test_result" ]; then
mkdir ./test_result
fi
cp -r $LOG_FOLDER ./test_result/
......@@ -84,7 +84,7 @@ cmd+=" --multihead-attention-fusion"
cmd+=" --fp16"
if [[ ${num_nodes} -gt 1 ]]; then
cmd+=" --use-rdma"
export ONEFLOW_COMM_NET_IB_ENABLE=1
fi
if [[ ! -z "${ONEFLOW_GTP_PROFILE_FILE}" ]]; then
......
......@@ -635,11 +635,7 @@ def _add_misc_args(parser):
action="store_true",
help="open transformer layer profiler",
)
group.add_argument(
"--use-rdma",
action="store_true",
help="Use rdma.",
)
return parser
......
......@@ -47,8 +47,6 @@ def _init_config(args):
" please try other version."
)
if args.use_rdma:
flow.config.use_rdma(True)
flow.config.enable_legacy_model_io()
flow.config.enable_model_io_v2(True)
......
numpy>=1.17.2
pandas>=1.0.4
pillow>=7.2.0
\ No newline at end of file