未验证 提交 21e61429 编写于 作者: littletomatodonkey's avatar littletomatodonkey 提交者: GitHub

[cherry-pick]fix gpu num check bug (#4431)

* fix gpu num check bug

* fix face_detection grammatical errors (#4405)
Co-authored-by: NGuanghua Yu <742925032@qq.com>
上级 b175b5e4
......@@ -92,8 +92,8 @@ tar -xf vgg_ilsvrc_16_fc_reduced.tar.gz && rm -f vgg_ilsvrc_16_fc_reduced.tar.gz
```bash
python -u train.py --batch_size=16 --pretrained_model=vgg_ilsvrc_16_fc_reduced
```
- 可以通过设置 `export CUDA_VISIBLE_DEVICES=0,1,2,3` 指定想要使用的GPU数量,`batch_size`默认设置为12或16。
- **注意**: 在**Windows**机器上训练,需要设置 `--use_multiprocess=False`,因为在Windows上使用Python多进程加速训练时有错误
- 可以通过设置 `export CUDA_VISIBLE_DEVICES=0,1,2,3` 指定想要使用的GPU数量,`batch_size`默认设置为12或16,如果GPU显存较小,请适当调低`batch_size`
- **注意**: 在**Windows**机器上训练或者训练环境内存较小时,需要设置 `--use_multiprocess=False`,因为在Windows上使用Python多进程加速训练时有错误,内存不足时程序会崩溃
- 更多的可选参数见:
```bash
python train.py --help
......
......@@ -28,7 +28,7 @@ def draw_bbox(image, bbox):
bbox (np.array|list|tuple): (xmin, ymin, xmax, ymax).
"""
draw = ImageDraw.Draw(image)
xmin, ymin, xmax, ymax = box
xmin, ymin, xmax, ymax = bbox
(left, right, top, bottom) = (xmin, xmax, ymin, ymax)
draw.line(
[(left, top), (left, bottom), (right, bottom), (right, top),
......
......@@ -67,7 +67,7 @@ def infer(args, config):
image_path = args.image_path
image = Image.open(image_path)
if image.mode == 'L':
image = img.convert('RGB')
image = image.convert('RGB')
shrink, max_shrink = get_shrink(image.size[1], image.size[0])
det0 = detect_face(image, shrink)
......
......@@ -44,7 +44,7 @@ add_arg('test_batch_size', int, 50, "Minibatch size.")
add_arg('image_shape', str, "3,224,224", "input image size")
add_arg('class_dim', int, 11318 , "Class number.")
add_arg('lr', float, 0.01, "set learning rate.")
add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.")
add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.")
add_arg('lr_steps', str, "15000,25000", "step of lr")
add_arg('total_iter_num', int, 30000, "total_iter_num")
add_arg('display_iter_step', int, 10, "display_iter_step.")
......@@ -63,15 +63,15 @@ add_arg('enable_ce', bool, False, "If set True, enable continuous evaluation job
model_list = [m for m in dir(models) if "__" not in m]
def optimizer_setting(params):
ls = params["learning_strategy"]
assert ls["name"] == "piecewise_decay", \
"learning rate strategy must be {}, \
but got {}".format("piecewise_decay", lr["name"])
"learning rate strategy must be {}, but got {}".format("piecewise_decay", lr["name"])
bd = [int(e) for e in ls["lr_steps"].split(',')]
base_lr = params["lr"]
lr = [base_lr * (0.1 ** i) for i in range(len(bd) + 1)]
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
......@@ -81,30 +81,28 @@ def optimizer_setting(params):
def net_config(image, label, model, args, is_train):
assert args.model in model_list, "{} is not in lists: {}".format(
args.model, model_list)
assert args.model in model_list, "{} is not in lists: {}".format(args.model,
model_list)
out = model.net(input=image, embedding_size=args.embedding_size)
if not is_train:
return None, None, None, out
if args.loss_name == "softmax":
metricloss = SoftmaxLoss(
class_dim=args.class_dim,
)
metricloss = SoftmaxLoss(class_dim=args.class_dim, )
elif args.loss_name == "arcmargin":
metricloss = ArcMarginLoss(
class_dim = args.class_dim,
margin = args.arc_margin,
scale = args.arc_scale,
easy_margin = args.arc_easy_margin,
)
class_dim=args.class_dim,
margin=args.arc_margin,
scale=args.arc_scale,
easy_margin=args.arc_easy_margin, )
cost, logit = metricloss.loss(out, label)
avg_cost = fluid.layers.mean(x=cost)
acc_top1 = fluid.layers.accuracy(input=logit, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=logit, label=label, k=5)
return avg_cost, acc_top1, acc_top5, out
def build_program(is_train, main_prog, startup_prog, args):
image_shape = [int(m) for m in args.image_shape.split(",")]
model = models.__dict__[args.model]()
......@@ -119,11 +117,13 @@ def build_program(is_train, main_prog, startup_prog, args):
use_double_buffer=True)
image, label = fluid.layers.read_file(py_reader)
else:
image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
image = fluid.layers.data(
name='image', shape=image_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
with fluid.unique_name.guard():
avg_cost, acc_top1, acc_top5, out = net_config(image, label, model, args, is_train)
avg_cost, acc_top1, acc_top5, out = net_config(image, label, model,
args, is_train)
if is_train:
params = model.params
params["lr"] = args.lr
......@@ -138,7 +138,7 @@ def build_program(is_train, main_prog, startup_prog, args):
"""
if is_train:
return py_reader, avg_cost, acc_top1, acc_top5, global_lr
else:
else:
return out, image, label
......@@ -175,7 +175,9 @@ def train_async(args):
args=args)
test_prog = tmp_prog.clone(for_test=True)
train_fetch_list = [global_lr.name, train_cost.name, train_acc1.name, train_acc5.name]
train_fetch_list = [
global_lr.name, train_cost.name, train_acc1.name, train_acc5.name
]
test_fetch_list = [test_feas.name]
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
......@@ -196,13 +198,18 @@ def train_async(args):
fluid.io.load_vars(
exe, pretrained_model, main_program=train_prog, predicate=if_exist)
devicenum = get_gpu_num()
if args.use_gpu:
devicenum = get_gpu_num()
else:
devicenum = int(os.environ.get('CPU_NUM', 1))
assert (args.train_batch_size % devicenum) == 0
train_batch_size = args.train_batch_size // devicenum
test_batch_size = args.test_batch_size
train_reader = paddle.batch(reader.train(args), batch_size=train_batch_size, drop_last=True)
test_reader = paddle.batch(reader.test(args), batch_size=test_batch_size, drop_last=False)
train_reader = paddle.batch(
reader.train(args), batch_size=train_batch_size, drop_last=True)
test_reader = paddle.batch(
reader.test(args), batch_size=test_batch_size, drop_last=False)
test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
train_py_reader.decorate_paddle_reader(train_reader)
......@@ -239,12 +246,14 @@ def train_async(args):
train_info = [0, 0, 0, 0]
totalruntime += period
if iter_no % args.test_iter_step == 0 and iter_no != 0:
f, l = [], []
for batch_id, data in enumerate(test_reader()):
t1 = time.time()
[feas] = exe.run(test_prog, fetch_list = test_fetch_list, feed=test_feeder.feed(data))
[feas] = exe.run(test_prog,
fetch_list=test_fetch_list,
feed=test_feeder.feed(data))
label = np.asarray([x[1] for x in data])
f.append(feas)
l.append(label)
......@@ -285,10 +294,10 @@ def initlogging():
logging.basicConfig(
level=loglevel,
# logger.BASIC_FORMAT,
format=
"%(levelname)s:%(filename)s[%(lineno)s] %(name)s:%(funcName)s->%(message)s",
format="%(levelname)s:%(filename)s[%(lineno)s] %(name)s:%(funcName)s->%(message)s",
datefmt='%a, %d %b %Y %H:%M:%S')
def main():
args = parser.parse_args()
print_arguments(args)
......
......@@ -46,7 +46,7 @@ add_arg('test_batch_size', int, 50, "Minibatch size.")
add_arg('image_shape', str, "3,224,224", "input image size")
add_arg('class_dim', int, 11318, "Class number.")
add_arg('lr', float, 0.0001, "set learning rate.")
add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.")
add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.")
add_arg('lr_steps', str, "100000", "step of lr")
add_arg('total_iter_num', int, 100000, "total_iter_num")
add_arg('display_iter_step', int, 10, "display_iter_step.")
......@@ -64,15 +64,15 @@ add_arg('npairs_reg_lambda', float, 0.01, "npairs reg lambda.")
model_list = [m for m in dir(models) if "__" not in m]
def optimizer_setting(params):
ls = params["learning_strategy"]
assert ls["name"] == "piecewise_decay", \
"learning rate strategy must be {}, \
but got {}".format("piecewise_decay", lr["name"])
"learning rate strategy must be {}, but got {}".format("piecewise_decay", lr["name"])
bd = [int(e) for e in ls["lr_steps"].split(',')]
base_lr = params["lr"]
lr = [base_lr * (0.1 ** i) for i in range(len(bd) + 1)]
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
......@@ -82,38 +82,34 @@ def optimizer_setting(params):
def net_config(image, label, model, args, is_train):
assert args.model in model_list, "{} is not in lists: {}".format(
args.model, model_list)
assert args.model in model_list, "{} is not in lists: {}".format(args.model,
model_list)
out = model.net(input=image, embedding_size=args.embedding_size)
if not is_train:
return None, out
if args.loss_name == "triplet":
metricloss = TripletLoss(
margin=args.margin,
)
metricloss = TripletLoss(margin=args.margin, )
elif args.loss_name == "quadruplet":
metricloss = QuadrupletLoss(
train_batch_size = args.train_batch_size,
samples_each_class = args.samples_each_class,
margin=args.margin,
)
train_batch_size=args.train_batch_size,
samples_each_class=args.samples_each_class,
margin=args.margin, )
elif args.loss_name == "eml":
metricloss = EmlLoss(
train_batch_size = args.train_batch_size,
samples_each_class = args.samples_each_class,
)
train_batch_size=args.train_batch_size,
samples_each_class=args.samples_each_class, )
elif args.loss_name == "npairs":
metricloss = NpairsLoss(
train_batch_size = args.train_batch_size,
samples_each_class = args.samples_each_class,
reg_lambda = args.npairs_reg_lambda,
)
train_batch_size=args.train_batch_size,
samples_each_class=args.samples_each_class,
reg_lambda=args.npairs_reg_lambda, )
cost = metricloss.loss(out, label)
avg_cost = fluid.layers.mean(x=cost)
return avg_cost, out
def build_program(is_train, main_prog, startup_prog, args):
image_shape = [int(m) for m in args.image_shape.split(",")]
model = models.__dict__[args.model]()
......@@ -128,7 +124,8 @@ def build_program(is_train, main_prog, startup_prog, args):
use_double_buffer=True)
image, label = fluid.layers.read_file(py_reader)
else:
image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
image = fluid.layers.data(
name='image', shape=image_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
with fluid.unique_name.guard():
......@@ -147,7 +144,7 @@ def build_program(is_train, main_prog, startup_prog, args):
"""
if is_train:
return py_reader, avg_cost, global_lr, out, label
else:
else:
return out, image, label
......@@ -176,7 +173,9 @@ def train_async(args):
args=args)
test_prog = tmp_prog.clone(for_test=True)
train_fetch_list = [global_lr.name, train_cost.name, train_feas.name, train_label.name]
train_fetch_list = [
global_lr.name, train_cost.name, train_feas.name, train_label.name
]
test_fetch_list = [test_feas.name]
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
......@@ -197,13 +196,18 @@ def train_async(args):
fluid.io.load_vars(
exe, pretrained_model, main_program=train_prog, predicate=if_exist)
devicenum = get_gpu_num()
if args.use_gpu:
devicenum = get_gpu_num()
else:
devicenum = int(os.environ.get('CPU_NUM', 1))
assert (args.train_batch_size % devicenum) == 0
train_batch_size = args.train_batch_size / devicenum
test_batch_size = args.test_batch_size
train_reader = paddle.batch(reader.train(args), batch_size=train_batch_size, drop_last=True)
test_reader = paddle.batch(reader.test(args), batch_size=test_batch_size, drop_last=False)
train_reader = paddle.batch(
reader.train(args), batch_size=train_batch_size, drop_last=True)
test_reader = paddle.batch(
reader.test(args), batch_size=test_batch_size, drop_last=False)
test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
train_py_reader.decorate_paddle_reader(train_reader)
......@@ -238,12 +242,14 @@ def train_async(args):
train_info = [0, 0, 0]
totalruntime += period
if iter_no % args.test_iter_step == 0 and iter_no != 0:
f, l = [], []
for batch_id, data in enumerate(test_reader()):
t1 = time.time()
[feas] = exe.run(test_prog, fetch_list = test_fetch_list, feed=test_feeder.feed(data))
[feas] = exe.run(test_prog,
fetch_list=test_fetch_list,
feed=test_feeder.feed(data))
label = np.asarray([x[1] for x in data])
f.append(feas)
l.append(label)
......@@ -270,6 +276,7 @@ def train_async(args):
iter_no += 1
def initlogging():
for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler)
......@@ -277,10 +284,10 @@ def initlogging():
logging.basicConfig(
level=loglevel,
# logger.BASIC_FORMAT,
format=
"%(levelname)s:%(filename)s[%(lineno)s] %(name)s:%(funcName)s->%(message)s",
format="%(levelname)s:%(filename)s[%(lineno)s] %(name)s:%(funcName)s->%(message)s",
datefmt='%a, %d %b %Y %H:%M:%S')
def main():
args = parser.parse_args()
print_arguments(args)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册