未验证 提交 2377b052 编写于 作者: littletomatodonkey's avatar littletomatodonkey 提交者: GitHub

fix gpu num check bug (#4406)

上级 a0a66616
...@@ -44,7 +44,7 @@ add_arg('test_batch_size', int, 50, "Minibatch size.") ...@@ -44,7 +44,7 @@ add_arg('test_batch_size', int, 50, "Minibatch size.")
add_arg('image_shape', str, "3,224,224", "input image size") add_arg('image_shape', str, "3,224,224", "input image size")
add_arg('class_dim', int, 11318 , "Class number.") add_arg('class_dim', int, 11318 , "Class number.")
add_arg('lr', float, 0.01, "set learning rate.") add_arg('lr', float, 0.01, "set learning rate.")
add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.") add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.")
add_arg('lr_steps', str, "15000,25000", "step of lr") add_arg('lr_steps', str, "15000,25000", "step of lr")
add_arg('total_iter_num', int, 30000, "total_iter_num") add_arg('total_iter_num', int, 30000, "total_iter_num")
add_arg('display_iter_step', int, 10, "display_iter_step.") add_arg('display_iter_step', int, 10, "display_iter_step.")
...@@ -63,15 +63,15 @@ add_arg('enable_ce', bool, False, "If set True, enable continuous evaluation job ...@@ -63,15 +63,15 @@ add_arg('enable_ce', bool, False, "If set True, enable continuous evaluation job
model_list = [m for m in dir(models) if "__" not in m] model_list = [m for m in dir(models) if "__" not in m]
def optimizer_setting(params): def optimizer_setting(params):
ls = params["learning_strategy"] ls = params["learning_strategy"]
assert ls["name"] == "piecewise_decay", \ assert ls["name"] == "piecewise_decay", \
"learning rate strategy must be {}, \ "learning rate strategy must be {}, but got {}".format("piecewise_decay", lr["name"])
but got {}".format("piecewise_decay", lr["name"])
bd = [int(e) for e in ls["lr_steps"].split(',')] bd = [int(e) for e in ls["lr_steps"].split(',')]
base_lr = params["lr"] base_lr = params["lr"]
lr = [base_lr * (0.1 ** i) for i in range(len(bd) + 1)] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum( optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay( learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr), boundaries=bd, values=lr),
...@@ -81,30 +81,28 @@ def optimizer_setting(params): ...@@ -81,30 +81,28 @@ def optimizer_setting(params):
def net_config(image, label, model, args, is_train): def net_config(image, label, model, args, is_train):
assert args.model in model_list, "{} is not in lists: {}".format( assert args.model in model_list, "{} is not in lists: {}".format(args.model,
args.model, model_list) model_list)
out = model.net(input=image, embedding_size=args.embedding_size) out = model.net(input=image, embedding_size=args.embedding_size)
if not is_train: if not is_train:
return None, None, None, out return None, None, None, out
if args.loss_name == "softmax": if args.loss_name == "softmax":
metricloss = SoftmaxLoss( metricloss = SoftmaxLoss(class_dim=args.class_dim, )
class_dim=args.class_dim,
)
elif args.loss_name == "arcmargin": elif args.loss_name == "arcmargin":
metricloss = ArcMarginLoss( metricloss = ArcMarginLoss(
class_dim = args.class_dim, class_dim=args.class_dim,
margin = args.arc_margin, margin=args.arc_margin,
scale = args.arc_scale, scale=args.arc_scale,
easy_margin = args.arc_easy_margin, easy_margin=args.arc_easy_margin, )
)
cost, logit = metricloss.loss(out, label) cost, logit = metricloss.loss(out, label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
acc_top1 = fluid.layers.accuracy(input=logit, label=label, k=1) acc_top1 = fluid.layers.accuracy(input=logit, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=logit, label=label, k=5) acc_top5 = fluid.layers.accuracy(input=logit, label=label, k=5)
return avg_cost, acc_top1, acc_top5, out return avg_cost, acc_top1, acc_top5, out
def build_program(is_train, main_prog, startup_prog, args): def build_program(is_train, main_prog, startup_prog, args):
image_shape = [int(m) for m in args.image_shape.split(",")] image_shape = [int(m) for m in args.image_shape.split(",")]
model = models.__dict__[args.model]() model = models.__dict__[args.model]()
...@@ -119,11 +117,13 @@ def build_program(is_train, main_prog, startup_prog, args): ...@@ -119,11 +117,13 @@ def build_program(is_train, main_prog, startup_prog, args):
use_double_buffer=True) use_double_buffer=True)
image, label = fluid.layers.read_file(py_reader) image, label = fluid.layers.read_file(py_reader)
else: else:
image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') image = fluid.layers.data(
name='image', shape=image_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
with fluid.unique_name.guard(): with fluid.unique_name.guard():
avg_cost, acc_top1, acc_top5, out = net_config(image, label, model, args, is_train) avg_cost, acc_top1, acc_top5, out = net_config(image, label, model,
args, is_train)
if is_train: if is_train:
params = model.params params = model.params
params["lr"] = args.lr params["lr"] = args.lr
...@@ -138,7 +138,7 @@ def build_program(is_train, main_prog, startup_prog, args): ...@@ -138,7 +138,7 @@ def build_program(is_train, main_prog, startup_prog, args):
""" """
if is_train: if is_train:
return py_reader, avg_cost, acc_top1, acc_top5, global_lr return py_reader, avg_cost, acc_top1, acc_top5, global_lr
else: else:
return out, image, label return out, image, label
...@@ -175,7 +175,9 @@ def train_async(args): ...@@ -175,7 +175,9 @@ def train_async(args):
args=args) args=args)
test_prog = tmp_prog.clone(for_test=True) test_prog = tmp_prog.clone(for_test=True)
train_fetch_list = [global_lr.name, train_cost.name, train_acc1.name, train_acc5.name] train_fetch_list = [
global_lr.name, train_cost.name, train_acc1.name, train_acc5.name
]
test_fetch_list = [test_feas.name] test_fetch_list = [test_feas.name]
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
...@@ -196,13 +198,18 @@ def train_async(args): ...@@ -196,13 +198,18 @@ def train_async(args):
fluid.io.load_vars( fluid.io.load_vars(
exe, pretrained_model, main_program=train_prog, predicate=if_exist) exe, pretrained_model, main_program=train_prog, predicate=if_exist)
devicenum = get_gpu_num() if args.use_gpu:
devicenum = get_gpu_num()
else:
devicenum = int(os.environ.get('CPU_NUM', 1))
assert (args.train_batch_size % devicenum) == 0 assert (args.train_batch_size % devicenum) == 0
train_batch_size = args.train_batch_size // devicenum train_batch_size = args.train_batch_size // devicenum
test_batch_size = args.test_batch_size test_batch_size = args.test_batch_size
train_reader = paddle.batch(reader.train(args), batch_size=train_batch_size, drop_last=True) train_reader = paddle.batch(
test_reader = paddle.batch(reader.test(args), batch_size=test_batch_size, drop_last=False) reader.train(args), batch_size=train_batch_size, drop_last=True)
test_reader = paddle.batch(
reader.test(args), batch_size=test_batch_size, drop_last=False)
test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
train_py_reader.decorate_paddle_reader(train_reader) train_py_reader.decorate_paddle_reader(train_reader)
...@@ -239,12 +246,14 @@ def train_async(args): ...@@ -239,12 +246,14 @@ def train_async(args):
train_info = [0, 0, 0, 0] train_info = [0, 0, 0, 0]
totalruntime += period totalruntime += period
if iter_no % args.test_iter_step == 0 and iter_no != 0: if iter_no % args.test_iter_step == 0 and iter_no != 0:
f, l = [], [] f, l = [], []
for batch_id, data in enumerate(test_reader()): for batch_id, data in enumerate(test_reader()):
t1 = time.time() t1 = time.time()
[feas] = exe.run(test_prog, fetch_list = test_fetch_list, feed=test_feeder.feed(data)) [feas] = exe.run(test_prog,
fetch_list=test_fetch_list,
feed=test_feeder.feed(data))
label = np.asarray([x[1] for x in data]) label = np.asarray([x[1] for x in data])
f.append(feas) f.append(feas)
l.append(label) l.append(label)
...@@ -285,10 +294,10 @@ def initlogging(): ...@@ -285,10 +294,10 @@ def initlogging():
logging.basicConfig( logging.basicConfig(
level=loglevel, level=loglevel,
# logger.BASIC_FORMAT, # logger.BASIC_FORMAT,
format= format="%(levelname)s:%(filename)s[%(lineno)s] %(name)s:%(funcName)s->%(message)s",
"%(levelname)s:%(filename)s[%(lineno)s] %(name)s:%(funcName)s->%(message)s",
datefmt='%a, %d %b %Y %H:%M:%S') datefmt='%a, %d %b %Y %H:%M:%S')
def main(): def main():
args = parser.parse_args() args = parser.parse_args()
print_arguments(args) print_arguments(args)
......
...@@ -46,7 +46,7 @@ add_arg('test_batch_size', int, 50, "Minibatch size.") ...@@ -46,7 +46,7 @@ add_arg('test_batch_size', int, 50, "Minibatch size.")
add_arg('image_shape', str, "3,224,224", "input image size") add_arg('image_shape', str, "3,224,224", "input image size")
add_arg('class_dim', int, 11318, "Class number.") add_arg('class_dim', int, 11318, "Class number.")
add_arg('lr', float, 0.0001, "set learning rate.") add_arg('lr', float, 0.0001, "set learning rate.")
add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.") add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.")
add_arg('lr_steps', str, "100000", "step of lr") add_arg('lr_steps', str, "100000", "step of lr")
add_arg('total_iter_num', int, 100000, "total_iter_num") add_arg('total_iter_num', int, 100000, "total_iter_num")
add_arg('display_iter_step', int, 10, "display_iter_step.") add_arg('display_iter_step', int, 10, "display_iter_step.")
...@@ -64,15 +64,15 @@ add_arg('npairs_reg_lambda', float, 0.01, "npairs reg lambda.") ...@@ -64,15 +64,15 @@ add_arg('npairs_reg_lambda', float, 0.01, "npairs reg lambda.")
model_list = [m for m in dir(models) if "__" not in m] model_list = [m for m in dir(models) if "__" not in m]
def optimizer_setting(params): def optimizer_setting(params):
ls = params["learning_strategy"] ls = params["learning_strategy"]
assert ls["name"] == "piecewise_decay", \ assert ls["name"] == "piecewise_decay", \
"learning rate strategy must be {}, \ "learning rate strategy must be {}, but got {}".format("piecewise_decay", lr["name"])
but got {}".format("piecewise_decay", lr["name"])
bd = [int(e) for e in ls["lr_steps"].split(',')] bd = [int(e) for e in ls["lr_steps"].split(',')]
base_lr = params["lr"] base_lr = params["lr"]
lr = [base_lr * (0.1 ** i) for i in range(len(bd) + 1)] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum( optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay( learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr), boundaries=bd, values=lr),
...@@ -82,38 +82,34 @@ def optimizer_setting(params): ...@@ -82,38 +82,34 @@ def optimizer_setting(params):
def net_config(image, label, model, args, is_train): def net_config(image, label, model, args, is_train):
assert args.model in model_list, "{} is not in lists: {}".format( assert args.model in model_list, "{} is not in lists: {}".format(args.model,
args.model, model_list) model_list)
out = model.net(input=image, embedding_size=args.embedding_size) out = model.net(input=image, embedding_size=args.embedding_size)
if not is_train: if not is_train:
return None, out return None, out
if args.loss_name == "triplet": if args.loss_name == "triplet":
metricloss = TripletLoss( metricloss = TripletLoss(margin=args.margin, )
margin=args.margin,
)
elif args.loss_name == "quadruplet": elif args.loss_name == "quadruplet":
metricloss = QuadrupletLoss( metricloss = QuadrupletLoss(
train_batch_size = args.train_batch_size, train_batch_size=args.train_batch_size,
samples_each_class = args.samples_each_class, samples_each_class=args.samples_each_class,
margin=args.margin, margin=args.margin, )
)
elif args.loss_name == "eml": elif args.loss_name == "eml":
metricloss = EmlLoss( metricloss = EmlLoss(
train_batch_size = args.train_batch_size, train_batch_size=args.train_batch_size,
samples_each_class = args.samples_each_class, samples_each_class=args.samples_each_class, )
)
elif args.loss_name == "npairs": elif args.loss_name == "npairs":
metricloss = NpairsLoss( metricloss = NpairsLoss(
train_batch_size = args.train_batch_size, train_batch_size=args.train_batch_size,
samples_each_class = args.samples_each_class, samples_each_class=args.samples_each_class,
reg_lambda = args.npairs_reg_lambda, reg_lambda=args.npairs_reg_lambda, )
)
cost = metricloss.loss(out, label) cost = metricloss.loss(out, label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
return avg_cost, out return avg_cost, out
def build_program(is_train, main_prog, startup_prog, args): def build_program(is_train, main_prog, startup_prog, args):
image_shape = [int(m) for m in args.image_shape.split(",")] image_shape = [int(m) for m in args.image_shape.split(",")]
model = models.__dict__[args.model]() model = models.__dict__[args.model]()
...@@ -128,7 +124,8 @@ def build_program(is_train, main_prog, startup_prog, args): ...@@ -128,7 +124,8 @@ def build_program(is_train, main_prog, startup_prog, args):
use_double_buffer=True) use_double_buffer=True)
image, label = fluid.layers.read_file(py_reader) image, label = fluid.layers.read_file(py_reader)
else: else:
image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') image = fluid.layers.data(
name='image', shape=image_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
with fluid.unique_name.guard(): with fluid.unique_name.guard():
...@@ -147,7 +144,7 @@ def build_program(is_train, main_prog, startup_prog, args): ...@@ -147,7 +144,7 @@ def build_program(is_train, main_prog, startup_prog, args):
""" """
if is_train: if is_train:
return py_reader, avg_cost, global_lr, out, label return py_reader, avg_cost, global_lr, out, label
else: else:
return out, image, label return out, image, label
...@@ -176,7 +173,9 @@ def train_async(args): ...@@ -176,7 +173,9 @@ def train_async(args):
args=args) args=args)
test_prog = tmp_prog.clone(for_test=True) test_prog = tmp_prog.clone(for_test=True)
train_fetch_list = [global_lr.name, train_cost.name, train_feas.name, train_label.name] train_fetch_list = [
global_lr.name, train_cost.name, train_feas.name, train_label.name
]
test_fetch_list = [test_feas.name] test_fetch_list = [test_feas.name]
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
...@@ -197,13 +196,18 @@ def train_async(args): ...@@ -197,13 +196,18 @@ def train_async(args):
fluid.io.load_vars( fluid.io.load_vars(
exe, pretrained_model, main_program=train_prog, predicate=if_exist) exe, pretrained_model, main_program=train_prog, predicate=if_exist)
devicenum = get_gpu_num() if args.use_gpu:
devicenum = get_gpu_num()
else:
devicenum = int(os.environ.get('CPU_NUM', 1))
assert (args.train_batch_size % devicenum) == 0 assert (args.train_batch_size % devicenum) == 0
train_batch_size = args.train_batch_size / devicenum train_batch_size = args.train_batch_size / devicenum
test_batch_size = args.test_batch_size test_batch_size = args.test_batch_size
train_reader = paddle.batch(reader.train(args), batch_size=train_batch_size, drop_last=True) train_reader = paddle.batch(
test_reader = paddle.batch(reader.test(args), batch_size=test_batch_size, drop_last=False) reader.train(args), batch_size=train_batch_size, drop_last=True)
test_reader = paddle.batch(
reader.test(args), batch_size=test_batch_size, drop_last=False)
test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
train_py_reader.decorate_paddle_reader(train_reader) train_py_reader.decorate_paddle_reader(train_reader)
...@@ -238,12 +242,14 @@ def train_async(args): ...@@ -238,12 +242,14 @@ def train_async(args):
train_info = [0, 0, 0] train_info = [0, 0, 0]
totalruntime += period totalruntime += period
if iter_no % args.test_iter_step == 0 and iter_no != 0: if iter_no % args.test_iter_step == 0 and iter_no != 0:
f, l = [], [] f, l = [], []
for batch_id, data in enumerate(test_reader()): for batch_id, data in enumerate(test_reader()):
t1 = time.time() t1 = time.time()
[feas] = exe.run(test_prog, fetch_list = test_fetch_list, feed=test_feeder.feed(data)) [feas] = exe.run(test_prog,
fetch_list=test_fetch_list,
feed=test_feeder.feed(data))
label = np.asarray([x[1] for x in data]) label = np.asarray([x[1] for x in data])
f.append(feas) f.append(feas)
l.append(label) l.append(label)
...@@ -270,6 +276,7 @@ def train_async(args): ...@@ -270,6 +276,7 @@ def train_async(args):
iter_no += 1 iter_no += 1
def initlogging(): def initlogging():
for handler in logging.root.handlers[:]: for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler) logging.root.removeHandler(handler)
...@@ -277,10 +284,10 @@ def initlogging(): ...@@ -277,10 +284,10 @@ def initlogging():
logging.basicConfig( logging.basicConfig(
level=loglevel, level=loglevel,
# logger.BASIC_FORMAT, # logger.BASIC_FORMAT,
format= format="%(levelname)s:%(filename)s[%(lineno)s] %(name)s:%(funcName)s->%(message)s",
"%(levelname)s:%(filename)s[%(lineno)s] %(name)s:%(funcName)s->%(message)s",
datefmt='%a, %d %b %Y %H:%M:%S') datefmt='%a, %d %b %Y %H:%M:%S')
def main(): def main():
args = parser.parse_args() args = parser.parse_args()
print_arguments(args) print_arguments(args)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册