From 2377b0527277d16b1bd1dfb1200c8fab18d0b31a Mon Sep 17 00:00:00 2001 From: littletomatodonkey Date: Thu, 12 Mar 2020 13:48:56 +0800 Subject: [PATCH] fix gpu num check bug (#4406) --- PaddleCV/metric_learning/train_elem.py | 61 +++++++++++++---------- PaddleCV/metric_learning/train_pair.py | 69 ++++++++++++++------------ 2 files changed, 73 insertions(+), 57 deletions(-) diff --git a/PaddleCV/metric_learning/train_elem.py b/PaddleCV/metric_learning/train_elem.py index c49003c5..83d93e3e 100644 --- a/PaddleCV/metric_learning/train_elem.py +++ b/PaddleCV/metric_learning/train_elem.py @@ -44,7 +44,7 @@ add_arg('test_batch_size', int, 50, "Minibatch size.") add_arg('image_shape', str, "3,224,224", "input image size") add_arg('class_dim', int, 11318 , "Class number.") add_arg('lr', float, 0.01, "set learning rate.") -add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.") +add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.") add_arg('lr_steps', str, "15000,25000", "step of lr") add_arg('total_iter_num', int, 30000, "total_iter_num") add_arg('display_iter_step', int, 10, "display_iter_step.") @@ -63,15 +63,15 @@ add_arg('enable_ce', bool, False, "If set True, enable continuous evaluation job model_list = [m for m in dir(models) if "__" not in m] + def optimizer_setting(params): ls = params["learning_strategy"] assert ls["name"] == "piecewise_decay", \ - "learning rate strategy must be {}, \ - but got {}".format("piecewise_decay", lr["name"]) + "learning rate strategy must be {}, but got {}".format("piecewise_decay", lr["name"]) bd = [int(e) for e in ls["lr_steps"].split(',')] base_lr = params["lr"] - lr = [base_lr * (0.1 ** i) for i in range(len(bd) + 1)] + lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] optimizer = fluid.optimizer.Momentum( learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr), @@ -81,30 +81,28 @@ def optimizer_setting(params): def net_config(image, label, model, args, is_train): - assert args.model in model_list, "{} is not in lists: {}".format( - args.model, model_list) + assert args.model in model_list, "{} is not in lists: {}".format(args.model, + model_list) out = model.net(input=image, embedding_size=args.embedding_size) if not is_train: return None, None, None, out if args.loss_name == "softmax": - metricloss = SoftmaxLoss( - class_dim=args.class_dim, - ) + metricloss = SoftmaxLoss(class_dim=args.class_dim, ) elif args.loss_name == "arcmargin": metricloss = ArcMarginLoss( - class_dim = args.class_dim, - margin = args.arc_margin, - scale = args.arc_scale, - easy_margin = args.arc_easy_margin, - ) + class_dim=args.class_dim, + margin=args.arc_margin, + scale=args.arc_scale, + easy_margin=args.arc_easy_margin, ) cost, logit = metricloss.loss(out, label) avg_cost = fluid.layers.mean(x=cost) acc_top1 = fluid.layers.accuracy(input=logit, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=logit, label=label, k=5) return avg_cost, acc_top1, acc_top5, out + def build_program(is_train, main_prog, startup_prog, args): image_shape = [int(m) for m in args.image_shape.split(",")] model = models.__dict__[args.model]() @@ -119,11 +117,13 @@ def build_program(is_train, main_prog, startup_prog, args): use_double_buffer=True) image, label = fluid.layers.read_file(py_reader) else: - image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') + image = fluid.layers.data( + name='image', shape=image_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') with fluid.unique_name.guard(): - avg_cost, acc_top1, acc_top5, out = net_config(image, label, model, args, is_train) + avg_cost, acc_top1, acc_top5, out = net_config(image, label, model, + args, is_train) if is_train: params = model.params params["lr"] = args.lr @@ -138,7 +138,7 @@ def build_program(is_train, main_prog, startup_prog, args): """ if is_train: return py_reader, avg_cost, acc_top1, acc_top5, global_lr - else: + else: return out, image, label @@ -175,7 +175,9 @@ def train_async(args): args=args) test_prog = tmp_prog.clone(for_test=True) - train_fetch_list = [global_lr.name, train_cost.name, train_acc1.name, train_acc5.name] + train_fetch_list = [ + global_lr.name, train_cost.name, train_acc1.name, train_acc5.name + ] test_fetch_list = [test_feas.name] place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() @@ -196,13 +198,18 @@ def train_async(args): fluid.io.load_vars( exe, pretrained_model, main_program=train_prog, predicate=if_exist) - devicenum = get_gpu_num() + if args.use_gpu: + devicenum = get_gpu_num() + else: + devicenum = int(os.environ.get('CPU_NUM', 1)) assert (args.train_batch_size % devicenum) == 0 train_batch_size = args.train_batch_size // devicenum test_batch_size = args.test_batch_size - - train_reader = paddle.batch(reader.train(args), batch_size=train_batch_size, drop_last=True) - test_reader = paddle.batch(reader.test(args), batch_size=test_batch_size, drop_last=False) + + train_reader = paddle.batch( + reader.train(args), batch_size=train_batch_size, drop_last=True) + test_reader = paddle.batch( + reader.test(args), batch_size=test_batch_size, drop_last=False) test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) train_py_reader.decorate_paddle_reader(train_reader) @@ -239,12 +246,14 @@ def train_async(args): train_info = [0, 0, 0, 0] totalruntime += period - + if iter_no % args.test_iter_step == 0 and iter_no != 0: f, l = [], [] for batch_id, data in enumerate(test_reader()): t1 = time.time() - [feas] = exe.run(test_prog, fetch_list = test_fetch_list, feed=test_feeder.feed(data)) + [feas] = exe.run(test_prog, + fetch_list=test_fetch_list, + feed=test_feeder.feed(data)) label = np.asarray([x[1] for x in data]) f.append(feas) l.append(label) @@ -285,10 +294,10 @@ def initlogging(): logging.basicConfig( level=loglevel, # logger.BASIC_FORMAT, - format= - "%(levelname)s:%(filename)s[%(lineno)s] %(name)s:%(funcName)s->%(message)s", + format="%(levelname)s:%(filename)s[%(lineno)s] %(name)s:%(funcName)s->%(message)s", datefmt='%a, %d %b %Y %H:%M:%S') + def main(): args = parser.parse_args() print_arguments(args) diff --git a/PaddleCV/metric_learning/train_pair.py b/PaddleCV/metric_learning/train_pair.py index 1b0c2e84..599f0322 100644 --- a/PaddleCV/metric_learning/train_pair.py +++ b/PaddleCV/metric_learning/train_pair.py @@ -46,7 +46,7 @@ add_arg('test_batch_size', int, 50, "Minibatch size.") add_arg('image_shape', str, "3,224,224", "input image size") add_arg('class_dim', int, 11318, "Class number.") add_arg('lr', float, 0.0001, "set learning rate.") -add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.") +add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.") add_arg('lr_steps', str, "100000", "step of lr") add_arg('total_iter_num', int, 100000, "total_iter_num") add_arg('display_iter_step', int, 10, "display_iter_step.") @@ -64,15 +64,15 @@ add_arg('npairs_reg_lambda', float, 0.01, "npairs reg lambda.") model_list = [m for m in dir(models) if "__" not in m] + def optimizer_setting(params): ls = params["learning_strategy"] assert ls["name"] == "piecewise_decay", \ - "learning rate strategy must be {}, \ - but got {}".format("piecewise_decay", lr["name"]) + "learning rate strategy must be {}, but got {}".format("piecewise_decay", lr["name"]) bd = [int(e) for e in ls["lr_steps"].split(',')] base_lr = params["lr"] - lr = [base_lr * (0.1 ** i) for i in range(len(bd) + 1)] + lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] optimizer = fluid.optimizer.Momentum( learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr), @@ -82,38 +82,34 @@ def optimizer_setting(params): def net_config(image, label, model, args, is_train): - assert args.model in model_list, "{} is not in lists: {}".format( - args.model, model_list) + assert args.model in model_list, "{} is not in lists: {}".format(args.model, + model_list) out = model.net(input=image, embedding_size=args.embedding_size) if not is_train: return None, out if args.loss_name == "triplet": - metricloss = TripletLoss( - margin=args.margin, - ) + metricloss = TripletLoss(margin=args.margin, ) elif args.loss_name == "quadruplet": metricloss = QuadrupletLoss( - train_batch_size = args.train_batch_size, - samples_each_class = args.samples_each_class, - margin=args.margin, - ) + train_batch_size=args.train_batch_size, + samples_each_class=args.samples_each_class, + margin=args.margin, ) elif args.loss_name == "eml": metricloss = EmlLoss( - train_batch_size = args.train_batch_size, - samples_each_class = args.samples_each_class, - ) + train_batch_size=args.train_batch_size, + samples_each_class=args.samples_each_class, ) elif args.loss_name == "npairs": metricloss = NpairsLoss( - train_batch_size = args.train_batch_size, - samples_each_class = args.samples_each_class, - reg_lambda = args.npairs_reg_lambda, - ) + train_batch_size=args.train_batch_size, + samples_each_class=args.samples_each_class, + reg_lambda=args.npairs_reg_lambda, ) cost = metricloss.loss(out, label) avg_cost = fluid.layers.mean(x=cost) return avg_cost, out + def build_program(is_train, main_prog, startup_prog, args): image_shape = [int(m) for m in args.image_shape.split(",")] model = models.__dict__[args.model]() @@ -128,7 +124,8 @@ def build_program(is_train, main_prog, startup_prog, args): use_double_buffer=True) image, label = fluid.layers.read_file(py_reader) else: - image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') + image = fluid.layers.data( + name='image', shape=image_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') with fluid.unique_name.guard(): @@ -147,7 +144,7 @@ def build_program(is_train, main_prog, startup_prog, args): """ if is_train: return py_reader, avg_cost, global_lr, out, label - else: + else: return out, image, label @@ -176,7 +173,9 @@ def train_async(args): args=args) test_prog = tmp_prog.clone(for_test=True) - train_fetch_list = [global_lr.name, train_cost.name, train_feas.name, train_label.name] + train_fetch_list = [ + global_lr.name, train_cost.name, train_feas.name, train_label.name + ] test_fetch_list = [test_feas.name] place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() @@ -197,13 +196,18 @@ def train_async(args): fluid.io.load_vars( exe, pretrained_model, main_program=train_prog, predicate=if_exist) - devicenum = get_gpu_num() + if args.use_gpu: + devicenum = get_gpu_num() + else: + devicenum = int(os.environ.get('CPU_NUM', 1)) assert (args.train_batch_size % devicenum) == 0 train_batch_size = args.train_batch_size / devicenum test_batch_size = args.test_batch_size - - train_reader = paddle.batch(reader.train(args), batch_size=train_batch_size, drop_last=True) - test_reader = paddle.batch(reader.test(args), batch_size=test_batch_size, drop_last=False) + + train_reader = paddle.batch( + reader.train(args), batch_size=train_batch_size, drop_last=True) + test_reader = paddle.batch( + reader.test(args), batch_size=test_batch_size, drop_last=False) test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) train_py_reader.decorate_paddle_reader(train_reader) @@ -238,12 +242,14 @@ def train_async(args): train_info = [0, 0, 0] totalruntime += period - + if iter_no % args.test_iter_step == 0 and iter_no != 0: f, l = [], [] for batch_id, data in enumerate(test_reader()): t1 = time.time() - [feas] = exe.run(test_prog, fetch_list = test_fetch_list, feed=test_feeder.feed(data)) + [feas] = exe.run(test_prog, + fetch_list=test_fetch_list, + feed=test_feeder.feed(data)) label = np.asarray([x[1] for x in data]) f.append(feas) l.append(label) @@ -270,6 +276,7 @@ def train_async(args): iter_no += 1 + def initlogging(): for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) @@ -277,10 +284,10 @@ def initlogging(): logging.basicConfig( level=loglevel, # logger.BASIC_FORMAT, - format= - "%(levelname)s:%(filename)s[%(lineno)s] %(name)s:%(funcName)s->%(message)s", + format="%(levelname)s:%(filename)s[%(lineno)s] %(name)s:%(funcName)s->%(message)s", datefmt='%a, %d %b %Y %H:%M:%S') + def main(): args = parser.parse_args() print_arguments(args) -- GitLab