diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 399f5fb49c1f36b1a70657774770e2974dba2c00..c2771ba5db1204011d14ec9b49aba2bc1077a643 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -38,7 +38,10 @@ def parse_args(): default='resnet', help='The model to run benchmark with.') parser.add_argument( - '--batch_size', type=int, default=32, help='The minibatch size.') + '--batch_size', + type=int, + default=32, + help='The batch size on each gpu.') parser.add_argument( '--learning_rate', type=float, default=0.001, help='The learning rate.') parser.add_argument( @@ -229,27 +232,35 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, iters, num_samples, start_time = 0, 0, time.time() for pass_id in range(args.pass_num): train_losses = [] - reader_generator = train_reader() + if not args.use_reader_op: + reader_generator = train_reader() batch_id = 0 data = None while True: if not args.use_reader_op: data = next(reader_generator, None) - if iters == args.iterations or data == None: + if data == None: + break + if iters == args.iterations: break if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 if args.use_reader_op: - loss = exe.run(train_prog, fetch_list=[avg_loss]) + try: + loss = exe.run(train_prog, fetch_list=[avg_loss]) + except fluid.core.EnforceNotMet as ex: + break else: loss = exe.run(train_prog, feed=feeder.feed(data), fetch_list=[avg_loss]) iters += 1 batch_id += 1 - # FIXME(wuyi): last batch size maybe different + # FIXME(wuyi): For use_reader_op, if the current + # pass is not the last, the last batch of this pass + # is also equal to args.batch_size. num_samples += len(args.batch_size) train_losses.append(loss) print("Pass: %d, Iter: %d, Loss: %f\n" % @@ -315,13 +326,16 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, num_samples = 0 iters = 0 start_time = time.time() - reader_generator = train_reader() + if not args.use_reader_op: + reader_generator = train_reader() batch_id = 0 data = None while True: if not args.use_reader_op: data = next(reader_generator, None) - if iters == args.iterations or data == None: + if data == None: + break + if iters == args.iterations: break if args.profile and pass_id == 0 and batch_id == 5: profiler.start_profiler("All") @@ -335,7 +349,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, if args.use_reader_op and iters >= args.iterations / args.gpus: break if args.use_fake_data or args.use_reader_op: - loss, = exe.run([avg_loss.name]) + try: + loss, = exe.run([avg_loss.name]) + except fluid.core.EnforceNotMet as ex: + break else: loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) if args.update_method == "pserver": diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py index 3024882725dd8b02ca0991f3d70f13bf1d9808e1..69541adf6b7e53fcc1ac9d3c82b5a60ca0a72879 100644 --- a/benchmark/fluid/models/machine_translation.py +++ b/benchmark/fluid/models/machine_translation.py @@ -223,7 +223,7 @@ def get_model(args): train_batch_generator = paddle.batch( paddle.reader.shuffle( paddle.dataset.wmt14.train(dict_size), buf_size=1000), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) test_batch_generator = paddle.batch( paddle.reader.shuffle( diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py index 5d3da68dafa6c0cea6aafd678c5425f2f6a361aa..54206c252ca33385995a6119e3bbe4b594ffbc6f 100644 --- a/benchmark/fluid/models/mnist.py +++ b/benchmark/fluid/models/mnist.py @@ -103,7 +103,7 @@ def get_model(args): # Reader train_reader = paddle.batch( - paddle.dataset.mnist.train(), batch_size=args.batch_size) + paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus) test_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=args.batch_size) return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index 47d8d026edca6d0a99ab50458f7c79869dca0ff8..3c87076724bc6bd24523a0fa0f829a0c9860f6c6 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -184,7 +184,7 @@ def get_model(args): batched_train_reader = paddle.batch( paddle.reader.shuffle( train_reader, buf_size=5120), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size) return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py index e2a8cf45ac3f2cb5c4023dba176d9d1670799ed3..211869af4e8d7180cb485811d3363c50d32f0f74 100644 --- a/benchmark/fluid/models/stacked_dynamic_lstm.py +++ b/benchmark/fluid/models/stacked_dynamic_lstm.py @@ -118,7 +118,7 @@ def get_model(args): train_reader = batch( paddle.reader.shuffle( crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) test_reader = batch( paddle.reader.shuffle( crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000), diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py index b84e118a88e53ebef0cfe26b21c0a4edc5107d20..cb0dc977634429178c441b1d2777f865c749c2db 100644 --- a/benchmark/fluid/models/vgg.py +++ b/benchmark/fluid/models/vgg.py @@ -110,7 +110,7 @@ def get_model(args): paddle.dataset.cifar.train10() if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), buf_size=5120), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) test_reader = paddle.batch( paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),