refine benchmark

dcf40fd0 · Luo Tao · f7a60017 · dcf40fd0 · dcf40fd0 · dcf40fd0
隐藏空白更改
内联并排

Showing with 31 addition and 23 deletion

benchmark/.gitignore benchmark/.gitignore +3 -0

benchmark/fluid/fluid_benchmark.py benchmark/fluid/fluid_benchmark.py +17 -14

benchmark/fluid/run.sh benchmark/fluid/run.sh +11 -9

未找到文件。
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@@ -7,3 +7,6 @@ paddle/rnn/imdb.pkl
 caffe/image/logs
 tensorflow/image/logs
 tensorflow/rnn/logs
+fluid/models/*.pyc
+fluid/logs
+fluid/nohup.out
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -40,10 +40,7 @@ def parse_args():
    parser.add_argument(
        '--batch_size', type=int, default=32, help='The minibatch size.')
    parser.add_argument(
-        '--learning_rate',
+        '--learning_rate', type=float, default=0.001, help='The learning rate.')
-        type=float,
-        default=0.001,
-        help='The minibatch size.')
    # TODO(wuyi): add "--use_fake_data" option back.
    parser.add_argument(
        '--skip_batch_num',
@@ -72,6 +69,11 @@ def parse_args():
        type=int,
        default=1,
        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+    parser.add_argument(
+        '--cpus',
+        type=int,
+        default=1,
+        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
    parser.add_argument(
        '--data_set',
        type=str,
@@ -231,10 +233,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
            train_losses.append(loss)
            print("Pass: %d, Iter: %d, Loss: %f\n" %
                  (pass_id, iters, np.mean(train_losses)))
-        train_elapsed = time.time() - start_time
+        print_train_time(start_time, time.time(), num_samples)
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sec\n' %
-              (num_samples, train_elapsed, examples_per_sec))
        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses)))
        # evaluation
        if not args.no_test and batch_acc != None:
@@ -315,10 +314,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
            if batch_id % 1 == 0:
                print("Pass %d, batch %d, loss %s" %
                      (pass_id, batch_id, np.array(loss)))
-        train_elapsed = time.time() - start_time
+        print_train_time(start_time, time.time(), num_samples)
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
        if not args.no_test and batch_acc != None:
            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
                            batch_acc)
@@ -329,12 +325,19 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
 def print_arguments(args):
    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
                                vars(args)['device'] == 'GPU')
-    print('----------- resnet Configuration Arguments -----------')
+    print('----------- Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
+def print_train_time(start_time, end_time, num_samples):
+    train_elapsed = end_time - start_time
+    examples_per_sec = num_samples / train_elapsed
+    print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+          (num_samples, train_elapsed, examples_per_sec))
 def main():
    args = parse_args()
    print_arguments(args)
@@ -342,7 +345,7 @@ def main():
    # the unique trainer id, starting from 0, needed by trainer
    # only
    nccl_id_var, num_trainers, trainer_id = (
-        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))
+        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
    if args.use_cprof:
        pr = cProfile.Profile()

--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@@ -2,6 +2,7 @@
 # This script benchmarking the PaddlePaddle Fluid on
 # single thread single GPU.
+mkdir -p logs
 #export FLAGS_fraction_of_gpu_memory_to_use=0.0
 export CUDNN_PATH=/paddle/cudnn_v5
@@ -35,6 +36,7 @@ nohup stdbuf -oL nvidia-smi \
      --format=csv \
      --filename=mem.log  \
      -l 1 &
 # mnist
 # mnist gpu mnist 128
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
@@ -43,7 +45,7 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=500 \
-               2>&1 | tee -a mnist_gpu_128.log
+               2>&1 | tee -a logs/mnist_gpu_128.log
 # vgg16
 # gpu cifar10 128
@@ -53,7 +55,7 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_128.log
+               2>&1 | tee -a logs/vgg16_gpu_128.log
 # flowers gpu  128
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
@@ -63,28 +65,28 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --data_set=flowers \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_flowers_32.log
+               2>&1 | tee -a logs/vgg16_gpu_flowers_32.log
 # resnet50
 # resnet50 gpu cifar10 128
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
-               --model=resnet50 \
+               --model=resnet \
               --device=GPU \
               --batch_size=128 \
               --data_set=cifar10 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_128.log
+               2>&1 | tee -a logs/resnet50_gpu_128.log
 # resnet50 gpu flowers 64
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
-               --model=resnet50 \
+               --model=resnet \
               --device=GPU \
               --batch_size=64 \
               --data_set=flowers \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_flowers_64.log
+               2>&1 | tee -a logs/resnet50_gpu_flowers_64.log
 # lstm
 # lstm gpu imdb 32 # tensorflow only support batch=32
@@ -94,7 +96,7 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --batch_size=32 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a lstm_gpu_32.log
+               2>&1 | tee -a logs/lstm_gpu_32.log
 # seq2seq
 # seq2seq gpu wmb 128
@@ -104,4 +106,4 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a lstm_gpu_128.log
+               2>&1 | tee -a logs/lstm_gpu_128.log