From 55d3951bed724ec17830e9bbfd23ed25c9b429bc Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Wed, 23 May 2018 17:19:44 +0800 Subject: [PATCH] Benchmark/Integrate benchmark scripts (#10707) * wip integrate benchmark scripts * testing nlp models * k8s script to start dist benchmark job * update script * done support all models * add README.md * update by comment * clean up * follow comments --- benchmark/fluid/README.md | 60 +++ benchmark/fluid/fluid_benchmark.py | 351 ++++++++++++++++++ benchmark/fluid/kube_gen_job.py | 190 ++++++++++ benchmark/fluid/kube_templates/__init__.py | 58 +++ benchmark/fluid/kube_templates/pserver.py | 58 +++ benchmark/fluid/kube_templates/trainer.py | 70 ++++ benchmark/fluid/mnist.py | 228 ------------ benchmark/fluid/models/__init__.py | 17 + .../fluid/{ => models}/machine_translation.py | 183 +-------- benchmark/fluid/models/mnist.py | 94 +++++ benchmark/fluid/models/resnet.py | 161 ++++++++ .../{ => models}/stacked_dynamic_lstm.py | 119 +----- benchmark/fluid/models/vgg.py | 104 ++++++ benchmark/fluid/resnet.py | 317 ---------------- benchmark/fluid/vgg.py | 228 ------------ 15 files changed, 1192 insertions(+), 1046 deletions(-) create mode 100644 benchmark/fluid/README.md create mode 100644 benchmark/fluid/fluid_benchmark.py create mode 100644 benchmark/fluid/kube_gen_job.py create mode 100644 benchmark/fluid/kube_templates/__init__.py create mode 100644 benchmark/fluid/kube_templates/pserver.py create mode 100644 benchmark/fluid/kube_templates/trainer.py delete mode 100644 benchmark/fluid/mnist.py create mode 100644 benchmark/fluid/models/__init__.py rename benchmark/fluid/{ => models}/machine_translation.py (60%) create mode 100644 benchmark/fluid/models/mnist.py create mode 100644 benchmark/fluid/models/resnet.py rename benchmark/fluid/{ => models}/stacked_dynamic_lstm.py (52%) create mode 100644 benchmark/fluid/models/vgg.py delete mode 100644 benchmark/fluid/resnet.py delete mode 100644 benchmark/fluid/vgg.py diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md new file mode 100644 index 0000000000..0fc02b7043 --- /dev/null +++ b/benchmark/fluid/README.md @@ -0,0 +1,60 @@ +# Fluid Benchmark + +This directory contains several models configurations and tools that used to run +Fluid benchmarks for local and distributed training. + + +## Run the Benchmark + +To start, run the following command to get the full help message: + +```bash +python fluid_benchmark.py --help +``` + +Currently supported `--model` argument include: + +* mnist +* resnet + * you can chose to use different dataset using `--data_set cifar10` or + `--data_set flowers`. +* vgg +* stacked_dynamic_lstm +* machine_translation + +* Run the following command to start a benchmark job locally: + ```bash + python fluid_benchmark.py --model mnist --parallel 1 --device GPU --with_test + ``` + You can choose to use GPU/CPU training. With GPU training, you can specify + `--parallel 1` to run multi GPU training. +* Run distributed training with parameter servers: + * start parameter servers: + ```bash + PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver + ``` + * start trainers: + ```bash + PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver + ``` +* Run distributed training using NCCL2 + ```bash + PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method nccl2 + ``` + +## Run Distributed Benchmark on Kubernetes Cluster + +We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit +distributed benchmark jobs to your cluster. To generate a job yaml, just run: + +```bash +python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver --with_test" --disttype pserver +``` + +Then the yaml files are generated under directory `myjob`, you can run: + +```bash +kubectl create -f myjob/ +``` + +The job shall start. diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py new file mode 100644 index 0000000000..1d8f27440d --- /dev/null +++ b/benchmark/fluid/fluid_benchmark.py @@ -0,0 +1,351 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import cProfile +import time +import os + +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.profiler as profiler +import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler + +BENCHMARK_MODELS = [ + "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" +] + + +def parse_args(): + parser = argparse.ArgumentParser('Fluid model benchmarks.') + parser.add_argument( + '--model', + type=str, + choices=BENCHMARK_MODELS, + default='resnet', + help='The model to run benchmark with.') + parser.add_argument( + '--batch_size', type=int, default=32, help='The minibatch size.') + parser.add_argument( + '--learning_rate', + type=float, + default=0.001, + help='The minibatch size.') + # TODO(wuyi): add "--use_fake_data" option back. + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=100, help='The number of passes.') + parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data data_format, now only support NCHW.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--gpus', + type=int, + default=1, + help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') + parser.add_argument( + '--data_set', + type=str, + default='flowers', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + parser.add_argument( + '--no_test', + action='store_false', + help='If set, test the testset during training.') + parser.add_argument( + '--memory_optimize', + action='store_true', + help='If set, optimize runtime memory before start.') + parser.add_argument( + '--update_method', + type=str, + default='local', + choices=['local', 'pserver', 'nccl2'], + help='Choose parameter update method, can be local, pserver, nccl2.') + args = parser.parse_args() + return args + + +def append_nccl2_prepare(): + if os.getenv("PADDLE_TRAINER_ID", None) != None: + # append gen_nccl_id at the end of startup program + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + port = os.getenv("PADDLE_PSERVER_PORT") + worker_ips = os.getenv("PADDLE_TRAINER_IPS") + worker_endpoints = [] + for ip in worker_ips.split(","): + worker_endpoints.append(':'.join([ip, port])) + num_trainers = len(worker_endpoints) + current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port + worker_endpoints.remove(current_endpoint) + + nccl_id_var = fluid.default_startup_program().global_block().create_var( + name="NCCLID", + persistable=True, + type=fluid.core.VarDesc.VarType.RAW) + fluid.default_startup_program().global_block().append_op( + type="gen_nccl_id", + inputs={}, + outputs={"NCCLID": nccl_id_var}, + attrs={ + "endpoint": current_endpoint, + "endpoint_list": worker_endpoints, + "trainer_id": trainer_id + }) + return nccl_id_var, num_trainers, trainer_id + else: + raise Exception( + "must set PADDLE_TRAINER_ID env variables for dist train.") + + +def dist_transpile(): + if "PADDLE_TRAINING_ROLE" not in os.environ: + return None, None + + # the port of all pservers, needed by both trainer and pserver + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + # comma separated ips of all pservers, needed by trainer and + # pserver + pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "") + eplist = [] + for ip in pserver_ips.split(","): + eplist.append(':'.join([ip, port])) + pserver_endpoints = ",".join(eplist) + # total number of workers/trainers in the job, needed by + # trainer and pserver + trainers = int(os.getenv("PADDLE_TRAINERS")) + # the IP of the local machine, needed by pserver only + current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port + # the unique trainer id, starting from 0, needed by trainer + # only + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + # the role, should be either PSERVER or TRAINER + training_role = os.getenv("PADDLE_TRAINING_ROLE") + + t = distribute_transpiler.DistributeTranspiler() + t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) + if training_role == "PSERVER": + pserver_program = t.get_pserver_program(current_endpoint) + pserver_startup_program = t.get_startup_program(current_endpoint, + pserver_program) + return pserver_program, pserver_startup_program + elif training_role == "TRAINER": + train_program = t.get_trainer_program() + return train_program, fluid.default_startup_program() + else: + raise ValueError( + 'TRAINING_ROLE environment variable must be either TRAINER or PSERVER' + ) + + +def test(exe, inference_program, test_reader, feeder, batch_acc): + accuracy_evaluator = fluid.metrics.Accuracy() + for batch_id, data in enumerate(test_reader()): + acc = exe.run(inference_program, + feed=feeder.feed(data), + fetch_list=[batch_acc]) + accuracy_evaluator.update(value=np.array(acc), weight=len(data)) + + return accuracy_evaluator.eval() + + +# TODO(wuyi): replace train, train_parallel, test functions with new trainer +# API once it is ready. +def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, + args, train_prog, startup_prog): + if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER": + place = core.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_prog) + exe.run(train_prog) + return + + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(startup_prog) + feed_var_list = [ + var for var in train_prog.global_block().vars.itervalues() + if var.is_data + ] + feeder = fluid.DataFeeder(feed_var_list, place) + + iters, num_samples, start_time = 0, 0, time.time() + for pass_id in range(args.pass_num): + train_losses = [] + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + loss = exe.run(train_prog, + feed=feeder.feed(data), + fetch_list=[avg_loss]) + iters += 1 + num_samples += len(data) + train_losses.append(loss) + print("Pass: %d, Iter: %d, Loss: %f\n" % + (pass_id, iters, np.mean(train_losses))) + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sec\n' % + (num_samples, train_elapsed, examples_per_sec)) + print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))) + # evaluation + if not args.no_test and batch_acc != None: + pass_test_acc = test(exe, infer_prog, test_reader, feeder, + batch_acc) + print(", Test Accuracy: %f" % pass_test_acc) + print("\n") + # TODO(wuyi): add warmup passes to get better perf data. + exit(0) + + +# TODO(wuyi): replace train, train_parallel, test functions with new trainer +# API once it is ready. +def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, + batch_acc, args, train_prog, startup_prog, nccl_id_var, + num_trainers, trainer_id): + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + startup_exe = fluid.Executor(place) + startup_exe.run(startup_prog) + strategy = fluid.ExecutionStrategy() + strategy.num_threads = 1 + strategy.allow_op_delay = False + exe = fluid.ParallelExecutor( + True, + avg_loss.name, + exec_strategy=strategy, + num_trainers=num_trainers, + trainer_id=trainer_id) + feed_var_list = [ + var for var in train_prog.global_block().vars.itervalues() + if var.is_data + ] + feeder = fluid.DataFeeder(feed_var_list, place) + for pass_id in range(args.pass_num): + num_samples = 0 + iters = 0 + start_time = time.time() + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) + if args.update_method == "pserver": + exe.bcast_params() + num_samples += len(data) + iters += 1 + if batch_id % 1 == 0: + print("Pass %d, batch %d, loss %s" % + (pass_id, batch_id, np.array(loss))) + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + if not args.no_test and batch_acc != None: + test_acc = test(startup_exe, infer_prog, test_reader, feeder, + batch_acc) + print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc)) + exit(0) + + +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + print('----------- resnet Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def main(): + args = parse_args() + print_arguments(args) + nccl_id_var, num_trainers, trainer_id = None, 1, 0 + + if args.use_cprof: + pr = cProfile.Profile() + pr.enable() + model_def = __import__("models.%s" % args.model, fromlist=["models"]) + train_args = list(model_def.get_model(args)) + train_args.append(args) + # Run optimizer.minimize(avg_loss) + train_args[2].minimize(train_args[0]) + if args.memory_optimize: + fluid.memory_optimize(fluid.default_main_program()) + + if args.update_method == "pserver": + train_prog, startup_prog = dist_transpile() + if not train_prog: + raise Exception( + "Must configure correct environments to run dist train.") + train_args.extend([train_prog, startup_prog]) + if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER": + train_args.extend([nccl_id_var, num_trainers, trainer_id]) + train_parallel(*train_args) + train(*train_args) + exit(0) + + # for other update methods, use default programs + train_args.append(fluid.default_main_program()) + train_args.append(fluid.default_startup_program()) + + if args.update_method == "nccl2": + nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare() + if args.gpus == 1: + # NOTE: parallel executor use profiler interanlly + if args.use_nvprof and args.device == 'GPU': + with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: + train(*train_args) + else: + train(*train_args) + else: + if args.device == "CPU": + raise Exception("Only support GPU perf with parallel exe") + train_args.extend([nccl_id_var, num_trainers, trainer_id]) + train_parallel(*train_args) + + +if __name__ == "__main__": + main() diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py new file mode 100644 index 0000000000..3dbb4b8c5d --- /dev/null +++ b/benchmark/fluid/kube_gen_job.py @@ -0,0 +1,190 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import yaml +import copy +import argparse +import random +import os +from kube_templates import pserver, trainer, envs + + +def parse_args(): + parser = argparse.ArgumentParser(description='Generate dist job yamls.') + + parser.add_argument( + '--jobname', default="paddlejob", help='unique job name') + parser.add_argument( + '--cpu', default=1, type=int, help='CPU cores per trainer node') + parser.add_argument( + '--pscpu', default=1, type=int, help='CPU cores per pserver node') + parser.add_argument( + '--gpu', default=0, type=int, help='num of GPUs per node') + parser.add_argument( + '--image', + default="bootstrapper:5000/fluid_benchmark:gpu", + help='num of GPUs per node') + parser.add_argument( + '--pservers', default=1, type=int, help='num of pservers') + parser.add_argument( + '--trainers', default=1, type=int, help='num of trainers') + parser.add_argument('--memory', default=1, type=int, help='trainer memory') + parser.add_argument( + '--psmemory', default=1, type=int, help='pserver memory') + parser.add_argument( + '--port', default=30236, type=int, help='num of trainers') + parser.add_argument( + '--entry', default="python train.py", help='command to run') + parser.add_argument( + '--fluid', default=1, type=int, help='whether is fluid job') + parser.add_argument( + '--rdma', action='store_ture', help='whether mount rdma libs') + parser.add_argument( + '--disttype', + default="pserver", + type=str, + choices=['pserver', 'nccl2', 'local'], + help='pserver or nccl2 or local') + + args = parser.parse_args() + return args + + +def gen_job(): + ps = pserver + tn = trainer + args = parse_args() + + ps_container = ps["spec"]["template"]["spec"]["containers"][0] + tn_container = tn["spec"]["template"]["spec"]["containers"][0] + + if args.fluid == 1: + ps_container["command"] = \ + ["paddle_k8s", "start_fluid"] + tn_container["command"] = \ + ["paddle_k8s", "start_fluid"] + ps["metadata"]["name"] = args.jobname + "-pserver" + ps["spec"]["template"]["metadata"]["labels"][ + "paddle-job-pserver"] = args.jobname + tn["metadata"]["name"] = args.jobname + "-trainer" + tn["spec"]["template"]["metadata"]["labels"]["paddle-job"] = args.jobname + + ps_container["image"] = args.image + tn_container["image"] = args.image + + ps_container["resources"]["requests"]["cpu"] = str(args.pscpu) + ps_container["resources"]["requests"]["memory"] = str(args.psmemory) + "Gi" + ps_container["resources"]["limits"]["cpu"] = str(args.pscpu) + ps_container["resources"]["limits"]["memory"] = str(args.psmemory) + "Gi" + + tn_container["resources"]["requests"]["cpu"] = str(args.cpu) + tn_container["resources"]["requests"]["memory"] = str(args.memory) + "Gi" + tn_container["resources"]["limits"]["cpu"] = str(args.cpu) + tn_container["resources"]["limits"]["memory"] = str(args.memory) + "Gi" + if args.gpu > 0: + tn_container["resources"]["requests"][ + "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu) + tn_container["resources"]["limits"][ + "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu) + + ps["spec"]["replicas"] = int(args.pservers) + tn["spec"]["parallelism"] = int(args.trainers) + tn["spec"]["completions"] = int(args.trainers) + ps_container["ports"][0]["name"] = "jobport-" + str(args.port) + ps_container["ports"][0]["containerPort"] = args.port + spreadport = random.randint(40000, 60000) + tn_container["ports"][0]["name"] = "spr-" + str(spreadport) + tn_container["ports"][0]["containerPort"] = spreadport + + envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname}) + envs.append({"name": "TRAINERS", "value": str(args.trainers)}) + envs.append({"name": "PSERVERS", "value": str(args.pservers)}) + envs.append({"name": "ENTRY", "value": args.entry}) + envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)}) + # NOTE: these directories below are cluster specific, please modify + # this settings before you run on your own cluster. + envs.append({ + "name": "LD_LIBRARY_PATH", + "value": + "/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind" + }) + + volumes = [{ + "name": "nvidia-driver", + "hostPath": { + "path": "/usr/local/nvidia/lib64" + } + }] + volumeMounts = [{ + "mountPath": "/usr/local/nvidia/lib64", + "name": "nvidia-driver" + }] + + if args.rdma: + volumes.extend([{ + "name": "ibetc", + "hostPath": { + "path": "/etc/libibverbs.d" + } + }, { + "name": "iblibs", + "hostPath": { + "path": "/usr/local/rdma" + } + }, { + "name": "valgrind", + "hostPath": { + "path": "/usr/lib64/mlnx_ofed/valgrind" + } + }]) + volumeMounts.extend([{ + "mountPath": "/etc/libibverbs.d", + "name": "ibetc" + }, { + "mountPath": "/usr/local/rdma", + "name": "iblibs" + }, { + "mountPath": "/usr/lib64/mlnx_ofed/valgrind", + "name": "valgrind" + }]) + # append shm for NCCL2 + volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}}) + volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"}) + + tn["spec"]["template"]["spec"]["volumes"] = volumes + tn_container["volumeMounts"] = volumeMounts + + ps_container["env"] = envs + ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"}) + tn_container["env"] = envs + if args.disttype == "pserver": + tn_container["env"].append({ + "name": "TRAINING_ROLE", + "value": "TRAINER" + }) + elif args.disttype == "nccl2" or args.disttype == "local": + # NCCL2 have no training role, set to plain WORKER + tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"}) + + os.mkdir(args.jobname) + if args.disttype == "pserver": + with open("%s/pserver.yaml" % args.jobname, "w") as fn: + yaml.dump(ps, fn) + + with open("%s/trainer.yaml" % args.jobname, "w") as fn: + yaml.dump(tn, fn) + + +if __name__ == "__main__": + gen_job() diff --git a/benchmark/fluid/kube_templates/__init__.py b/benchmark/fluid/kube_templates/__init__.py new file mode 100644 index 0000000000..b64a7f78ff --- /dev/null +++ b/benchmark/fluid/kube_templates/__init__.py @@ -0,0 +1,58 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pserver import pserver +from trainer import trainer + +__all__ = ["pserver", "trainer", "envs"] + +envs = [ + # envs that don't need to change + { + "name": "GLOG_v", + "value": "0" + }, + { + "name": "GLOG_logtostderr", + "value": "1" + }, + { + "name": "TOPOLOGY", + "value": "" + }, + { + "name": "TRAINER_PACKAGE", + "value": "/workspace" + }, + { + "name": "PADDLE_INIT_NICS", + "value": "eth2" + }, + { + "name": "NAMESPACE", + "valueFrom": { + "fieldRef": { + "fieldPath": "metadata.namespace" + } + } + }, + { + "name": "POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + } + } +] diff --git a/benchmark/fluid/kube_templates/pserver.py b/benchmark/fluid/kube_templates/pserver.py new file mode 100644 index 0000000000..b54982c806 --- /dev/null +++ b/benchmark/fluid/kube_templates/pserver.py @@ -0,0 +1,58 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pserver = { + "apiVersion": "extensions/v1beta1", + "kind": "ReplicaSet", + "metadata": { + "name": "jobname-pserver" + }, + "spec": { + "replicas": 1, + "template": { + "metadata": { + "labels": { + "paddle-job-pserver": "jobname" + } + }, + "spec": { + "hostNetwork": True, + "imagePullSecrets": [{ + "name": "job-registry-secret" + }], + "containers": [{ + "name": "pserver", + "image": "", + "imagePullPolicy": "Always", + "ports": [{ + "name": "jobport-1", + "containerPort": 1 + }], + "env": [], + "command": ["paddle_k8s", "start_pserver"], + "resources": { + "requests": { + "memory": "10Gi", + "cpu": "4" + }, + "limits": { + "memory": "10Gi", + "cpu": "4" + } + } + }] + } + } + } +} diff --git a/benchmark/fluid/kube_templates/trainer.py b/benchmark/fluid/kube_templates/trainer.py new file mode 100644 index 0000000000..b915d31e37 --- /dev/null +++ b/benchmark/fluid/kube_templates/trainer.py @@ -0,0 +1,70 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +trainer = { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": { + "name": "jobname-pserver" + }, + "spec": { + "parallelism": 4, + "completions": 4, + "template": { + "metadata": { + "labels": { + "paddle-job": "jobname" + } + }, + "spec": { + "hostNetwork": True, + "imagePullSecrets": [{ + "name": "job-registry-secret" + }], + "restartPolicy": "Never", + "containers": [{ + "name": "trainer", + "image": "", + "imagePullPolicy": "Always", + # to let container set rlimit + "securityContext": { + "privileged": True + # TODO(wuyi): use below specific cap instead of privileged, + # using privileged will cause all GPU device are visible + # in the container. + # "capabilities": { + # "add": ["SYS_RESOURCE"] + # } + }, + "ports": [{ + "name": "jobport-1", + "containerPort": 1 + }], + "env": [], + "command": ["paddle_k8s", "start_trainer", "v2"], + "resources": { + "requests": { + "memory": "10Gi", + "cpu": "4", + }, + "limits": { + "memory": "10Gi", + "cpu": "4", + } + } + }] + } + } + } +} diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py deleted file mode 100644 index 400200c474..0000000000 --- a/benchmark/fluid/mnist.py +++ /dev/null @@ -1,228 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import argparse -import time - -import paddle -import paddle.fluid as fluid -import paddle.fluid.profiler as profiler - -SEED = 1 -DTYPE = "float32" - -# random seed must set before configuring the network. -# fluid.default_startup_program().random_seed = SEED - - -def parse_args(): - parser = argparse.ArgumentParser("mnist model benchmark.") - parser.add_argument( - '--batch_size', type=int, default=128, help='The minibatch size.') - parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test' - ) - parser.add_argument( - '--iterations', type=int, default=35, help='The number of minibatches.') - parser.add_argument( - '--pass_num', type=int, default=5, help='The number of passes.') - parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help='The device type.') - parser.add_argument( - '--infer_only', action='store_true', help='If set, run forward only.') - parser.add_argument( - '--use_cprof', action='store_true', help='If set, use cProfile.') - parser.add_argument( - '--use_nvprof', - action='store_true', - help='If set, use nvprof for CUDA.') - parser.add_argument( - '--with_test', - action='store_true', - help='If set, test the testset during training.') - args = parser.parse_args() - return args - - -def cnn_model(data): - conv_pool_1 = fluid.nets.simple_img_conv_pool( - input=data, - filter_size=5, - num_filters=20, - pool_size=2, - pool_stride=2, - act="relu") - conv_pool_2 = fluid.nets.simple_img_conv_pool( - input=conv_pool_1, - filter_size=5, - num_filters=50, - pool_size=2, - pool_stride=2, - act="relu") - - # TODO(dzhwinter) : refine the initializer and random seed settting - SIZE = 10 - input_shape = conv_pool_2.shape - param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] - scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 - - predict = fluid.layers.fc( - input=conv_pool_2, - size=SIZE, - act="softmax", - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.NormalInitializer( - loc=0.0, scale=scale))) - return predict - - -def eval_test(exe, batch_acc, batch_size_tensor, inference_program): - test_reader = paddle.batch( - paddle.dataset.mnist.test(), batch_size=args.batch_size) - test_pass_acc = fluid.average.WeightedAverage() - for batch_id, data in enumerate(test_reader()): - img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]), - data)).astype(DTYPE) - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([len(y_data), 1]) - - acc, weight = exe.run(inference_program, - feed={"pixel": img_data, - "label": y_data}, - fetch_list=[batch_acc, batch_size_tensor]) - test_pass_acc.add(value=acc, weight=weight) - pass_acc = test_pass_acc.eval() - return pass_acc - - -def run_benchmark(model, args): - if args.use_cprof: - pr = cProfile.Profile() - pr.enable() - start_time = time.time() - # Input data - images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - - # Train program - predict = model(images) - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - # Evaluator - batch_size_tensor = fluid.layers.create_tensor(dtype='int64') - batch_acc = fluid.layers.accuracy( - input=predict, label=label, total=batch_size_tensor) - - # inference program - inference_program = fluid.default_main_program().clone() - - # Optimization - opt = fluid.optimizer.AdamOptimizer( - learning_rate=0.001, beta1=0.9, beta2=0.999) - opt.minimize(avg_cost) - - fluid.memory_optimize(fluid.default_main_program()) - - # Initialize executor - place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - # Parameter initialization - exe.run(fluid.default_startup_program()) - - # Reader - train_reader = paddle.batch( - paddle.dataset.mnist.train(), batch_size=args.batch_size) - - accuracy = fluid.metrics.Accuracy() - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) - iters, num_samples, start_time = 0, 0, time.time() - for pass_id in range(args.pass_num): - accuracy.reset() - train_accs = [] - train_losses = [] - for batch_id, data in enumerate(train_reader()): - if iters == args.skip_batch_num: - start_time = time.time() - num_samples = 0 - if iters == args.iterations: - break - img_data = np.array( - map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE) - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([len(y_data), 1]) - - outs = train_exe.run( - feed={"pixel": img_data, - "label": y_data}, - fetch_list=[ - avg_cost.name, batch_acc.name, batch_size_tensor.name - ] - ) # The accuracy is the accumulation of batches, but not the current batch. - accuracy.update( - value=np.array(np.mean(outs[1])), - weight=np.mean(np.array(outs[2]))) - iters += 1 - num_samples += len(y_data) - loss = np.mean(np.array(outs[0])) - acc = np.mean(np.array(outs[1])) - train_losses.append(loss) - train_accs.append(acc) - print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % - (pass_id, iters, loss, acc)) - - print("Pass: %d, Loss: %f, Train Accuray: %f\n" % - (pass_id, np.mean(train_losses), np.mean(train_accs))) - train_elapsed = time.time() - start_time - examples_per_sec = num_samples / train_elapsed - - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % - (num_samples, train_elapsed, examples_per_sec)) - # evaluation - if args.with_test: - test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor, - inference_program) - exit(0) - - -def print_arguments(args): - vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and - vars(args)['device'] == 'GPU') - print('----------- mnist Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -if __name__ == '__main__': - args = parse_args() - print_arguments(args) - if args.use_nvprof and args.device == 'GPU': - with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: - run_benchmark(cnn_model, args) - else: - run_benchmark(cnn_model, args) diff --git a/benchmark/fluid/models/__init__.py b/benchmark/fluid/models/__init__.py new file mode 100644 index 0000000000..1c3fcac8dd --- /dev/null +++ b/benchmark/fluid/models/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = [ + "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" +] diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/models/machine_translation.py similarity index 60% rename from benchmark/fluid/machine_translation.py rename to benchmark/fluid/models/machine_translation.py index adde5f21ac..635b3373dd 100644 --- a/benchmark/fluid/machine_translation.py +++ b/benchmark/fluid/models/machine_translation.py @@ -27,74 +27,6 @@ import paddle.fluid.core as core import paddle.fluid.framework as framework from paddle.fluid.executor import Executor -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--embedding_dim", - type=int, - default=512, - help="The dimension of embedding table. (default: %(default)d)") -parser.add_argument( - "--encoder_size", - type=int, - default=512, - help="The size of encoder bi-rnn unit. (default: %(default)d)") -parser.add_argument( - "--decoder_size", - type=int, - default=512, - help="The size of decoder rnn unit. (default: %(default)d)") -parser.add_argument( - "--batch_size", - type=int, - default=16, - help="The sequence number of a mini-batch data. (default: %(default)d)") -parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test') -parser.add_argument( - '--iterations', type=int, default=80, help='The number of minibatches.') -parser.add_argument( - "--dict_size", - type=int, - default=30000, - help="The dictionary capacity. Dictionaries of source sequence and " - "target dictionary have same capacity. (default: %(default)d)") -parser.add_argument( - "--pass_num", - type=int, - default=2, - help="The pass number to train. (default: %(default)d)") -parser.add_argument( - "--learning_rate", - type=float, - default=0.0002, - help="Learning rate used to train the model. (default: %(default)f)") -parser.add_argument( - "--infer_only", action='store_true', help="If set, run forward only.") -parser.add_argument( - "--beam_size", - type=int, - default=3, - help="The width for beam searching. (default: %(default)d)") -parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help="The device type.") -parser.add_argument( - "--max_length", - type=int, - default=250, - help="The maximum length of sequence when doing generation. " - "(default: %(default)d)") -parser.add_argument( - '--with_test', - action='store_true', - help='If set, test the testset during training.') - def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): def linear(inputs): @@ -264,116 +196,37 @@ def lodtensor_to_ndarray(lod_tensor): return ndarray -def train(): +def get_model(args): + embedding_dim = 512 + encoder_size = 512 + decoder_size = 512 + dict_size = 30000 + beam_size = 3 + max_length = 250 avg_cost, feeding_list = seq_to_seq_net( - args.embedding_dim, - args.encoder_size, - args.decoder_size, - args.dict_size, - args.dict_size, + embedding_dim, + encoder_size, + decoder_size, + dict_size, + dict_size, False, - beam_size=args.beam_size, - max_length=args.max_length) + beam_size=beam_size, + max_length=max_length) # clone from default main program inference_program = fluid.default_main_program().clone() optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) - optimizer.minimize(avg_cost) - - fluid.memory_optimize(fluid.default_main_program()) train_batch_generator = paddle.batch( paddle.reader.shuffle( - paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), + paddle.dataset.wmt14.train(dict_size), buf_size=1000), batch_size=args.batch_size) test_batch_generator = paddle.batch( paddle.reader.shuffle( - paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), + paddle.dataset.wmt14.test(dict_size), buf_size=1000), batch_size=args.batch_size) - place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) - exe = Executor(place) - exe.run(framework.default_startup_program()) - - def do_validation(): - total_loss = 0.0 - count = 0 - for batch_id, data in enumerate(test_batch_generator()): - src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0] - trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0] - lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0] - - fetch_outs = exe.run(inference_program, - feed={ - feeding_list[0]: src_seq, - feeding_list[1]: trg_seq, - feeding_list[2]: lbl_seq - }, - fetch_list=[avg_cost], - return_numpy=False) - - total_loss += lodtensor_to_ndarray(fetch_outs[0])[0] - count += 1 - - return total_loss / count - - iters, num_samples, start_time = 0, 0, time.time() - for pass_id in xrange(args.pass_num): - train_accs = [] - train_losses = [] - for batch_id, data in enumerate(train_batch_generator()): - if iters == args.skip_batch_num: - start_time = time.time() - num_samples = 0 - if iters == args.iterations: - break - src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) - num_samples += word_num - trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) - num_samples += word_num - lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) - - fetch_outs = exe.run(framework.default_main_program(), - feed={ - feeding_list[0]: src_seq, - feeding_list[1]: trg_seq, - feeding_list[2]: lbl_seq - }, - fetch_list=[avg_cost]) - - iters += 1 - loss = np.array(fetch_outs[0]) - print( - "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss) - ) # The accuracy is the accumulation of batches, but not the current batch. - - train_elapsed = time.time() - start_time - examples_per_sec = num_samples / train_elapsed - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % - (num_samples, train_elapsed, examples_per_sec)) - # evaluation - if args.with_test: - test_loss = do_validation() - exit(0) - - -def infer(): - pass - - -def print_arguments(args): - print('----------- seq2seq Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -if __name__ == '__main__': - args = parser.parse_args() - print_arguments(args) - if args.infer_only: - infer() - else: - train() + return avg_cost, inference_program, optimizer, train_batch_generator, \ + test_batch_generator, None diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py new file mode 100644 index 0000000000..d264bfc12b --- /dev/null +++ b/benchmark/fluid/models/mnist.py @@ -0,0 +1,94 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time +import cProfile + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler + +SEED = 1 +DTYPE = "float32" + +# random seed must set before configuring the network. +# fluid.default_startup_program().random_seed = SEED + + +def cnn_model(data): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=data, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + + # TODO(dzhwinter) : refine the initializer and random seed settting + SIZE = 10 + input_shape = conv_pool_2.shape + param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] + scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 + + predict = fluid.layers.fc( + input=conv_pool_2, + size=SIZE, + act="softmax", + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale))) + return predict + + +def get_model(args): + # Input data + images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + predict = cnn_model(images) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + # inference program + inference_program = fluid.default_main_program().clone() + + # Optimization + opt = fluid.optimizer.AdamOptimizer( + learning_rate=0.001, beta1=0.9, beta2=0.999) + + # Reader + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=args.batch_size) + return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py new file mode 100644 index 0000000000..9dec8911ed --- /dev/null +++ b/benchmark/fluid/models/resnet.py @@ -0,0 +1,161 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import numpy as np +import time + +import cProfile, pstats, StringIO + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.profiler as profiler + + +def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): + conv1 = fluid.layers.conv2d( + input=input, + filter_size=filter_size, + num_filters=ch_out, + stride=stride, + padding=padding, + act=None, + bias_attr=False) + return fluid.layers.batch_norm(input=conv1, act=act) + + +def shortcut(input, ch_out, stride): + ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1] + if ch_in != ch_out: + return conv_bn_layer(input, ch_out, 1, stride, 0, None) + else: + return input + + +def basicblock(input, ch_out, stride): + short = shortcut(input, ch_out, stride) + conv1 = conv_bn_layer(input, ch_out, 3, stride, 1) + conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None) + return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') + + +def bottleneck(input, ch_out, stride): + short = shortcut(input, ch_out * 4, stride) + conv1 = conv_bn_layer(input, ch_out, 1, stride, 0) + conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1) + conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None) + return fluid.layers.elementwise_add(x=short, y=conv3, act='relu') + + +def layer_warp(block_func, input, ch_out, count, stride): + res_out = block_func(input, ch_out, stride) + for i in range(1, count): + res_out = block_func(res_out, ch_out, 1) + return res_out + + +def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'): + + cfg = { + 18: ([2, 2, 2, 1], basicblock), + 34: ([3, 4, 6, 3], basicblock), + 50: ([3, 4, 6, 3], bottleneck), + 101: ([3, 4, 23, 3], bottleneck), + 152: ([3, 8, 36, 3], bottleneck) + } + stages, block_func = cfg[depth] + conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3) + pool1 = fluid.layers.pool2d( + input=conv1, pool_type='avg', pool_size=3, pool_stride=2) + res1 = layer_warp(block_func, pool1, 64, stages[0], 1) + res2 = layer_warp(block_func, res1, 128, stages[1], 2) + res3 = layer_warp(block_func, res2, 256, stages[2], 2) + res4 = layer_warp(block_func, res3, 512, stages[3], 2) + pool2 = fluid.layers.pool2d( + input=res4, + pool_size=7, + pool_type='avg', + pool_stride=1, + global_pooling=True) + out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax') + return out + + +def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'): + assert (depth - 2) % 6 == 0 + + n = (depth - 2) // 6 + + conv1 = conv_bn_layer( + input=input, ch_out=16, filter_size=3, stride=1, padding=1) + res1 = layer_warp(basicblock, conv1, 16, n, 1) + res2 = layer_warp(basicblock, res1, 32, n, 2) + res3 = layer_warp(basicblock, res2, 64, n, 2) + pool = fluid.layers.pool2d( + input=res3, pool_size=8, pool_type='avg', pool_stride=1) + out = fluid.layers.fc(input=pool, size=class_dim, act='softmax') + return out + + +def get_model(args): + model = resnet_cifar10 + if args.data_set == "cifar10": + class_dim = 10 + if args.data_format == 'NCHW': + dshape = [3, 32, 32] + else: + dshape = [32, 32, 3] + model = resnet_cifar10 + else: + class_dim = 102 + if args.data_format == 'NCHW': + dshape = [3, 224, 224] + else: + dshape = [224, 224, 3] + model = resnet_imagenet + + input = fluid.layers.data(name='data', shape=dshape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + predict = model(input, class_dim) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=args.batch_size) + + return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py similarity index 52% rename from benchmark/fluid/stacked_dynamic_lstm.py rename to benchmark/fluid/models/stacked_dynamic_lstm.py index 73bcc47b4d..81a28b5f3a 100644 --- a/benchmark/fluid/stacked_dynamic_lstm.py +++ b/benchmark/fluid/models/stacked_dynamic_lstm.py @@ -29,57 +29,6 @@ import paddle.fluid as fluid import paddle.batch as batch import paddle.fluid.profiler as profiler - -def parse_args(): - parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.") - parser.add_argument( - '--batch_size', - type=int, - default=32, - help='The sequence number of a batch data. (default: %(default)d)') - parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test' - ) - parser.add_argument( - '--iterations', type=int, default=80, help='The number of minibatches.') - parser.add_argument( - '--emb_dim', - type=int, - default=512, - help='Dimension of embedding table. (default: %(default)d)') - parser.add_argument( - '--hidden_dim', - type=int, - default=512, - help='Hidden size of lstm unit. (default: %(default)d)') - parser.add_argument( - '--pass_num', - type=int, - default=100, - help='Epoch number to train. (default: %(default)d)') - parser.add_argument( - '--device', - type=str, - default='CPU', - choices=['CPU', 'GPU'], - help='The device type.') - parser.add_argument( - '--crop_size', - type=int, - default=int(os.environ.get('CROP_SIZE', '1500')), - help='The max sentence length of input. Since this model use plain RNN,' - ' Gradient could be explored if sentence is too long') - parser.add_argument( - '--with_test', - action='store_true', - help='If set, test the testset during training.') - args = parser.parse_args() - return args - - word_dict = imdb.word_dict() @@ -94,14 +43,15 @@ def crop_sentence(reader, crop_size): return __impl__ -def main(): - args = parse_args() - lstm_size = args.hidden_dim +def get_model(args): + lstm_size = 512 + emb_dim = 512 + crop_size = 1500 data = fluid.layers.data( name="words", shape=[1], lod_level=1, dtype='int64') sentence = fluid.layers.embedding( - input=data, size=[len(word_dict), args.emb_dim]) + input=data, size=[len(word_dict), emb_dim]) sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh') @@ -161,51 +111,17 @@ def main(): target_vars=[batch_acc, batch_size_tensor]) adam = fluid.optimizer.Adam() - adam.minimize(loss) - - fluid.memory_optimize(fluid.default_main_program()) - - place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) train_reader = batch( paddle.reader.shuffle( - crop_sentence(imdb.train(word_dict), args.crop_size), - buf_size=25000), + crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000), + batch_size=args.batch_size) + test_reader = batch( + paddle.reader.shuffle( + crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000), batch_size=args.batch_size) - iters, num_samples, start_time = 0, 0, time.time() - for pass_id in range(args.pass_num): - train_accs = [] - train_losses = [] - for batch_id, data in enumerate(train_reader()): - if iters == args.skip_batch_num: - start_time = time.time() - num_samples = 0 - if iters == args.iterations: - break - tensor_words = to_lodtensor([x[0] for x in data], place) - label = numpy.array([x[1] for x in data]).astype("int64") - label = label.reshape((-1, 1)) - loss_np, acc, weight = exe.run( - fluid.default_main_program(), - feed={"words": tensor_words, - "label": label}, - fetch_list=[loss, batch_acc, batch_size_tensor]) - iters += 1 - for x in data: - num_samples += len(x[0]) - print( - "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % - (pass_id, iters, loss_np, acc) - ) # The accuracy is the accumulation of batches, but not the current batch. - - train_elapsed = time.time() - start_time - examples_per_sec = num_samples / train_elapsed - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % - (num_samples, train_elapsed, examples_per_sec)) - exit(0) + return loss, inference_program, adam, train_reader, test_reader, batch_acc def to_lodtensor(data, place): @@ -221,16 +137,3 @@ def to_lodtensor(data, place): res.set(flattened_data, place) res.set_lod([lod]) return res - - -def print_arguments(args): - print('----------- lstm Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -if __name__ == '__main__': - args = parse_args() - print_arguments(args) - main() diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py new file mode 100644 index 0000000000..53856c5f7a --- /dev/null +++ b/benchmark/fluid/models/vgg.py @@ -0,0 +1,104 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""VGG16 benchmark in Fluid""" +from __future__ import print_function + +import sys +import time +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import argparse +import functools + + +def vgg16_bn_drop(input): + def conv_block(input, num_filter, groups, dropouts): + return fluid.nets.img_conv_group( + input=input, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act='relu', + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type='max') + + conv1 = conv_block(input, 64, 2, [0.3, 0]) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) + fc1 = fluid.layers.fc(input=drop, size=512, act=None) + bn = fluid.layers.batch_norm(input=fc1, act='relu') + drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) + fc2 = fluid.layers.fc(input=drop2, size=512, act=None) + return fc2 + + +def get_model(args): + if args.data_set == "cifar10": + classdim = 10 + if args.data_format == 'NCHW': + data_shape = [3, 32, 32] + else: + data_shape = [32, 32, 3] + else: + classdim = 102 + if args.data_format == 'NCHW': + data_shape = [3, 224, 224] + else: + data_shape = [224, 224, 3] + + # Input data + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + net = vgg16_bn_drop(images) + predict = fluid.layers.fc(input=net, size=classdim, act='softmax') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + # inference program + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + # Optimization + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + + # data reader + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=args.batch_size) + + return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py deleted file mode 100644 index 0fd7258a80..0000000000 --- a/benchmark/fluid/resnet.py +++ /dev/null @@ -1,317 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import functools -import numpy as np -import time - -import cProfile, pstats, StringIO - -import paddle -import paddle.fluid as fluid -import paddle.fluid.core as core -import paddle.fluid.profiler as profiler - - -def parse_args(): - parser = argparse.ArgumentParser('Convolution model benchmark.') - parser.add_argument( - '--model', - type=str, - choices=['resnet_imagenet', 'resnet_cifar10'], - default='resnet_imagenet', - help='The model architecture.') - parser.add_argument( - '--batch_size', type=int, default=32, help='The minibatch size.') - parser.add_argument( - '--use_fake_data', - action='store_true', - help='use real data or fake data') - parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test' - ) - parser.add_argument( - '--iterations', type=int, default=80, help='The number of minibatches.') - parser.add_argument( - '--pass_num', type=int, default=100, help='The number of passes.') - parser.add_argument( - '--data_format', - type=str, - default='NCHW', - choices=['NCHW', 'NHWC'], - help='The data data_format, now only support NCHW.') - parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help='The device type.') - parser.add_argument( - '--data_set', - type=str, - default='flowers', - choices=['cifar10', 'flowers'], - help='Optional dataset for benchmark.') - parser.add_argument( - '--infer_only', action='store_true', help='If set, run forward only.') - parser.add_argument( - '--use_cprof', action='store_true', help='If set, use cProfile.') - parser.add_argument( - '--use_nvprof', - action='store_true', - help='If set, use nvprof for CUDA.') - parser.add_argument( - '--with_test', - action='store_true', - help='If set, test the testset during training.') - args = parser.parse_args() - return args - - -def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): - conv1 = fluid.layers.conv2d( - input=input, - filter_size=filter_size, - num_filters=ch_out, - stride=stride, - padding=padding, - act=None, - bias_attr=False) - return fluid.layers.batch_norm(input=conv1, act=act) - - -def shortcut(input, ch_out, stride): - ch_in = input.shape[1] if args.data_format == 'NCHW' else input.shape[-1] - if ch_in != ch_out: - return conv_bn_layer(input, ch_out, 1, stride, 0, None) - else: - return input - - -def basicblock(input, ch_out, stride): - short = shortcut(input, ch_out, stride) - conv1 = conv_bn_layer(input, ch_out, 3, stride, 1) - conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None) - return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') - - -def bottleneck(input, ch_out, stride): - short = shortcut(input, ch_out * 4, stride) - conv1 = conv_bn_layer(input, ch_out, 1, stride, 0) - conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1) - conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None) - return fluid.layers.elementwise_add(x=short, y=conv3, act='relu') - - -def layer_warp(block_func, input, ch_out, count, stride): - res_out = block_func(input, ch_out, stride) - for i in range(1, count): - res_out = block_func(res_out, ch_out, 1) - return res_out - - -def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'): - - cfg = { - 18: ([2, 2, 2, 1], basicblock), - 34: ([3, 4, 6, 3], basicblock), - 50: ([3, 4, 6, 3], bottleneck), - 101: ([3, 4, 23, 3], bottleneck), - 152: ([3, 8, 36, 3], bottleneck) - } - stages, block_func = cfg[depth] - conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3) - pool1 = fluid.layers.pool2d( - input=conv1, pool_type='avg', pool_size=3, pool_stride=2) - res1 = layer_warp(block_func, pool1, 64, stages[0], 1) - res2 = layer_warp(block_func, res1, 128, stages[1], 2) - res3 = layer_warp(block_func, res2, 256, stages[2], 2) - res4 = layer_warp(block_func, res3, 512, stages[3], 2) - pool2 = fluid.layers.pool2d( - input=res4, - pool_size=7, - pool_type='avg', - pool_stride=1, - global_pooling=True) - out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax') - return out - - -def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'): - assert (depth - 2) % 6 == 0 - - n = (depth - 2) // 6 - - conv1 = conv_bn_layer( - input=input, ch_out=16, filter_size=3, stride=1, padding=1) - res1 = layer_warp(basicblock, conv1, 16, n, 1) - res2 = layer_warp(basicblock, res1, 32, n, 2) - res3 = layer_warp(basicblock, res2, 64, n, 2) - pool = fluid.layers.pool2d( - input=res3, pool_size=8, pool_type='avg', pool_stride=1) - out = fluid.layers.fc(input=pool, size=class_dim, act='softmax') - return out - - -def run_benchmark(model, args): - if args.use_cprof: - pr = cProfile.Profile() - pr.enable() - - if args.data_set == "cifar10": - class_dim = 10 - if args.data_format == 'NCHW': - dshape = [3, 32, 32] - else: - dshape = [32, 32, 3] - else: - class_dim = 102 - if args.data_format == 'NCHW': - dshape = [3, 224, 224] - else: - dshape = [224, 224, 3] - - input = fluid.layers.data(name='data', shape=dshape, dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - predict = model(input, class_dim) - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - batch_size_tensor = fluid.layers.create_tensor(dtype='int64') - batch_acc = fluid.layers.accuracy( - input=predict, label=label, total=batch_size_tensor) - - inference_program = fluid.default_main_program().clone() - with fluid.program_guard(inference_program): - inference_program = fluid.io.get_inference_program( - target_vars=[batch_acc, batch_size_tensor]) - - optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) - opts = optimizer.minimize(avg_cost) - - fluid.memory_optimize(fluid.default_main_program()) - - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.cifar.train10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), - buf_size=5120), - batch_size=args.batch_size) - test_reader = paddle.batch( - paddle.dataset.cifar.test10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), - batch_size=args.batch_size) - - def test(exe): - test_accuracy = fluid.average.WeightedAverage() - for batch_id, data in enumerate(test_reader()): - img_data = np.array(map(lambda x: x[0].reshape(dshape), - data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([-1, 1]) - - acc, weight = exe.run(inference_program, - feed={"data": img_data, - "label": y_data}, - fetch_list=[batch_acc, batch_size_tensor]) - test_accuracy.add(value=acc, weight=weight) - - return test_accuracy.eval() - - place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - accuracy = fluid.average.WeightedAverage() - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) - if args.use_fake_data: - data = train_reader().next() - image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype( - 'float32') - label = np.array(map(lambda x: x[1], data)).astype('int64') - label = label.reshape([-1, 1]) - - iters, num_samples, start_time = 0, 0, time.time() - for pass_id in range(args.pass_num): - accuracy.reset() - train_accs = [] - train_losses = [] - for batch_id, data in enumerate(train_reader()): - if iters == args.skip_batch_num: - start_time = time.time() - num_samples = 0 - if iters == args.iterations: - break - if not args.use_fake_data: - image = np.array(map(lambda x: x[0].reshape(dshape), - data)).astype('float32') - label = np.array(map(lambda x: x[1], data)).astype('int64') - label = label.reshape([-1, 1]) - loss, acc, weight = train_exe.run( - feed={'data': image, - 'label': label}, - fetch_list=[ - avg_cost.name, batch_acc.name, batch_size_tensor.name - ]) - iters += 1 - num_samples += len(label) - accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight)) - loss = np.mean(np.array(loss)) - acc = np.mean(np.array(acc)) - train_losses.append(loss) - train_accs.append(acc) - print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % - (pass_id, iters, loss, acc)) - print("Pass: %d, Loss: %f, Train Accuray: %f\n" % - (pass_id, np.mean(train_losses), np.mean(train_accs))) - train_elapsed = time.time() - start_time - examples_per_sec = num_samples / train_elapsed - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % - (num_samples, train_elapsed, examples_per_sec)) - # evaluation - if args.with_test: - pass_test_acc = test(exe) - exit(0) - - -def print_arguments(args): - vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and - vars(args)['device'] == 'GPU') - print('----------- resnet Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -if __name__ == '__main__': - model_map = { - 'resnet_imagenet': resnet_imagenet, - 'resnet_cifar10': resnet_cifar10 - } - args = parse_args() - print_arguments(args) - if args.data_format == 'NHWC': - raise ValueError('Only support NCHW data_format now.') - if args.use_nvprof and args.device == 'GPU': - with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: - run_benchmark(model_map[args.model], args) - else: - run_benchmark(model_map[args.model], args) diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py deleted file mode 100644 index 2a9566a45c..0000000000 --- a/benchmark/fluid/vgg.py +++ /dev/null @@ -1,228 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""VGG16 benchmark in Fluid""" -from __future__ import print_function - -import sys -import time -import numpy as np -import paddle -import paddle.fluid as fluid -import paddle.fluid.core as core -import argparse -import functools - -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - '--batch_size', type=int, default=128, help="Batch size for training.") -parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test') -parser.add_argument( - '--iterations', type=int, default=80, help='The number of minibatches.') -parser.add_argument( - '--learning_rate', - type=float, - default=1e-3, - help="Learning rate for training.") -parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.") -parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help="The device type.") -parser.add_argument( - '--data_format', - type=str, - default='NCHW', - choices=['NCHW', 'NHWC'], - help='The data order, now only support NCHW.') -parser.add_argument( - '--data_set', - type=str, - default='cifar10', - choices=['cifar10', 'flowers'], - help='Optional dataset for benchmark.') -parser.add_argument( - '--with_test', - action='store_true', - help='If set, test the testset during training.') -args = parser.parse_args() - - -def vgg16_bn_drop(input): - def conv_block(input, num_filter, groups, dropouts): - return fluid.nets.img_conv_group( - input=input, - pool_size=2, - pool_stride=2, - conv_num_filter=[num_filter] * groups, - conv_filter_size=3, - conv_act='relu', - conv_with_batchnorm=True, - conv_batchnorm_drop_rate=dropouts, - pool_type='max') - - conv1 = conv_block(input, 64, 2, [0.3, 0]) - conv2 = conv_block(conv1, 128, 2, [0.4, 0]) - conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) - conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) - conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) - - drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) - fc1 = fluid.layers.fc(input=drop, size=512, act=None) - bn = fluid.layers.batch_norm(input=fc1, act='relu') - drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) - fc2 = fluid.layers.fc(input=drop2, size=512, act=None) - return fc2 - - -def main(): - if args.data_set == "cifar10": - classdim = 10 - if args.data_format == 'NCHW': - data_shape = [3, 32, 32] - else: - data_shape = [32, 32, 3] - else: - classdim = 102 - if args.data_format == 'NCHW': - data_shape = [3, 224, 224] - else: - data_shape = [224, 224, 3] - - # Input data - images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - - # Train program - net = vgg16_bn_drop(images) - predict = fluid.layers.fc(input=net, size=classdim, act='softmax') - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - # Evaluator - batch_size_tensor = fluid.layers.create_tensor(dtype='int64') - batch_acc = fluid.layers.accuracy( - input=predict, label=label, total=batch_size_tensor) - - # inference program - inference_program = fluid.default_main_program().clone() - with fluid.program_guard(inference_program): - inference_program = fluid.io.get_inference_program( - target_vars=[batch_acc, batch_size_tensor]) - - # Optimization - optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) - opts = optimizer.minimize(avg_cost) - - fluid.memory_optimize(fluid.default_main_program()) - - # Initialize executor - place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) - exe = fluid.Executor(place) - - # Parameter initialization - exe.run(fluid.default_startup_program()) - - # data reader - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.cifar.train10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), - buf_size=5120), - batch_size=args.batch_size) - test_reader = paddle.batch( - paddle.dataset.cifar.test10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), - batch_size=args.batch_size) - - # test - def test(exe): - test_accuracy = fluid.average.WeightedAverage() - for batch_id, data in enumerate(test_reader()): - img_data = np.array(map(lambda x: x[0].reshape(data_shape), - data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([-1, 1]) - - acc, weight = exe.run(inference_program, - feed={"pixel": img_data, - "label": y_data}, - fetch_list=[batch_acc, batch_size_tensor]) - test_accuracy.add(value=acc, weight=weight) - return test_accuracy.eval() - - iters, num_samples, start_time = 0, 0, time.time() - accuracy = fluid.average.WeightedAverage() - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) - for pass_id in range(args.pass_num): - accuracy.reset() - train_accs = [] - train_losses = [] - for batch_id, data in enumerate(train_reader()): - if iters == args.skip_batch_num: - start_time = time.time() - num_samples = 0 - if iters == args.iterations: - break - img_data = np.array(map(lambda x: x[0].reshape(data_shape), - data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([-1, 1]) - - loss, acc, weight = train_exe.run( - feed={"pixel": img_data, - "label": y_data}, - fetch_list=[ - avg_cost.name, batch_acc.name, batch_size_tensor.name - ]) - accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight)) - iters += 1 - num_samples += len(y_data) - loss = np.mean(np.array(loss)) - acc = np.mean(np.array(acc)) - print( - "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % - (pass_id, iters, loss, acc) - ) # The accuracy is the accumulation of batches, but not the current batch. - - # pass_train_acc = accuracy.eval() - train_losses.append(loss) - train_accs.append(acc) - print("Pass: %d, Loss: %f, Train Accuray: %f\n" % - (pass_id, np.mean(train_losses), np.mean(train_accs))) - train_elapsed = time.time() - start_time - examples_per_sec = num_samples / train_elapsed - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % - (num_samples, train_elapsed, examples_per_sec)) - # evaluation - if args.with_test: - pass_test_acc = test(exe) - exit(0) - - -def print_arguments(): - print('----------- vgg Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -if __name__ == "__main__": - print_arguments() - main() -- GitLab