diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ca9bc51ccd7f30eb170a2ae5fdd5f55e0e74364..710b4774ca021c2e916460e7253d4fbf979a38cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,7 +41,6 @@ option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FO option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) -option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON) option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF) option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF) @@ -59,7 +58,6 @@ option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) -option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) @@ -100,6 +98,9 @@ endif() set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING "A path setting third party libraries download & build directories.") +set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING + "A path setting fluid shared and static libraries") + if (WITH_C_API AND WITH_PYTHON) message(WARNING "It is suggest not embedded a python interpreter in Paddle " "when using C-API. It will give an unpredictable behavior when using a " @@ -153,7 +154,6 @@ include(cupti) include(configure) # add paddle env configuration include(generic) # simplify cmake module include(package) # set paddle packages -include(cpplint) # set paddle c++ style include(ccache) # set ccache for compilation include(util) # set unittest and link libs include(rdma) # set rdma libraries diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py index 05b5f3977cbed2f08df73c6d8ba2fff687db3313..e9360ab4c79d23bdf9f84d0c0d407af6d39bde3e 100644 --- a/benchmark/cluster/vgg16/vgg16_fluid.py +++ b/benchmark/cluster/vgg16/vgg16_fluid.py @@ -38,7 +38,7 @@ def str2bool(v): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( - '--batch_size', type=int, default=128, help="Batch size for training.") + '--batch_size', type=int, default=16, help="Batch size for training.") parser.add_argument( '--learning_rate', type=float, @@ -61,7 +61,7 @@ parser.add_argument( parser.add_argument( '--data_set', type=str, - default='cifar10', + default='flowers', choices=['cifar10', 'flowers'], help='Optional dataset for benchmark.') parser.add_argument( @@ -200,26 +200,30 @@ def main(): fetch_list=[avg_cost, batch_acc, batch_size]) return loss, acc, b_size - if args.profile and args.task_index == 0: - # warmup. - for batch_id, data in enumerate(train_reader()): - if batch_id > 5: break - run_step(batch_id, data) - with profiler.profiler('All', 'total', '/tmp/profile_vgg'): + if args.profile: + with profiler.profiler('All', 'total', + '/tmp/profile_vgg_%d' % args.task_index): for batch_id, data in enumerate(train_reader()): if batch_id > 5: break run_step(batch_id, data) + total_time = 0.0 + count = 0 for batch_id, data in enumerate(train_reader()): ts = time.time() loss, acc, b_size = run_step(batch_id, data) iters += 1 num_samples += len(data) train_pass_acc.add(value=acc, weight=b_size) + + duration = time.time() - ts + total_time += duration + count += len(data) print( "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, " - "Speed = %.2f img/s" % (pass_id, iters, loss, acc, - len(data) / (time.time() - ts)) + "Speed = %.2f (%.2f) img/s" % (pass_id, iters, loss, acc, + len(data) / duration, + count / total_time) ) # The accuracy is the accumulation of batches, but not the current batch. pass_elapsed = time.time() - start_time diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0fc02b704362f79f2219252538b4b3195e665b2c --- /dev/null +++ b/benchmark/fluid/README.md @@ -0,0 +1,60 @@ +# Fluid Benchmark + +This directory contains several models configurations and tools that used to run +Fluid benchmarks for local and distributed training. + + +## Run the Benchmark + +To start, run the following command to get the full help message: + +```bash +python fluid_benchmark.py --help +``` + +Currently supported `--model` argument include: + +* mnist +* resnet + * you can chose to use different dataset using `--data_set cifar10` or + `--data_set flowers`. +* vgg +* stacked_dynamic_lstm +* machine_translation + +* Run the following command to start a benchmark job locally: + ```bash + python fluid_benchmark.py --model mnist --parallel 1 --device GPU --with_test + ``` + You can choose to use GPU/CPU training. With GPU training, you can specify + `--parallel 1` to run multi GPU training. +* Run distributed training with parameter servers: + * start parameter servers: + ```bash + PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver + ``` + * start trainers: + ```bash + PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver + ``` +* Run distributed training using NCCL2 + ```bash + PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method nccl2 + ``` + +## Run Distributed Benchmark on Kubernetes Cluster + +We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit +distributed benchmark jobs to your cluster. To generate a job yaml, just run: + +```bash +python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver --with_test" --disttype pserver +``` + +Then the yaml files are generated under directory `myjob`, you can run: + +```bash +kubectl create -f myjob/ +``` + +The job shall start. diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..1d8f27440d0f1438e0520684ee3e90e8a5891a17 --- /dev/null +++ b/benchmark/fluid/fluid_benchmark.py @@ -0,0 +1,351 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import cProfile +import time +import os + +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.profiler as profiler +import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler + +BENCHMARK_MODELS = [ + "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" +] + + +def parse_args(): + parser = argparse.ArgumentParser('Fluid model benchmarks.') + parser.add_argument( + '--model', + type=str, + choices=BENCHMARK_MODELS, + default='resnet', + help='The model to run benchmark with.') + parser.add_argument( + '--batch_size', type=int, default=32, help='The minibatch size.') + parser.add_argument( + '--learning_rate', + type=float, + default=0.001, + help='The minibatch size.') + # TODO(wuyi): add "--use_fake_data" option back. + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=100, help='The number of passes.') + parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data data_format, now only support NCHW.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--gpus', + type=int, + default=1, + help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') + parser.add_argument( + '--data_set', + type=str, + default='flowers', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + parser.add_argument( + '--no_test', + action='store_false', + help='If set, test the testset during training.') + parser.add_argument( + '--memory_optimize', + action='store_true', + help='If set, optimize runtime memory before start.') + parser.add_argument( + '--update_method', + type=str, + default='local', + choices=['local', 'pserver', 'nccl2'], + help='Choose parameter update method, can be local, pserver, nccl2.') + args = parser.parse_args() + return args + + +def append_nccl2_prepare(): + if os.getenv("PADDLE_TRAINER_ID", None) != None: + # append gen_nccl_id at the end of startup program + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + port = os.getenv("PADDLE_PSERVER_PORT") + worker_ips = os.getenv("PADDLE_TRAINER_IPS") + worker_endpoints = [] + for ip in worker_ips.split(","): + worker_endpoints.append(':'.join([ip, port])) + num_trainers = len(worker_endpoints) + current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port + worker_endpoints.remove(current_endpoint) + + nccl_id_var = fluid.default_startup_program().global_block().create_var( + name="NCCLID", + persistable=True, + type=fluid.core.VarDesc.VarType.RAW) + fluid.default_startup_program().global_block().append_op( + type="gen_nccl_id", + inputs={}, + outputs={"NCCLID": nccl_id_var}, + attrs={ + "endpoint": current_endpoint, + "endpoint_list": worker_endpoints, + "trainer_id": trainer_id + }) + return nccl_id_var, num_trainers, trainer_id + else: + raise Exception( + "must set PADDLE_TRAINER_ID env variables for dist train.") + + +def dist_transpile(): + if "PADDLE_TRAINING_ROLE" not in os.environ: + return None, None + + # the port of all pservers, needed by both trainer and pserver + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + # comma separated ips of all pservers, needed by trainer and + # pserver + pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "") + eplist = [] + for ip in pserver_ips.split(","): + eplist.append(':'.join([ip, port])) + pserver_endpoints = ",".join(eplist) + # total number of workers/trainers in the job, needed by + # trainer and pserver + trainers = int(os.getenv("PADDLE_TRAINERS")) + # the IP of the local machine, needed by pserver only + current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port + # the unique trainer id, starting from 0, needed by trainer + # only + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + # the role, should be either PSERVER or TRAINER + training_role = os.getenv("PADDLE_TRAINING_ROLE") + + t = distribute_transpiler.DistributeTranspiler() + t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) + if training_role == "PSERVER": + pserver_program = t.get_pserver_program(current_endpoint) + pserver_startup_program = t.get_startup_program(current_endpoint, + pserver_program) + return pserver_program, pserver_startup_program + elif training_role == "TRAINER": + train_program = t.get_trainer_program() + return train_program, fluid.default_startup_program() + else: + raise ValueError( + 'TRAINING_ROLE environment variable must be either TRAINER or PSERVER' + ) + + +def test(exe, inference_program, test_reader, feeder, batch_acc): + accuracy_evaluator = fluid.metrics.Accuracy() + for batch_id, data in enumerate(test_reader()): + acc = exe.run(inference_program, + feed=feeder.feed(data), + fetch_list=[batch_acc]) + accuracy_evaluator.update(value=np.array(acc), weight=len(data)) + + return accuracy_evaluator.eval() + + +# TODO(wuyi): replace train, train_parallel, test functions with new trainer +# API once it is ready. +def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, + args, train_prog, startup_prog): + if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER": + place = core.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_prog) + exe.run(train_prog) + return + + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(startup_prog) + feed_var_list = [ + var for var in train_prog.global_block().vars.itervalues() + if var.is_data + ] + feeder = fluid.DataFeeder(feed_var_list, place) + + iters, num_samples, start_time = 0, 0, time.time() + for pass_id in range(args.pass_num): + train_losses = [] + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + loss = exe.run(train_prog, + feed=feeder.feed(data), + fetch_list=[avg_loss]) + iters += 1 + num_samples += len(data) + train_losses.append(loss) + print("Pass: %d, Iter: %d, Loss: %f\n" % + (pass_id, iters, np.mean(train_losses))) + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sec\n' % + (num_samples, train_elapsed, examples_per_sec)) + print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))) + # evaluation + if not args.no_test and batch_acc != None: + pass_test_acc = test(exe, infer_prog, test_reader, feeder, + batch_acc) + print(", Test Accuracy: %f" % pass_test_acc) + print("\n") + # TODO(wuyi): add warmup passes to get better perf data. + exit(0) + + +# TODO(wuyi): replace train, train_parallel, test functions with new trainer +# API once it is ready. +def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, + batch_acc, args, train_prog, startup_prog, nccl_id_var, + num_trainers, trainer_id): + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + startup_exe = fluid.Executor(place) + startup_exe.run(startup_prog) + strategy = fluid.ExecutionStrategy() + strategy.num_threads = 1 + strategy.allow_op_delay = False + exe = fluid.ParallelExecutor( + True, + avg_loss.name, + exec_strategy=strategy, + num_trainers=num_trainers, + trainer_id=trainer_id) + feed_var_list = [ + var for var in train_prog.global_block().vars.itervalues() + if var.is_data + ] + feeder = fluid.DataFeeder(feed_var_list, place) + for pass_id in range(args.pass_num): + num_samples = 0 + iters = 0 + start_time = time.time() + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) + if args.update_method == "pserver": + exe.bcast_params() + num_samples += len(data) + iters += 1 + if batch_id % 1 == 0: + print("Pass %d, batch %d, loss %s" % + (pass_id, batch_id, np.array(loss))) + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + if not args.no_test and batch_acc != None: + test_acc = test(startup_exe, infer_prog, test_reader, feeder, + batch_acc) + print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc)) + exit(0) + + +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + print('----------- resnet Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def main(): + args = parse_args() + print_arguments(args) + nccl_id_var, num_trainers, trainer_id = None, 1, 0 + + if args.use_cprof: + pr = cProfile.Profile() + pr.enable() + model_def = __import__("models.%s" % args.model, fromlist=["models"]) + train_args = list(model_def.get_model(args)) + train_args.append(args) + # Run optimizer.minimize(avg_loss) + train_args[2].minimize(train_args[0]) + if args.memory_optimize: + fluid.memory_optimize(fluid.default_main_program()) + + if args.update_method == "pserver": + train_prog, startup_prog = dist_transpile() + if not train_prog: + raise Exception( + "Must configure correct environments to run dist train.") + train_args.extend([train_prog, startup_prog]) + if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER": + train_args.extend([nccl_id_var, num_trainers, trainer_id]) + train_parallel(*train_args) + train(*train_args) + exit(0) + + # for other update methods, use default programs + train_args.append(fluid.default_main_program()) + train_args.append(fluid.default_startup_program()) + + if args.update_method == "nccl2": + nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare() + if args.gpus == 1: + # NOTE: parallel executor use profiler interanlly + if args.use_nvprof and args.device == 'GPU': + with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: + train(*train_args) + else: + train(*train_args) + else: + if args.device == "CPU": + raise Exception("Only support GPU perf with parallel exe") + train_args.extend([nccl_id_var, num_trainers, trainer_id]) + train_parallel(*train_args) + + +if __name__ == "__main__": + main() diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py new file mode 100644 index 0000000000000000000000000000000000000000..3dbb4b8c5dd13657f8d1853003b321ad047e1349 --- /dev/null +++ b/benchmark/fluid/kube_gen_job.py @@ -0,0 +1,190 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import yaml +import copy +import argparse +import random +import os +from kube_templates import pserver, trainer, envs + + +def parse_args(): + parser = argparse.ArgumentParser(description='Generate dist job yamls.') + + parser.add_argument( + '--jobname', default="paddlejob", help='unique job name') + parser.add_argument( + '--cpu', default=1, type=int, help='CPU cores per trainer node') + parser.add_argument( + '--pscpu', default=1, type=int, help='CPU cores per pserver node') + parser.add_argument( + '--gpu', default=0, type=int, help='num of GPUs per node') + parser.add_argument( + '--image', + default="bootstrapper:5000/fluid_benchmark:gpu", + help='num of GPUs per node') + parser.add_argument( + '--pservers', default=1, type=int, help='num of pservers') + parser.add_argument( + '--trainers', default=1, type=int, help='num of trainers') + parser.add_argument('--memory', default=1, type=int, help='trainer memory') + parser.add_argument( + '--psmemory', default=1, type=int, help='pserver memory') + parser.add_argument( + '--port', default=30236, type=int, help='num of trainers') + parser.add_argument( + '--entry', default="python train.py", help='command to run') + parser.add_argument( + '--fluid', default=1, type=int, help='whether is fluid job') + parser.add_argument( + '--rdma', action='store_ture', help='whether mount rdma libs') + parser.add_argument( + '--disttype', + default="pserver", + type=str, + choices=['pserver', 'nccl2', 'local'], + help='pserver or nccl2 or local') + + args = parser.parse_args() + return args + + +def gen_job(): + ps = pserver + tn = trainer + args = parse_args() + + ps_container = ps["spec"]["template"]["spec"]["containers"][0] + tn_container = tn["spec"]["template"]["spec"]["containers"][0] + + if args.fluid == 1: + ps_container["command"] = \ + ["paddle_k8s", "start_fluid"] + tn_container["command"] = \ + ["paddle_k8s", "start_fluid"] + ps["metadata"]["name"] = args.jobname + "-pserver" + ps["spec"]["template"]["metadata"]["labels"][ + "paddle-job-pserver"] = args.jobname + tn["metadata"]["name"] = args.jobname + "-trainer" + tn["spec"]["template"]["metadata"]["labels"]["paddle-job"] = args.jobname + + ps_container["image"] = args.image + tn_container["image"] = args.image + + ps_container["resources"]["requests"]["cpu"] = str(args.pscpu) + ps_container["resources"]["requests"]["memory"] = str(args.psmemory) + "Gi" + ps_container["resources"]["limits"]["cpu"] = str(args.pscpu) + ps_container["resources"]["limits"]["memory"] = str(args.psmemory) + "Gi" + + tn_container["resources"]["requests"]["cpu"] = str(args.cpu) + tn_container["resources"]["requests"]["memory"] = str(args.memory) + "Gi" + tn_container["resources"]["limits"]["cpu"] = str(args.cpu) + tn_container["resources"]["limits"]["memory"] = str(args.memory) + "Gi" + if args.gpu > 0: + tn_container["resources"]["requests"][ + "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu) + tn_container["resources"]["limits"][ + "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu) + + ps["spec"]["replicas"] = int(args.pservers) + tn["spec"]["parallelism"] = int(args.trainers) + tn["spec"]["completions"] = int(args.trainers) + ps_container["ports"][0]["name"] = "jobport-" + str(args.port) + ps_container["ports"][0]["containerPort"] = args.port + spreadport = random.randint(40000, 60000) + tn_container["ports"][0]["name"] = "spr-" + str(spreadport) + tn_container["ports"][0]["containerPort"] = spreadport + + envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname}) + envs.append({"name": "TRAINERS", "value": str(args.trainers)}) + envs.append({"name": "PSERVERS", "value": str(args.pservers)}) + envs.append({"name": "ENTRY", "value": args.entry}) + envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)}) + # NOTE: these directories below are cluster specific, please modify + # this settings before you run on your own cluster. + envs.append({ + "name": "LD_LIBRARY_PATH", + "value": + "/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind" + }) + + volumes = [{ + "name": "nvidia-driver", + "hostPath": { + "path": "/usr/local/nvidia/lib64" + } + }] + volumeMounts = [{ + "mountPath": "/usr/local/nvidia/lib64", + "name": "nvidia-driver" + }] + + if args.rdma: + volumes.extend([{ + "name": "ibetc", + "hostPath": { + "path": "/etc/libibverbs.d" + } + }, { + "name": "iblibs", + "hostPath": { + "path": "/usr/local/rdma" + } + }, { + "name": "valgrind", + "hostPath": { + "path": "/usr/lib64/mlnx_ofed/valgrind" + } + }]) + volumeMounts.extend([{ + "mountPath": "/etc/libibverbs.d", + "name": "ibetc" + }, { + "mountPath": "/usr/local/rdma", + "name": "iblibs" + }, { + "mountPath": "/usr/lib64/mlnx_ofed/valgrind", + "name": "valgrind" + }]) + # append shm for NCCL2 + volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}}) + volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"}) + + tn["spec"]["template"]["spec"]["volumes"] = volumes + tn_container["volumeMounts"] = volumeMounts + + ps_container["env"] = envs + ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"}) + tn_container["env"] = envs + if args.disttype == "pserver": + tn_container["env"].append({ + "name": "TRAINING_ROLE", + "value": "TRAINER" + }) + elif args.disttype == "nccl2" or args.disttype == "local": + # NCCL2 have no training role, set to plain WORKER + tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"}) + + os.mkdir(args.jobname) + if args.disttype == "pserver": + with open("%s/pserver.yaml" % args.jobname, "w") as fn: + yaml.dump(ps, fn) + + with open("%s/trainer.yaml" % args.jobname, "w") as fn: + yaml.dump(tn, fn) + + +if __name__ == "__main__": + gen_job() diff --git a/benchmark/fluid/kube_templates/__init__.py b/benchmark/fluid/kube_templates/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b64a7f78ff10d03987ea4a8c13a0e34bb433f64c --- /dev/null +++ b/benchmark/fluid/kube_templates/__init__.py @@ -0,0 +1,58 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pserver import pserver +from trainer import trainer + +__all__ = ["pserver", "trainer", "envs"] + +envs = [ + # envs that don't need to change + { + "name": "GLOG_v", + "value": "0" + }, + { + "name": "GLOG_logtostderr", + "value": "1" + }, + { + "name": "TOPOLOGY", + "value": "" + }, + { + "name": "TRAINER_PACKAGE", + "value": "/workspace" + }, + { + "name": "PADDLE_INIT_NICS", + "value": "eth2" + }, + { + "name": "NAMESPACE", + "valueFrom": { + "fieldRef": { + "fieldPath": "metadata.namespace" + } + } + }, + { + "name": "POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + } + } +] diff --git a/benchmark/fluid/kube_templates/pserver.py b/benchmark/fluid/kube_templates/pserver.py new file mode 100644 index 0000000000000000000000000000000000000000..b54982c806ad4229fbd4bd7edf82a4e7eb4c5ad1 --- /dev/null +++ b/benchmark/fluid/kube_templates/pserver.py @@ -0,0 +1,58 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pserver = { + "apiVersion": "extensions/v1beta1", + "kind": "ReplicaSet", + "metadata": { + "name": "jobname-pserver" + }, + "spec": { + "replicas": 1, + "template": { + "metadata": { + "labels": { + "paddle-job-pserver": "jobname" + } + }, + "spec": { + "hostNetwork": True, + "imagePullSecrets": [{ + "name": "job-registry-secret" + }], + "containers": [{ + "name": "pserver", + "image": "", + "imagePullPolicy": "Always", + "ports": [{ + "name": "jobport-1", + "containerPort": 1 + }], + "env": [], + "command": ["paddle_k8s", "start_pserver"], + "resources": { + "requests": { + "memory": "10Gi", + "cpu": "4" + }, + "limits": { + "memory": "10Gi", + "cpu": "4" + } + } + }] + } + } + } +} diff --git a/benchmark/fluid/kube_templates/trainer.py b/benchmark/fluid/kube_templates/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..b915d31e371d9d787ff64d705e32baf301e16abe --- /dev/null +++ b/benchmark/fluid/kube_templates/trainer.py @@ -0,0 +1,70 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +trainer = { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": { + "name": "jobname-pserver" + }, + "spec": { + "parallelism": 4, + "completions": 4, + "template": { + "metadata": { + "labels": { + "paddle-job": "jobname" + } + }, + "spec": { + "hostNetwork": True, + "imagePullSecrets": [{ + "name": "job-registry-secret" + }], + "restartPolicy": "Never", + "containers": [{ + "name": "trainer", + "image": "", + "imagePullPolicy": "Always", + # to let container set rlimit + "securityContext": { + "privileged": True + # TODO(wuyi): use below specific cap instead of privileged, + # using privileged will cause all GPU device are visible + # in the container. + # "capabilities": { + # "add": ["SYS_RESOURCE"] + # } + }, + "ports": [{ + "name": "jobport-1", + "containerPort": 1 + }], + "env": [], + "command": ["paddle_k8s", "start_trainer", "v2"], + "resources": { + "requests": { + "memory": "10Gi", + "cpu": "4", + }, + "limits": { + "memory": "10Gi", + "cpu": "4", + } + } + }] + } + } + } +} diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py deleted file mode 100644 index 400200c4745017bd9d160bb9e415fde041c0a6c8..0000000000000000000000000000000000000000 --- a/benchmark/fluid/mnist.py +++ /dev/null @@ -1,228 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import argparse -import time - -import paddle -import paddle.fluid as fluid -import paddle.fluid.profiler as profiler - -SEED = 1 -DTYPE = "float32" - -# random seed must set before configuring the network. -# fluid.default_startup_program().random_seed = SEED - - -def parse_args(): - parser = argparse.ArgumentParser("mnist model benchmark.") - parser.add_argument( - '--batch_size', type=int, default=128, help='The minibatch size.') - parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test' - ) - parser.add_argument( - '--iterations', type=int, default=35, help='The number of minibatches.') - parser.add_argument( - '--pass_num', type=int, default=5, help='The number of passes.') - parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help='The device type.') - parser.add_argument( - '--infer_only', action='store_true', help='If set, run forward only.') - parser.add_argument( - '--use_cprof', action='store_true', help='If set, use cProfile.') - parser.add_argument( - '--use_nvprof', - action='store_true', - help='If set, use nvprof for CUDA.') - parser.add_argument( - '--with_test', - action='store_true', - help='If set, test the testset during training.') - args = parser.parse_args() - return args - - -def cnn_model(data): - conv_pool_1 = fluid.nets.simple_img_conv_pool( - input=data, - filter_size=5, - num_filters=20, - pool_size=2, - pool_stride=2, - act="relu") - conv_pool_2 = fluid.nets.simple_img_conv_pool( - input=conv_pool_1, - filter_size=5, - num_filters=50, - pool_size=2, - pool_stride=2, - act="relu") - - # TODO(dzhwinter) : refine the initializer and random seed settting - SIZE = 10 - input_shape = conv_pool_2.shape - param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] - scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 - - predict = fluid.layers.fc( - input=conv_pool_2, - size=SIZE, - act="softmax", - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.NormalInitializer( - loc=0.0, scale=scale))) - return predict - - -def eval_test(exe, batch_acc, batch_size_tensor, inference_program): - test_reader = paddle.batch( - paddle.dataset.mnist.test(), batch_size=args.batch_size) - test_pass_acc = fluid.average.WeightedAverage() - for batch_id, data in enumerate(test_reader()): - img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]), - data)).astype(DTYPE) - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([len(y_data), 1]) - - acc, weight = exe.run(inference_program, - feed={"pixel": img_data, - "label": y_data}, - fetch_list=[batch_acc, batch_size_tensor]) - test_pass_acc.add(value=acc, weight=weight) - pass_acc = test_pass_acc.eval() - return pass_acc - - -def run_benchmark(model, args): - if args.use_cprof: - pr = cProfile.Profile() - pr.enable() - start_time = time.time() - # Input data - images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - - # Train program - predict = model(images) - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - # Evaluator - batch_size_tensor = fluid.layers.create_tensor(dtype='int64') - batch_acc = fluid.layers.accuracy( - input=predict, label=label, total=batch_size_tensor) - - # inference program - inference_program = fluid.default_main_program().clone() - - # Optimization - opt = fluid.optimizer.AdamOptimizer( - learning_rate=0.001, beta1=0.9, beta2=0.999) - opt.minimize(avg_cost) - - fluid.memory_optimize(fluid.default_main_program()) - - # Initialize executor - place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - # Parameter initialization - exe.run(fluid.default_startup_program()) - - # Reader - train_reader = paddle.batch( - paddle.dataset.mnist.train(), batch_size=args.batch_size) - - accuracy = fluid.metrics.Accuracy() - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) - iters, num_samples, start_time = 0, 0, time.time() - for pass_id in range(args.pass_num): - accuracy.reset() - train_accs = [] - train_losses = [] - for batch_id, data in enumerate(train_reader()): - if iters == args.skip_batch_num: - start_time = time.time() - num_samples = 0 - if iters == args.iterations: - break - img_data = np.array( - map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE) - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([len(y_data), 1]) - - outs = train_exe.run( - feed={"pixel": img_data, - "label": y_data}, - fetch_list=[ - avg_cost.name, batch_acc.name, batch_size_tensor.name - ] - ) # The accuracy is the accumulation of batches, but not the current batch. - accuracy.update( - value=np.array(np.mean(outs[1])), - weight=np.mean(np.array(outs[2]))) - iters += 1 - num_samples += len(y_data) - loss = np.mean(np.array(outs[0])) - acc = np.mean(np.array(outs[1])) - train_losses.append(loss) - train_accs.append(acc) - print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % - (pass_id, iters, loss, acc)) - - print("Pass: %d, Loss: %f, Train Accuray: %f\n" % - (pass_id, np.mean(train_losses), np.mean(train_accs))) - train_elapsed = time.time() - start_time - examples_per_sec = num_samples / train_elapsed - - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % - (num_samples, train_elapsed, examples_per_sec)) - # evaluation - if args.with_test: - test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor, - inference_program) - exit(0) - - -def print_arguments(args): - vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and - vars(args)['device'] == 'GPU') - print('----------- mnist Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -if __name__ == '__main__': - args = parse_args() - print_arguments(args) - if args.use_nvprof and args.device == 'GPU': - with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: - run_benchmark(cnn_model, args) - else: - run_benchmark(cnn_model, args) diff --git a/benchmark/fluid/models/__init__.py b/benchmark/fluid/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1c3fcac8dd4a1ba0496ef013bd4eb468a0075125 --- /dev/null +++ b/benchmark/fluid/models/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = [ + "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" +] diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/models/machine_translation.py similarity index 60% rename from benchmark/fluid/machine_translation.py rename to benchmark/fluid/models/machine_translation.py index adde5f21acd4e77d58a453d6868abeccfca4bb5a..635b3373dd27b21f83afae10b1d24833b81d57eb 100644 --- a/benchmark/fluid/machine_translation.py +++ b/benchmark/fluid/models/machine_translation.py @@ -27,74 +27,6 @@ import paddle.fluid.core as core import paddle.fluid.framework as framework from paddle.fluid.executor import Executor -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--embedding_dim", - type=int, - default=512, - help="The dimension of embedding table. (default: %(default)d)") -parser.add_argument( - "--encoder_size", - type=int, - default=512, - help="The size of encoder bi-rnn unit. (default: %(default)d)") -parser.add_argument( - "--decoder_size", - type=int, - default=512, - help="The size of decoder rnn unit. (default: %(default)d)") -parser.add_argument( - "--batch_size", - type=int, - default=16, - help="The sequence number of a mini-batch data. (default: %(default)d)") -parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test') -parser.add_argument( - '--iterations', type=int, default=80, help='The number of minibatches.') -parser.add_argument( - "--dict_size", - type=int, - default=30000, - help="The dictionary capacity. Dictionaries of source sequence and " - "target dictionary have same capacity. (default: %(default)d)") -parser.add_argument( - "--pass_num", - type=int, - default=2, - help="The pass number to train. (default: %(default)d)") -parser.add_argument( - "--learning_rate", - type=float, - default=0.0002, - help="Learning rate used to train the model. (default: %(default)f)") -parser.add_argument( - "--infer_only", action='store_true', help="If set, run forward only.") -parser.add_argument( - "--beam_size", - type=int, - default=3, - help="The width for beam searching. (default: %(default)d)") -parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help="The device type.") -parser.add_argument( - "--max_length", - type=int, - default=250, - help="The maximum length of sequence when doing generation. " - "(default: %(default)d)") -parser.add_argument( - '--with_test', - action='store_true', - help='If set, test the testset during training.') - def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): def linear(inputs): @@ -264,116 +196,37 @@ def lodtensor_to_ndarray(lod_tensor): return ndarray -def train(): +def get_model(args): + embedding_dim = 512 + encoder_size = 512 + decoder_size = 512 + dict_size = 30000 + beam_size = 3 + max_length = 250 avg_cost, feeding_list = seq_to_seq_net( - args.embedding_dim, - args.encoder_size, - args.decoder_size, - args.dict_size, - args.dict_size, + embedding_dim, + encoder_size, + decoder_size, + dict_size, + dict_size, False, - beam_size=args.beam_size, - max_length=args.max_length) + beam_size=beam_size, + max_length=max_length) # clone from default main program inference_program = fluid.default_main_program().clone() optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) - optimizer.minimize(avg_cost) - - fluid.memory_optimize(fluid.default_main_program()) train_batch_generator = paddle.batch( paddle.reader.shuffle( - paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), + paddle.dataset.wmt14.train(dict_size), buf_size=1000), batch_size=args.batch_size) test_batch_generator = paddle.batch( paddle.reader.shuffle( - paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), + paddle.dataset.wmt14.test(dict_size), buf_size=1000), batch_size=args.batch_size) - place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) - exe = Executor(place) - exe.run(framework.default_startup_program()) - - def do_validation(): - total_loss = 0.0 - count = 0 - for batch_id, data in enumerate(test_batch_generator()): - src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0] - trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0] - lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0] - - fetch_outs = exe.run(inference_program, - feed={ - feeding_list[0]: src_seq, - feeding_list[1]: trg_seq, - feeding_list[2]: lbl_seq - }, - fetch_list=[avg_cost], - return_numpy=False) - - total_loss += lodtensor_to_ndarray(fetch_outs[0])[0] - count += 1 - - return total_loss / count - - iters, num_samples, start_time = 0, 0, time.time() - for pass_id in xrange(args.pass_num): - train_accs = [] - train_losses = [] - for batch_id, data in enumerate(train_batch_generator()): - if iters == args.skip_batch_num: - start_time = time.time() - num_samples = 0 - if iters == args.iterations: - break - src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) - num_samples += word_num - trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) - num_samples += word_num - lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) - - fetch_outs = exe.run(framework.default_main_program(), - feed={ - feeding_list[0]: src_seq, - feeding_list[1]: trg_seq, - feeding_list[2]: lbl_seq - }, - fetch_list=[avg_cost]) - - iters += 1 - loss = np.array(fetch_outs[0]) - print( - "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss) - ) # The accuracy is the accumulation of batches, but not the current batch. - - train_elapsed = time.time() - start_time - examples_per_sec = num_samples / train_elapsed - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % - (num_samples, train_elapsed, examples_per_sec)) - # evaluation - if args.with_test: - test_loss = do_validation() - exit(0) - - -def infer(): - pass - - -def print_arguments(args): - print('----------- seq2seq Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -if __name__ == '__main__': - args = parser.parse_args() - print_arguments(args) - if args.infer_only: - infer() - else: - train() + return avg_cost, inference_program, optimizer, train_batch_generator, \ + test_batch_generator, None diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..d264bfc12bdb159c06dae81db4949b9ee17268e2 --- /dev/null +++ b/benchmark/fluid/models/mnist.py @@ -0,0 +1,94 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time +import cProfile + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler + +SEED = 1 +DTYPE = "float32" + +# random seed must set before configuring the network. +# fluid.default_startup_program().random_seed = SEED + + +def cnn_model(data): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=data, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + + # TODO(dzhwinter) : refine the initializer and random seed settting + SIZE = 10 + input_shape = conv_pool_2.shape + param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] + scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 + + predict = fluid.layers.fc( + input=conv_pool_2, + size=SIZE, + act="softmax", + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale))) + return predict + + +def get_model(args): + # Input data + images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + predict = cnn_model(images) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + # inference program + inference_program = fluid.default_main_program().clone() + + # Optimization + opt = fluid.optimizer.AdamOptimizer( + learning_rate=0.001, beta1=0.9, beta2=0.999) + + # Reader + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=args.batch_size) + return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..9dec8911ed64e09285fb461c4a12adb601535316 --- /dev/null +++ b/benchmark/fluid/models/resnet.py @@ -0,0 +1,161 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import numpy as np +import time + +import cProfile, pstats, StringIO + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.profiler as profiler + + +def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): + conv1 = fluid.layers.conv2d( + input=input, + filter_size=filter_size, + num_filters=ch_out, + stride=stride, + padding=padding, + act=None, + bias_attr=False) + return fluid.layers.batch_norm(input=conv1, act=act) + + +def shortcut(input, ch_out, stride): + ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1] + if ch_in != ch_out: + return conv_bn_layer(input, ch_out, 1, stride, 0, None) + else: + return input + + +def basicblock(input, ch_out, stride): + short = shortcut(input, ch_out, stride) + conv1 = conv_bn_layer(input, ch_out, 3, stride, 1) + conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None) + return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') + + +def bottleneck(input, ch_out, stride): + short = shortcut(input, ch_out * 4, stride) + conv1 = conv_bn_layer(input, ch_out, 1, stride, 0) + conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1) + conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None) + return fluid.layers.elementwise_add(x=short, y=conv3, act='relu') + + +def layer_warp(block_func, input, ch_out, count, stride): + res_out = block_func(input, ch_out, stride) + for i in range(1, count): + res_out = block_func(res_out, ch_out, 1) + return res_out + + +def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'): + + cfg = { + 18: ([2, 2, 2, 1], basicblock), + 34: ([3, 4, 6, 3], basicblock), + 50: ([3, 4, 6, 3], bottleneck), + 101: ([3, 4, 23, 3], bottleneck), + 152: ([3, 8, 36, 3], bottleneck) + } + stages, block_func = cfg[depth] + conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3) + pool1 = fluid.layers.pool2d( + input=conv1, pool_type='avg', pool_size=3, pool_stride=2) + res1 = layer_warp(block_func, pool1, 64, stages[0], 1) + res2 = layer_warp(block_func, res1, 128, stages[1], 2) + res3 = layer_warp(block_func, res2, 256, stages[2], 2) + res4 = layer_warp(block_func, res3, 512, stages[3], 2) + pool2 = fluid.layers.pool2d( + input=res4, + pool_size=7, + pool_type='avg', + pool_stride=1, + global_pooling=True) + out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax') + return out + + +def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'): + assert (depth - 2) % 6 == 0 + + n = (depth - 2) // 6 + + conv1 = conv_bn_layer( + input=input, ch_out=16, filter_size=3, stride=1, padding=1) + res1 = layer_warp(basicblock, conv1, 16, n, 1) + res2 = layer_warp(basicblock, res1, 32, n, 2) + res3 = layer_warp(basicblock, res2, 64, n, 2) + pool = fluid.layers.pool2d( + input=res3, pool_size=8, pool_type='avg', pool_stride=1) + out = fluid.layers.fc(input=pool, size=class_dim, act='softmax') + return out + + +def get_model(args): + model = resnet_cifar10 + if args.data_set == "cifar10": + class_dim = 10 + if args.data_format == 'NCHW': + dshape = [3, 32, 32] + else: + dshape = [32, 32, 3] + model = resnet_cifar10 + else: + class_dim = 102 + if args.data_format == 'NCHW': + dshape = [3, 224, 224] + else: + dshape = [224, 224, 3] + model = resnet_imagenet + + input = fluid.layers.data(name='data', shape=dshape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + predict = model(input, class_dim) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=args.batch_size) + + return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py similarity index 52% rename from benchmark/fluid/stacked_dynamic_lstm.py rename to benchmark/fluid/models/stacked_dynamic_lstm.py index 73bcc47b4d404af2c01d61ca3dfb11971bbcfe9c..81a28b5f3aed0c325398b909d700c23df545824a 100644 --- a/benchmark/fluid/stacked_dynamic_lstm.py +++ b/benchmark/fluid/models/stacked_dynamic_lstm.py @@ -29,57 +29,6 @@ import paddle.fluid as fluid import paddle.batch as batch import paddle.fluid.profiler as profiler - -def parse_args(): - parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.") - parser.add_argument( - '--batch_size', - type=int, - default=32, - help='The sequence number of a batch data. (default: %(default)d)') - parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test' - ) - parser.add_argument( - '--iterations', type=int, default=80, help='The number of minibatches.') - parser.add_argument( - '--emb_dim', - type=int, - default=512, - help='Dimension of embedding table. (default: %(default)d)') - parser.add_argument( - '--hidden_dim', - type=int, - default=512, - help='Hidden size of lstm unit. (default: %(default)d)') - parser.add_argument( - '--pass_num', - type=int, - default=100, - help='Epoch number to train. (default: %(default)d)') - parser.add_argument( - '--device', - type=str, - default='CPU', - choices=['CPU', 'GPU'], - help='The device type.') - parser.add_argument( - '--crop_size', - type=int, - default=int(os.environ.get('CROP_SIZE', '1500')), - help='The max sentence length of input. Since this model use plain RNN,' - ' Gradient could be explored if sentence is too long') - parser.add_argument( - '--with_test', - action='store_true', - help='If set, test the testset during training.') - args = parser.parse_args() - return args - - word_dict = imdb.word_dict() @@ -94,14 +43,15 @@ def crop_sentence(reader, crop_size): return __impl__ -def main(): - args = parse_args() - lstm_size = args.hidden_dim +def get_model(args): + lstm_size = 512 + emb_dim = 512 + crop_size = 1500 data = fluid.layers.data( name="words", shape=[1], lod_level=1, dtype='int64') sentence = fluid.layers.embedding( - input=data, size=[len(word_dict), args.emb_dim]) + input=data, size=[len(word_dict), emb_dim]) sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh') @@ -161,51 +111,17 @@ def main(): target_vars=[batch_acc, batch_size_tensor]) adam = fluid.optimizer.Adam() - adam.minimize(loss) - - fluid.memory_optimize(fluid.default_main_program()) - - place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) train_reader = batch( paddle.reader.shuffle( - crop_sentence(imdb.train(word_dict), args.crop_size), - buf_size=25000), + crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000), + batch_size=args.batch_size) + test_reader = batch( + paddle.reader.shuffle( + crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000), batch_size=args.batch_size) - iters, num_samples, start_time = 0, 0, time.time() - for pass_id in range(args.pass_num): - train_accs = [] - train_losses = [] - for batch_id, data in enumerate(train_reader()): - if iters == args.skip_batch_num: - start_time = time.time() - num_samples = 0 - if iters == args.iterations: - break - tensor_words = to_lodtensor([x[0] for x in data], place) - label = numpy.array([x[1] for x in data]).astype("int64") - label = label.reshape((-1, 1)) - loss_np, acc, weight = exe.run( - fluid.default_main_program(), - feed={"words": tensor_words, - "label": label}, - fetch_list=[loss, batch_acc, batch_size_tensor]) - iters += 1 - for x in data: - num_samples += len(x[0]) - print( - "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % - (pass_id, iters, loss_np, acc) - ) # The accuracy is the accumulation of batches, but not the current batch. - - train_elapsed = time.time() - start_time - examples_per_sec = num_samples / train_elapsed - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % - (num_samples, train_elapsed, examples_per_sec)) - exit(0) + return loss, inference_program, adam, train_reader, test_reader, batch_acc def to_lodtensor(data, place): @@ -221,16 +137,3 @@ def to_lodtensor(data, place): res.set(flattened_data, place) res.set_lod([lod]) return res - - -def print_arguments(args): - print('----------- lstm Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -if __name__ == '__main__': - args = parse_args() - print_arguments(args) - main() diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..53856c5f7acd3a4e1476ec57154a880bb6f984c9 --- /dev/null +++ b/benchmark/fluid/models/vgg.py @@ -0,0 +1,104 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""VGG16 benchmark in Fluid""" +from __future__ import print_function + +import sys +import time +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import argparse +import functools + + +def vgg16_bn_drop(input): + def conv_block(input, num_filter, groups, dropouts): + return fluid.nets.img_conv_group( + input=input, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act='relu', + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type='max') + + conv1 = conv_block(input, 64, 2, [0.3, 0]) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) + fc1 = fluid.layers.fc(input=drop, size=512, act=None) + bn = fluid.layers.batch_norm(input=fc1, act='relu') + drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) + fc2 = fluid.layers.fc(input=drop2, size=512, act=None) + return fc2 + + +def get_model(args): + if args.data_set == "cifar10": + classdim = 10 + if args.data_format == 'NCHW': + data_shape = [3, 32, 32] + else: + data_shape = [32, 32, 3] + else: + classdim = 102 + if args.data_format == 'NCHW': + data_shape = [3, 224, 224] + else: + data_shape = [224, 224, 3] + + # Input data + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + net = vgg16_bn_drop(images) + predict = fluid.layers.fc(input=net, size=classdim, act='softmax') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + # inference program + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + # Optimization + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + + # data reader + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=args.batch_size) + + return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py deleted file mode 100644 index 0fd7258a804e7c93b0b03da140140394bf90004a..0000000000000000000000000000000000000000 --- a/benchmark/fluid/resnet.py +++ /dev/null @@ -1,317 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import functools -import numpy as np -import time - -import cProfile, pstats, StringIO - -import paddle -import paddle.fluid as fluid -import paddle.fluid.core as core -import paddle.fluid.profiler as profiler - - -def parse_args(): - parser = argparse.ArgumentParser('Convolution model benchmark.') - parser.add_argument( - '--model', - type=str, - choices=['resnet_imagenet', 'resnet_cifar10'], - default='resnet_imagenet', - help='The model architecture.') - parser.add_argument( - '--batch_size', type=int, default=32, help='The minibatch size.') - parser.add_argument( - '--use_fake_data', - action='store_true', - help='use real data or fake data') - parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test' - ) - parser.add_argument( - '--iterations', type=int, default=80, help='The number of minibatches.') - parser.add_argument( - '--pass_num', type=int, default=100, help='The number of passes.') - parser.add_argument( - '--data_format', - type=str, - default='NCHW', - choices=['NCHW', 'NHWC'], - help='The data data_format, now only support NCHW.') - parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help='The device type.') - parser.add_argument( - '--data_set', - type=str, - default='flowers', - choices=['cifar10', 'flowers'], - help='Optional dataset for benchmark.') - parser.add_argument( - '--infer_only', action='store_true', help='If set, run forward only.') - parser.add_argument( - '--use_cprof', action='store_true', help='If set, use cProfile.') - parser.add_argument( - '--use_nvprof', - action='store_true', - help='If set, use nvprof for CUDA.') - parser.add_argument( - '--with_test', - action='store_true', - help='If set, test the testset during training.') - args = parser.parse_args() - return args - - -def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): - conv1 = fluid.layers.conv2d( - input=input, - filter_size=filter_size, - num_filters=ch_out, - stride=stride, - padding=padding, - act=None, - bias_attr=False) - return fluid.layers.batch_norm(input=conv1, act=act) - - -def shortcut(input, ch_out, stride): - ch_in = input.shape[1] if args.data_format == 'NCHW' else input.shape[-1] - if ch_in != ch_out: - return conv_bn_layer(input, ch_out, 1, stride, 0, None) - else: - return input - - -def basicblock(input, ch_out, stride): - short = shortcut(input, ch_out, stride) - conv1 = conv_bn_layer(input, ch_out, 3, stride, 1) - conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None) - return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') - - -def bottleneck(input, ch_out, stride): - short = shortcut(input, ch_out * 4, stride) - conv1 = conv_bn_layer(input, ch_out, 1, stride, 0) - conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1) - conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None) - return fluid.layers.elementwise_add(x=short, y=conv3, act='relu') - - -def layer_warp(block_func, input, ch_out, count, stride): - res_out = block_func(input, ch_out, stride) - for i in range(1, count): - res_out = block_func(res_out, ch_out, 1) - return res_out - - -def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'): - - cfg = { - 18: ([2, 2, 2, 1], basicblock), - 34: ([3, 4, 6, 3], basicblock), - 50: ([3, 4, 6, 3], bottleneck), - 101: ([3, 4, 23, 3], bottleneck), - 152: ([3, 8, 36, 3], bottleneck) - } - stages, block_func = cfg[depth] - conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3) - pool1 = fluid.layers.pool2d( - input=conv1, pool_type='avg', pool_size=3, pool_stride=2) - res1 = layer_warp(block_func, pool1, 64, stages[0], 1) - res2 = layer_warp(block_func, res1, 128, stages[1], 2) - res3 = layer_warp(block_func, res2, 256, stages[2], 2) - res4 = layer_warp(block_func, res3, 512, stages[3], 2) - pool2 = fluid.layers.pool2d( - input=res4, - pool_size=7, - pool_type='avg', - pool_stride=1, - global_pooling=True) - out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax') - return out - - -def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'): - assert (depth - 2) % 6 == 0 - - n = (depth - 2) // 6 - - conv1 = conv_bn_layer( - input=input, ch_out=16, filter_size=3, stride=1, padding=1) - res1 = layer_warp(basicblock, conv1, 16, n, 1) - res2 = layer_warp(basicblock, res1, 32, n, 2) - res3 = layer_warp(basicblock, res2, 64, n, 2) - pool = fluid.layers.pool2d( - input=res3, pool_size=8, pool_type='avg', pool_stride=1) - out = fluid.layers.fc(input=pool, size=class_dim, act='softmax') - return out - - -def run_benchmark(model, args): - if args.use_cprof: - pr = cProfile.Profile() - pr.enable() - - if args.data_set == "cifar10": - class_dim = 10 - if args.data_format == 'NCHW': - dshape = [3, 32, 32] - else: - dshape = [32, 32, 3] - else: - class_dim = 102 - if args.data_format == 'NCHW': - dshape = [3, 224, 224] - else: - dshape = [224, 224, 3] - - input = fluid.layers.data(name='data', shape=dshape, dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - predict = model(input, class_dim) - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - batch_size_tensor = fluid.layers.create_tensor(dtype='int64') - batch_acc = fluid.layers.accuracy( - input=predict, label=label, total=batch_size_tensor) - - inference_program = fluid.default_main_program().clone() - with fluid.program_guard(inference_program): - inference_program = fluid.io.get_inference_program( - target_vars=[batch_acc, batch_size_tensor]) - - optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) - opts = optimizer.minimize(avg_cost) - - fluid.memory_optimize(fluid.default_main_program()) - - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.cifar.train10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), - buf_size=5120), - batch_size=args.batch_size) - test_reader = paddle.batch( - paddle.dataset.cifar.test10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), - batch_size=args.batch_size) - - def test(exe): - test_accuracy = fluid.average.WeightedAverage() - for batch_id, data in enumerate(test_reader()): - img_data = np.array(map(lambda x: x[0].reshape(dshape), - data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([-1, 1]) - - acc, weight = exe.run(inference_program, - feed={"data": img_data, - "label": y_data}, - fetch_list=[batch_acc, batch_size_tensor]) - test_accuracy.add(value=acc, weight=weight) - - return test_accuracy.eval() - - place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - accuracy = fluid.average.WeightedAverage() - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) - if args.use_fake_data: - data = train_reader().next() - image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype( - 'float32') - label = np.array(map(lambda x: x[1], data)).astype('int64') - label = label.reshape([-1, 1]) - - iters, num_samples, start_time = 0, 0, time.time() - for pass_id in range(args.pass_num): - accuracy.reset() - train_accs = [] - train_losses = [] - for batch_id, data in enumerate(train_reader()): - if iters == args.skip_batch_num: - start_time = time.time() - num_samples = 0 - if iters == args.iterations: - break - if not args.use_fake_data: - image = np.array(map(lambda x: x[0].reshape(dshape), - data)).astype('float32') - label = np.array(map(lambda x: x[1], data)).astype('int64') - label = label.reshape([-1, 1]) - loss, acc, weight = train_exe.run( - feed={'data': image, - 'label': label}, - fetch_list=[ - avg_cost.name, batch_acc.name, batch_size_tensor.name - ]) - iters += 1 - num_samples += len(label) - accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight)) - loss = np.mean(np.array(loss)) - acc = np.mean(np.array(acc)) - train_losses.append(loss) - train_accs.append(acc) - print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % - (pass_id, iters, loss, acc)) - print("Pass: %d, Loss: %f, Train Accuray: %f\n" % - (pass_id, np.mean(train_losses), np.mean(train_accs))) - train_elapsed = time.time() - start_time - examples_per_sec = num_samples / train_elapsed - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % - (num_samples, train_elapsed, examples_per_sec)) - # evaluation - if args.with_test: - pass_test_acc = test(exe) - exit(0) - - -def print_arguments(args): - vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and - vars(args)['device'] == 'GPU') - print('----------- resnet Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -if __name__ == '__main__': - model_map = { - 'resnet_imagenet': resnet_imagenet, - 'resnet_cifar10': resnet_cifar10 - } - args = parse_args() - print_arguments(args) - if args.data_format == 'NHWC': - raise ValueError('Only support NCHW data_format now.') - if args.use_nvprof and args.device == 'GPU': - with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: - run_benchmark(model_map[args.model], args) - else: - run_benchmark(model_map[args.model], args) diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py deleted file mode 100644 index 2a9566a45c3804183e05db9298cec4f670225a6f..0000000000000000000000000000000000000000 --- a/benchmark/fluid/vgg.py +++ /dev/null @@ -1,228 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""VGG16 benchmark in Fluid""" -from __future__ import print_function - -import sys -import time -import numpy as np -import paddle -import paddle.fluid as fluid -import paddle.fluid.core as core -import argparse -import functools - -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - '--batch_size', type=int, default=128, help="Batch size for training.") -parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test') -parser.add_argument( - '--iterations', type=int, default=80, help='The number of minibatches.') -parser.add_argument( - '--learning_rate', - type=float, - default=1e-3, - help="Learning rate for training.") -parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.") -parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help="The device type.") -parser.add_argument( - '--data_format', - type=str, - default='NCHW', - choices=['NCHW', 'NHWC'], - help='The data order, now only support NCHW.') -parser.add_argument( - '--data_set', - type=str, - default='cifar10', - choices=['cifar10', 'flowers'], - help='Optional dataset for benchmark.') -parser.add_argument( - '--with_test', - action='store_true', - help='If set, test the testset during training.') -args = parser.parse_args() - - -def vgg16_bn_drop(input): - def conv_block(input, num_filter, groups, dropouts): - return fluid.nets.img_conv_group( - input=input, - pool_size=2, - pool_stride=2, - conv_num_filter=[num_filter] * groups, - conv_filter_size=3, - conv_act='relu', - conv_with_batchnorm=True, - conv_batchnorm_drop_rate=dropouts, - pool_type='max') - - conv1 = conv_block(input, 64, 2, [0.3, 0]) - conv2 = conv_block(conv1, 128, 2, [0.4, 0]) - conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) - conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) - conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) - - drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) - fc1 = fluid.layers.fc(input=drop, size=512, act=None) - bn = fluid.layers.batch_norm(input=fc1, act='relu') - drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) - fc2 = fluid.layers.fc(input=drop2, size=512, act=None) - return fc2 - - -def main(): - if args.data_set == "cifar10": - classdim = 10 - if args.data_format == 'NCHW': - data_shape = [3, 32, 32] - else: - data_shape = [32, 32, 3] - else: - classdim = 102 - if args.data_format == 'NCHW': - data_shape = [3, 224, 224] - else: - data_shape = [224, 224, 3] - - # Input data - images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - - # Train program - net = vgg16_bn_drop(images) - predict = fluid.layers.fc(input=net, size=classdim, act='softmax') - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - # Evaluator - batch_size_tensor = fluid.layers.create_tensor(dtype='int64') - batch_acc = fluid.layers.accuracy( - input=predict, label=label, total=batch_size_tensor) - - # inference program - inference_program = fluid.default_main_program().clone() - with fluid.program_guard(inference_program): - inference_program = fluid.io.get_inference_program( - target_vars=[batch_acc, batch_size_tensor]) - - # Optimization - optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) - opts = optimizer.minimize(avg_cost) - - fluid.memory_optimize(fluid.default_main_program()) - - # Initialize executor - place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) - exe = fluid.Executor(place) - - # Parameter initialization - exe.run(fluid.default_startup_program()) - - # data reader - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.cifar.train10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), - buf_size=5120), - batch_size=args.batch_size) - test_reader = paddle.batch( - paddle.dataset.cifar.test10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), - batch_size=args.batch_size) - - # test - def test(exe): - test_accuracy = fluid.average.WeightedAverage() - for batch_id, data in enumerate(test_reader()): - img_data = np.array(map(lambda x: x[0].reshape(data_shape), - data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([-1, 1]) - - acc, weight = exe.run(inference_program, - feed={"pixel": img_data, - "label": y_data}, - fetch_list=[batch_acc, batch_size_tensor]) - test_accuracy.add(value=acc, weight=weight) - return test_accuracy.eval() - - iters, num_samples, start_time = 0, 0, time.time() - accuracy = fluid.average.WeightedAverage() - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) - for pass_id in range(args.pass_num): - accuracy.reset() - train_accs = [] - train_losses = [] - for batch_id, data in enumerate(train_reader()): - if iters == args.skip_batch_num: - start_time = time.time() - num_samples = 0 - if iters == args.iterations: - break - img_data = np.array(map(lambda x: x[0].reshape(data_shape), - data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([-1, 1]) - - loss, acc, weight = train_exe.run( - feed={"pixel": img_data, - "label": y_data}, - fetch_list=[ - avg_cost.name, batch_acc.name, batch_size_tensor.name - ]) - accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight)) - iters += 1 - num_samples += len(y_data) - loss = np.mean(np.array(loss)) - acc = np.mean(np.array(acc)) - print( - "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % - (pass_id, iters, loss, acc) - ) # The accuracy is the accumulation of batches, but not the current batch. - - # pass_train_acc = accuracy.eval() - train_losses.append(loss) - train_accs.append(acc) - print("Pass: %d, Loss: %f, Train Accuray: %f\n" % - (pass_id, np.mean(train_losses), np.mean(train_accs))) - train_elapsed = time.time() - start_time - examples_per_sec = num_samples / train_elapsed - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % - (num_samples, train_elapsed, examples_per_sec)) - # evaluation - if args.with_test: - pass_test_acc = test(exe) - exit(0) - - -def print_arguments(): - print('----------- vgg Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -if __name__ == "__main__": - print_arguments() - main() diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake deleted file mode 100644 index 4823dc3e91390002aefac70f7931b4197db05789..0000000000000000000000000000000000000000 --- a/cmake/cpplint.cmake +++ /dev/null @@ -1,62 +0,0 @@ -# util to check C++ file style -# * it basically use google cpplint.py. -# * It provide "add_style_check_target" for cmake. -# Usage see add_style_check_target's document -# -# TODO(yuyang18): Add python style check. - -set(STYLE_FILTER) - -# diable unwanted filters - -# paddle do not indent public/potected/private in class -set(STYLE_FILTER "${STYLE_FILTER}-whitespace/indent,") -# paddle use mutable reference. BUT IT IS NOT RECOMMANDED -set(STYLE_FILTER "${STYLE_FILTER}-runtime/references,") -# paddle use relative path for include. -set(STYLE_FILTER "${STYLE_FILTER}-build/include,") -# paddle use , , etc. -set(STYLE_FILTER "${STYLE_FILTER}-build/c++11,") -# paddle use c style casting. BUT IT IS NOT RECOMMANDED -set(STYLE_FILTER "${STYLE_FILTER}-readability/casting") - - -# IGNORE SOME FILES -set(IGNORE_PATTERN - .*ImportanceSampler.* - .*cblas\\.h.* - .*\\.pb\\.txt - .*MultiDataProvider.* - .*pb.* - .*pybind.h) - -# add_style_check_target -# -# attach check code style step for target. -# -# first argument: target name to attach -# rest arguments: source list to check code style. -# -# NOTE: If WITH_STYLE_CHECK is OFF, then this macro just do nothing. -macro(add_style_check_target TARGET_NAME) - if(WITH_STYLE_CHECK) - set(SOURCES_LIST ${ARGN}) - list(REMOVE_DUPLICATES SOURCES_LIST) - foreach(filename ${SOURCES_LIST}) - foreach(pattern ${IGNORE_PATTERN}) - if(filename MATCHES ${pattern}) - list(REMOVE_ITEM SOURCES_LIST ${filename}) - endif() - endforeach() - endforeach() - - if(SOURCES_LIST) - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py" - "--filter=${STYLE_FILTER}" - ${SOURCES_LIST} - COMMENT "cpplint: Checking source code style" - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) - endif() - endif() -endmacro() diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index e90948782bb5e333bbdb47ef9d61c1e37e3cf9e4..9459f1ddfe85f5607880d3fdd968b494d6af592a 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -23,17 +23,20 @@ SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc) SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) + +include(ProcessorCount) +ProcessorCount(NUM_OF_PROCESSOR) + IF(APPLE) - SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh) + SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh) ELSE() - SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin) + SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin) ENDIF() ExternalProject_Add( extern_grpc DEPENDS protobuf zlib - GIT_REPOSITORY "https://github.com/grpc/grpc.git" - GIT_TAG "v1.10.x" + URL "http://paddlepaddledeps.bj.bcebos.com/grpc.tar.xz" PREFIX ${GRPC_SOURCES_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 1d3e2ade6d393c6e4c37eea0dc1064cdb18808a5..9ddd05b3d9404df29ca1bf634105314b7e6a5b70 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -206,8 +206,6 @@ function(cc_library TARGET_NAME) list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) endif() endforeach() - add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS}) - else(cc_library_SRCS) if(cc_library_DEPS) merge_static_libs(${TARGET_NAME} ${cc_library_DEPS}) @@ -231,7 +229,7 @@ endfunction(cc_binary) function(cc_test TARGET_NAME) if(WITH_TESTING) - set(options "") + set(options SERIAL) set(oneValueArgs "") set(multiValueArgs SRCS DEPS ARGS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -241,6 +239,9 @@ function(cc_test TARGET_NAME) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} ${cc_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + if (${cc_test_SERIAL}) + set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1) + endif() endif() endfunction(cc_test) @@ -268,7 +269,6 @@ function(nv_library TARGET_NAME) list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) endif() endforeach() - add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS}) else(nv_library_SRCS) if (nv_library_DEPS) merge_static_libs(${TARGET_NAME} ${nv_library_DEPS}) @@ -295,7 +295,7 @@ endfunction(nv_binary) function(nv_test TARGET_NAME) if (WITH_GPU AND WITH_TESTING) - set(options "") + set(options SERIAL) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -303,6 +303,9 @@ function(nv_test TARGET_NAME) target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog) add_test(${TARGET_NAME} ${TARGET_NAME}) + if (nv_test_SERIAL) + set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1) + endif() endif() endfunction(nv_test) @@ -338,7 +341,6 @@ function(hip_library TARGET_NAME) list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) endif() endforeach() - add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS}) else(hip_library_SRCS) if (hip_library_DEPS) merge_static_libs(${TARGET_NAME} ${hip_library_DEPS}) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 7117a3a4f31c88b3c4a81e611146123903659ad5..3b13b2150514bd615667241272d287c7e55d4e74 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -52,32 +52,32 @@ function(copy TARGET) endfunction() # third party -set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3") +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3") copy(eigen3_lib SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported ) -set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags") +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags") copy(gflags_lib SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} DSTS ${dst_dir} ${dst_dir}/lib ) -set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog") +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog") copy(glog_lib SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} DSTS ${dst_dir} ${dst_dir}/lib ) -set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/boost/") +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/") copy(boost_lib SRCS ${BOOST_INCLUDE_DIR}/boost DSTS ${dst_dir} ) if(NOT PROTOBUF_FOUND) - set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf") + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf") copy(protobuf_lib SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} DSTS ${dst_dir} ${dst_dir}/lib @@ -85,13 +85,13 @@ if(NOT PROTOBUF_FOUND) endif() if(NOT CBLAS_FOUND) - set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas") + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas") copy(openblas_lib SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include DSTS ${dst_dir} ${dst_dir} ) elseif (WITH_MKLML) - set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml") + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml") copy(mklml_lib SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR} DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir} @@ -99,7 +99,7 @@ elseif (WITH_MKLML) endif() if(WITH_MKLDNN) - set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mkldnn") + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn") copy(mkldnn_lib SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} DSTS ${dst_dir} ${dst_dir}/lib @@ -107,17 +107,17 @@ if(WITH_MKLDNN) endif() if(NOT MOBILE_INFERENCE AND NOT RPI) - set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy") + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") copy(snappy_lib SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} DSTS ${dst_dir} ${dst_dir}/lib) - set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream") + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") copy(snappystream_lib SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} DSTS ${dst_dir} ${dst_dir}/lib) - set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib") + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") copy(zlib_lib SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} DSTS ${dst_dir} ${dst_dir}/lib) @@ -125,7 +125,7 @@ endif() # paddle fluid module set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") -set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid") +set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid") set(module "framework") copy(framework_lib DEPS framework_py_proto SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h @@ -165,15 +165,16 @@ copy(pybind_lib # CMakeCache Info copy(cmake_cache SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt - DSTS ${CMAKE_INSTALL_PREFIX}) + DSTS ${FLUID_INSTALL_DIR}) add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) # paddle fluid version execute_process( COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} OUTPUT_VARIABLE PADDLE_GIT_COMMIT) -set(version_file ${CMAKE_INSTALL_PREFIX}/version.txt) +set(version_file ${FLUID_INSTALL_DIR}/version.txt) file(WRITE ${version_file} "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" "WITH_MKL: ${WITH_MKL}\n" diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt index 8086507bb4b7e870ad6d6091945ed07a00b5100b..be92af3902769a65c77953c9f3cb1f3aa3738d79 100644 --- a/doc/fluid/CMakeLists.txt +++ b/doc/fluid/CMakeLists.txt @@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees") # HTML output director set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html") +set(IMPORT_PADDLE_STRING "") +set(IMPORT_PADDLEV2_STRING "") + configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in" "${BINARY_BUILD_DIR_EN}/conf.py" @@ -27,8 +30,6 @@ sphinx_add_target(paddle_fluid_docs ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_EN}) -add_dependencies(paddle_fluid_docs gen_proto_py paddle_python) - # configured documentation tools and intermediate build results set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") @@ -50,6 +51,4 @@ sphinx_add_target(paddle_fluid_docs_cn ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_CN}) -add_dependencies(paddle_fluid_docs_cn gen_proto_py paddle_python) - add_subdirectory(api) diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt index 48b396f0786adad1ba6cd41f72497f853e54bc38..435d6e10fb02e9b2a8147f37da33e8848cc9b98a 100644 --- a/doc/fluid/api/CMakeLists.txt +++ b/doc/fluid/api/CMakeLists.txt @@ -7,6 +7,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees") # HTML output director set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html") +set(IMPORT_PADDLE_STRING "import paddle") +set(IMPORT_PADDLEV2_STRING "import paddle.v2") + configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in" "${BINARY_BUILD_DIR_EN}/conf.py" diff --git a/doc/fluid/api/clip.rst b/doc/fluid/api/clip.rst new file mode 100644 index 0000000000000000000000000000000000000000..3ba096388fc87dda3096a9030fe5749e61112c06 --- /dev/null +++ b/doc/fluid/api/clip.rst @@ -0,0 +1,47 @@ +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + +==== +clip +==== + +ErrorClipByValue +---------------- + +.. autoclass:: paddle.fluid.clip.ErrorClipByValue + :members: + :noindex: + +GradientClipByValue +------------------- + +.. autoclass:: paddle.fluid.clip.GradientClipByValue + :members: + :noindex: + +GradientClipByNorm +------------------ + +.. autoclass:: paddle.fluid.clip.GradientClipByNorm + :members: + :noindex: + +GradientClipByGlobalNorm +------------------------ + +.. autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm + :members: + :noindex: + +append_gradient_clip_ops +------------------------ + +.. autofunction:: paddle.fluid.clip.append_gradient_clip_ops + :noindex: + +error_clip_callback +------------------- + +.. autofunction:: paddle.fluid.clip.error_clip_callback + :noindex: + diff --git a/doc/fluid/api/evaluator.rst b/doc/fluid/api/evaluator.rst index f80b87c7d2704a144c02028c4925530a67d11289..c0dc9a0d1d9f2f70948dc3c905dca25d7dd43742 100644 --- a/doc/fluid/api/evaluator.rst +++ b/doc/fluid/api/evaluator.rst @@ -5,24 +5,3 @@ evaluator ========= -ChunkEvaluator --------------- - -.. autoclass:: paddle.fluid.evaluator.ChunkEvaluator - :members: - :noindex: - -EditDistance --------------- - -.. autoclass:: paddle.fluid.evaluator.EditDistance - :members: - :noindex: - -DetectionMAP --------------- - -.. autoclass:: paddle.fluid.evaluator.DetectionMAP - :members: - :noindex: - diff --git a/doc/fluid/api/executor.rst b/doc/fluid/api/executor.rst index a9cdf264e49691afc4b9425b7bfe54f8157ae6c2..f67a14c49f372e67d18ec8e6f87da01109376d22 100644 --- a/doc/fluid/api/executor.rst +++ b/doc/fluid/api/executor.rst @@ -30,3 +30,9 @@ switch_scope .. autofunction:: paddle.fluid.executor.switch_scope :noindex: +fetch_var +--------- + +.. autofunction:: paddle.fluid.executor.fetch_var + :noindex: + diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh index ba7b7ba8e51399deb852b0a7c8ddd3128f521e85..0f0539355559446fd91f659d61b636db214b5a40 100755 --- a/doc/fluid/api/gen_doc.sh +++ b/doc/fluid/api/gen_doc.sh @@ -1,7 +1,7 @@ #!/bin/bash python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst -for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer +for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer do python gen_doc.py ${module} > ${module}.rst done diff --git a/doc/fluid/api/index_en.rst b/doc/fluid/api/index_en.rst index 06c686d9508635abd41571983e00be174e94743e..29cea9c68221b921939e8e09072d87f9f604e21b 100644 --- a/doc/fluid/api/index_en.rst +++ b/doc/fluid/api/index_en.rst @@ -9,8 +9,9 @@ Fluid data_feeder.rst executor.rst initializer.rst - evaluator.rst + metrics.rst nets.rst + clip.rst optimizer.rst param_attr.rst profiler.rst diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst index 2f02c5de097945a45a3e053427104bd17bea1279..c49a98c744cdf907630ea8c74791ff2021d996e8 100644 --- a/doc/fluid/api/initializer.rst +++ b/doc/fluid/api/initializer.rst @@ -33,11 +33,16 @@ Xavier :members: :noindex: -MSRA ------- +force_init_on_cpu +----------------- -.. autoclass:: paddle.fluid.initializer.MSRA - :members: +.. autofunction:: paddle.fluid.initializer.force_init_on_cpu + :noindex: + +init_on_cpu +----------- + +.. autofunction:: paddle.fluid.initializer.init_on_cpu :noindex: ConstantInitializer @@ -68,9 +73,3 @@ XavierInitializer :members: :noindex: - -MSRAInitializer ------------------ -.. autoclass:: paddle.fluid.initializer.MSRAInitializer - :members: - :noindex: diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst index ff3c9346a2cd777a5294d536911f39de9032fe52..91449042fcdfd48c95f3dd3babf958c5d572e747 100644 --- a/doc/fluid/api/layers.rst +++ b/doc/fluid/api/layers.rst @@ -55,6 +55,13 @@ While :members: :noindex: +Switch +------ + +.. autoclass:: paddle.fluid.layers.Switch + :members: + :noindex: + lod_rank_table -------------- @@ -67,12 +74,6 @@ max_sequence_len .. autofunction:: paddle.fluid.layers.max_sequence_len :noindex: -topk ----- - -.. autofunction:: paddle.fluid.layers.topk - :noindex: - lod_tensor_to_array ------------------- @@ -109,6 +110,12 @@ less_than .. autofunction:: paddle.fluid.layers.less_than :noindex: +equal +----- + +.. autofunction:: paddle.fluid.layers.equal + :noindex: + array_read ---------- @@ -212,6 +219,42 @@ Send .. autofunction:: paddle.fluid.layers.Send :noindex: +open_recordio_file +------------------ + +.. autofunction:: paddle.fluid.layers.open_recordio_file + :noindex: + +open_files +---------- + +.. autofunction:: paddle.fluid.layers.open_files + :noindex: + +read_file +--------- + +.. autofunction:: paddle.fluid.layers.read_file + :noindex: + +shuffle +------- + +.. autofunction:: paddle.fluid.layers.shuffle + :noindex: + +batch +----- + +.. autofunction:: paddle.fluid.layers.batch + :noindex: + +double_buffer +------------- + +.. autofunction:: paddle.fluid.layers.double_buffer + :noindex: + nn == @@ -281,12 +324,6 @@ square_error_cost .. autofunction:: paddle.fluid.layers.square_error_cost :noindex: -accuracy --------- - -.. autofunction:: paddle.fluid.layers.accuracy - :noindex: - chunk_eval ---------- @@ -311,6 +348,18 @@ sequence_pool .. autofunction:: paddle.fluid.layers.sequence_pool :noindex: +sequence_softmax +---------------- + +.. autofunction:: paddle.fluid.layers.sequence_softmax + :noindex: + +softmax +------- + +.. autofunction:: paddle.fluid.layers.softmax + :noindex: + pool2d ------ @@ -323,12 +372,6 @@ batch_norm .. autofunction:: paddle.fluid.layers.batch_norm :noindex: -layer_norm ----------- - -.. autofunction:: paddle.fluid.layers.layer_norm - :noindex: - beam_search_decode ------------------ @@ -377,6 +420,12 @@ reduce_min .. autofunction:: paddle.fluid.layers.reduce_min :noindex: +reduce_prod +----------- + +.. autofunction:: paddle.fluid.layers.reduce_prod + :noindex: + sequence_first_step ------------------- @@ -425,6 +474,12 @@ matmul .. autofunction:: paddle.fluid.layers.matmul :noindex: +topk +---- + +.. autofunction:: paddle.fluid.layers.topk + :noindex: + warpctc ------- @@ -473,6 +528,60 @@ multiplex .. autofunction:: paddle.fluid.layers.multiplex :noindex: +layer_norm +---------- + +.. autofunction:: paddle.fluid.layers.layer_norm + :noindex: + +softmax_with_cross_entropy +-------------------------- + +.. autofunction:: paddle.fluid.layers.softmax_with_cross_entropy + :noindex: + +smooth_l1 +--------- + +.. autofunction:: paddle.fluid.layers.smooth_l1 + :noindex: + +one_hot +------- + +.. autofunction:: paddle.fluid.layers.one_hot + :noindex: + +autoincreased_step_counter +-------------------------- + +.. autofunction:: paddle.fluid.layers.autoincreased_step_counter + :noindex: + +reshape +------- + +.. autofunction:: paddle.fluid.layers.reshape + :noindex: + +lod_reset +--------- + +.. autofunction:: paddle.fluid.layers.lod_reset + :noindex: + +lrn +--- + +.. autofunction:: paddle.fluid.layers.lrn + :noindex: + +pad +--- + +.. autofunction:: paddle.fluid.layers.pad + :noindex: + label_smooth ------------ @@ -480,12 +589,12 @@ label_smooth :noindex: roi_pool ---------- +-------- .. autofunction:: paddle.fluid.layers.roi_pool :noindex: - + ops === @@ -501,18 +610,6 @@ mul .. autofunction:: paddle.fluid.layers.mul :noindex: -reshape -------- - -.. autofunction:: paddle.fluid.layers.reshape - :noindex: - -pad ---- - -.. autofunction:: paddle.fluid.layers.pad - :noindex: - scale ----- @@ -579,10 +676,70 @@ clip_by_norm .. autofunction:: paddle.fluid.layers.clip_by_norm :noindex: -sequence_softmax ----------------- +logical_and +----------- -.. autofunction:: paddle.fluid.layers.sequence_softmax +.. autofunction:: paddle.fluid.layers.logical_and + :noindex: + +logical_or +---------- + +.. autofunction:: paddle.fluid.layers.logical_or + :noindex: + +logical_xor +----------- + +.. autofunction:: paddle.fluid.layers.logical_xor + :noindex: + +logical_not +----------- + +.. autofunction:: paddle.fluid.layers.logical_not + :noindex: + +uniform_random +-------------- + +.. autofunction:: paddle.fluid.layers.uniform_random + :noindex: + +uniform_random_batch_size_like +------------------------------ + +.. autofunction:: paddle.fluid.layers.uniform_random_batch_size_like + :noindex: + +gaussian_random +--------------- + +.. autofunction:: paddle.fluid.layers.gaussian_random + :noindex: + +gaussian_random_batch_size_like +------------------------------- + +.. autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like + :noindex: + +cumsum +------ + +.. autofunction:: paddle.fluid.layers.cumsum + :noindex: + +scatter +------- + +.. autofunction:: paddle.fluid.layers.scatter + :noindex: + +sum +--- + +.. autofunction:: paddle.fluid.layers.sum :noindex: sigmoid @@ -651,6 +808,18 @@ floor .. autofunction:: paddle.fluid.layers.floor :noindex: +cos +--- + +.. autofunction:: paddle.fluid.layers.cos + :noindex: + +sin +--- + +.. autofunction:: paddle.fluid.layers.sin + :noindex: + round ----- @@ -828,4 +997,15 @@ topk .. autofunction:: paddle.fluid.layers.topk :noindex: +dice_loss +---- + +.. autofunction:: paddle.fluid.layers.dice_loss + :noindex: + +bilinear_interp +____ + +.. autofunction:: paddle.fluid.layers.bilinear_interp + :noindex: diff --git a/doc/fluid/api/metrics.rst b/doc/fluid/api/metrics.rst new file mode 100644 index 0000000000000000000000000000000000000000..ddf07775d7ea293acd421b8549d03b277ff0611d --- /dev/null +++ b/doc/fluid/api/metrics.rst @@ -0,0 +1,56 @@ +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + +======= +metrics +======= + +MetricBase +---------- + +.. autoclass:: paddle.fluid.metrics.MetricBase + :members: + :noindex: + +CompositeMetric +--------------- + +.. autoclass:: paddle.fluid.metrics.CompositeMetric + :members: + :noindex: + +Accuracy +-------- + +.. autoclass:: paddle.fluid.metrics.Accuracy + :members: + :noindex: + +ChunkEvaluator +-------------- + +.. autoclass:: paddle.fluid.metrics.ChunkEvaluator + :members: + :noindex: + +EditDistance +------------ + +.. autoclass:: paddle.fluid.metrics.EditDistance + :members: + :noindex: + +DetectionMAP +------------ + +.. autoclass:: paddle.fluid.metrics.DetectionMAP + :members: + :noindex: + +Auc +--- + +.. autoclass:: paddle.fluid.metrics.Auc + :members: + :noindex: + diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst index 7a92caf9b7139cf091eff834dbed3586b23ac3af..df2bd2eace52e78805433bea320f5de95d45bfc7 100644 --- a/doc/fluid/api/optimizer.rst +++ b/doc/fluid/api/optimizer.rst @@ -47,6 +47,28 @@ DecayedAdagrad :members: :noindex: +Adadelta +----------------- + +.. autoclass:: paddle.fluid.optimizer.Adadelta + :members: + :noindex: + +RMSProp +----------------- + +.. autoclass:: paddle.fluid.optimizer.RMSProp + :members: + :noindex: + +ModelAverage +----------------- + +.. autoclass:: paddle.fluid.optimizer.ModelAverage + :members: + :noindex: + + SGDOptimizer ------------ @@ -89,9 +111,25 @@ DecayedAdagradOptimizer :members: :noindex: -Adadelta --------------- + +AdadeltaOptimizer +----------------- .. autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer :members: :noindex: + + +RMSPropOptimizer +----------------- + +.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer + :members: + :noindex: + +Optimizer +--------- + +.. autoclass:: paddle.fluid.optimizer.Optimizer + :members: + :noindex: diff --git a/doc/fluid/api/regularizer.rst b/doc/fluid/api/regularizer.rst index 837c67111c6e98e6a3859be802addc20a1c64f2b..756bc53baa0625aef48dad0c35e7ae57421a70d0 100644 --- a/doc/fluid/api/regularizer.rst +++ b/doc/fluid/api/regularizer.rst @@ -11,6 +11,13 @@ append_regularization_ops .. autofunction:: paddle.fluid.regularizer.append_regularization_ops :noindex: +WeightDecayRegularizer +---------------------- + +.. autoclass:: paddle.fluid.regularizer.WeightDecayRegularizer + :members: + :noindex: + L1Decay ------- @@ -26,15 +33,16 @@ L2Decay :noindex: L1DecayRegularizer ---------------------- +------------------ .. autoclass:: paddle.fluid.regularizer.L1DecayRegularizer :members: :noindex: L2DecayRegularizer ---------------------- +------------------ .. autoclass:: paddle.fluid.regularizer.L2DecayRegularizer :members: :noindex: + diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst index 97aeaf167d329529f2b120b5a3d4085e0510fe16..b7c620179724ebe97a0a47b75a57b376b21ccf90 100644 --- a/doc/fluid/howto/index_cn.rst +++ b/doc/fluid/howto/index_cn.rst @@ -3,5 +3,6 @@ .. toctree:: :maxdepth: 1 - + optimization/index_cn.rst + inference/inference_support_in_fluid.md diff --git a/doc/fluid/howto/index_en.rst b/doc/fluid/howto/index_en.rst index fd21e167ce3a46da167db1e9d7013804f730e047..f3ca41cdbf1d40ec8afaf045233a38755d8a777a 100644 --- a/doc/fluid/howto/index_en.rst +++ b/doc/fluid/howto/index_en.rst @@ -5,3 +5,4 @@ HOW TO :maxdepth: 1 optimization/index_en.rst + inference/inference_support_in_fluid.md diff --git a/doc/fluid/howto/inference/inference_support_in_fluid.md b/doc/fluid/howto/inference/inference_support_in_fluid.md new file mode 100644 index 0000000000000000000000000000000000000000..d272cd3e3bdac49b9ed1a21531de1b0be03d881e --- /dev/null +++ b/doc/fluid/howto/inference/inference_support_in_fluid.md @@ -0,0 +1,361 @@ +# Fluid Inference使用指南 + +## 目录: + +- Python Inference API +- 编译Fluid Inference库 +- Inference C++ API +- Inference实例 +- Inference计算优化 + +## Python Inference API **[改进中]** +- 保存Inference模型 ([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L295)) + + ```python + def save_inference_model(dirname, + feeded_var_names, + target_vars, + executor, + main_program=None, + model_filename=None, + params_filename=None): + ``` + Inference模型和参数将会保存到`dirname`目录下: + - 序列化的模型 + - `model_filename`为`None`,保存到`dirname/__model__` + - `model_filename`非`None`,保存到`dirname/model_filename` + - 参数 + - `params_filename`为`None`,单独保存到各个独立的文件,各文件以参数变量的名字命名 + - `params_filename`非`None`,保存到`dirname/params_filename` + +- 两种存储格式 + - 参数保存到各个独立的文件 + - 如,设置`model_filename`为`None`、`params_filename`为`None` + + ```bash + $ cd recognize_digits_conv.inference.model + $ ls + $ __model__ batch_norm_1.w_0 batch_norm_1.w_2 conv2d_2.w_0 conv2d_3.w_0 fc_1.w_0 batch_norm_1.b_0 batch_norm_1.w_1 conv2d_2.b_0 conv2d_3.b_0 fc_1.b_0 + ``` + - 参数保存到同一个文件 + - 如,设置`model_filename`为`None`、`params_filename`为`__params__` + + ```bash + $ cd recognize_digits_conv.inference.model + $ ls + $ __model__ __params__ + ``` +- 加载Inference模型([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L380)) + ```python + def load_inference_model(dirname, + executor, + model_filename=None, + params_filename=None): + ... + return [program, feed_target_names, fetch_targets] + ``` + + +## 编译Fluid Inference库 + + - **不需要额外的CMake选项** + - 1、 配置CMake命令,更多配置请参考[源码编译PaddlePaddle](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html) + ```bash + $ git clone https://github.com/PaddlePaddle/Paddle.git + $ cd Paddle + $ mkdir build + $ cd build + $ cmake -DCMAKE_INSTALL_PREFIX=your/path/to/paddle_inference_lib \ + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_PYTHON=ON \ + -DWITH_MKL=OFF \ + -DWITH_GPU=OFF \ + .. + ``` + + - 2、 编译PaddlePaddle + ```bash + $ make + ``` + + - 3、 部署。执行如下命令将PaddlePaddle Fluid Inference库部署到`your/path/to/paddle_inference_lib`目录。 + ```bash + $ make inference_lib_dist + ``` + +- 目录结构 + + ```bash + $ cd your/path/to/paddle_inference_lib + $ tree + . + |-- paddle + | `-- fluid + | |-- framework + | |-- inference + | | |-- io.h + | | `-- libpaddle_fluid.so + | |-- memory + | |-- platform + | `-- string + |-- third_party + | |-- eigen3 + | `-- install + | |-- gflags + | |-- glog + | `-- protobuf + `-- ... + ``` + + 假设`PADDLE_ROOT=your/path/to/paddle_inference_lib`。 + + + +## 链接Fluid Inference库 +- 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git)) + + - GCC配置 + ```bash + $ g++ -o a.out -std=c++11 main.cc \ + -I${PADDLE_ROOT}/ \ + -I${PADDLE_ROOT}/third_party/install/gflags/include \ + -I${PADDLE_ROOT}/third_party/install/glog/include \ + -I${PADDLE_ROOT}/third_party/install/protobuf/include \ + -I${PADDLE_ROOT}/third_party/eigen3 \ + -L${PADDLE_ROOT}/paddle/fluid/inference -lpaddle_fluid \ + -lrt -ldl -lpthread + ``` + + - CMake配置 + ```cmake + include_directories(${PADDLE_ROOT}/) + include_directories(${PADDLE_ROOT}/third_party/install/gflags/include) + include_directories(${PADDLE_ROOT}/third_party/install/glog/include) + include_directories(${PADDLE_ROOT}/third_party/install/protobuf/include) + include_directories(${PADDLE_ROOT}/third_party/eigen3) + target_link_libraries(${TARGET_NAME} + ${PADDLE_ROOT}/paddle/fluid/inference/libpaddle_fluid.so + -lrt -ldl -lpthread) + ``` + + - 设置环境变量: + `export LD_LIBRARY_PATH=${PADDLE_ROOT}/paddle/fluid/inference:$LD_LIBRARY_PATH` + + + +## C++ Inference API + +- 推断流程([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_helper.h#L91)) + + - 1、 初始化设备 + ```cpp + #include "paddle/fluid/framework/init.h" + paddle::framework::InitDevices(false); + ``` + + - 2、 定义place,executor,scope + ```cpp + auto place = paddle::platform::CPUPlace(); + auto executor = paddle::framework::Executor(place); + auto* scope = new paddle::framework::Scope(); + ``` + + - 3、 加载模型 + ```cpp + #include "paddle/fluid/inference/io.h" + auto inference_program = paddle::inference::Load(executor, *scope, dirname); + // or + auto inference_program = paddle::inference::Load(executor, + *scope, + dirname + "/" + model_filename, + dirname + "/" + params_filename); + ``` + + - 4、 获取`feed_target_names`和`fetch_target_names` + ```cpp + const std::vector& feed_target_names = inference_program->GetFeedTargetNames(); + const std::vector& fetch_target_names = inference_program->GetFetchTargetNames(); + ``` + + - 5、 准备`feed`数据 + ```cpp + #include "paddle/fluid/framework/lod_tensor.h" + std::vector cpu_feeds; + ... + std::map feed_targets; + for (size_t i = 0; i < feed_target_names.size(); ++i) { + // Please make sure that cpu_feeds[i] is right for feed_target_names[i] + feed_targets[feed_target_names[i]] = cpu_feeds[i]; + } + ``` + + - 6、 定义`Tensor`来`fetch`结果 + ```cpp + std::vector cpu_fetchs; + std::map fetch_targets; + for (size_t i = 0; i < fetch_target_names.size(); ++i) { + fetch_targets[fetch_target_names[i]] = cpu_fetchs[i]; + } + ``` + + - 7、 执行`inference_program` + ```cpp + executor.Run(*inference_program, scope, feed_targets, fetch_targets); + ``` + + - 8、 使用`fetch`数据 + ```cpp + for (size_t i = 0; i < cpu_fetchs.size(); ++i) { + std::cout << "lod_i: " << cpu_fetchs[i]->lod(); + std::cout << "dims_i: " << cpu_fetchs[i]->dims(); + std::cout << "result:"; + float* output_ptr = cpu_fetchs[i]->data(); + for (int j = 0; j < cpu_fetchs[i]->numel(); ++j) { + std::cout << " " << output_ptr[j]; + } + std::cout << std::endl; + } + ``` + 针对不同的数据,4. - 8.可执行多次。 + + - 9、 释放内存 + ```cpp + delete scope; + ``` + + +- 接口说明 + + ```cpp + void Run(const ProgramDesc& program, Scope* scope, + std::map& feed_targets, + std::map& fetch_targets, + bool create_vars = true, + const std::string& feed_holder_name = "feed", + const std::string& fetch_holder_name = "fetch"); + ``` + - 使用Python API `save_inference_model`保存的`program`里面包含了`feed_op`和`fetch_op`,用户提供的`feed_targets`、`fetch_targets`必须和`inference_program`中的`feed_op`、`fetch_op`保持一致。 + - 用户提供的`feed_holder_name`和`fetch_holder_name`也必须和`inference_program`中`feed_op`、`fetch_op`保持一致,可使用`SetFeedHolderName`和`SetFetchHolderName`接口重新设置`inferece_program` + - 默认情况下,除了`persistable`属性设置为`True`的`Variable`之外,每次执行`executor.Run`会创建一个局部`Scope`,并且在这个局部`Scope`中创建和销毁所有的`Variable`,以最小化空闲时的内存占用。 + - `persistable`属性为`True`的`Variable`有: + - Operators的参数`w`、`b`等 + - `feed_op`的输入变量 + - `fetch_op`的输出变量 + + +- **不在每次执行时创建和销毁变量 + ([PR](https://github.com/PaddlePaddle/Paddle/pull/9301))** + - 执行`inference_program` + ```cpp + // Call once + executor.CreateVariables(*inference_program, scope, 0); + // Call as many times as you like + executor.Run( + *inference_program, scope, feed_targets, fetch_targets, false); + ``` + - **优点** + - 节省了频繁创建、销毁变量的时间(约占每次`Run`总时间的1% ~ 12%) + - 执行结束后可获取所有Operators的计算结果 + - **缺点** + - 空闲时也会占用大量的内存 + - 在同一个`Scope`中,相同的变量名是公用同一块内存的,容易引起意想不到的错误 + + +- **不在每次执行时创建Op([PR](https://github.com/PaddlePaddle/Paddle/pull/9630))** + - 执行`inference_program` + ```cpp + // Call once + auto ctx = executor.Prepare(*inference_program, 0); + // Call as many times as you like if you have no need to change the inference_program + executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets); + ``` + - **优点** + - 节省了频繁创建、销毁Op的时间 + - **缺点** + - 一旦修改了`inference_program`,则需要重新创建`ctx` + + +- **多线程共享Parameters([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_multi_thread_helper.h))** + - 主线程 + - 1、 初始化设备 + - 2、 定义`place`,`executor`,`scope` + - 3、 加载模型,得到`inference_program` + - 从线程 + - **复制`inference_program`得到`copy_program`,修改`copy_program`的`feed_holder_name`和`fetch_holder_name`** + ```cpp + auto copy_program = std::unique_ptr( + new paddle::framework::ProgramDesc(*inference_program)); + std::string feed_holder_name = "feed_" + paddle::string::to_string(thread_id); + std::string fetch_holder_name = "fetch_" + paddle::string::to_string(thread_id); + copy_program->SetFeedHolderName(feed_holder_name); + copy_program->SetFetchHolderName(fetch_holder_name); + ``` + - 4、 获取`copy_program`的`feed_target_names`和`fetch_target_names` + - 5、 准备feed数据,定义Tensor来fetch结果 + - 6、 执行`copy_program` + ```cpp + executor->Run(*copy_program, scope, feed_targets, fetch_targets, true, feed_holder_name, fetch_holder_name); + ``` + - 7、 使用fetch数据 + - 主线程 + - 8、 释放资源 + + +- 基本概念 + - 数据相关: + - [Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor.md),一个N维数组,数据可以是任意类型(int,float,double等) + - [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md),带LoD(Level-of-Detail)即序列信息的Tensor + - [Scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md),记录了变量Variable + - 执行相关: + - [Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md),无状态执行器,只跟设备相关 + - Place + - CPUPlace,CPU设备 + - CUDAPlace,CUDA GPU设备 + - 神经网络表示: + - [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md). + + 详细介绍请参考[**Paddle Fluid开发者指南**](https://github.com/lcy-seso/learning_notes/blob/master/Fluid/developer's_guid_for_Fluid/Developer's_Guide_to_Paddle_Fluid.md) + + + +## Inference实例 + + 1. fit a line: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc) + 1. image classification: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_image_classification.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_image_classification.cc) + 1. label semantic roles: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc) + 1. recognize digits: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc) + 1. recommender system: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recommender_system.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc) + 1. understand sentiment: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_understand_sentiment.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc) + 1. word2vec: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_word2vec.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_word2vec.cc) + + +## Inference计算优化 +- 使用Python推理优化工具([inference_transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/inference_transpiler.py)) + ```python + class InferenceTranspiler: + def transpile(self, program, place, scope=None): + ... + if scope is None: + scope = global_scope() + ... + ``` + - 使用`InferenceTranspiler`将会直接修改`program`。 + - 使用`InferenceTranspiler`会修改参数的值,请确保`program`的参数在`scope`内。 +- 支持的优化 + - 融合batch_norm op的计算 +- 使用示例([链接](https://github.com/Xreki/Xreki.github.io/blob/master/fluid/inference/inference_transpiler.py)) + ```python + import paddle.fluid as fluid + # NOTE: Applying the inference transpiler will change the inference_program. + t = fluid.InferenceTranspiler() + t.transpile(inference_program, place, inference_scope) + ``` + + + + +## 内存使用优化 +- 使用Python内存优化工具([memory_optimization_transipiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/memory_optimization_transpiler.py)) + ```python + fluid.memory_optimize(inference_program) + ``` diff --git a/doc/mobile/CMakeLists.txt b/doc/mobile/CMakeLists.txt index b104a6318d474d6531670b8ac3569448774850c7..7b34ba8d0768427802b11614c6962f3c3f6ef4e3 100644 --- a/doc/mobile/CMakeLists.txt +++ b/doc/mobile/CMakeLists.txt @@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees") # HTML output director set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html") +set(IMPORT_PADDLE_STRING "") +set(IMPORT_PADDLEV2_STRING "") + configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in" "${BINARY_BUILD_DIR_EN}/conf.py" @@ -27,8 +30,6 @@ sphinx_add_target(paddle_mobile_docs ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_EN}) -add_dependencies(paddle_mobile_docs gen_proto_py paddle_python) - # configured documentation tools and intermediate build results set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") @@ -49,5 +50,3 @@ sphinx_add_target(paddle_mobile_docs_cn ${SPHINX_CACHE_DIR_CN} ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_CN}) - -add_dependencies(paddle_mobile_docs_cn gen_proto_py paddle_python) diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst index 8297316e8fbb2b8f41954030293feadbcd81295e..56d1515005f6e40b084c6b2184c6a0b3e3a00496 100644 --- a/doc/mobile/index_cn.rst +++ b/doc/mobile/index_cn.rst @@ -1,9 +1,9 @@ 移动端 -===== +====== .. toctree:: :maxdepth: 1 cross_compiling_for_android_cn.md cross_compiling_for_ios_cn.md - cross_compiling_for_raspberry_cn.md \ No newline at end of file + cross_compiling_for_raspberry_cn.md diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in index 76b82fd97f1ed642696c4414676b694ebda9ad81..890f70615538af23cd05b9ffd685e870a5644cdb 100644 --- a/doc/templates/conf.py.cn.in +++ b/doc/templates/conf.py.cn.in @@ -16,8 +16,8 @@ import os, subprocess sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python')) import shlex from recommonmark import parser, transform -import paddle -import paddle.v2 +@IMPORT_PADDLE_STRING@ +@IMPORT_PADDLEV2_STRING@ MarkdownParser = parser.CommonMarkParser AutoStructify = transform.AutoStructify diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in index 5aa5c1381fa3fad4ebc181c7868da03ae0138016..5b09464cb991f96127edec40f7dbbc97a8d82582 100644 --- a/doc/templates/conf.py.en.in +++ b/doc/templates/conf.py.en.in @@ -16,8 +16,8 @@ import os, subprocess sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python')) import shlex from recommonmark import parser, transform -import paddle -import paddle.v2 +@IMPORT_PADDLE_STRING@ +@IMPORT_PADDLEV2_STRING@ MarkdownParser = parser.CommonMarkParser diff --git a/doc/v2/CMakeLists.txt b/doc/v2/CMakeLists.txt index be957d37b14c618e9346251b3bd3dbaf1541773f..d230a1b9217eea6740419822f350096e361a4435 100644 --- a/doc/v2/CMakeLists.txt +++ b/doc/v2/CMakeLists.txt @@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees") # HTML output director set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html") +set(IMPORT_PADDLE_STRING "") +set(IMPORT_PADDLEV2_STRING "") + configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in" "${BINARY_BUILD_DIR_EN}/conf.py" @@ -27,8 +30,6 @@ sphinx_add_target(paddle_v2_docs ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_EN}) -add_dependencies(paddle_v2_docs gen_proto_py paddle_python) - # configured documentation tools and intermediate build results set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") @@ -50,6 +51,4 @@ sphinx_add_target(paddle_v2_docs_cn ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_CN}) -add_dependencies(paddle_v2_docs_cn gen_proto_py paddle_python) - add_subdirectory(api) diff --git a/doc/v2/api/CMakeLists.txt b/doc/v2/api/CMakeLists.txt index 2670a21a227546ffcee4f10f395feef3c58df9b4..0c74522cb089b17c8419e9058f76631b0fe0df93 100644 --- a/doc/v2/api/CMakeLists.txt +++ b/doc/v2/api/CMakeLists.txt @@ -7,6 +7,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees") # HTML output director set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html") +set(IMPORT_PADDLE_STRING "import paddle") +set(IMPORT_PADDLEV2_STRING "import paddle.v2") + configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in" "${BINARY_BUILD_DIR_EN}/conf.py" diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst index f846928954dd3a05e11054ce2ff2ff839fbefd4b..077f5e9b189269f9f6c9cf68310e2bfd43d8cb67 100644 --- a/doc/v2/build_and_install/build_from_source_cn.rst +++ b/doc/v2/build_and_install/build_from_source_cn.rst @@ -19,8 +19,8 @@ ---------------- PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像 -可以在 `这里 `_ 找到,您也可以 -在 `这里 `_ 找到 paddle_manylinux_devel +可以在 `这里 `__ 找到,您也可以 +在 `这里 `__ 找到 paddle_manylinux_devel 镜像的编译以及使用方法。或者参考下述可选步骤,从源码中构建用于编译PaddlePaddle的Docker镜像。 如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。 @@ -35,7 +35,7 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 # 2. 可选步骤:源码中构建用于编译PaddlePaddle的Docker镜像 docker build -t paddle:dev . # 3. 执行下面的命令编译CPU-Only的二进制 - docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build # 4. 或者也可以使用为上述可选步骤构建的镜像(必须先执行第2步) docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev @@ -116,11 +116,10 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行 - ```emacs - (global-set-key "\C-cc" 'compile) - (setq compile-command - "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev") - ``` + .. code-block:: emacs + + (global-set-key "\C-cc" 'compile) + (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev") 就可以按 `Ctrl-C` 和 `c` 键来启动编译了。 diff --git a/doc/v2/build_and_install/build_from_source_en.rst b/doc/v2/build_and_install/build_from_source_en.rst index d1b5b88dff81d4c5cee3dd13a7dccbc333ab6a17..545e61ce9602240807d515e9eae971dfca9ddd7f 100644 --- a/doc/v2/build_and_install/build_from_source_en.rst +++ b/doc/v2/build_and_install/build_from_source_en.rst @@ -23,7 +23,7 @@ You need to use Docker to build PaddlePaddle to avoid installing dependencies by yourself. We have several pre-built Docker images `here `_ , you can also find how to build and use paddle_manylinux_devel Docker image from -`here `_ +`here `__ Or you can build your own image from source as the optional step below: .. code-block:: bash @@ -34,7 +34,7 @@ Or you can build your own image from source as the optional step below: # 2. Optional: build development docker image from source docker build -t paddle:dev . # 3. Run the following command to build a CPU-Only binaries - docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2) docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev @@ -88,7 +88,7 @@ If you wish to run only one unit test, like :code:`test_sum_op`: .. _faq_docker: Frequently Asked Questions ----------------- +--------------------------- - What is Docker? @@ -118,11 +118,10 @@ Frequently Asked Questions Many PaddlePaddle developers are using Emacs. They add the following few lines into their `~/.emacs` configure file: - ```emacs - (global-set-key "\C-cc" 'compile) - (setq compile-command - "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev") - ``` + .. code-block:: emacs + + (global-set-key "\C-cc" 'compile) + (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev") so they could type `Ctrl-C` and `c` to build PaddlePaddle from source. @@ -145,7 +144,7 @@ Frequently Asked Questions .. _compile_deps: Appendix: Compile Dependencies ----------------- +------------------------------- PaddlePaddle need the following dependencies when compiling, other dependencies will be downloaded automatically. @@ -166,11 +165,11 @@ will be downloaded automatically. .. _build_options: Appendix: Build Options ----------------- +------------------------- Build options include whether build binaries for CPU or GPU, which BLAS library to use etc. You may pass these settings when running cmake. -For detailed cmake tutorial please refer to `here `_ 。 +For detailed cmake tutorial please refer to `here `__ 。 You can add :code:`-D` argument to pass such options, like: @@ -219,7 +218,7 @@ keep on with latest cuDNN versions. Be sure to run with the same version of cuDN you built. Pass Compile Options -++++++++++++++ +++++++++++++++++++++++ You can pass compile options to use intended BLAS/CUDA/Cudnn libraries. When running cmake command, it will search system paths like diff --git a/doc/v2/build_and_install/docker_install_cn.rst b/doc/v2/build_and_install/docker_install_cn.rst index 79d214635a069a739060e0b79424729f6ff90387..da876b03e384a8175b27f78756af648c80fc6784 100644 --- a/doc/v2/build_and_install/docker_install_cn.rst +++ b/doc/v2/build_and_install/docker_install_cn.rst @@ -73,6 +73,7 @@ 当然,您也可以进入到Docker容器中,以交互式的方式执行或调试您的代码: .. code-block:: bash + docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash cd /work python train.py diff --git a/doc/v2/build_and_install/docker_install_en.rst b/doc/v2/build_and_install/docker_install_en.rst index e0e0559fb858a093db96a9b4ec1c5a45d6c71a38..5dbdedc4cb064ef415e8d19f00727a16d1c175c6 100644 --- a/doc/v2/build_and_install/docker_install_en.rst +++ b/doc/v2/build_and_install/docker_install_en.rst @@ -80,6 +80,7 @@ Also, you can go into the container shell, run or debug your code interactively: .. code-block:: bash + docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash cd /work python train.py diff --git a/doc/v2/build_and_install/index_cn.rst b/doc/v2/build_and_install/index_cn.rst index e079bb661f3a5141a09dfbc6893d1bf945697bc9..1a9305ac4b6578c14a962f223c647a71e3b8a72b 100644 --- a/doc/v2/build_and_install/index_cn.rst +++ b/doc/v2/build_and_install/index_cn.rst @@ -6,7 +6,7 @@ PaddlePaddle针对不同的用户群体提供了多种安装方式。 专注深度学习模型开发 ------------------ +-------------------- PaddlePaddle提供了多种python wheel包,可通过pip一键安装: @@ -18,7 +18,7 @@ PaddlePaddle提供了多种python wheel包,可通过pip一键安装: 这是最便捷的安装方式,请根据机器配置和系统选择对应的安装包。 关注底层框架 ----------- +------------- PaddlePaddle提供了基于Docker的安装方式,请参照以下教程: @@ -45,7 +45,7 @@ PaddlePaddle提供了基于Docker的安装方式,请参照以下教程: 常见问题汇总 ------------ +-------------- 如果在安装过程中遇到了问题,请先尝试在下面的页面寻找答案: diff --git a/doc/v2/build_and_install/index_en.rst b/doc/v2/build_and_install/index_en.rst index 5b3de0f8c3e5496060646b5ddb080d0d338a8bfa..7990bacbd6966e88e8763e9c5709e410f7e9fed4 100644 --- a/doc/v2/build_and_install/index_en.rst +++ b/doc/v2/build_and_install/index_en.rst @@ -1,12 +1,12 @@ install and Compile -========== +====================== .. _install_steps: PaddlePaddle provides various methods of installation for many different users Focus on Deep Learning Model Development ------------------ +---------------------------------------- PaddlePaddle provides lots of packages of python wheel , that pip can install: @@ -18,7 +18,7 @@ PaddlePaddle provides lots of packages of python wheel , that pip can install: This is the most convenient way of installation. Please choose the right installation package with machine configure and system. Follow the Bottom Frame ----------- +------------------------ PaddlePaddle also supports installation using Docker. Please refer to the tutorial below: diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst index 9b84bb6425af1eeb94a4f2f5d6c2b1e28c62e3c8..853bdb21bbcf07ae1742d2196dbcfe4668828b7b 100644 --- a/doc/v2/build_and_install/pip_install_cn.rst +++ b/doc/v2/build_and_install/pip_install_cn.rst @@ -55,11 +55,11 @@ paddlepaddle-gpu==0.11.0 使用CUDA 7.5和cuDNN 5编译的0.11.0版 :header: "版本说明", "cp27-cp27mu", "cp27-cp27m" :widths: 1, 3, 3 - "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" - "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" - "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" - "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `_" - "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `_" + "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" + "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" + "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" .. _pip_dependency: diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst index fcac76d6a24eb4905a20f797d614db8f743342d7..fecf6d3712feac3265100a6121901ba784f7d5cc 100644 --- a/doc/v2/build_and_install/pip_install_en.rst +++ b/doc/v2/build_and_install/pip_install_en.rst @@ -58,11 +58,11 @@ If the links below shows up the login form, just click "Log in as guest" to star :header: "version", "cp27-cp27mu", "cp27-cp27m" :widths: 1, 3, 3 - "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" - "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" - "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" - "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `_" - "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `_" + "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" + "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" + "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" .. _pip_dependency: diff --git a/doc/v2/howto/capi/workflow_of_capi_cn.md b/doc/v2/howto/capi/workflow_of_capi_cn.md index 1968c1099ac5734cd68b437f2f7aa428d7b5265e..3acdbae28e9b35f8a9104a89c9a5799f8c892334 100644 --- a/doc/v2/howto/capi/workflow_of_capi_cn.md +++ b/doc/v2/howto/capi/workflow_of_capi_cn.md @@ -59,7 +59,7 @@ 代码示例如下: ```python - from paddle.utils.merge_model import merge_v2_modelss + from paddle.utils.merge_model import merge_v2_model from mnist_v2 import network net = network(is_infer=True) diff --git a/go/pserver/client/c/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt index 411dc50332672143d7a1f7bd0556ae86dc37f6f3..4500b1f288372ed0e2d9d383234df97ae976c60b 100644 --- a/go/pserver/client/c/test/CMakeLists.txt +++ b/go/pserver/client/c/test/CMakeLists.txt @@ -13,4 +13,3 @@ # limitations under the License. # cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer) -add_style_check_target(test_cclient test_cclient.c) diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt index e06e9a2b363d1ffc6876b98bcb7304b0a54dbcaa..957b1a3e6b07b058a76605992da387b43657146a 100644 --- a/paddle/capi/CMakeLists.txt +++ b/paddle/capi/CMakeLists.txt @@ -33,9 +33,6 @@ add_library(paddle_capi STATIC ${CAPI_HEADERS} ${CAPI_PRIVATE_HEADER} target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) -add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER} - ${CAPI_PRIVATE_HEADER}) - add_dependencies(paddle_capi paddle_proto paddle_gserver) # TODO: paddle_capi_whole will be removed. diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt index efd1b7a73e1655f95eb83a5e2f59e82cbf7eba16..9bbb8de78e09829d24faf42c360811084981578f 100755 --- a/paddle/cuda/CMakeLists.txt +++ b/paddle/cuda/CMakeLists.txt @@ -87,8 +87,3 @@ else() endif() add_dependencies(paddle_cuda paddle_proto ${external_project_dependencies}) - -add_style_check_target(paddle_cuda - ${CUDA_SOURCES} - ${CUDA_HEADERS} - ${CUDA_CXX_SOURCES}) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 9de44beafbb69b3510b97afcc43d4b489a029c35..b69de2ced03569d5e9ffe313527ab776ee798496 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -36,5 +36,5 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha device_context broadcast_op_handle) cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context gather_op_handle) -cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory - device_context reduce_op_handle ) +#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory +# device_context reduce_op_handle ) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 6b0c0a6b9fb29e641449f0c21109611cccd4e5a9..35d23d68c0dd26a05544a72316d5764129aa8d40 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/send_op_handle.h" +#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/scope.h" #ifdef PADDLE_WITH_CUDA @@ -159,25 +160,39 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( if (!is_forwarding && places_.size() > 1) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. - for (auto &og : op->OutputArgumentNames()) { - if (IsParameterGradientOnce(og, &og_has_been_broadcast)) { - switch (strategy_.reduce_) { - case BuildStrategy::ReduceStrategy::kReduce: - CreateReduceOp(&result, og, cur_device_id); - var_name_on_devices[cur_device_id].emplace(og); - bcast_var_name_set[cur_device_id].emplace( - og.substr(0, og.size() - strlen(kGradVarSuffix))); - cur_device_id = (cur_device_id + 1) % places_.size(); - break; - case BuildStrategy::ReduceStrategy::kAllReduce: - if (IsSparseGradient(var_types, og)) { - CreateReduceOp(&result, og, 0); - CreateBroadcastOp(&result, og, 0); - } else { - InsertNCCLAllReduceOp(&result, og); - } - break; + if (static_cast(boost::get(op->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward))) { + try { + auto backward_vars = + boost::get>(op->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &p_name = backward_vars[i]; + auto &g_name = backward_vars[i + 1]; + VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; + + switch (strategy_.reduce_) { + case BuildStrategy::ReduceStrategy::kReduce: + CreateReduceOp(&result, g_name, cur_device_id); + var_name_on_devices[cur_device_id].emplace(g_name); + bcast_var_name_set[cur_device_id].emplace(p_name); + cur_device_id = (cur_device_id + 1) % places_.size(); + break; + case BuildStrategy::ReduceStrategy::kAllReduce: + if (IsSparseGradient(var_types, g_name)) { + CreateReduceOp(&result, g_name, 0); + CreateBroadcastOp(&result, g_name, 0); + } else { + InsertNCCLAllReduceOp(&result, g_name); + } + break; + } } + } catch (boost::bad_get e) { } } } @@ -398,11 +413,12 @@ void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result, } bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const { - // FIXME(yy): Do not hard code like this - return op.OutputArgumentNames().size() == 1 && - op.OutputArgumentNames()[0] == GradVarName(loss_var_name_); + return boost::get( + op.GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + (static_cast(OpRole::kBackward) | + static_cast(OpRole::kLoss)) && + !loss_var_name_.empty(); // If loss_var is empty. This is test mode } - } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h index 1c4b059cd0aeff803ca7436d3f198e97a06cd012..eea7e712f8f6e187cdceedce77cc76d1d4ca2101 100644 --- a/paddle/fluid/framework/details/op_registry.h +++ b/paddle/fluid/framework/details/op_registry.h @@ -96,10 +96,7 @@ struct OpInfoFiller { info->proto_ = new proto::OpProto; info->checker_ = new OpAttrChecker(); T maker; - maker.SetProto(info->proto_); - maker.SetChecker(info->checker_); - maker.Make(); - maker.Validate(); + maker(info->proto_, info->checker_); info->proto_->set_type(op_type); PADDLE_ENFORCE( info->proto_->IsInitialized(), diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 076c45713015797f86a3611dd333132bae40044d..1b9c685866763ed126a1bf5d7fdd851c38ac1c63 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/shape_inference.h" @@ -222,6 +223,15 @@ Attribute OpDesc::GetAttr(const std::string &name) const { return it->second; } +Attribute OpDesc::GetNullableAttr(const std::string &name) const { + auto it = attrs_.find(name); + if (it != attrs_.end()) { + return it->second; + } else { + return Attribute(); + } +} + int OpDesc::GetBlockAttr(const std::string &name) const { auto it = attrs_.find(name); PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); @@ -249,6 +259,13 @@ void OpDesc::RenameOutput(const std::string &old_name, std::replace(output.second.begin(), output.second.end(), old_name, new_name); } + + auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName()); + if (it != attrs_.end()) { + auto &op_vars = boost::get>(it->second); + std::replace(op_vars.begin(), op_vars.end(), old_name, new_name); + } + need_update_ = true; } diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 3ee36a47c156da67a9ff70852665fbbd464bea17..1a330db7cc5555a939950043ac90a321573b292d 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -78,6 +78,8 @@ class OpDesc { Attribute GetAttr(const std::string &name) const; + Attribute GetNullableAttr(const std::string &name) const; + int GetBlockAttr(const std::string &name) const; void Rename(const std::string &old_name, const std::string &new_name); diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index c479d7617cfa34cd381d84d15d5e214d57af52d0..5a4380a83a2e5bf492098032cd9de7bf274fe47e 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_proto_maker.h" #include +#include namespace paddle { namespace framework { @@ -55,5 +56,28 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() { } } +void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, + OpAttrChecker* attr_checker) { + proto_ = proto; + op_checker_ = attr_checker; + Make(); + + AddAttr(OpRoleAttrName(), "The role of this operator") + .InEnum( + {static_cast(OpRole::kForward), + static_cast(OpRole::kBackward), + static_cast(OpRole::kOptimize), + static_cast(OpRole::kLoss) | static_cast(OpRole::kForward), + static_cast(OpRole::kLoss) | + static_cast(OpRole::kBackward), + static_cast(OpRole::kNotSpecified)}) + .SetDefault(static_cast(OpRole::kNotSpecified)); + AddAttr>(OpRoleVarAttrName(), + "Optimized for variable") + .SetDefault({}); + + Validate(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index b01a520bba19c1be32363a1a5c381666c82e6afc..9bd6ca6ea32734707a5c37b3ecfe449436c04c8c 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -20,21 +20,31 @@ limitations under the License. */ namespace paddle { namespace framework { +enum class OpRole { + kForward = 0x0000, + kBackward = 0x0001, + kOptimize = 0x0002, + + kLoss = 0x0100, + // The default value of op's role. This should be only used for unittests and + // CreateOp inside a operator. + kNotSpecified = 0x1000, +}; + // this class not only make proto but also init attribute checkers. class OpProtoAndCheckerMaker { public: + static const char *OpRoleAttrName() { return "op_role"; } + static const char *OpRoleVarAttrName() { return "op_role_var"; } + + void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker); + virtual void Make() = 0; virtual ~OpProtoAndCheckerMaker() { CHECK(validated_) << "should call Validate after build"; } - void SetProto(proto::OpProto *proto) { proto_ = proto; } - - void SetChecker(OpAttrChecker *attr_checker) { op_checker_ = attr_checker; } - - void Validate(); - protected: struct VariableBuilder { proto::OpProto::Var *var_; @@ -76,6 +86,7 @@ class OpProtoAndCheckerMaker { private: void CheckNoDuplicatedInOutAttrs(); + void Validate(); proto::OpProto *proto_; OpAttrChecker *op_checker_; diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc index 9b5badbc81f9ddf083c81f57f5355e07a8e5e4a2..a8030d377fdb4d4aef74b315e21792dad10fac96 100644 --- a/paddle/fluid/framework/op_proto_maker_test.cc +++ b/paddle/fluid/framework/op_proto_maker_test.cc @@ -28,10 +28,8 @@ TEST(ProtoMaker, DuplicatedAttr) { paddle::framework::proto::OpProto op_proto; paddle::framework::OpAttrChecker op_checker; TestAttrProtoMaker proto_maker; - proto_maker.SetProto(&op_proto); - proto_maker.SetChecker(&op_checker); - proto_maker.Make(); - ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); + ASSERT_THROW(proto_maker(&op_proto, &op_checker), + paddle::platform::EnforceNotMet); } class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { @@ -46,8 +44,6 @@ TEST(ProtoMaker, DuplicatedInOut) { paddle::framework::proto::OpProto op_proto; paddle::framework::OpAttrChecker op_checker; TestAttrProtoMaker proto_maker; - proto_maker.SetProto(&op_proto); - proto_maker.SetChecker(&op_checker); - proto_maker.Make(); - ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); + ASSERT_THROW(proto_maker(&op_proto, &op_checker), + paddle::platform::EnforceNotMet); } diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 47929ef7490e5edb246625cb0b3ba507039df27a..9faf5bb3036775a2ba0c08d3d6ea17ffa73753c6 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -1,2 +1,17 @@ -cc_library(analysis SRCS dot.cc node.cc node.h) +set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init) +cc_library(analysis SRCS dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc fluid_to_data_flow_graph_pass.cc + DEPS paddle_fluid) cc_test(test_node SRCS node_tester.cc DEPS analysis) +cc_test(test_dot SRCS dot_tester.cc DEPS analysis) + +set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) + +cc_test(test_data_flow_graph SRCS data_flow_graph_tester.cc DEPS analysis ${FLUID_CORE_MODULES} paddle_fluid + ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model) +set_tests_properties(test_data_flow_graph PROPERTIES DEPENDS test_word2vec) + +cc_test(test_subgraph_splitter + SRCS subgraph_splitter_tester.cc + DEPS analysis paddle_fluid tensor + ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model) +set_tests_properties(test_subgraph_splitter PROPERTIES DEPENDS test_word2vec) diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc new file mode 100644 index 0000000000000000000000000000000000000000..4220451e3caee62caa51af5bc33d6dd3fd891018 --- /dev/null +++ b/paddle/fluid/inference/analysis/data_flow_graph.cc @@ -0,0 +1,205 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/analysis/data_flow_graph.h" +#include "paddle/fluid/inference/analysis/dot.h" + +namespace paddle { +namespace inference { +namespace analysis { + +// It is a better idea that the inputs and outputs of this graph is set manully +// before, but there must be a Pass that helps to prune the unnecessary ops that +// do not contribute to the given targets, so in this pass, analysis and get the +// inputs and outputs is OK. +void DataFlowGraph::Build() { + inputs.clear(); + outputs.clear(); + std::unordered_set ins; + std::unordered_set outs; + for (auto &node : nodes.nodes()) { + for (auto *in : node->inlinks) { + ins.insert(in); + } + for (auto *out : node->outlinks) { + outs.insert(out); + } + } + + // The nodes that in ins but not in outs is the graph's inputs + // similarly, the nodes that in outs but not in ins is the graphs' outputs + for (auto *in : ins) { + if (!outs.count(in)) { + inputs.push_back(in); + } + } + for (auto *out : outs) { + if (!outs.count(out)) { + outputs.push_back(out); + } + } +} + +std::string DataFlowGraph::DotString() const { + Dot dot; + + // Add nodes + for (size_t i = 0; i < nodes.size(); i++) { + const Node &node = nodes.Get(i); + switch (node.type()) { + case Node::Type::kValue: + dot.AddNode(node.repr(), node.dot_attrs()); + break; + case Node::Type::kFunction: + dot.AddNode(node.repr(), node.dot_attrs()); + break; + case Node::Type::kFunctionBlock: + dot.AddNode(node.repr(), node.dot_attrs()); + break; + default: + PADDLE_THROW("unsupported Node type %d", static_cast(node.type())); + } + } + + // Add edges + for (size_t i = 0; i < nodes.size(); i++) { + const Node &node = nodes.Get(i); + for (auto &in : node.inlinks) { + dot.AddEdge(in->repr(), node.repr(), {}); + } + } + return dot.Build(); +} + +// +// NodesBFSIterator +// + +GraphTraits::NodesBFSIterator::NodesBFSIterator( + const std::vector &source) + : queue_(source.begin(), source.end()) {} + +// GraphTraits::NodesBFSIterator::NodesBFSIterator( +// GraphTraits::NodesBFSIterator &&other) noexcept +// : queue_(std::move(other.queue_)), +// visited_(std::move(other.visited_)) {} + +GraphTraits::NodesBFSIterator::NodesBFSIterator( + const GraphTraits::NodesBFSIterator &other) + : queue_(other.queue_), visited_(other.visited_) {} + +Node &GraphTraits::NodesBFSIterator::operator*() { + PADDLE_ENFORCE(!queue_.empty()); + return *queue_.front(); +} + +Node *GraphTraits::NodesBFSIterator::operator->() { + PADDLE_ENFORCE(!queue_.empty()); + return queue_.front(); +} + +GraphTraits::NodesBFSIterator & +GraphTraits::NodesBFSIterator::operator=( + const GraphTraits::NodesBFSIterator &other) { + queue_ = other.queue_; + visited_ = other.visited_; + return *this; +} + +GraphTraits::NodesBFSIterator + &GraphTraits::NodesBFSIterator::operator++() { + PADDLE_ENFORCE(!queue_.empty()); + auto *cur = queue_.front(); + visited_.insert(cur); + queue_.pop_front(); + for (auto *output : cur->outlinks) { + if (!visited_.count(output)) { + queue_.push_back(output); + visited_.insert(output); + } + } + return *this; +} + +bool GraphTraits::NodesBFSIterator::operator==( + const GraphTraits::NodesBFSIterator &other) { + if (queue_.empty()) return other.queue_.empty(); + if ((!queue_.empty()) && (!other.queue_.empty())) { + return queue_.front() == other.queue_.front() && + visited_.size() == other.visited_.size(); // here need to check the + // equality of queue and + // visited. Just a light but week implementation. + } + return false; +} + +// +// NodesDFSIterator +// +GraphTraits::NodesDFSIterator::NodesDFSIterator( + const std::vector &source) { + for (auto *x : source) stack_.push(x); +} + +// GraphTraits::NodesDFSIterator::NodesDFSIterator( +// GraphTraits::NodesDFSIterator &&other) noexcept +// : stack_(std::move(other.stack_)), +// visited_(std::move(other.visited_)) {} + +GraphTraits::NodesDFSIterator::NodesDFSIterator( + const GraphTraits::NodesDFSIterator &other) + : stack_(other.stack_), visited_(other.visited_) {} + +Node &GraphTraits::NodesDFSIterator::operator*() { + PADDLE_ENFORCE(!stack_.empty()); + return *stack_.top(); +} + +GraphTraits::NodesDFSIterator + &GraphTraits::NodesDFSIterator::operator++() { + if (stack_.empty()) return *this; + visited_.insert(stack_.top()); + auto *cur = stack_.top(); + stack_.pop(); + for (auto *x : cur->outlinks) { + if (!visited_.count(x)) { + stack_.push(x); + visited_.insert(x); + } + } + return *this; +} +bool GraphTraits::NodesDFSIterator::operator==( + const GraphTraits::NodesDFSIterator &other) { + if (stack_.empty()) return other.stack_.empty(); + if ((!stack_.empty()) && (!other.stack_.empty())) { + return stack_.top() == other.stack_.top(); + } + return false; +} + +GraphTraits::NodesDFSIterator & +GraphTraits::NodesDFSIterator::operator=( + const GraphTraits::NodesDFSIterator &other) { + stack_ = other.stack_; + visited_ = other.visited_; + return *this; +} +Node *GraphTraits::NodesDFSIterator::operator->() { + return stack_.top(); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h new file mode 100644 index 0000000000000000000000000000000000000000..9f6ce40ede25248a4f779b379c132806a4ec06ba --- /dev/null +++ b/paddle/fluid/inference/analysis/data_flow_graph.h @@ -0,0 +1,159 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * Data flow graph is an pass that build the basic graph. It contains a graph + * and the iterators that enable the iteration over the graph. + */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/inference/analysis/graph_traits.h" +#include "paddle/fluid/inference/analysis/node.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * DataFlowGraph - A container of Value and Function Nodes. + */ +struct DataFlowGraph { + NodeMap nodes; + std::vector inputs; + std::vector outputs; + + // Extract inputs and outputs of the graph. + void Build(); + + // Output a DOT graph file for debug. + std::string DotString() const; +}; + +/* + * An graph trait help to traverse the graph using BFS. + * The BFS start from a graph's inputs, the graph should be fully-connected, so + * that the iterator can reach the end. + */ +template <> +struct GraphTraits { + // BFS iterator on nodes. + struct NodesBFSIterator + : public std::iterator { + NodesBFSIterator() = default; + explicit NodesBFSIterator(const std::vector &source); + // NodesBFSIterator(NodesBFSIterator &&other) noexcept; + // NOTE Heavy to use. + NodesBFSIterator(const NodesBFSIterator &other); + + Node &operator*(); + NodesBFSIterator &operator++(); + Node *operator->(); + // TODO(Superjomn) current implementation just compare the first + // element, need to compare the graph and all the elements in the queue and + // set. + NodesBFSIterator &operator=(const NodesBFSIterator &other); + bool operator==(const NodesBFSIterator &other); + bool operator!=(const NodesBFSIterator &other) { return !(*this == other); } + + private: + std::deque queue_; + std::unordered_set visited_; + }; + + // DFS iterator on nodes. + struct NodesDFSIterator + : public std::iterator { + NodesDFSIterator() = default; + explicit NodesDFSIterator(const std::vector &source); + // NodesDFSIterator(NodesDFSIterator &&other) noexcept; + NodesDFSIterator(const NodesDFSIterator &other); + + Node &operator*(); + NodesDFSIterator &operator++(); + // TODO(Superjomn) current implementation just compare the first + // element, need to compare the graph and all the elements in the queue and + // set. + NodesDFSIterator &operator=(const NodesDFSIterator &other); + bool operator==(const NodesDFSIterator &other); + bool operator!=(const NodesDFSIterator &other) { return !(*this == other); } + Node *operator->(); + + private: + std::stack stack_; + std::unordered_set visited_; + }; + + explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {} + + // default use BFS to visit the nodes. + iterator_range nodes() { + return iterator_range(nodes_bfs_begin(), nodes_bfs_end()); + } + iterator_range nodes_in_BFS() { + return iterator_range(nodes_bfs_begin(), nodes_bfs_end()); + } + iterator_range nodes_in_DFS() { + return iterator_range(nodes_dfs_begin(), nodes_dfs_end()); + } + + private: + NodesBFSIterator nodes_bfs_begin() { + return NodesBFSIterator(graph_->inputs); + } + NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); } + NodesDFSIterator nodes_dfs_begin() { + return NodesDFSIterator(graph_->inputs); + } + NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); } + + private: + DataFlowGraph *graph_; +}; + +// Extract the inputs and outputs of a graph. The inputs and outputs of a +// sub-graph is the inputs nodes and output nodes that doesn't inside the +// sub-graph. +std::pair< + std::vector, + std::vector< + Node *>> static ExtractInputAndOutputOfSubGraph(std::vector + &graph) { + std::unordered_set nodes(graph.begin(), graph.end()); + std::unordered_set inputs; + std::unordered_set outputs; + for (auto &node : graph) { + for (auto *in : node->inlinks) { + if (!nodes.count(in) && in->type() == Node::Type::kValue) { + inputs.insert(in); + } + } + for (auto *out : node->outlinks) { + if (!nodes.count(out) && out->type() == Node::Type::kValue) { + outputs.insert(out); + } + } + } + return std::make_pair(std::vector(inputs.begin(), inputs.end()), + std::vector(outputs.begin(), outputs.end())); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..51d38d6251d853fa8a02a4e22f819cfc44294453 --- /dev/null +++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/analysis/data_flow_graph.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST(DataFlowGraph, BFS) { + auto desc = LoadProgramDesc(); + auto dfg = ProgramDescToDFG(desc); + dfg.Build(); + + for (auto* in : dfg.inputs) { + LOG(INFO) << "inputs: " << in->name() << " " + << static_cast(in->type()); + } + for (auto* out : dfg.outputs) { + LOG(INFO) << "outputs: " << out->name() << " " + << static_cast(out->type()); + } + + GraphTraits trait(&dfg); + auto nodes = trait.nodes(); + int count = 0; + for (auto it = nodes.begin(); it != nodes.end(); ++it) { + LOG(INFO) << "visiting " << it->name(); + ++count; + } + ASSERT_EQ(count, dfg.nodes.size()); +} + +TEST(DataFlowGraph, DFS) { + auto desc = LoadProgramDesc(); + auto dfg = ProgramDescToDFG(desc); + dfg.Build(); + GraphTraits trait(&dfg); + auto nodes = trait.nodes_in_DFS(); + int count = 0; + for (auto it = nodes.begin(); it != nodes.end(); ++it) { + LOG(INFO) << "visiting " << it->name(); + ++count; + } + ASSERT_EQ(count, dfg.nodes.size()); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..60f159da9140516284449a0274906df004b23ac5 --- /dev/null +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" + +#include +#include +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/io.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST_F(DFG_Tester, Test) { + framework::proto::ProgramDesc new_desc; + DataFlowGraph graph; + + FluidToDataFlowGraphPass pass0; + DataFlowGraphToFluidPass pass1; + pass0.Initialize(desc); + pass1.Initialize(&new_desc); + + pass0.Run(&graph); + pass1.Run(&graph); + + pass0.Finalize(); + pass1.Finalize(); + + LOG(INFO) << graph.nodes.size(); +} + +} // analysis +} // inference +} // paddle diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..f848a7d1add79c3032da7defc34a406dccf29d2e --- /dev/null +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" +#include + +namespace paddle { +namespace inference { +namespace analysis { + +FluidToDataFlowGraphPass::FluidToDataFlowGraphPass() {} + +bool FluidToDataFlowGraphPass::Initialize() { return Pass::Initialize(); } + +bool FluidToDataFlowGraphPass::Initialize( + const framework::proto::ProgramDesc &desc) { + desc_ = &desc; + return true; +} + +bool FluidToDataFlowGraphPass::Finalize() { return Pass::Finalize(); } + +void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { + // insert vars + std::unordered_map var2id; + auto &main_block = desc_->blocks(framework::kRootBlockIndex); + for (int i = 0; i < main_block.vars_size(); i++) { + const auto &var = main_block.vars(i); + auto *v = graph->nodes.Create(Node::Type::kValue); + v->SetName(var.name()); + v->SetExtraInfo(const_cast(static_cast(&var))); + var2id[var.name()] = v->id(); + } + for (int i = 0; i < main_block.ops_size(); i++) { + const auto &op = main_block.ops(i); + auto *o = graph->nodes.Create(Node::Type::kFunction); + o->SetName(op.type()); + static_cast(o)->SetFuncType(op.type()); + // Link to the original protobuf message's memory, make it easier to + // generate from a data flow graph to fluid ProgramDesc. + o->SetExtraInfo(const_cast(static_cast(&op))); + // set inputs and outputs + // TODO(Superjomn) make sure the InputNames is the real variable name. + for (int j = 0; j < op.inputs_size(); j++) { + auto &in_var = op.inputs(j); + for (int k = 0; k < in_var.arguments_size(); k++) { + auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k))); + in->outlinks.push_back(o); + o->inlinks.push_back(in); + } + } + for (int j = 0; j < op.outputs_size(); j++) { + auto &out_var = op.outputs(j); + for (int k = 0; k < out_var.arguments_size(); k++) { + auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]); + out->inlinks.push_back(o); + o->outlinks.push_back(out); + } + } + } + // Analysis and extract the inputs and outputs of this graph. + graph->Build(); +} + +Pass *FluidToDataFlowGraphPass::CreatePrinterPass( + std::ostream &os, const std::string &banner) const { + return nullptr; +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..cd0d4fabaafe844bcc5bb8bfc2586971197d9167 --- /dev/null +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +/* + * This file implements the transformation from data flow graph to fluid + * ProgramDesc. + */ + +#pragma once + +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/inference/analysis/data_flow_graph.h" +#include "paddle/fluid/inference/analysis/pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * Transform a FluidDesc to a data flow graph. + */ +class FluidToDataFlowGraphPass final : public DataFlowGraphPass { + public: + FluidToDataFlowGraphPass(); + bool Initialize() override; + bool Initialize(const framework::proto::ProgramDesc &desc) override; + bool Finalize() override; + + void Run(DataFlowGraph *graph) override; + + Pass *CreatePrinterPass(std::ostream &os, + const std::string &banner) const override; + + private: + framework::proto::ProgramDesc const *desc_; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..851c98bef305fa9e20dced5f7c26e9d1b6ddf4f2 --- /dev/null +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" + +#include +#include "paddle/fluid/inference/analysis/ut_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST_F(DFG_Tester, Init) { + FluidToDataFlowGraphPass pass; + pass.Initialize(); + pass.Initialize(desc); + DataFlowGraph graph; + pass.Run(&graph); + ASSERT_GT(graph.nodes.size(), 0); + pass.Finalize(); + LOG(INFO) << '\n' << graph.DotString(); +} + +} // analysis +} // inference +} // paddle diff --git a/paddle/fluid/inference/analysis/graph_traits.cc b/paddle/fluid/inference/analysis/graph_traits.cc new file mode 100644 index 0000000000000000000000000000000000000000..2ea70a1d2060e03769d67060dc6f008207342b52 --- /dev/null +++ b/paddle/fluid/inference/analysis/graph_traits.cc @@ -0,0 +1,15 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/analysis/graph_traits.h" diff --git a/paddle/fluid/inference/analysis/graph_traits.h b/paddle/fluid/inference/analysis/graph_traits.h new file mode 100644 index 0000000000000000000000000000000000000000..aed2b1e8e27d94b430201d70ecf09d4acc33c8fa --- /dev/null +++ b/paddle/fluid/inference/analysis/graph_traits.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file defines the GraphTraits template class that should be specified + * by classes that want to be iteratable by generic graph iterators. + * + * This file also defines the marker class Inverse that is used to iterate over + * graphs in a graph defined, inverse ordering... + */ + +#pragma once + +#include "paddle/fluid/inference/analysis/helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * This class should be specialized by different graph types... + * That's why the base class is empty. + */ +template +struct GraphTraits { + // using NodesBFSIterator = xxx + + // NodesBFSIterator nodes_begin(); + // NodesBFSIterator nodes_end(); +}; + +/* + * Inverse - This class is used as a marker class to tell the graph iterator to + * iterate in a graph defined Inverse order. + */ +template +struct Inverse { + const GraphType &graph; + + explicit Inverse(const GraphType &graph) : graph(graph) {} +}; + +/* + * Provide a partial specialization of GraphTraits so that the inverse of an + * inverse turns into the original graph. + */ +template +struct GraphTraits>> : GraphTraits {}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index b2d06c5d63ff139186710cd963e07b4ba245f9f3..ea39ba4ddb5e8d5d6cce9b116ab968764e578c26 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -1,74 +1,107 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace inference { -namespace analysis { - -template -class iterator_range { - IteratorT begin_, end_; - - public: - template - explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {} - - iterator_range(const IteratorT &begin, const IteratorT &end) - : begin_(begin), end_(end) {} - - const IteratorT &begin() const { return begin_; } - const IteratorT &end() const { return end_; } -}; - -/* - * An registry helper class, with its records keeps the order they registers. - */ -template -class OrderedRegistry { - public: - T *Register(const std::string &name, T *x) { - PADDLE_ENFORCE(!dic_.count(name)); - dic_[name] = data_.size(); - data_.emplace_back(std::unique_ptr(x)); - return data_.back().get(); - } - - T *Lookup(const std::string &name) { - auto it = dic_.find(name); - if (it == dic_.end()) return nullptr; - return data_[it->second].get(); - } - - protected: - std::unordered_map dic_; - std::vector> data_; -}; - -} // namespace analysis -} // namespace inference -} // namespace paddle - -#define PADDLE_DISALLOW_COPY_AND_ASSIGN(type__) \ - \ - type__(const type__ &) = delete; \ - \ - void operator=(const type__ &) = delete; +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace analysis { + +#define SET_TYPE(type__) dic_[typeid(type__).hash_code()] = #type__; +/* + * Map typeid to representation. + */ +struct DataTypeNamer { + static const DataTypeNamer &Global() { + static auto *x = new DataTypeNamer(); + return *x; + } + + template + const std::string &repr() const { + auto x = typeid(T).hash_code(); + PADDLE_ENFORCE(dic_.count(x), "unknown type for representation"); + return dic_.at(x); + } + + const std::string &repr(size_t &hash) const { + PADDLE_ENFORCE(dic_.count(hash), "unknown type for representation"); + return dic_.at(hash); + } + + private: + DataTypeNamer() { + SET_TYPE(int); + SET_TYPE(bool); + SET_TYPE(float); + } + + std::unordered_map dic_; +}; +#undef SET_TYPE + +template +class iterator_range { + IteratorT begin_, end_; + + public: + template + explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {} + + iterator_range(const IteratorT &begin, const IteratorT &end) + : begin_(begin), end_(end) {} + + const IteratorT &begin() const { return begin_; } + const IteratorT &end() const { return end_; } +}; + +/* + * An registry helper class, with its records keeps the order they registers. + */ +template +class OrderedRegistry { + public: + T *Register(const std::string &name, T *x) { + PADDLE_ENFORCE(!dic_.count(name)); + dic_[name] = data_.size(); + data_.emplace_back(std::unique_ptr(x)); + return data_.back().get(); + } + + T *Lookup(const std::string &name) { + auto it = dic_.find(name); + if (it == dic_.end()) return nullptr; + return data_[it->second].get(); + } + + protected: + std::unordered_map dic_; + std::vector> data_; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle + +#define PADDLE_DISALLOW_COPY_AND_ASSIGN(type__) \ + \ + type__(const type__ &) = delete; \ + \ + void operator=(const type__ &) = delete; diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h index 07cb7669f98237399c4165947a03c67ce2a86aa8..7972ca25c92186a8c55a76de645f4fdbb089e8d3 100644 --- a/paddle/fluid/inference/analysis/node.h +++ b/paddle/fluid/inference/analysis/node.h @@ -117,7 +117,10 @@ class Node { type_hash_ = typeid(T).hash_code(); data_.resize(sizeof(T)); } - PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(), "type not matched"); + PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(), + "type not matched, origin is %s, want %s", + DataTypeNamer::Global().repr(type_hash_), + DataTypeNamer::Global().repr()); PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error"); return *reinterpret_cast(&data_[0]); } @@ -127,6 +130,10 @@ class Node { size_t type_hash_{std::numeric_limits::max()}; }; + bool IsFunction() const { return type_ == Node::Type::kFunction; } + bool IsValue() const { return type_ == Node::Type::kValue; } + bool IsFunctionBlock() const { return type_ == Node::Type::kFunctionBlock; } + virtual ~Node() {} friend class NodeMap; diff --git a/paddle/fluid/inference/analysis/node_tester.cc b/paddle/fluid/inference/analysis/node_tester.cc index 47fea0fdff808c930ca73edb25f5b16fef397e9a..ea832a3a7e47758be9b6bd59a4325ddb576ec446 100644 --- a/paddle/fluid/inference/analysis/node_tester.cc +++ b/paddle/fluid/inference/analysis/node_tester.cc @@ -1,16 +1,16 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 +http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #include "paddle/fluid/inference/analysis/node.h" diff --git a/paddle/fluid/inference/analysis/pass.cc b/paddle/fluid/inference/analysis/pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..121b72c0a0aa9a0c568b04f7ee9a5bc5c1d6f5f8 --- /dev/null +++ b/paddle/fluid/inference/analysis/pass.cc @@ -0,0 +1,15 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/pass.h" diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h new file mode 100644 index 0000000000000000000000000000000000000000..5c89b1304d84abc9a4942f12da46b4bfe76f44f5 --- /dev/null +++ b/paddle/fluid/inference/analysis/pass.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/inference/analysis/data_flow_graph.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/analysis/node.h" + +namespace paddle { +namespace inference { +namespace analysis { + +class Pass { + public: + Pass() = default; + virtual ~Pass() {} + // Virtual method overridden by subclasses to do only necessary initialization + // before any pass is run. + virtual bool Initialize() { return false; } + // There is some passes such as FlowToDataFlowGraphPass that needs a + // ProgramDesc. Here use the native ProgramDesc ProtoBuf message, so that it + // only couple with the proto file. + virtual bool Initialize(const framework::proto::ProgramDesc &desc) { + return false; + } + // There are some Passes such as DataFlowGraphToFluidPass that will output a + // ProgramDesc. + virtual bool Initialize(framework::proto::ProgramDesc *desc) { return false; } + + // Virtual method overriden by subclasses to do any necessary clean up after + // all passes have run. + virtual bool Finalize() { return false; } + + // Get a Pass appropriate to print the Node this pass operates on. + virtual Pass *CreatePrinterPass(std::ostream &os, + const std::string &banner) const = 0; + + // Run on a single Node. + virtual void Run(Node *x) { LOG(FATAL) << "not valid"; } + // Run on a single Function. + virtual void Run(Function *x) { LOG(FATAL) << "not valid"; } + // Run on a single FunctionBlock. + virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; } + // Run on a single DataFlowGraph. + virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; } +}; + +// NodePass process on any Node types. +class NodePass : public Pass { + public: + virtual void Run(Node *node) = 0; +}; + +// NodePass process on any Function node types. +class FunctionPass : public Pass { + public: + virtual void Run(Function *node) = 0; +}; + +// NodePass process on any FunctionBlock node types. +class FunctionBlockPass : public Pass { + public: + virtual void Run(FunctionBlock *node) = 0; +}; + +// GraphPass processes on any GraphType. +class DataFlowGraphPass : public Pass { + public: + virtual void Run(DataFlowGraph *graph) = 0; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc new file mode 100644 index 0000000000000000000000000000000000000000..43ccac96c84e987ad1f494af3e314c810fc1ffe3 --- /dev/null +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -0,0 +1,154 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/analysis/subgraph_splitter.h" + +namespace paddle { +namespace inference { +namespace analysis { + +const char *SubGraphSplitter::kMarkerAttrName = + "_sub_graph_splitter_inside_sub_graph"; + +std::vector> SubGraphSplitter::operator()() { + MarkNodesInsideSubGraph(); + return ExtractSubGraphs(); +} + +// Mark the output variables inside a subgraph with the func. +inline void MarkOutLinksInSubGraph(const Function *func) { + for (auto *var : func->outlinks) { + var->attr(SubGraphSplitter::kMarkerAttrName).Bool() = true; + } +} + +void SubGraphSplitter::MarkNodesInsideSubGraph() { + for (auto &node : GraphTraits(graph_).nodes()) { + if (node_inside_subgraph_teller_(&node)) { + node.attr(kMarkerAttrName).Bool() = true; + if (node.type() == Node::Type::kFunction) { + // If a function is inside the sub-graph, mark all the output variables + // to be inside too, so that two marked functions will be inside a same + // sub-graph, lets take a example: A_function->var->B_function, if + // A_function is marked, var should also be marked, so that B_function + // will be in the same sub-graph with A_function if B_function is + // marked. + MarkOutLinksInSubGraph(static_cast(&node)); + } + } + } +} + +const char *kUnionFindParent = "_sub_graph_splitter_union_find_parent_"; + +// Use the Union Find(UF) algorithm to find fully connected sub-graphs, if node +// a's output is node b, that is a and b is in the same sub-graph. The UF +// algorithm will group them to the same cluster. +using node_map_t = std::unordered_map; +// Find the ancestor id of a node. +int UnionFindGetAncestor(const node_map_t &node_map, size_t id) { + int tmp = id; + do { + tmp = node_map.at(tmp)->attr(kUnionFindParent).Int32(); + } while (node_map.at(tmp)->attr(kUnionFindParent).Int32() != tmp); + return tmp; +} +// Make this two node share the same ancestor. +// TODO(Superjom) bad performance, make a balanced tree latter. +void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) { + int a_ancestor = UnionFindGetAncestor(node_map, a); + int b_ancestor = UnionFindGetAncestor(node_map, b); + node_map.at(b_ancestor)->attr(kUnionFindParent).Int32() = a_ancestor; + node_map.at(a)->attr(kUnionFindParent).Int32() = a_ancestor; + node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor; +} + +std::vector> SubGraphSplitter::ExtractSubGraphs() { + std::vector marked_nodes; + for (auto &node : GraphTraits(graph_).nodes()) { + if (node.attr(kMarkerAttrName).Bool()) { + marked_nodes.push_back(&node); + } + } + // extract sub-graphs in the marked node set, use Union Find algorithm. + node_map_t node_map; // id to ptr + for (auto *n : marked_nodes) { + // n's parent == n.id means it is the ancestor + n->attr(kUnionFindParent).Int32() = n->id(); + node_map[n->id()] = n; + } + std::unordered_set visited; + for (auto *n : marked_nodes) { + for (auto *out : n->outlinks) { + if (node_map.count(out->id())) { + UnionFindCombine(node_map, n->id(), out->id()); + } + } + } + + std::unordered_map> clusters; + for (auto *n : marked_nodes) { + if (n->type() == Node::Type::kFunction) { + clusters[UnionFindGetAncestor(node_map, + n->attr(kUnionFindParent).Int32())] + .push_back(n); + } + } + std::vector> result; + std::for_each(clusters.begin(), clusters.end(), + [&](const decltype(clusters)::value_type &it) { + result.push_back(it.second); + }); + + return result; +} + +void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); } + +void SubGraphFuse::ReplaceNodesWithSubGraphs() { + auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)(); + for (auto &subgraph : subgraphs) { + // replace this sub-graph with the first node. Two steps: 1. Create a Block + // Node that contains this subgraph 2. Mark the nodes inside the sub-graph + // as deleted. 3. Replace the deleted node with the new Block Node. + auto *block_node = graph_->nodes.Create(Node::Type::kFunctionBlock); + auto io = ExtractInputAndOutputOfSubGraph(subgraph); + block_node->inlinks = std::move(io.first); + block_node->outlinks = std::move(io.second); + for (auto *node : subgraph) { + // TODO(Superjomn) need a unified mechanism to treat deleted node in each + // pass. + node->SetDeleted(); + } + + std::unordered_map + delelte_node_map; // deleted node to BlockNode + for (auto *n : block_node->inlinks) { + n->inlinks.clear(); + } + for (auto *n : block_node->outlinks) { + n->outlinks.clear(); + } + for (auto *n : block_node->inlinks) { + n->outlinks.push_back(block_node); + } + for (auto *n : block_node->outlinks) { + n->inlinks.push_back(n); + } + } +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.h b/paddle/fluid/inference/analysis/subgraph_splitter.h new file mode 100644 index 0000000000000000000000000000000000000000..ed90a0dcf31e154c4d82be08ce35e2f11d11c139 --- /dev/null +++ b/paddle/fluid/inference/analysis/subgraph_splitter.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file defines the the class to partition a graph. + */ + +#pragma once + +#include "paddle/fluid/inference/analysis/data_flow_graph.h" +#include "paddle/fluid/inference/analysis/node.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * Detect the nodes in a sub-graph that meet some conditions. This class doesn't + * modify the graph. + */ +class SubGraphSplitter { + public: + static const char *kMarkerAttrName; + // Tell whether a node is inside a sub-graph. + using NodeInsideSubgraphTeller = std::function; + + SubGraphSplitter(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller) + : graph_(graph), node_inside_subgraph_teller_(teller) {} + + std::vector> operator()(); + + protected: + // Mark the nodes inside the accepted sub-graph using + // node_inside_subgraph_teller. + void MarkNodesInsideSubGraph(); + + // Merge the marked nodes into sub-graphs and return the sub-graphs. + std::vector> ExtractSubGraphs(); + + private: + DataFlowGraph *graph_; + NodeInsideSubgraphTeller node_inside_subgraph_teller_; +}; + +/* + * SubGraphFuse - Replace some nodes with the sub-graph node they are inside. To + * some extent, the TensorRT engine is just a fusion op for a model. + */ +class SubGraphFuse { + public: + using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller; + + SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller) + : graph_(graph), node_inside_subgraph_teller_(teller) {} + + // The main method which run all the logic. + void operator()(); + + protected: + // Remove the nodes inside sub-graphs and replace with the SubGraphNode. + void ReplaceNodesWithSubGraphs(); + + private: + DataFlowGraph *graph_; + NodeInsideSubgraphTeller node_inside_subgraph_teller_; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..0644c0db12e3daabba76dbaad33847f5624b157a --- /dev/null +++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/analysis/subgraph_splitter.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST_F(DFG_Tester, Split) { + auto desc = LoadProgramDesc(); + auto dfg = ProgramDescToDFG(desc); + LOG(INFO) << "spliter\n" << dfg.DotString(); + + SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) { + if (node->type() != Node::Type::kFunction) return false; + const auto* func = static_cast(node); + if (func->func_type() == "elementwise_add" || func->func_type() == "relu" || + func->func_type() == "conv2d" || func->func_type() == "mul" || + func->func_type() == "sigmoid" || func->func_type() == "softmax") { + LOG(INFO) << "sub-graph marked " << node->repr(); + return true; + } + return false; + }; + ASSERT_GT(dfg.nodes.size(), 5UL); + + auto subgraphs = SubGraphSplitter(&dfg, teller)(); + + // Check the number of the marked nodes. + int marked_nodes = 0; + for (auto& node : dfg.nodes.nodes()) { + if (node->IsFunction() && + node->attr(SubGraphSplitter::kMarkerAttrName).Bool()) { + ++marked_nodes; + } + } + EXPECT_EQ(marked_nodes, 6); + + // For human debug. + for (auto& subgraph : subgraphs) { + LOG(INFO) << "subgraph size " << subgraph.size(); + for (auto* node : subgraph) { + LOG(INFO) << "node " << node->repr(); + } + } + + ASSERT_EQ(subgraphs.size(), 1UL); + // The last sub-graph has 5 Functions. + ASSERT_EQ(subgraphs.back().size(), 6UL); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..c86083d12153921672e15c172b874f77a8b46cde --- /dev/null +++ b/paddle/fluid/inference/analysis/ut_helper.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/inference/analysis/data_flow_graph.h" +#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/io.h" + +namespace paddle { +namespace inference { +namespace analysis { + +DEFINE_string(inference_model_dir, "", "inference test model dir"); + +static framework::proto::ProgramDesc LoadProgramDesc( + const std::string& model_dir = FLAGS_inference_model_dir) { + paddle::platform::CPUPlace place; + paddle::framework::Executor executor(place); + paddle::framework::Scope scope; + auto program = Load(&executor, &scope, model_dir); + return *program->Proto(); +} + +static DataFlowGraph ProgramDescToDFG( + const framework::proto::ProgramDesc& desc) { + DataFlowGraph graph; + FluidToDataFlowGraphPass pass; + pass.Initialize(desc); + pass.Run(&graph); + pass.Finalize(); + return graph; +} + +class DFG_Tester : public ::testing::Test { + protected: + void SetUp() override { desc = LoadProgramDesc(FLAGS_inference_model_dir); } + + framework::proto::ProgramDesc desc; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 4fb4511d99179e4ea14cde66feb13bc9e114581a..79b1a248a0acfded0d2fcfadc041a6ad2a92ff3d 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,4 +1,5 @@ nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES}) nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc - DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine) + DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine + SERIAL) nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 7fce138e3f47e0eb485afb4d5a665eb41f68e286..f72997ca24ed837f761b52cbecdc05998424a675 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -201,11 +201,13 @@ if(WITH_DISTRIBUTE) set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor) + #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op + # listen_and_serv_op sum_op executor SERIAL) if(WITH_GPU) set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op listen_and_serv_op executor) + cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op + listen_and_serv_op executor SERIAL) op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc) set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc index 901682edbb01c563be6ea407228336b14f942778..038ea8999072f562104c5386ed18b6b275816345 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -44,6 +44,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { std::vector paddings = ctx.Attr>("paddings"); // cudnn v5 does not support dilations std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); int user_workspace_size = ctx.Attr("workspace_size_MB"); const T* input_data = input->data(); @@ -64,13 +65,13 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { // (N, M, H, W) or (N, M, D, H, W) cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, framework::vectorize2int(input->dims())); + layout, framework::vectorize2int(input->dims()), groups); // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, framework::vectorize2int(output->dims())); + layout, framework::vectorize2int(output->dims()), groups); // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w) cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( - layout, framework::vectorize2int(filter->dims())); + layout, framework::vectorize2int(filter->dims()), groups); cudnnConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(paddings, strides, dilations); @@ -104,11 +105,17 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv transpose forward --------------------- + int input_offset = input->numel() / input->dims()[0] / groups; + int output_offset = output->numel() / output->dims()[0] / groups; + int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc, - input_data, cudnn_conv_desc, algo, cudnn_workspace, - workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); + for (int g = 0; g < groups; g++) { + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, + cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, + algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_output_desc, output_data + output_offset * g)); + } // Release the cudnn workspace paddle::memory::Free(gpu, cudnn_workspace); @@ -134,6 +141,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { std::vector paddings = ctx.Attr>("paddings"); // cudnn v5 does not support dilations std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); int user_workspace_size = ctx.Attr("workspace_size_MB"); // ------------------- cudnn descriptors --------------------- @@ -145,13 +153,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { // Input: (N, M, H, W) or (N, M, D, H, W) cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, framework::vectorize2int(input->dims())); + layout, framework::vectorize2int(input->dims()), groups); // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, framework::vectorize2int(output_grad->dims())); + layout, framework::vectorize2int(output_grad->dims()), groups); // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w) cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( - layout, framework::vectorize2int(filter->dims())); + layout, framework::vectorize2int(filter->dims()), groups); cudnnConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(paddings, strides, dilations); @@ -205,15 +213,22 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv backward data --------------------- // FIXME(typhoonzero): template type T may not be the same as cudnn call. + int input_offset = input->numel() / input->dims()[0] / groups; + int output_grad_offset = + output_grad->numel() / output_grad->dims()[0] / groups; + int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_output_desc, output_grad_data, - cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data)); + for (int g = 0; g < groups; g++) { + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_filter_desc, + filter_data + filter_offset * g, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, + input_grad_data + input_offset * g)); + } } // ------------------- cudnn conv backward filter --------------------- @@ -221,11 +236,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset filter_grad. // Gradient with respect to the filter - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc, - input_data, cudnn_conv_desc, filter_algo, cudnn_workspace, - workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data)); + for (int g = 0; g < groups; g++) { + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_input_desc, + input_data + input_offset * g, cudnn_conv_desc, filter_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc, + filter_grad_data + filter_offset * g)); + } } + // Release the cudnn workspace paddle::memory::Free(gpu, cudnn_workspace); } diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index c27c8e273168407d3aacb05cd6628887cc5760ad..0b363f5c43f9fc191790e5cca629ffc46eb9388c 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -32,6 +32,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); std::vector dilations = ctx->Attrs().Get>("dilations"); + int groups = ctx->Attrs().Get("groups"); PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, "ConvTransposeOp intput should be 4-D or 5-D tensor."); @@ -48,10 +49,10 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { "ConvTransposeOp paddings dimension and dilations " "dimension should be the same."); PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0], - "In ConvTransposeOp, The input channel should be the same " - "as the number of filters."); + "In ConvTransposeOp, The number of input channels should " + "be equal to the number of filter's channels."); - std::vector output_shape({in_dims[0], filter_dims[1]}); + std::vector output_shape({in_dims[0], filter_dims[1] * groups}); for (size_t i = 0; i < strides.size(); ++i) { auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] + @@ -102,7 +103,10 @@ void Conv2DTransposeOpMaker::Make() { AddOutput("Output", "(Tensor) The output tensor of convolution transpose operator. " "The format of output tensor is also NCHW."); - + AddAttr("groups", + "(int default:1), the groups number of the convolution " + "transpose operator. ") + .SetDefault(1); AddAttr>("dilations", "(vector default:{1, 1}), the " "dilations(h_dilation, w_dilation) of convolution " @@ -204,6 +208,10 @@ void Conv3DTransposeOpMaker::Make() { "(vector default:{0, 0, 0}), paddings(d_pad, " "h_pad, w_pad) of convolution transpose operator.") .SetDefault({0, 0, 0}); + AddAttr("groups", + "(int default:1), the groups number of the convolution3d " + "transpose operator. ") + .SetDefault(1); AddAttr( "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index f9d205a5b5c4cff74d02a6c89b83f7584e4a6824..1dcfc651fdd79aed50736d05d38ec8576b183d41 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -70,7 +70,7 @@ class GemmConvTransposeKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); std::vector dilations = context.Attr>("dilations"); - // groups will alway be disabled in conv2dtranspose. + int groups = context.Attr("groups"); const int batch_size = static_cast(input->dims()[0]); @@ -81,10 +81,10 @@ class GemmConvTransposeKernel : public framework::OpKernel { // use col_shape in the im2col and col2im (or vol2col and col2vol) // calculation - // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w} + // col_shape_vec: {c/g, k_h, k_w, h, w} or {c/g, k_d, k_h, k_w, d, h, w} size_t data_dim = filter_shape_vec.size() - 2; std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = output->dims()[1]; + col_shape_vec[0] = output->dims()[1] / groups; for (size_t j = 0; j < data_dim; ++j) { col_shape_vec[j + 1] = filter_shape_vec[j + 2]; col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2]; @@ -92,7 +92,7 @@ class GemmConvTransposeKernel : public framework::OpKernel { DDim col_shape(framework::make_ddim(col_shape_vec)); // use col_matrix_shape in the gemm calculation - // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w) + // size: (c/g * k_h * k_w, h * w) or (c/g * k_d * k_h * k_w, d * h * w) DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1); Tensor col; @@ -111,7 +111,7 @@ class GemmConvTransposeKernel : public framework::OpKernel { // input matrix size: (m, h * w) or (m, d * h * w) DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]}; - // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w) + // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w) DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]}; filter.Resize(filter_matrix_shape); @@ -121,6 +121,8 @@ class GemmConvTransposeKernel : public framework::OpKernel { auto blas = math::GetBlas(dev_ctx); set_zero(dev_ctx, output, static_cast(0)); + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; math::Col2ImFunctor col2im; math::Col2VolFunctor col2vol; @@ -133,22 +135,29 @@ class GemmConvTransposeKernel : public framework::OpKernel { // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w) Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape); - // col_matrix = filter * input_batch - // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w) - blas.MatMul(filter, true, input_batch, false, static_cast(1.0), - &col_matrix, static_cast(0.0)); - - if (data_dim == 2U) { - // col2im: col_matrix -> dy - // from (c * k_h * k_w, h * w) to (c, o_h, o_w) - col2im(dev_ctx, col, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &output_batch); - } else if (data_dim == 3U) { - // col2vol: col_matrix -> dy - // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w) - col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch); + for (int g = 0; g < groups; g++) { + Tensor in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step); + Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step); + Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step); + + // col_matrix = filter_slice * input_slice + // of shape (c/g * k_h * k_w, h * w) + // or (c/g * k_d * k_h * k_w, d * h * w) + blas.MatMul(filter_slice, true, in_slice, false, static_cast(1.0), + &col_matrix, static_cast(0.0)); + + if (data_dim == 2U) { + // col2im: col_matrix -> dy + // from (c/g * k_h * k_w, h * w) to (c/g, o_h, o_w) + col2im(dev_ctx, col, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &out_slice); + } else if (data_dim == 3U) { + // col2vol: col_matrix -> dy + // from (c/g * k_d * k_h * k_w, d * h * w) to (c/g, o_d, o_h, o_w) + col2vol(dev_ctx, col, dilations, strides, paddings, &out_slice); + } } } } @@ -174,6 +183,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); std::vector dilations = context.Attr>("dilations"); + int groups = context.Attr("groups"); const int batch_size = static_cast(input->dims()[0]); @@ -205,9 +215,11 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { // input matrix size: (m, h * w) or (m, d * h * w) DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]}; - // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w) - DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]}; + // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w) + DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0] / groups}; filter.Resize(filter_matrix_shape); + int in_step = static_cast(input->dims()[1]) / groups; + int col_step = static_cast(col_matrix_shape[0]) / groups; // convolution transpose grad on input: // im2col + gemm (similar to conv-forward) @@ -233,7 +245,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { if (input_grad) { input_grad->mutable_data(context.GetPlace()); } - if (filter_grad) { // filter size (m, c, k_h, k_w) + if (filter_grad) { // filter size (m, c/g, k_h, k_w) filter_grad->mutable_data(context.GetPlace()); set_zero(dev_ctx, filter_grad, static_cast(0)); filter_grad_ = *filter_grad; @@ -268,8 +280,17 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { // or // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m, // d, h, w) - blas.MatMul(filter, false, col_matrix, false, static_cast(1.0), - &input_grad_batch, static_cast(0.0)); + for (int g = 0; g < groups; g++) { + Tensor input_grad_slice = + input_grad_batch.Slice(g * in_step, (g + 1) * in_step); + Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step); + Tensor col_matrix_slice = + col_matrix.Slice(g * col_step, (g + 1) * col_step); + + blas.MatMul(filter_slice, false, col_matrix_slice, false, + static_cast(1.0), &input_grad_slice, + static_cast(0.0)); + } } if (filter_grad) { // input batch @@ -279,8 +300,17 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { // or // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d * // k_h * k_w) - blas.MatMul(in_batch, false, col_matrix, true, static_cast(1.0), - &filter_grad_, static_cast(1.0)); + for (int g = 0; g < groups; g++) { + Tensor in_batch_slice = + in_batch.Slice(g * in_step, (g + 1) * in_step); + Tensor filter_grad_slice = + filter_grad_.Slice(g * in_step, (g + 1) * in_step); + Tensor col_matrix_slice = + col_matrix.Slice(g * col_step, (g + 1) * col_step); + blas.MatMul(in_batch_slice, false, col_matrix_slice, true, + static_cast(1.0), &filter_grad_slice, + static_cast(1.0)); + } } } } diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt index 719a7465b8d58ef8588ff1e83c2b971eb6fbb00f..b9a66474c9afc27462f9c47af1a0465e2cec70bc 100644 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ b/paddle/fluid/operators/detail/CMakeLists.txt @@ -4,6 +4,8 @@ if(WITH_DISTRIBUTE) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr - cares zlib protobuf sendrecvop_grpc) - cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op) + cares zlib protobuf sendrecvop_grpc SERIAL) + cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc + grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor + proto_desc lookup_table_op SERIAL) endif() diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index ae60ab15325ef101feb7270a4f5d840cb2112be0..47892b1bcc073d24ea617ea1c680138a88925177 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { @@ -196,9 +197,14 @@ bool RPCClient::Wait() { const size_t kReqCnt = req_count_; bool a[kReqCnt]; std::vector> waits(req_count_); + std::mutex mu; for (int i = 0; i < req_count_; i++) { - waits[i] = framework::AsyncIO([i, &a, this] { a[i] = Proceed(); }); + waits[i] = framework::AsyncIO([i, &a, &mu, this] { + bool ret = Proceed(); + std::lock_guard l(mu); + a[i] = ret; + }); } for (int i = 0; i < req_count_; i++) { diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index eb114a47d99541402f748bfffcf6b10fde3e78e2..58faead2bdf9a89749e08207d964836bbf5cb68e 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -19,10 +19,16 @@ limitations under the License. */ using ::grpc::ServerAsyncResponseWriter; +DEFINE_int32(rpc_server_handle_send_threads, 20, + "Number of threads used to handle send at rpc server."); +DEFINE_int32(rpc_server_handle_get_threads, 20, + "Number of threads used to handle get at rpc server."); +DEFINE_int32(rpc_server_handle_prefetch_threads, 1, + "Number of threads used to handle prefetch at rpc server."); + namespace paddle { namespace operators { namespace detail { - enum CallStatus { PROCESS = 0, FINISH }; // reference: @@ -63,18 +69,20 @@ class RequestSend final : public RequestBase { explicit RequestSend(GrpcService::AsyncService* service, ::grpc::ServerCompletionQueue* cq, bool sync_mode, framework::Scope* scope, ReceivedQueue* queue, - const platform::DeviceContext* dev_ctx) + const platform::DeviceContext* dev_ctx, int req_id) : RequestBase(service, cq, sync_mode, dev_ctx), queue_(queue), - responder_(&ctx_) { + responder_(&ctx_), + req_id_(req_id) { if (sync_mode_) { request_.reset(new VariableResponse(scope, dev_ctx_, false)); } else { request_.reset(new VariableResponse(scope, dev_ctx_, true)); } int method_id = static_cast(detail::GrpcMethod::kSendVariable); - service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_, - cq_, cq_, this); + service_->RequestAsyncUnary( + method_id, &ctx_, request_.get(), &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); } virtual ~RequestSend() {} @@ -86,15 +94,17 @@ class RequestSend final : public RequestBase { VLOG(3) << "RequestSend " << var_name; queue_->Push(std::make_pair(var_name, request_)); - sendrecv::VoidMessage reply; - responder_.Finish(reply, ::grpc::Status::OK, this); status_ = FINISH; + responder_.Finish(reply_, ::grpc::Status::OK, + reinterpret_cast(static_cast(req_id_))); } protected: + sendrecv::VoidMessage reply_; std::shared_ptr request_; ReceivedQueue* queue_; ServerAsyncResponseWriter responder_; + int req_id_; }; class RequestGet final : public RequestBase { @@ -103,14 +113,17 @@ class RequestGet final : public RequestBase { ::grpc::ServerCompletionQueue* cq, bool sync_mode, framework::Scope* scope, const platform::DeviceContext* dev_ctx, - framework::BlockingQueue* queue) + framework::BlockingQueue* queue, + int req_id) : RequestBase(service, cq, sync_mode, dev_ctx), responder_(&ctx_), scope_(scope), - queue_(queue) { + queue_(queue), + req_id_(req_id) { auto method_id = static_cast(detail::GrpcMethod::kGetVariable); - service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_, - cq_, this); + service_->RequestAsyncUnary( + method_id, &ctx_, &request_, &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id_))); } virtual ~RequestGet() {} @@ -123,13 +136,13 @@ class RequestGet final : public RequestBase { VLOG(3) << "RequestGet " << var_name; auto* var = scope_->FindVar(var_name); - ::grpc::ByteBuffer reply; if (var_name != FETCH_BARRIER_MESSAGE) { - SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply); + SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply_); } - responder_.Finish(reply, ::grpc::Status::OK, this); status_ = FINISH; + responder_.Finish(reply_, ::grpc::Status::OK, + reinterpret_cast(static_cast(req_id_))); if (var_name == FETCH_BARRIER_MESSAGE) { sendrecv::VariableMessage msg; @@ -140,9 +153,11 @@ class RequestGet final : public RequestBase { protected: sendrecv::VariableMessage request_; + ::grpc::ByteBuffer reply_; ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; framework::Scope* scope_; framework::BlockingQueue* queue_; + int req_id_; }; class RequestPrefetch final : public RequestBase { @@ -153,21 +168,24 @@ class RequestPrefetch final : public RequestBase { const platform::DeviceContext* dev_ctx, framework::Executor* executor, framework::ProgramDesc* program, - framework::ExecutorPrepareContext* prefetch_ctx) + framework::ExecutorPrepareContext* prefetch_ctx, + int req_id) : RequestBase(service, cq, sync_mode, dev_ctx), responder_(&ctx_), scope_(scope), executor_(executor), program_(program), - prefetch_ctx_(prefetch_ctx) { + prefetch_ctx_(prefetch_ctx), + req_id_(req_id) { if (sync_mode_) { request_.reset(new VariableResponse(scope, dev_ctx_, false)); } else { request_.reset(new VariableResponse(scope, dev_ctx_, true)); } int method_id = static_cast(detail::GrpcMethod::kPrefetchVariable); - service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_, - cq_, cq_, this); + service_->RequestAsyncUnary( + method_id, &ctx_, request_.get(), &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id_))); } virtual ~RequestPrefetch() {} @@ -176,7 +194,6 @@ class RequestPrefetch final : public RequestBase { virtual void Process() { // prefetch process... - ::grpc::ByteBuffer reply; std::string var_name = request_->OutVarname(); VLOG(3) << "RequestPrefetch " << var_name; @@ -186,19 +203,22 @@ class RequestPrefetch final : public RequestBase { InitializeVariable(var, var_desc->GetType()); executor_->RunPreparedContext(prefetch_ctx_, scope_); - SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply); + SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply_); - responder_.Finish(reply, ::grpc::Status::OK, this); status_ = FINISH; + responder_.Finish(reply_, ::grpc::Status::OK, + reinterpret_cast(static_cast(req_id_))); } protected: std::shared_ptr request_; + ::grpc::ByteBuffer reply_; ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; framework::Scope* scope_; framework::Executor* executor_; framework::ProgramDesc* program_; framework::ExecutorPrepareContext* prefetch_ctx_; + int req_id_; }; void AsyncGRPCServer::WaitClientGet(int count) { @@ -232,24 +252,39 @@ void AsyncGRPCServer::RunSyncUpdate() { LOG(INFO) << "Server listening on " << address_ << " selected port: " << selected_port_; - std::function send_register = - std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this); - std::function get_register = - std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this); - std::function prefetch_register = - std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this); - - // TODO(wuyi): Run these "HandleRequest" in thread pool - t_send_.reset( - new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, - cq_send_.get(), "cq_send", send_register))); - t_get_.reset( - new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, - cq_get_.get(), "cq_get", get_register))); - t_prefetch_.reset(new std::thread( - std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(), - "cq_prefetch", prefetch_register))); + std::function send_register = std::bind( + &AsyncGRPCServer::TryToRegisterNewSendOne, this, std::placeholders::_1); + std::function get_register = std::bind( + &AsyncGRPCServer::TryToRegisterNewGetOne, this, std::placeholders::_1); + std::function prefetch_register = + std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this, + std::placeholders::_1); + for (int i = 0; i < kSendReqsBufSize; ++i) { + TryToRegisterNewSendOne(i); + } + for (int i = 0; i < kGetReqsBufSize; ++i) { + TryToRegisterNewGetOne(i); + } + for (int i = 0; i < kPrefetchReqsBufSize; ++i) { + TryToRegisterNewPrefetchOne(i); + } + + for (int i = 0; i < FLAGS_rpc_server_handle_send_threads; ++i) { + t_sends_.emplace_back( + new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, + cq_send_.get(), "cq_send", send_register))); + } + for (int i = 0; i < FLAGS_rpc_server_handle_get_threads; ++i) { + t_gets_.emplace_back( + new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, + cq_get_.get(), "cq_get", get_register))); + } + for (int i = 0; i < FLAGS_rpc_server_handle_prefetch_threads; ++i) { + t_prefetchs_.emplace_back(new std::thread( + std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(), + "cq_prefetch", prefetch_register))); + } { std::lock_guard lock(this->mutex_ready_); ready_ = 1; @@ -257,9 +292,15 @@ void AsyncGRPCServer::RunSyncUpdate() { condition_ready_.notify_all(); // wait server server_->Wait(); - t_send_->join(); - t_get_->join(); - t_prefetch_->join(); + for (int i = 0; i < FLAGS_rpc_server_handle_send_threads; ++i) { + t_sends_[i]->join(); + } + for (int i = 0; i < FLAGS_rpc_server_handle_get_threads; ++i) { + t_gets_[i]->join(); + } + for (int i = 0; i < FLAGS_rpc_server_handle_prefetch_threads; ++i) { + t_prefetchs_[i]->join(); + } } void AsyncGRPCServer::ShutdownQueue() { @@ -276,47 +317,48 @@ void AsyncGRPCServer::ShutDown() { server_->Shutdown(); } -void AsyncGRPCServer::TryToRegisterNewSendOne() { +void AsyncGRPCServer::TryToRegisterNewSendOne(int i) { std::unique_lock lock(cq_mutex_); if (is_shut_down_) { VLOG(3) << "shutdown, do not TryToRegisterNewSendOne"; return; } RequestSend* send = new RequestSend(&service_, cq_send_.get(), sync_mode_, - scope_, &var_recv_queue_, dev_ctx_); + scope_, &var_recv_queue_, dev_ctx_, i); + send_reqs_[i] = static_cast(send); VLOG(4) << "Create RequestSend status:" << send->Status(); } -void AsyncGRPCServer::TryToRegisterNewGetOne() { +void AsyncGRPCServer::TryToRegisterNewGetOne(int req_id) { std::unique_lock lock(cq_mutex_); if (is_shut_down_) { VLOG(3) << "shutdown, do not TryToRegisterNewGetOne"; return; } RequestGet* get = new RequestGet(&service_, cq_get_.get(), sync_mode_, scope_, - dev_ctx_, &var_get_queue_); + dev_ctx_, &var_get_queue_, req_id); + get_reqs_[req_id] = static_cast(get); VLOG(4) << "Create RequestGet status:" << get->Status(); } -void AsyncGRPCServer::TryToRegisterNewPrefetchOne() { +void AsyncGRPCServer::TryToRegisterNewPrefetchOne(int req_id) { std::unique_lock lock(cq_mutex_); if (is_shut_down_) { VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne"; return; } - RequestPrefetch* prefetch = - new RequestPrefetch(&service_, cq_prefetch_.get(), sync_mode_, scope_, - dev_ctx_, executor_, program_, prefetch_ctx_.get()); + RequestPrefetch* prefetch = new RequestPrefetch( + &service_, cq_prefetch_.get(), sync_mode_, scope_, dev_ctx_, executor_, + program_, prefetch_ctx_.get(), req_id); + prefetch_reqs_[req_id] = static_cast(prefetch); VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status(); } // FIXME(typhoonzero): change cq_name to enum. -void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, - const std::string& cq_name, - std::function TryToRegisterNewOne) { - TryToRegisterNewOne(); - +void AsyncGRPCServer::HandleRequest( + ::grpc::ServerCompletionQueue* cq, const std::string& cq_name, + std::function TryToRegisterNewOne) { void* tag = NULL; bool ok = false; @@ -327,8 +369,7 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, break; } VLOG(3) << "HandleRequest for " << cq_name << " get Next"; - - PADDLE_ENFORCE(tag); + int req_id = static_cast(reinterpret_cast(tag)); if (sync_mode_) { // FIXME(typhoonzero): de-couple the barriers with recv_op @@ -337,7 +378,17 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, VLOG(3) << "HandleRequest for " << cq_name << " after WaitCond"; } - RequestBase* base = reinterpret_cast(tag); + RequestBase* base = nullptr; + { + std::lock_guard l(cq_mutex_); + if (cq_name == "cq_get") { + base = get_reqs_[req_id]; + } else if (cq_name == "cq_send") { + base = send_reqs_[req_id]; + } else if (cq_name == "cq_prefetch") { + base = prefetch_reqs_[req_id]; + } + } // reference: // https://github.com/tensorflow/tensorflow/issues/5596 // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM @@ -345,19 +396,19 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, if (!ok) { LOG(WARNING) << cq_name << " recv no regular event:argument name[" << base->GetReqName() << "]"; - TryToRegisterNewOne(); + TryToRegisterNewOne(req_id); delete base; continue; } switch (base->Status()) { case PROCESS: { - TryToRegisterNewOne(); base->Process(); VLOG(4) << cq_name << " PROCESS status:" << base->Status(); break; } case FINISH: { + TryToRegisterNewOne(req_id); VLOG(4) << cq_name << " FINISH status:" << base->Status(); delete base; break; diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index 238aaa29634a7eff65429c27aa3538a185723eb2..bdff9801a928699f8391bfb68c1c7bd2d75aa642 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include // NOLINT #include +#include #include "grpc++/grpc++.h" #include "paddle/fluid/framework/blocking_queue.h" @@ -30,6 +31,7 @@ limitations under the License. */ #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" #include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { @@ -82,19 +84,27 @@ class AsyncGRPCServer final { protected: void HandleRequest(::grpc::ServerCompletionQueue *cq, const std::string &cq_name, - std::function TryToRegisterNewOne); - void TryToRegisterNewSendOne(); - void TryToRegisterNewGetOne(); - void TryToRegisterNewPrefetchOne(); + std::function TryToRegisterNewOne); + void TryToRegisterNewSendOne(int req_id); + void TryToRegisterNewGetOne(int req_id); + void TryToRegisterNewPrefetchOne(int req_id); void ShutdownQueue(); private: + static const int kSendReqsBufSize = 100; + static const int kGetReqsBufSize = 100; + static const int kPrefetchReqsBufSize = 10; + std::mutex cq_mutex_; volatile bool is_shut_down_ = false; std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_; std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_; std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_; + RequestBase *send_reqs_[kSendReqsBufSize]; + RequestBase *get_reqs_[kGetReqsBufSize]; + RequestBase *prefetch_reqs_[kPrefetchReqsBufSize]; + GrpcService::AsyncService service_; std::unique_ptr<::grpc::Server> server_; @@ -113,8 +123,10 @@ class AsyncGRPCServer final { mutable int barrier_cond_step_; std::condition_variable barrier_condition_; - std::unique_ptr t_send_; - std::unique_ptr t_get_; + std::vector> t_sends_; + std::vector> t_gets_; + std::vector> t_prefetchs_; + std::unique_ptr t_prefetch_; std::unique_ptr prefetch_ctx_; diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc index b8db0ad987cdfaec1fc9236c3f26e88891376dce..73e75c9087fef756840c76db249f8996253ced64 100644 --- a/paddle/fluid/operators/detail/grpc_server_test.cc +++ b/paddle/fluid/operators/detail/grpc_server_test.cc @@ -108,7 +108,7 @@ void StartServer(const std::string& endpoint) { rpc_service_->RunSyncUpdate(); } -TEST(PREFETCH, CPU) { +TEST(PREFETCH, DISABLED_CPU) { // start up a server instance backend std::thread server_thread(StartServer, "127.0.0.1:8889"); sleep(2); diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h index e6dab2f5a3a4280f3979417c3ca2d884a0b8ff2f..e0505c2b9d0903837713d7e0032b01ab091c2e04 100644 --- a/paddle/fluid/operators/detail/grpc_service.h +++ b/paddle/fluid/operators/detail/grpc_service.h @@ -25,6 +25,8 @@ #include #include "paddle/fluid/operators/detail/variable_response.h" +#include "paddle/fluid/platform/profiler.h" + // NOTE: This method was originally created by tensorflow // (https://github.com/tensorflow/tensorflow/) we borrow this // method and did some modifications so that we can parse gRPC diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto index 9478c5702bcbf99fc88207b8c4843dbccf8a5925..a244afc46f3247c7e6e8481b09b5c729a2a569f7 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/detail/send_recv.proto @@ -70,10 +70,10 @@ message VariableMessage { bytes rows = 9; // Look up table block execution output variable name. string out_varname = 10; - // If true, the ps server will start profiling, the ps + // If 1, the ps server will start profiling, the ps // server stops profiling and generates a profile to /tmp/profile_ps_* - // when profile switches from true to false. - bool profile = 11; + // when profile switches from 1 to 2. + int64 profile = 11; } message VoidMessage {} diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc index 07c43554bc6a0d71d688a5a5772d0ab3d2de319a..3bae56532d655a1725e18276e09e0cade47b5c68 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.cc +++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc @@ -58,12 +58,13 @@ void GetTensorPayload(framework::Variable* var, if (platform::is_gpu_place(ctx.GetPlace())) { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE(platform::is_gpu_place(tensor.place())); - platform::CPUPlace cpu; + platform::CUDAPinnedPlace cuda_pinned; auto& gpu_dev_ctx = static_cast(ctx); auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type()); - *payload = memory::Alloc(cpu, copy_size); + *payload = memory::Alloc(cuda_pinned, copy_size); - memory::Copy(cpu, *payload, boost::get(tensor.place()), + memory::Copy(cuda_pinned, *payload, + boost::get(tensor.place()), reinterpret_cast(tensor.data()), copy_size, gpu_dev_ctx.stream()); ctx.Wait(); @@ -90,11 +91,11 @@ void GetSelectedRowsPayload(framework::Variable* var, auto* tensor = slr->mutable_value(); if (platform::is_gpu_place(ctx.GetPlace())) { #ifdef PADDLE_WITH_CUDA - platform::CPUPlace cpu; + platform::CUDAPinnedPlace cuda_pinned; auto& gpu_dev_ctx = static_cast(ctx); auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type()); - *payload = memory::Alloc(cpu, copy_size); - memory::Copy(cpu, *payload, + *payload = memory::Alloc(cuda_pinned, copy_size); + memory::Copy(cuda_pinned, *payload, boost::get(tensor->place()), reinterpret_cast(tensor->data()), copy_size, gpu_dev_ctx.stream()); @@ -122,7 +123,13 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, // 1 trainer returns true for ShouldSendProfileState(). It tells PS // servers the trainer's profiling state so that PS can follow the // trainer. - request.set_profile(platform::IsProfileEnabled()); + if (platform::ShouldSendProfileState()) { + if (platform::IsProfileEnabled()) { + request.set_profile(platform::kEnableProfiler); + } else { + request.set_profile(platform::kDisableProfiler); + } + } if (!out_name.empty()) { request.set_out_varname(out_name); } @@ -145,8 +152,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, // GPU data is copied to CPU buffer when sending, // free the buffer when possible. destroy_callback = [](void* backing) { - platform::CPUPlace cpu; - memory::Free(cpu, backing); + platform::CUDAPinnedPlace cuda_pinned; + memory::Free(cuda_pinned, backing); }; } diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc index 462e303096e609c6797ca8cc16266ec3621623fc..24cb91a3bb820a0e5d51aaa49154434919080f69 100644 --- a/paddle/fluid/operators/detail/variable_response.cc +++ b/paddle/fluid/operators/detail/variable_response.cc @@ -449,8 +449,8 @@ int VariableResponse::Parse(Source* source) { break; } case sendrecv::VariableMessage::kProfileFieldNumber: { - bool profiling; - if (!input.ReadRaw(reinterpret_cast(&profiling), 1)) { + uint64_t profiling = 0; + if (!input.ReadVarint64(&profiling)) { return tag; } meta_.set_profile(profiling); @@ -458,9 +458,11 @@ int VariableResponse::Parse(Source* source) { if (listener_id <= 0) { break; } - if (profiling && !platform::IsProfileEnabled()) { + if (profiling == platform::kEnableProfiler && + !platform::IsProfileEnabled()) { platform::EnableProfiler(platform::ProfilerState::kCPU); - } else if (!profiling && platform::IsProfileEnabled()) { + } else if (profiling == platform::kDisableProfiler && + platform::IsProfileEnabled()) { // TODO(panyx0718): Should we allow to customize file dir. platform::DisableProfiler( platform::EventSortingKey::kDefault, diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h index 253964562c8d34e0fda3b4760761206895f749aa..baf04c30b17cb333fc8a6544afd6c479442f835b 100644 --- a/paddle/fluid/operators/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise_add_op.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once +#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/elementwise_op_function.h" +#include "paddle/fluid/operators/math/blas.h" namespace paddle { namespace operators { @@ -24,19 +26,57 @@ struct AddFunctor { inline HOSTDEVICE T operator()(T a, T b) const { return a + b; } }; +template +void default_elementwise_add(const framework::ExecutionContext& ctx, + const framework::Tensor* x, + const framework::Tensor* y, framework::Tensor* z) { + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + AddFunctor(), z); +} + +template +typename std::enable_if< + std::is_floating_point::value && + std::is_same::value>::type +elementwise_add(const framework::ExecutionContext& ctx, + const framework::Tensor* x, const framework::Tensor* y, + framework::Tensor* z) { + auto eigen_x = framework::EigenVector::Flatten(*x); + auto eigen_y = framework::EigenVector::Flatten(*y); + auto eigen_z = framework::EigenVector::Flatten(*z); + + auto blas = math::GetBlas(ctx); + blas.VADD(x->numel(), eigen_x.data(), eigen_y.data(), eigen_z.data()); +} + +template +typename std::enable_if< + !std::is_floating_point::value || + !std::is_same::value>::type +elementwise_add(const framework::ExecutionContext& ctx, + const framework::Tensor* x, const framework::Tensor* y, + framework::Tensor* z) { + default_elementwise_add(ctx, x, y, z); +} + template class ElementwiseAddKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { using Tensor = framework::Tensor; - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); + const auto x = ctx.Input("X"); + const auto y = ctx.Input("Y"); + auto z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); - int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - AddFunctor(), z); + + auto dims_equal = x->dims() == y->dims(); + if (dims_equal) { + elementwise_add(ctx, x, y, z); + } else { + default_elementwise_add(ctx, x, y, z); + } } }; @@ -45,6 +85,55 @@ struct IdentityGrad { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; } }; +template +void default_elementwise_add_grad(const framework::ExecutionContext& ctx, + const framework::Tensor* x, + const framework::Tensor* y, + const framework::Tensor* out, + const framework::Tensor* dout, + framework::Tensor* dx, + framework::Tensor* dy) { + int axis = ctx.Attr("axis"); + + ElemwiseGradCompute, IdentityGrad>( + ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad(), + IdentityGrad()); +} + +template +typename std::enable_if< + std::is_floating_point::value && + std::is_same::value>::type +elementwise_add_grad(const framework::ExecutionContext& ctx, + const framework::Tensor* x, const framework::Tensor* y, + const framework::Tensor* out, + const framework::Tensor* dout, framework::Tensor* dx, + framework::Tensor* dy) { + auto blas = math::GetBlas(ctx); + + if (dx) { + blas.VCOPY(dout->numel(), dout->data(), + dx->mutable_data(ctx.GetPlace())); + } + + if (dy) { + blas.VCOPY(dout->numel(), dout->data(), + dy->mutable_data(ctx.GetPlace())); + } +} + +template +typename std::enable_if< + !std::is_floating_point::value || + !std::is_same::value>::type +elementwise_add_grad(const framework::ExecutionContext& ctx, + const framework::Tensor* x, const framework::Tensor* y, + const framework::Tensor* out, + const framework::Tensor* dout, framework::Tensor* dx, + framework::Tensor* dy) { + default_elementwise_add_grad(ctx, x, y, out, dout, dx, dy); +} + template class ElementwiseAddGradKernel : public framework::OpKernel { public: @@ -57,10 +146,13 @@ class ElementwiseAddGradKernel : public framework::OpKernel { auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, IdentityGrad>( - ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad(), - IdentityGrad()); + + if (platform::is_cpu_place(ctx.GetPlace()) && (x->dims() == y->dims())) { + elementwise_add_grad(ctx, x, y, out, dout, dx, dy); + } else { + default_elementwise_add_grad(ctx, x, y, out, dout, dx, + dy); + } } }; diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h index d5b57cc2524efcdee112b2ce41cdcd4697fb79e6..f4cec8ad971abebe8d6dff1a384c8414269148a5 100644 --- a/paddle/fluid/operators/elementwise_op.h +++ b/paddle/fluid/operators/elementwise_op.h @@ -46,9 +46,11 @@ class ElementwiseOpInferVarType : public framework::VarTypeInference { public: void operator()(const framework::OpDesc& op_desc, framework::BlockDesc* block) const override { - auto x_var = op_desc.Input("X")[0]; - auto out_var = op_desc.Output("Out")[0]; - block->Var(out_var)->SetType(block->Var(x_var)->GetType()); + auto x_name = op_desc.Input("X")[0]; + auto out_name = op_desc.Output("Out")[0]; + auto& x = block->FindRecursiveOrCreateVar(x_name); + auto& out = block->FindRecursiveOrCreateVar(out_name); + out.SetType(x.GetType()); } }; diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 57eb5d9a0e73a51d9e2cef7ad7539c1b9da2c4ea..3e693ed7170530c5ca5cf8820e469146c2eb0c02 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include // for removing the port file #include #include #include // NOLINT @@ -77,12 +78,14 @@ ListenAndServOp::ListenAndServOp(const std::string &type, void ListenAndServOp::Stop() { rpc_service_->Push(LISTEN_TERMINATE_MESSAGE); server_thread_->join(); + auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid()); + remove(file_path.c_str()); } -void ListenAndServOp::SavePort(const std::string &file_path) const { +void ListenAndServOp::SavePort() const { // NOTE: default write file to /tmp/paddle.selected_port selected_port_ = rpc_service_->GetSelectedPort(); - + auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid()); std::ofstream port_file; port_file.open(file_path); port_file << selected_port_.load(); @@ -187,6 +190,7 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, for (auto &var : sparse_vars) { var->GetMutable()->mutable_rows()->clear(); } + rpc_service_->SetCond(1); // FIXME(typhoonzero): use another condition to sync wait clients get. rpc_service_->WaitClientGet(fan_in); @@ -331,7 +335,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, // Write to a file of server selected port for python use. std::string file_path = string::Sprintf("/tmp/paddle.%d.selected_port", static_cast(::getpid())); - SavePort(file_path); + SavePort(); if (sync_mode) { RunSyncLoop(&executor, program, &recv_scope, prefetch_block); } else { diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index f52a55c5c2d6902df6cb7e0a0d7242c6e86dc786..8af061eaf2bec4a9edd264c8c77ac69e228b0669 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -48,8 +48,7 @@ class ListenAndServOp : public framework::OperatorBase { void RunAsyncLoop(framework::Executor* executor, framework::ProgramDesc* program) const; - void SavePort( - const std::string& file_path = "/tmp/paddle.selected_port") const; + void SavePort() const; void WaitServerReady(); diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index dabde43850db770d286b13cacd32bee181328d5c..1a37cb39d56066b8380338b9710a441e41518c39 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -125,6 +125,12 @@ class Blas { template void AXPY(int n, T alpha, const T* x, T* y) const; + template + void VADD(int n, const T* x, const T* y, T* z) const; + + template + void VCOPY(int n, const T* x, T* y) const; + template void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta, T* C) const; @@ -163,6 +169,16 @@ class BlasT : private Blas { Base()->template AXPY(args...); } + template + void VADD(ARGS... args) const { + Base()->template VADD(args...); + } + + template + void VCOPY(ARGS... args) const { + Base()->template VCOPY(args...); + } + template void GEMV(ARGS... args) const { Base()->template GEMV(args...); diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 14b3624b420cb883b36268c0a5a9e8692dbb5b43..ae20406bc21d5e08359be8295cd98495dda7813b 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -34,6 +34,18 @@ struct CBlas { cblas_saxpy(args...); } +#ifdef PADDLE_WITH_MKLML + template + static void VADD(ARGS... args) { + vsAdd(args...); + } +#endif + + template + static void VCOPY(ARGS... args) { + cblas_scopy(args...); + } + template static void GEMV(ARGS... args) { cblas_sgemv(args...); @@ -59,6 +71,18 @@ struct CBlas { cblas_daxpy(args...); } +#ifdef PADDLE_WITH_MKLML + template + static void VADD(ARGS... args) { + vdAdd(args...); + } +#endif + + template + static void VCOPY(ARGS... args) { + cblas_dcopy(args...); + } + template static void GEMV(ARGS... args) { cblas_dgemv(args...); @@ -139,6 +163,24 @@ void Blas::AXPY(int n, T alpha, const T *x, CBlas::AXPY(n, alpha, x, 1, y, 1); } +template <> +template +void Blas::VCOPY(int n, const T *x, T *y) const { + CBlas::VCOPY(n, x, 1, y, 1); +} + +template <> +template +void Blas::VADD(int n, const T *x, const T *y, + T *z) const { +#ifdef PADDLE_WITH_MKLML + CBlas::VADD(n, x, y, z); +#else + this->template VCOPY(n, y, z); + this->template AXPY(n, 1., x, z); +#endif +} + template <> template void Blas::GEMV(bool trans_a, int M, int N, T alpha, diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc index 55bb9739e0239d31f63c3d8703bcf1d18bf459dc..5b7e8a063a034f0be056065826fca0fe807bc9a7 100644 --- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc +++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc @@ -21,14 +21,15 @@ namespace reader { template class RandomDataGenerator : public framework::ReaderBase { public: - RandomDataGenerator(const std::vector& shapes, float min, - float max) - : framework::ReaderBase(), min_(min), max_(max), shapes_(shapes) { - PADDLE_ENFORCE_LE( - min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max); + RandomDataGenerator(const std::vector& shapes, float low, + float high) + : framework::ReaderBase(), low_(low), high_(high), shapes_(shapes) { + PADDLE_ENFORCE_LE(low, high, + "'low' shouldn't be greater than 'high'.(%f vs %f)", low, + high); unsigned int seed = std::random_device()(); engine_.seed(seed); - dist_ = std::uniform_real_distribution(min_, max_); + dist_ = std::uniform_real_distribution(low_, high_); } void ReadNext(std::vector* out) override { @@ -53,8 +54,8 @@ class RandomDataGenerator : public framework::ReaderBase { void ReInit() override { return; } private: - float min_; - float max_; + float low_; + float high_; std::minstd_rand engine_; std::uniform_real_distribution dist_; std::vector shapes_; @@ -78,22 +79,22 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase { std::vector shapes = RestoreShapes(shape_concat, ranks); auto* out = scope.FindVar(Output("Out")) ->template GetMutable(); - out->Reset(new RandomDataGenerator(shapes, Attr("min"), - Attr("max"))); + out->Reset(new RandomDataGenerator(shapes, Attr("low"), + Attr("high"))); } }; class CreateRandomDataGeneratorOpMaker : public FileReaderMakerBase { protected: void Apply() override { - AddAttr("min", "The lower bound of reader's uniform distribution."); - AddAttr("max", "The upper bound of reader's uniform distribution."); + AddAttr("low", "The lower bound of reader's uniform distribution."); + AddAttr("high", "The upper bound of reader's uniform distribution."); AddComment(R"DOC( CreateRandomDataGenerator Operator This Op creates a random reader. The reader generates random data instead of really reading from files. - Generated data follow an uniform distribution between 'min' and 'max'. + Generated data follow an uniform distribution between 'low' and 'high'. )DOC"); } }; diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc index eb8c21179db690e20db29c21892fd6258dd75579..e293fd5e410b2a34b3c71ea674607ba9d7654535 100644 --- a/paddle/fluid/operators/reduce_op.cc +++ b/paddle/fluid/operators/reduce_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/reduce_op.h" +#include #include #include @@ -34,11 +35,14 @@ class ReduceOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("X"); auto x_rank = x_dims.size(); PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); - int dim = ctx->Attrs().Get("dim"); - if (dim < 0) dim = x_rank + dim; - PADDLE_ENFORCE_LT( - dim, x_rank, - "The dim should be in the range [-rank(input), rank(input))."); + auto dims = ctx->Attrs().Get>("dim"); + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] < 0) dims[i] = x_rank + dims[i]; + PADDLE_ENFORCE_LT( + dims[i], x_rank, + "The dim should be in the range [-rank(input), rank(input))."); + } + sort(dims.begin(), dims.end()); bool reduce_all = ctx->Attrs().Get("reduce_all"); bool keep_dim = ctx->Attrs().Get("keep_dim"); if (reduce_all) { @@ -49,14 +53,22 @@ class ReduceOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", {1}); } else { auto dims_vector = vectorize(x_dims); - if (keep_dim || x_rank == 1) { - dims_vector[dim] = 1; + if (keep_dim) { + for (size_t i = 0; i < dims.size(); ++i) { + dims_vector[dims[i]] = 1; + } } else { - dims_vector.erase(dims_vector.begin() + dim); + const int kDelFlag = -2; + for (size_t i = 0; i < dims.size(); ++i) { + dims_vector[dims[i]] = kDelFlag; + } + dims_vector.erase( + remove(dims_vector.begin(), dims_vector.end(), kDelFlag), + dims_vector.end()); } auto out_dims = framework::make_ddim(dims_vector); ctx->SetOutputDim("Out", out_dims); - if (dim != 0) { + if (dims[0] != 0) { // Only pass LoD when not reducing on the first dim. ctx->ShareLoD("X", /*->*/ "Out"); } @@ -75,11 +87,14 @@ class ReduceGradOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("X"); auto x_rank = x_dims.size(); PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); - int dim = ctx->Attrs().Get("dim"); - if (dim < 0) dim = x_rank + dim; - PADDLE_ENFORCE_LT( - dim, x_rank, - "The dim should be in the range [-rank(input), rank(input))."); + auto dims = ctx->Attrs().Get>("dim"); + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] < 0) dims[i] = x_rank + dims[i]; + PADDLE_ENFORCE_LT( + dims[i], x_rank, + "The dim should be in the range [-rank(input), rank(input))."); + } + sort(dims.begin(), dims.end()); auto x_grad_name = framework::GradVarName("X"); if (ctx->HasOutput(x_grad_name)) { ctx->SetOutputDim(x_grad_name, x_dims); @@ -95,13 +110,13 @@ class ReduceOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor) The input tensor. Tensors with rank at most 6 are " "supported."); AddOutput("Out", "(Tensor) The result tensor."); - AddAttr( + AddAttr>( "dim", - "(int, default 0) The dimension to reduce. " + "(list, default {0}) The dimensions to reduce. " "Must be in the range [-rank(input), rank(input)). " - "If `dim < 0`, the dim to reduce is `rank + dim`. " + "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. " "Note that reducing on the first dim will make the LoD info lost.") - .SetDefault(0); + .SetDefault({0}); AddAttr("keep_dim", "(bool, default false) " "If true, retain the reduced dimension with length 1.") diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h index e42b4bfe42df05346020d4f48519fecf39aa37d2..cd19cc1460a6b4d4201f21f6f27f988c1547b88a 100644 --- a/paddle/fluid/operators/reduce_op.h +++ b/paddle/fluid/operators/reduce_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "glog/logging.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -109,6 +110,11 @@ struct ProdGradFunctor { } }; +#define HANDLE_DIM(NDIM, RDIM) \ + if (ndim == NDIM && rdim == RDIM) { \ + ReduceCompute(context); \ + } + template class ReduceKernel : public framework::OpKernel { public: @@ -127,32 +133,29 @@ class ReduceKernel : public framework::OpKernel { Functor functor; functor(place, &x, &out, reduce_dim); } else { - int rank = context.Input("X")->dims().size(); - switch (rank) { - case 1: - ReduceCompute<1>(context); - break; - case 2: - ReduceCompute<2>(context); - break; - case 3: - ReduceCompute<3>(context); - break; - case 4: - ReduceCompute<4>(context); - break; - case 5: - ReduceCompute<5>(context); - break; - case 6: - ReduceCompute<6>(context); - break; - } + int ndim = context.Input("X")->dims().size(); + int rdim = context.Attr>("dim").size(); + HANDLE_DIM(6, 5); + HANDLE_DIM(6, 4); + HANDLE_DIM(6, 3); + HANDLE_DIM(6, 2); + HANDLE_DIM(6, 1); + HANDLE_DIM(5, 4); + HANDLE_DIM(5, 3); + HANDLE_DIM(5, 2); + HANDLE_DIM(5, 1); + HANDLE_DIM(4, 3); + HANDLE_DIM(4, 2); + HANDLE_DIM(4, 1); + HANDLE_DIM(3, 2); + HANDLE_DIM(3, 1); + HANDLE_DIM(2, 1); + HANDLE_DIM(1, 1); } } private: - template + template void ReduceCompute(const framework::ExecutionContext& context) const { auto* input = context.Input("X"); auto* output = context.Output("Out"); @@ -160,18 +163,26 @@ class ReduceKernel : public framework::OpKernel { auto x = EigenTensor::From(*input); auto x_rank = static_cast(x.dimensions().size()); - int dim = static_cast(context.Attr("dim")); - if (dim < 0) dim = x_rank + dim; - auto reduce_dim = Eigen::array({{dim}}); + auto dims = context.Attr>("dim"); + auto reduce_dim = Eigen::array(); + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] < 0) dims[i] = x_rank + dims[i]; + reduce_dim[i] = dims[i]; + } // construct the squeezed output tensor bool keep_dim = context.Attr("keep_dim"); - DDim dims = output->dims(); - auto dims_vector = vectorize(dims); + DDim out_dims = output->dims(); if (keep_dim && x_rank > 1) { - dims_vector.erase(dims_vector.begin() + dim); - dims = framework::make_ddim(dims_vector); + const int kDelFlag = -2; + auto dims_vector = vectorize(out_dims); + for (size_t i = 0; i < dims.size(); ++i) { + dims_vector[dims[i]] = kDelFlag; + } + dims_vector.erase( + remove(dims_vector.begin(), dims_vector.end(), kDelFlag), + dims_vector.end()); + out_dims = framework::make_ddim(dims_vector); } - auto& place = *context.template device_context().eigen_device(); Functor functor; @@ -180,7 +191,7 @@ class ReduceKernel : public framework::OpKernel { auto out = EigenScalar::From(*output); functor(place, &x, &out, reduce_dim); } else { - auto out = EigenTensor::From(*output, dims); + auto out = EigenTensor::From(*output, out_dims); functor(place, &x, &out, reduce_dim); } } @@ -245,21 +256,29 @@ class ReduceGradKernel : public framework::OpKernel { auto x = EigenTensor::From(*input0); auto x_grad = EigenTensor::From(*output); auto x_rank = static_cast(x.dimensions().size()); - int dim = static_cast(context.Attr("dim")); - if (dim < 0) dim = x_rank + dim; - DDim dims = input0->dims(); - dims[dim] = 1; - auto x_reduce = EigenTensor::From(*input1, dims); - auto x_reduce_grad = EigenTensor::From(*input2, dims); - + auto dims = context.Attr>("dim"); + auto x_dims = input0->dims(); + auto reduced_dims_v = vectorize(x_dims); Eigen::array broadcast_dim; for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; - broadcast_dim[dim] = input0->dims()[dim]; + + int broad_cats_times = 1; + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] < 0) dims[i] = x_rank + dims[i]; + reduced_dims_v[dims[i]] = 1; + broadcast_dim[dims[i]] = x_dims[dims[i]]; + broad_cats_times *= x_dims[dims[i]]; + } + auto reduced_dims = framework::make_ddim(reduced_dims_v); + auto x_reduce = EigenTensor::From(*input1, reduced_dims); + auto x_reduce_grad = EigenTensor::From(*input2, reduced_dims); + auto& place = *context.template device_context().eigen_device(); + Functor functor; functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, - broadcast_dim[dim]); + broad_cats_times); } }; diff --git a/paddle/fluid/operators/test_send_nccl_id.cc b/paddle/fluid/operators/test_send_nccl_id.cc index bbae1d54aa3524fd45cb8ab13c86df8d54b8e643..719f039a0f5fcd7445bf1589a683f122e6d62ba0 100644 --- a/paddle/fluid/operators/test_send_nccl_id.cc +++ b/paddle/fluid/operators/test_send_nccl_id.cc @@ -63,7 +63,7 @@ void StartServer(std::atomic* initialized) { server_thread.join(); } -TEST(SendNcclId, Normal) { +TEST(SendNcclId, DISABLED_Normal) { std::atomic initialized{false}; std::thread server_thread(StartServer, &initialized); while (!initialized) { diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h index 705cc894c06b207f4e4e45fc771c04fa3cbdf6d5..ab70c1f0592d122ba248a101db487e64c0bdae6f 100644 --- a/paddle/fluid/operators/warpctc_op.h +++ b/paddle/fluid/operators/warpctc_op.h @@ -186,8 +186,7 @@ class WarpCTCKernel : public framework::OpKernel { // warpctc accesses labels in CPU memory Tensor warpctc_label; - TensorCopy(*label, platform::CPUPlace(), ctx.device_context(), - &warpctc_label); + TensorCopySync(*label, platform::CPUPlace(), &warpctc_label); const int* warpctc_label_data = warpctc_label.data(); // warpctc stores loss in CPU memory Tensor warpctc_loss; diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index c9e10631680a6ea3876f555a3a6e6c12f79b39d5..1a9be044e024e4b1dda5ef7d515c65f3a7513710 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -245,7 +245,6 @@ class DeviceTracerImpl : public DeviceTracer { void Enable() { std::lock_guard l(trace_mu_); if (enabled_) { - fprintf(stderr, "DeviceTracer already enabled\n"); return; } EnableActivity(); diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 643bb6183d144ec11a4890d9ea1ca970acb08b4c..bf43925373a12cd9ff2155d68c42d0266ba4df60 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -116,6 +116,8 @@ void ResetProfiler(); void DisableProfiler(EventSortingKey sorted_key, const std::string& profile_path); +const int kEnableProfiler = 1; +const int kDisableProfiler = 2; // Test if the profiler is currently enabled. bool IsProfileEnabled(); // Whether the trainer should send profiling state to PS. diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index 3f28e616494ad1322708ad6403aaf50b22d724e6..9111abca5aac97e9d5c7b00ce5173f08e49cda12 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/pybind/const_value.h" +#include #include "paddle/fluid/framework/operator.h" namespace paddle { @@ -23,6 +24,21 @@ void BindConstValue(pybind11::module* m) { m->def("kTempVarName", [] { return framework::kTempVarName; }); m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; }); m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; }); + + auto op_proto_and_checker_maker = + m->def_submodule("op_proto_and_checker_maker"); + + pybind11::enum_(op_proto_and_checker_maker, "OpRole") + .value("Forward", framework::OpRole::kForward) + .value("Backward", framework::OpRole::kBackward) + .value("Optimize", framework::OpRole::kOptimize) + .value("Loss", framework::OpRole::kLoss); + + op_proto_and_checker_maker.def( + "kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName); + op_proto_and_checker_maker.def( + "kOpRoleVarAttrName", + framework::OpProtoAndCheckerMaker::OpRoleVarAttrName); } } // namespace pybind diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md index fd80a77b02e60c15ae6c58486ed7cbbb6ffefabc..41b01d33828f750f67bba5f82cb7ed6fe4d4ea0a 100644 --- a/paddle/fluid/train/demo/README.md +++ b/paddle/fluid/train/demo/README.md @@ -7,7 +7,7 @@ # WITH_MKLDNN=ON|OFF PADDLE_LIB=/paddle/lib/dir -cmake .. -DCMAKE_INSTALL_PREFIX=$PADDLE_LIB \ +cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \ -DCMAKE_BUILD_TYPE=Release \ -DWITH_FLUID_ONLY=ON \ -DWITH_GPU=OFF \ @@ -42,7 +42,7 @@ cd build # WITH_MKLDNN=ON|OFF PADDLE_LIB=/paddle/lib/dir -# PADDLE_LIB is the same with CMAKE_INSTALL_PREFIX when building the lib +# PADDLE_LIB is the same with FLUID_INSTALL_DIR when building the lib cmake .. -DPADDLE_LIB=$PADDLE_LIB \ -DWITH_MKLDNN=OFF \ -DWITH_MKL=OFF diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt index 9b2779b42cad324253dadf27dbff20fd8e8c8e16..29b4ac098e21ee315d5c9b2f2499521d1aa1c322 100644 --- a/paddle/function/CMakeLists.txt +++ b/paddle/function/CMakeLists.txt @@ -52,9 +52,3 @@ add_simple_unittest(Im2ColTest) add_simple_unittest(GemmConvOpTest) add_simple_unittest(DepthwiseConvOpTest) endif() - -add_style_check_target(paddle_function ${h_files}) -add_style_check_target(paddle_function ${cpp_files}) -if(WITH_GPU) - add_style_check_target(paddle_function ${cu_files}) -endif() diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt index 3d6ced713f00bd72622d8aeed3967642b6774ffe..6dc877dd90ee2ae3d99406299a9244eb3e3d7b53 100644 --- a/paddle/gserver/CMakeLists.txt +++ b/paddle/gserver/CMakeLists.txt @@ -146,8 +146,6 @@ else() ${GSERVER_SOURCES}) endif() -add_style_check_target(paddle_gserver ${GSERVER_SOURCES}) -add_style_check_target(paddle_gserver ${GSERVER_HEADER}) add_dependencies(paddle_gserver paddle_proto ${external_project_dependencies}) if(WITH_TESTING) add_subdirectory(tests) diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt index 922fb5172273da24f9c48786961a6d850b1ed7c5..3c897b5f3e09cd53ddd5b767333ce4759250da71 100644 --- a/paddle/math/CMakeLists.txt +++ b/paddle/math/CMakeLists.txt @@ -51,10 +51,6 @@ else() endif() - -add_style_check_target(paddle_math ${MATH_SOURCES}) -add_style_check_target(paddle_math ${MATH_HEADERS}) - add_dependencies(paddle_math paddle_proto ${external_project_dependencies}) # depends if(WITH_TESTING) add_subdirectory(tests) diff --git a/paddle/parameter/CMakeLists.txt b/paddle/parameter/CMakeLists.txt index d2ae1c16c6b7316f1a6facdef4b933693d6ba818..19ae07e077e2b8f55ce4050566c9cf6aaa0efa0a 100644 --- a/paddle/parameter/CMakeLists.txt +++ b/paddle/parameter/CMakeLists.txt @@ -5,8 +5,6 @@ file(GLOB PARAMETERS_SOURCES . *.cpp) add_library(paddle_parameter STATIC ${PARAMETERS_SOURCES}) -add_style_check_target(paddle_parameter ${PARAMETERS_SOURCES}) -add_style_check_target(paddle_parameter ${PARAMETERS_HEADERS}) add_dependencies(paddle_parameter paddle_proto ${external_project_dependencies}) if(WITH_TESTING) add_subdirectory(tests) diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt index f75475a88f7224ee3889827795088c8aa920b63b..0ae9c6ef6afc6ec5a99a685b08883def0db51cf1 100644 --- a/paddle/pserver/CMakeLists.txt +++ b/paddle/pserver/CMakeLists.txt @@ -14,9 +14,6 @@ set(NETWORK_HEADERS add_library(paddle_network STATIC ${NETWORK_SOURCES}) -add_style_check_target(paddle_network ${NETWORK_SOURCES}) -add_style_check_target(paddle_network ${NETWORK_HEADERS}) - add_dependencies(paddle_network paddle_proto ${external_project_dependencies}) ################### paddle_pserver ###################### @@ -37,9 +34,6 @@ set(PSERVER_HEADERS add_library(paddle_pserver STATIC ${PSERVER_SOURCES}) -add_style_check_target(paddle_pserver ${PSERVER_SOURCES}) -add_style_check_target(paddle_pserver ${PSERVER_HEADERS}) - add_dependencies(paddle_pserver paddle_proto ${external_project_dependencies}) set(PSERVER_MAIN_SOURCES diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 92b8b90880bc78dbc281a959a7472c2822f76fc3..baff7628ea01caa0248af82c6eed2c3b546cdb35 100755 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -48,7 +48,6 @@ function cmake_gen() { -DWITH_PYTHON=${WITH_PYTHON:-ON} -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} -DCUDNN_ROOT=/usr/ - -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} -DWITH_TESTING=${WITH_TESTING:-ON} -DWITH_FAST_BUNDLE_TEST=ON -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake @@ -75,7 +74,6 @@ EOF -DWITH_C_API=${WITH_C_API:-OFF} \ -DWITH_PYTHON=${WITH_PYTHON:-ON} \ -DCUDNN_ROOT=/usr/ \ - -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \ -DWITH_TESTING=${WITH_TESTING:-ON} \ -DWITH_FAST_BUNDLE_TEST=ON \ -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ @@ -125,8 +123,7 @@ EOF -DWITH_DOC=ON \ -DWITH_GPU=OFF \ -DWITH_AVX=${WITH_AVX:-ON} \ - -DWITH_SWIG_PY=ON \ - -DWITH_STYLE_CHECK=OFF + -DWITH_SWIG_PY=ON make -j `nproc` paddle_docs paddle_apis popd diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh index 3d5e775fafb6b94a3429dbf3368a8949bca3d612..7e60079ebf086d0f06219de1e85bdd495105c7b0 100644 --- a/paddle/scripts/docker/build_android.sh +++ b/paddle/scripts/docker/build_android.sh @@ -47,7 +47,6 @@ if [ $ANDROID_ABI == "armeabi-v7a" ]; then -DUSE_EIGEN_FOR_BLAS=ON \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - -DWITH_STYLE_CHECK=OFF \ .. elif [ $ANDROID_ABI == "arm64-v8a" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ @@ -61,7 +60,6 @@ elif [ $ANDROID_ABI == "arm64-v8a" ]; then -DUSE_EIGEN_FOR_BLAS=OFF \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - -DWITH_STYLE_CHECK=OFF \ .. elif [ $ANDROID_ABI == "armeabi" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ @@ -74,7 +72,6 @@ elif [ $ANDROID_ABI == "armeabi" ]; then -DCMAKE_BUILD_TYPE=MinSizeRel \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - -DWITH_STYLE_CHECK=OFF \ .. else echo "Invalid ANDROID_ABI: $ANDROID_ABI" diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index fbe219a1c9cf85f19ae2ab991ae7e4207858f204..900ddfd1128da4c2d4f7d23a16c833352379fab2 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -99,7 +99,6 @@ function cmake_gen() { -DWITH_PYTHON=${WITH_PYTHON:-ON} -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} -DCUDNN_ROOT=/usr/ - -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} -DWITH_TESTING=${WITH_TESTING:-ON} -DWITH_FAST_BUNDLE_TEST=ON -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake @@ -126,7 +125,6 @@ EOF -DWITH_C_API=${WITH_C_API:-OFF} \ -DWITH_PYTHON=${WITH_PYTHON:-ON} \ -DCUDNN_ROOT=/usr/ \ - -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \ -DWITH_TESTING=${WITH_TESTING:-ON} \ -DWITH_FAST_BUNDLE_TEST=ON \ -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ @@ -231,7 +229,6 @@ EOF -DUSE_EIGEN_FOR_BLAS=ON \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - -DWITH_STYLE_CHECK=OFF \ .. elif [ $ANDROID_ABI == "arm64-v8a" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ @@ -245,7 +242,6 @@ EOF -DUSE_EIGEN_FOR_BLAS=OFF \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - -DWITH_STYLE_CHECK=OFF \ .. elif [ $ANDROID_ABI == "armeabi" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ @@ -258,7 +254,6 @@ EOF -DCMAKE_BUILD_TYPE=MinSizeRel \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - -DWITH_STYLE_CHECK=OFF \ .. else echo "Invalid ANDROID_ABI: $ANDROID_ABI" @@ -287,7 +282,6 @@ function build_ios() { -DUSE_EIGEN_FOR_BLAS=ON \ -DWITH_TESTING=OFF \ -DWITH_SWIG_PY=OFF \ - -DWITH_STYLE_CHECK=OFF \ -DCMAKE_BUILD_TYPE=Release make -j 2 @@ -375,8 +369,7 @@ EOF -DCMAKE_BUILD_TYPE=Release \ -DWITH_DOC=ON \ -DWITH_GPU=OFF \ - -DWITH_MKL=OFF \ - -DWITH_STYLE_CHECK=OFF + -DWITH_MKL=OFF make -j `nproc` paddle_docs paddle_apis @@ -415,9 +408,11 @@ function gen_dockerfile() { DOCKERFILE_GPU_ENV="" DOCKERFILE_CUDNN_DSO="" + DOCKERFILE_CUBLAS_DSO="" if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}" - DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so" + DOCKERFILE_CUDNN_DSO="RUN ln -sf /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so" + DOCKERFILE_CUBLAS_DSO="RUN ln -sf /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.${CUDA_MAJOR} /usr/lib/x86_64-linux-gnu/libcublas.so" fi cat <= 0 and not _interval_secs_exceed( + _get_serial_dir(serial, checkpoint_dir), save_interval_secs): + return + + serial += 1 + cur_dir = _get_serial_dir(serial, checkpoint_dir) + + save_vars( + executor, + dirname=cur_dir, + main_program=main_program, + vars=None, + predicate=_is_checkpoint_var, + filename=None) + _write_success(cur_dir) + _lru_delete(checkpoint_dir, max_num_checkpoints) + + +def load_checkpoint(executor, checkpoint_dir=None, main_program=None): + """ + Load checkpoint from a directory by executor, + it will find the most recent saved checkpoint file and load it auto. + + :param executor + :param checkpoint_dir + :param main_program + """ + + if checkpoint_dir is None: + checkpoint_dir = os.getcwd() + + serial = _get_lastest_checkpoint_dir(checkpoint_dir) + + if serial < 0: + return + + cur_dir = _get_serial_dir(serial, checkpoint_dir) + + load_vars( + executor, + dirname=cur_dir, + main_program=main_program, + predicate=_is_checkpoint_var, + filename=None) + + +def clean_checkpoint(checkpoint_dir, delete_dir=False): + """ + clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before. + delete_dir only works when the directory is empty, otherwise, OSError is raised. + """ + if checkpoint_dir is None: + checkpoint_dir = os.getcwd() + _lru_delete(checkpoint_dir, max_num_checkpoints=0) + + if delete_dir and not os.listdir(checkpoint_dir): + os.rmdir(checkpoint_dir) + + +def _get_serial_dir(serial, checkpoint_dir): + serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) + return os.path.join(checkpoint_dir, serial_folder) + + +def _is_checkpoint_var(var): + """ + the checkpoint will not save or load all the variables. + var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded. + + :param var + """ + if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ + var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ + var.desc.type() == core.VarDesc.VarType.RAW: + return False + + if var.name.endswith("@GRAD"): + return False + + return var.persistable + + +def _interval_secs_exceed(dirname, save_interval_secs): + dir_time = os.path.getmtime(dirname) + if save_interval_secs > (time.time() - dir_time): + return False + return True + + +def _lru_delete(dirname, max_num_checkpoints=3): + dirs = os.listdir(dirname) + serials = [] + for serial in dirs: + try: + serials.append(int(serial)) + except ValueError: + continue + + if len(serials) <= max_num_checkpoints: + return + + serials.sort(reverse=True) + serials = serials[max_num_checkpoints:] + for serial in serials: + cur_dir = os.path.join(dirname, str(serial)) + shutil.rmtree(cur_dir) + + +def _write_success(dirname): + """ + write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct. + + :param dirname + """ + success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME) + with open(success_file, 'a') as f: + now = time.ctime() + f.write(now) + + +def _get_lastest_checkpoint_dir(checkpoint_dir): + """ + get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory + + :param checkpoint_dir + """ + if not checkpoint_dir.strip(): + return -1 + + def has_success(checkpoint_dir, cur_dir): + """ + is _SUCCESS in this dir + """ + _, serial = cur_dir.split(CHECKPOINT_SEPARATOR) + + try: + int(serial) + except ValueError: + return -1 + + if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)): + return -1 + + success_path = os.path.join( + _get_serial_dir(serial, checkpoint_dir), SUCCESS_MARK_FILENAME) + if os.path.isfile(success_path): + return int(serial) + + if not os.path.isdir(checkpoint_dir): + return -1 + + current_dir = -1 + dirs = os.listdir(checkpoint_dir) + for cur_dir in dirs: + success_num = has_success(checkpoint_dir, cur_dir) + if success_num > current_dir: + current_dir = success_num + return current_dir diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index dee41448081cbfcd8224ce2abbf3ba7b7b97eb7c..d1ea9f148566d20988a43f4c9d421c4452697ef1 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -1098,7 +1098,7 @@ class ConditionalBlock(object): input_set = set([ipt.name for ipt in self.inputs]) param_list = [ - parent_block.var(each_name) for each_name in params + parent_block.var_recursive(each_name) for each_name in params if each_name not in input_set ] diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 4d6ee3c51b7cccdaa3303b5a4cd8e7219b753ccb..1470f8c2e50004abb08e75980decd9485c22dece 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -321,7 +321,7 @@ def open_recordio_file(filename, dtypes=['float32', 'int64']) # Via the reader, we can use 'read_file' layer to get data: - image, label = fluid.layers.read_file(reader) + image, label = fluid.layers.io.read_file(reader) """ dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] shape_concat = [] @@ -359,6 +359,73 @@ def open_recordio_file(filename, return monkey_patch_reader_methods(main_prog_var) +def random_data_generator(low, high, shapes, lod_levels, for_parallel=True): + """ + Create a uniform random data generator + + This layer returns a Reader Variable. + Instead of opening a file and reading data from it, this + Reader Variable generates float uniform random data by itself. + It can be used as a dummy reader to test a network without + opening a real file. + + Args: + low(float): The lower bound of data's uniform distribution. + high(float): The upper bound of data's uniform distribution. + shapes(list): List of tuples which declaring data shapes. + lod_levels(list): List of ints which declaring data lod_level. + for_parallel(Bool): Set it as True if you are going to run + subsequent operators in parallel. + + Returns: + Variable: A Reader Variable from which we can get random data. + + Examples: + .. code-block:: python + + reader = fluid.layers.io.random_data_generator( + low=0.0, + high=1.0, + shapes=[(3,224,224), (1)], + lod_levels=[0, 0]) + + # Via the reader, we can use 'read_file' layer to get data: + image, label = fluid.layers.io.read_file(reader) + """ + dtypes = [core.VarDesc.VarType.FP32] * len(shapes) + shape_concat = [] + ranks = [] + + for shape in shapes: + shape_concat.extend(shape) + ranks.append(len(shape)) + + var_name = unique_name('random_data_generator') + + startup_blk = default_startup_program().current_block() + startup_var = startup_blk.create_var(name=var_name) + startup_blk.append_op( + type='create_random_data_generator', + outputs={'Out': [startup_var]}, + attrs={ + 'low': low, + 'high': high, + 'shape_concat': shape_concat, + 'lod_levels': lod_levels, + 'ranks': ranks + }) + + startup_var.desc.set_dtypes(dtypes) + startup_var.persistable = True + main_prog_var = _copy_reader_var_(default_main_program().current_block(), + startup_var) + + if for_parallel: + main_prog_var = parallel(reader=main_prog_var) + + return monkey_patch_reader_methods(main_prog_var) + + def open_files(filenames, shapes, lod_levels, diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 561c8bd42f90911bf5a0c898fe01412d42d2c9b1..04ee8ac9aee92a0e161e83bf1bb34d3ce727a0fb 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -80,6 +80,8 @@ __all__ = [ 'pad', 'label_smooth', 'roi_pool', + 'dice_loss', + 'bilinear_interp', ] @@ -699,8 +701,8 @@ def dynamic_gru(input, def gru_unit(input, hidden, size, - weight=None, - bias=None, + param_attr=None, + bias_attr=None, activation='tanh', gate_activation='sigmoid'): """ @@ -731,8 +733,8 @@ def gru_unit(input, input (Variable): The fc transformed input value of current step. hidden (Variable): The hidden value of lstm unit from previous step. size (integer): The input dimension value. - weight (ParamAttr): The weight parameters for gru unit. Default: None - bias (ParamAttr): The bias parameters for gru unit. Default: None + param_attr (ParamAttr): The weight parameters for gru unit. Default: None + bias_attr (ParamAttr): The bias parameters for gru unit. Default: None activation (string): The activation type for cell (actNode). Default: 'tanh' gate_activation (string): The activation type for gates (actGate). @@ -764,34 +766,31 @@ def gru_unit(input, size = size / 3 # create weight - if weight is None: - weight = helper.create_parameter( - attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) + weight = helper.create_parameter( + attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) + gate = helper.create_tmp_variable(dtype) + reset_hidden_pre = helper.create_tmp_variable(dtype) + updated_hidden = helper.create_tmp_variable(dtype) + inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight} # create bias - - if bias is None: + if helper.bias_attr: bias_size = [1, 3 * size] bias = helper.create_parameter( attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) - - gate = helper.create_tmp_variable(dtype) - reset_hidden_pre = helper.create_tmp_variable(dtype) - updated_hidden = helper.create_tmp_variable(dtype) + inputs['Bias'] = bias helper.append_op( type='gru_unit', - inputs={'Input': input, - 'HiddenPrev': hidden, - 'Weight': weight}, + inputs=inputs, outputs={ 'Gate': gate, 'ResetHiddenPrev': reset_hidden_pre, 'Hidden': updated_hidden, }, attrs={ - 'activation': 0, - 'gate_activation': 1, + 'activation': 2, # tanh + 'gate_activation': 1, # sigmoid }) return updated_hidden, reset_hidden_pre, gate @@ -1710,6 +1709,7 @@ def conv2d_transpose(input, padding=0, stride=1, dilation=1, + groups=None, param_attr=None, bias_attr=None, use_cudnn=True, @@ -1780,6 +1780,12 @@ def conv2d_transpose(input, dilation(int|tuple): The dilation size. If dilation is a tuple, it must contain two integers, (dilation_H, dilation_W). Otherwise, the dilation_H = dilation_W = dilation. Default: dilation = 1. + groups(int): The groups number of the Conv2d transpose layer. Inspired by + grouped convolution in Alex Krizhevsky's Deep CNN paper, in which + when group=2, the first half of the filters is only connected to the + first half of the input channels, while the second half of the + filters is only connected to the second half of the input channels. + Default: groups=1 param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer. Default: None bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None @@ -1834,7 +1840,8 @@ def conv2d_transpose(input, filter_size = utils.convert_to_list(filter_size, 2, 'conv2d_transpose.filter_size') - filter_shape = [input_channel, num_filters] + filter_size + groups = 1 if groups is None else groups + filter_shape = [input_channel, num_filters / groups] + filter_size img_filter = helper.create_parameter( dtype=input.dtype, shape=filter_shape, attr=helper.param_attr) @@ -2084,11 +2091,11 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None): Args: input (Variable): The input variable which is a Tensor or LoDTensor. - dim (int|None): The dimension along which the sum is performed. If + dim (list|int|None): The dimensions along which the sum is performed. If :attr:`None`, sum all elements of :attr:`input` and return a Tensor variable with a single element, otherwise must be in the - range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, - the dimension to reduce is :math:`rank + dim`. + range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`, + the dimension to reduce is :math:`rank + dim[i]`. keep_dim (bool|False): Whether to reserve the reduced dimension in the output Tensor. The result tensor will have one fewer dimension than the :attr:`input` unless :attr:`keep_dim` is true. @@ -2109,15 +2116,25 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None): fluid.layers.reduce_sum(x, dim=0) # [0.3, 0.5, 1.1, 1.6] fluid.layers.reduce_sum(x, dim=-1) # [1.9, 1.6] fluid.layers.reduce_sum(x, dim=1, keep_dim=True) # [[1.9], [1.6]] + + # x is a Tensor variable with shape [2, 2, 2] and elements as below: + # [[[1, 2], [3, 4]], + # [[5, 6], [7, 8]]] + # Each example is followed by the correspending output tensor. + fluid.layers.reduce_sum(x, dim=[1, 2]) # [10, 26] + fluid.layers.reduce_sum(x, dim=[0, 1]) # [16, 20] + """ helper = LayerHelper('reduce_sum', **locals()) out = helper.create_tmp_variable(dtype=helper.input_dtype()) + if dim is not None and not isinstance(dim, list): + dim = [dim] helper.append_op( type='reduce_sum', inputs={'X': input}, outputs={'Out': out}, attrs={ - 'dim': dim if dim != None else 0, + 'dim': dim if dim != None else [0], 'keep_dim': keep_dim, 'reduce_all': True if dim == None else False }) @@ -2130,11 +2147,11 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None): Args: input (Variable): The input variable which is a Tensor or LoDTensor. - dim (int|None): The dimension along which the mean is computed. If + dim (list|int|None): The dimensions along which the mean is computed. If :attr:`None`, compute the mean over all elements of :attr:`input` and return a Tensor variable with a single element, otherwise must be in the range :math:`[-rank(input), rank(input))`. If - :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`. + :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`. keep_dim (bool): Whether to reserve the reduced dimension in the output Tensor. The result tensor will have one fewer dimension than the :attr:`input` unless :attr:`keep_dim` is true. @@ -2155,15 +2172,24 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None): fluid.layers.reduce_mean(x, dim=0) # [0.15, 0.25, 0.55, 0.8] fluid.layers.reduce_mean(x, dim=-1) # [0.475, 0.4] fluid.layers.reduce_mean(x, dim=1, keep_dim=True) # [[0.475], [0.4]] + + # x is a Tensor variable with shape [2, 2, 2] and elements as below: + # [[[1.0, 2.0], [3.0, 4.0]], + # [[5.0, 6.0], [7.0, 8.0]]] + # Each example is followed by the correspending output tensor. + fluid.layers.reduce_mean(x, dim=[1, 2]) # [2.5, 6.5] + fluid.layers.reduce_mean(x, dim=[0, 1]) # [4.0, 5.0] """ helper = LayerHelper('reduce_mean', **locals()) out = helper.create_tmp_variable(dtype=helper.input_dtype()) + if dim is not None and not isinstance(dim, list): + dim = [dim] helper.append_op( type='reduce_mean', inputs={'X': input}, outputs={'Out': out}, attrs={ - 'dim': dim if dim != None else 0, + 'dim': dim if dim != None else [0], 'keep_dim': keep_dim, 'reduce_all': True if dim == None else False }) @@ -2176,11 +2202,11 @@ def reduce_max(input, dim=None, keep_dim=False, name=None): Args: input (Variable): The input variable which is a Tensor or LoDTensor. - dim (int|None): The dimension along which the maximum is computed. + dim (list|int|None): The dimension along which the maximum is computed. If :attr:`None`, compute the maximum over all elements of :attr:`input` and return a Tensor variable with a single element, otherwise must be in the range :math:`[-rank(input), rank(input))`. - If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`. + If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`. keep_dim (bool): Whether to reserve the reduced dimension in the output Tensor. The result tensor will have one fewer dimension than the :attr:`input` unless :attr:`keep_dim` is true. @@ -2201,15 +2227,24 @@ def reduce_max(input, dim=None, keep_dim=False, name=None): fluid.layers.reduce_max(x, dim=0) # [0.2, 0.3, 0.6, 0.9] fluid.layers.reduce_max(x, dim=-1) # [0.9, 0.7] fluid.layers.reduce_max(x, dim=1, keep_dim=True) # [[0.9], [0.7]] + + # x is a Tensor variable with shape [2, 2, 2] and elements as below: + # [[[1.0, 2.0], [3.0, 4.0]], + # [[5.0, 6.0], [7.0, 8.0]]] + # Each example is followed by the correspending output tensor. + fluid.layers.reduce_max(x, dim=[1, 2]) # [4.0, 8.0] + fluid.layers.reduce_max(x, dim=[0, 1]) # [7.0, 8.0] """ helper = LayerHelper('reduce_max', **locals()) out = helper.create_tmp_variable(dtype=helper.input_dtype()) + if dim is not None and not isinstance(dim, list): + dim = [dim] helper.append_op( type='reduce_max', inputs={'X': input}, outputs={'Out': out}, attrs={ - 'dim': dim if dim != None else 0, + 'dim': dim if dim != None else [0], 'keep_dim': keep_dim, 'reduce_all': True if dim == None else False }) @@ -2222,11 +2257,11 @@ def reduce_min(input, dim=None, keep_dim=False, name=None): Args: input (Variable): The input variable which is a Tensor or LoDTensor. - dim (int|None): The dimension along which the minimum is computed. + dim (list|int|None): The dimensions along which the minimum is computed. If :attr:`None`, compute the minimum over all elements of :attr:`input` and return a Tensor variable with a single element, otherwise must be in the range :math:`[-rank(input), rank(input))`. - If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`. + If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`. keep_dim (bool): Whether to reserve the reduced dimension in the output Tensor. The result tensor will have one fewer dimension than the :attr:`input` unless :attr:`keep_dim` is true. @@ -2247,15 +2282,24 @@ def reduce_min(input, dim=None, keep_dim=False, name=None): fluid.layers.reduce_min(x, dim=0) # [0.1, 0.2, 0.5, 0.7] fluid.layers.reduce_min(x, dim=-1) # [0.2, 0.1] fluid.layers.reduce_min(x, dim=1, keep_dim=True) # [[0.2], [0.1]] + + # x is a Tensor variable with shape [2, 2, 2] and elements as below: + # [[[1.0, 2.0], [3.0, 4.0]], + # [[5.0, 6.0], [7.0, 8.0]]] + # Each example is followed by the correspending output tensor. + fluid.layers.reduce_min(x, dim=[1, 2]) # [1.0, 5.0] + fluid.layers.reduce_min(x, dim=[0, 1]) # [1.0, 2.0] """ helper = LayerHelper('reduce_min', **locals()) out = helper.create_tmp_variable(dtype=helper.input_dtype()) + if dim is not None and not isinstance(dim, list): + dim = [dim] helper.append_op( type='reduce_min', inputs={'X': input}, outputs={'Out': out}, attrs={ - 'dim': dim if dim != None else 0, + 'dim': dim if dim != None else [0], 'keep_dim': keep_dim, 'reduce_all': True if dim == None else False }) @@ -2268,11 +2312,11 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None): Args: input (Variable): The input variable which is a Tensor or LoDTensor. - dim (int|None): The dimension along which the product is performed. If + dim (list|int|None): The dimensions along which the product is performed. If :attr:`None`, multipy all elements of :attr:`input` and return a Tensor variable with a single element, otherwise must be in the - range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, - the dimension to reduce is :math:`rank + dim`. + range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`, + the dimension to reduce is :math:`rank + dim[i]`. keep_dim (bool|False): Whether to reserve the reduced dimension in the output Tensor. The result tensor will have one fewer dimension than the :attr:`input` unless :attr:`keep_dim` is true. @@ -2294,15 +2338,24 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None): fluid.layers.reduce_prod(x, dim=-1) # [0.027, 0.0084] fluid.layers.reduce_prod(x, dim=1, keep_dim=True) # [[0.027], [0.0084]] + + # x is a Tensor variable with shape [2, 2, 2] and elements as below: + # [[[1.0, 2.0], [3.0, 4.0]], + # [[5.0, 6.0], [7.0, 8.0]]] + # Each example is followed by the correspending output tensor. + fluid.layers.reduce_prod(x, dim=[1, 2]) # [24.0, 1680.0] + fluid.layers.reduce_prod(x, dim=[0, 1]) # [105.0, 384.0] """ helper = LayerHelper('reduce_prod', **locals()) out = helper.create_tmp_variable(dtype=helper.input_dtype()) + if dim is not None and not isinstance(dim, list): + dim = [dim] helper.append_op( type='reduce_prod', inputs={'X': input}, outputs={'Out': out}, attrs={ - 'dim': dim if dim != None else 0, + 'dim': dim if dim != None else [0], 'keep_dim': keep_dim, 'reduce_all': True if dim == None else False }) @@ -2405,7 +2458,6 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): if len(x.shape) == 1: axis = 0 - helper = LayerHelper("l2_normalize", **locals()) square = helper.create_tmp_variable(dtype=x.dtype) @@ -2417,7 +2469,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): inputs={"X": square}, outputs={"Out": reduced_sum}, attrs={ - "dim": 1 if axis is None else axis, + "dim": [1] if axis is None else [axis], "keep_dim": True, "reduce_all": False }) @@ -3801,6 +3853,8 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0): (num_rois, channels, pooled_h, pooled_w). Examples: + .. code-block:: python + pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0) """ helper = LayerHelper('roi_pool', **locals()) @@ -3819,3 +3873,84 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0): "spatial_scale": spatial_scale }) return pool_out + + +def dice_loss(input, label, epsilon=0.00001): + """ + **Dice loss Layer** + Dice loss for comparing the similarity of two batch of data, + usually is used for binary image segmentation i.e. labels are binary. + The dice loss can be defined as below equation: + + .. math:: + + dice\_loss &= 1 - \\frac{2 * intersection\_area}{total\_area} \\\\ + &= \\frac{(total\_area - intersection\_area) - intersection\_area}{total\_area} \\\\ + &= \\frac{(union\_area - intersection\_area)}{total\_area} + + + Args: + input (Variable): The predictions with rank>=2. The first dimension is batch size, + and the last dimension is class number. + label (Variable): The groud truth with the same rank with input. The first dimension + is batch size, and the last dimension is 1. + epsilon (float): The epsilon will be added to the numerator and denominator. + If both input and label are empty, it makes sure dice is 1. + Default: 0.00001 + + Returns: + dice_loss (Variable): The dice loss with shape [1]. + + Examples: + .. code-block:: python + + predictions = fluid.layers.softmax(x) + loss = fluid.layers.dice_loss(input=predictions, label=label, 2) + """ + label = one_hot(label, depth=input.shape[-1]) + reduce_dim = range(1, len(input.shape)) + inse = reduce_sum(input * label, dim=reduce_dim) + dice_denominator = reduce_sum( + input, dim=reduce_dim) + reduce_sum( + label, dim=reduce_dim) + dice_score = 1 - inse * 2 / (dice_denominator + epsilon) + return reduce_mean(dice_score) + + +def bilinear_interp(input, out_h, out_w, name=None): + """ + Bilinear interpolation is an extension of linear interpolation for + interpolating functions of two variables (e.g. H-direction and + W-direction in this layer) on a rectilinear 2D grid. + + For details, please refer to Wikipedia: + https://en.wikipedia.org/wiki/Bilinear_interpolation + + Args: + input (Variable): The input tensor of bilinear interpolation, + This is a 4-D tensor of the shape + (num_batches, channels, in_h, in_w). + out_h (int): output height of bilinear interpolation layer. + out_w (int): output width of bilinear interpolation layer. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + out (Variable): The output is a 4-D tensor of the shape + (num_batches, channls, out_h, out_w). + + Examples: + .. code-block:: python + + out = fluid.layers.bilinear_interp(input, out_h=12, out_w=12) + """ + helper = LayerHelper('bilinear_interp', **locals()) + dtype = helper.input_dtype() + out = helper.create_tmp_variable(dtype) + helper.append_op( + type="bilinear_interp", + inputs={"X": input}, + outputs={"Out": out}, + attrs={"out_h": out_h, + "out_w": out_w}) + return out diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py new file mode 100644 index 0000000000000000000000000000000000000000..555e371952d0f902063133c2a227eb78f082726c --- /dev/null +++ b/python/paddle/fluid/lod_tensor.py @@ -0,0 +1,178 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import core +import numpy as np + +__all__ = ['create_lod_tensor', 'create_random_int_lodtensor'] + + +def _validate_lod(lod, tensor_height=-1): + """Check whether the input length-based lod info is valid. + + There are several things to check: + 1. lod should be a list of lists. Empty list is fine. + 2. The length of each sublist (a lod level) should be at least one. + 3. Each element in each lod level should be an integer greater than 0. + 4. The sum of one lod level should be equal to the length of the next lod level. + 5. The sum of the last lod level should be equal to the tensor height. + Bypass this check if user does not provide tensor_height as input. + + Args: + lod: the length-based lod info, e.g., [[2, 3], [2, 1, 2, 3, 4]]. + tensor_height: the outermost dimension of the tensor with which the input + lod is associated with. + + Returns: + A boolean indicating whether the input lod is valid or not. + """ + assert isinstance(lod, list), "lod should be a list" + # Empty lod is fine + if len(lod) == 0: + return True + + lod_sum = [] + for level in lod: + assert isinstance(level, list), "each item in lod should be a list" + # Each level of lod should have at least one length info + if len(level) < 1: + return False + level_sum = 0 + for lod_len in level: + # Each length in a level should be > 0 + if lod_len <= 0: + return False + level_sum += lod_len + lod_sum.append(level_sum) + + for idx, val in enumerate(lod_sum[:-1]): + # Each level's sum should be equal to + # the number of items in the next level + if val != len(lod[idx + 1]): + return False + + if tensor_height == -1: + return True + else: + # Last level's sum should be equal to the tensor height + return lod_sum[-1] == tensor_height + + +def _convert_lod(lod): + """Convert a length-based lod to a offset-based lod. + + If the length-based lod is [[2, 3], [2, 1, 2, 3, 4]], + then the offset-based lod is [[0, 2, 5], [0, 2, 3, 5, 8, 12]]. + + Args: + lod: a length-based lod info. + + Returns: + A list of lists as the offset-based lod converted to from the input lod. + """ + new_lod = [] + for level in lod: + cur_len = 0 + new_level = [cur_len] + for lod_len in level: + cur_len += lod_len + new_level.append(cur_len) + new_lod.append(new_level) + return new_lod + + +def create_lod_tensor(data, lod, place): + """Create a lod tensor from a numpy array or an existing lod tensor. + + Create a lod tensor by doing the following: + 1. Check that the length-based input lod is valid. + 2. Convert the length-based lod to a offset-based LoD. + 3. Copy the data from a numpy array or a existing lod tensor to + CPU or GPU device (based on input place). + 4. Set the level of detail (LoD) using the offset-based LoD. + + Use example: + Suppose we want LoDTensor to hold data for sequences of word, where each word is + represented by an integer. If we want to create a LoDTensor to represent two + sentences, one of 2 words, and one of 3 words. + + Then 'data' can be a numpy array of integers with shape (5, 1). + 'lod' will be [[2, 3]], indicating the length(# of words) in each sentence. + This length-based input lod [[2, 3]] will be converted to offset-based lod [[0, 2, 5]] + inside the function call. + + Please refer to + github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md + for more details regarding LoD. + + Args: + data: a numpy array or a LoDTensor holding the data to be copied. + lod: a list of lists indicating the length-based LoD info specified by the user. + place: CPU or GPU place indicating where the data in the new LoDTensor will be stored. + + Returns: + A fluid LoDTensor object with tensor data and lod info. + """ + if isinstance(data, core.LoDTensor): + return create_lod_tensor(np.array(data), lod, place) + elif isinstance(data, np.ndarray): + assert _validate_lod(lod, + data.shape[0]), "the provided lod info is invalid" + tensor = core.LoDTensor() + tensor.set(data, place) + tensor.set_lod(_convert_lod(lod)) + return tensor + else: + raise Exception( + "data should be either a LoDTensor or a Numpy array, but you pass type %s instead" + % (type(data))) + + +def create_random_int_lodtensor(lod, base_shape, place, low, high): + """Create a LoDTensor containing random integers. + + This function is frequently used in the book examples. So we revised it based on + the new create_lod_tensor API and put it here in the lod_tensor module to simplify + the code. + + The function does the following: + 1. Calculate the overall shape of the LoDTensor based on the length-based 'lod' input + and the shape of the basic element in 'base_shape'. + 2. Create a numpy array of this shape. + 3. Create the LoDTensor using create_lod_tensor API. + + Suppose we want LoDTensor to hold data for sequences of word, where each word is + represented by an integer. If we want to create a LoDTensor to represent two + sentences, one of 2 words, and one of 3 words. Then 'base_shape' is [1], input + length-based 'lod' is [[2, 3]]. Then the overall shape of the LoDTensor would be + [5, 1], holding 5 words for two sentences. + + Args: + data: a numpy array or a LoDTensor holding the data to be copied. + lod: a list of lists indicating the length-based LoD info specified by the user. + base_shape: the shape of the basic element to be held by the LoDTensor. + place: CPU or GPU place indicating where the data in the new LoDTensor will be stored. + low: the lower bound of the random integers. + high: the upper bound of the random integers. + + Returns: + A fluid LoDTensor object with tensor data and lod info. + """ + assert isinstance(base_shape, list), "base_shape should be a list" + converted_lod = _convert_lod(lod) + # append the total number of basic elements to the front of its shape + overall_shape = [converted_lod[-1][-1]] + base_shape + # the range of integer data elements is [low, high] + data = np.random.random_integers(low, high, overall_shape).astype("int64") + return create_lod_tensor(data, lod, place) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 0fc48055220ed84c4ab146ad01b05f393e01078e..115362c6bf33018342699a442c688e7356f3c206 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -28,8 +28,8 @@ from contextlib import contextmanager __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer', - 'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'Adadelta', 'ModelAverage', - 'Optimizer' + 'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer', + 'Adadelta', 'ModelAverage', 'Optimizer' ] @@ -213,11 +213,13 @@ class Optimizer(object): optimize_ops = [] for param_and_grad in parameters_and_grads: - if param_and_grad[0].trainable is True and param_and_grad[ - 1] is not None: - optimize_op = self._append_optimize_op(loss.block, - param_and_grad) - optimize_ops.append(optimize_op) + with param_and_grad[0].block.program.optimized_guard( + param_and_grad[0]): + if param_and_grad[0].trainable is True and param_and_grad[ + 1] is not None: + optimize_op = self._append_optimize_op(loss.block, + param_and_grad) + optimize_ops.append(optimize_op) # Get custom finish ops for subclasses # FIXME: Need to fix this once we figure out how to handle dependencies diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py index c006bd9a66ddb422b7d80d2ca87aa7f56a6485db..c4d6829599616cb3ea7791a189e7070974de6ae3 100644 --- a/python/paddle/fluid/regularizer.py +++ b/python/paddle/fluid/regularizer.py @@ -43,31 +43,32 @@ def append_regularization_ops(parameters_and_grads, regularization=None): """ params_and_grads = [] for param, grad in parameters_and_grads: - # If no gradient then we don't need to do anything - if grad is None: + with param.block.program.optimized_guard(param): + # If no gradient then we don't need to do anything + if grad is None: + params_and_grads.append((param, grad)) + continue + + regularization_term = None + if param.regularizer is not None: + # Add variable for regularization term in grad block + regularization_term = param.regularizer(param, grad, grad.block) + elif regularization is not None: + regularization_term = regularization(param, grad, grad.block) + + # If no regularization specified, then we don't need to do anything + if regularization_term is None: + params_and_grads.append((param, grad)) + continue + + assert grad.shape == regularization_term.shape + + grad.block.append_op( + type='elementwise_add', + inputs={"X": grad, + "Y": regularization_term}, + outputs={"Out": grad}) params_and_grads.append((param, grad)) - continue - - regularization_term = None - if param.regularizer is not None: - # Add variable for regularization term in grad block - regularization_term = param.regularizer(param, grad, grad.block) - elif regularization is not None: - regularization_term = regularization(param, grad, grad.block) - - # If no regularization specified, then we don't need to do anything - if regularization_term is None: - params_and_grads.append((param, grad)) - continue - - assert grad.shape == regularization_term.shape - - grad.block.append_op( - type='elementwise_add', - inputs={"X": grad, - "Y": regularization_term}, - outputs={"Out": grad}) - params_and_grads.append((param, grad)) return params_and_grads diff --git a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt index 182e30a6a9b4249a895d15cfd65c403bb6813d0d..efa5ee2d06af3d31e7d84122dd7eea37d6dcf3a3 100644 --- a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt +++ b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt @@ -10,3 +10,7 @@ add_subdirectory(fit_a_line) add_subdirectory(recognize_digits) add_subdirectory(image_classification) add_subdirectory(understand_sentiment) +add_subdirectory(label_semantic_roles) +add_subdirectory(word2vec) +add_subdirectory(recommender_system) +add_subdirectory(machine_translation) diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py index 4c8505acf322a8ee33799c009b523cd70bd01db3..5fba561e024b0690f10939267146f2622c567fa5 100644 --- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py +++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py @@ -92,7 +92,7 @@ def infer(use_cuda, inference_program, save_dirname=None): tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32") results = inferencer.infer({'x': tensor_x}) - print("infer results: ", numpy.array(results[0])) + print("infer results: ", results[0]) def main(use_cuda): diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926 --- /dev/null +++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt @@ -0,0 +1,7 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +# default test +foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) +endforeach() diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/no_test_label_semantic_roles.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py similarity index 50% rename from python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/no_test_label_semantic_roles.py rename to python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py index fe36e55bb5380975ae322eccbcd8ad41e1e6748a..f4344988141af44af83fda24d73da25f597796ef 100755 --- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/no_test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py @@ -16,21 +16,23 @@ from __future__ import print_function import paddle import paddle.fluid as fluid -import numpy +import numpy as np WORD_DICT, VERB_DICT, LABEL_DICT = paddle.dataset.conll05.get_dict() WORD_DICT_LEN = len(WORD_DICT) LABEL_DICT_LEN = len(LABEL_DICT) PRED_DICT_LEN = len(VERB_DICT) MARK_DICT_LEN = 2 +IS_SPARSE = True +BATCH_SIZE = 10 +EMBEDDING_NAME = 'emb' -def lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark): +def lstm_net(): WORD_DIM = 32 MARK_DIM = 5 HIDDEN_DIM = 512 DEPTH = 8 - EMBEDDING_NAME = 'emb' # Data definitions word = fluid.layers.data( @@ -69,8 +71,9 @@ def lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark): fluid.layers.embedding( size=[WORD_DICT_LEN, WORD_DIM], input=x, - param_attr=fluid.ParamAttr( - name=EMBEDDING_NAME, trainable=False)) for x in word_input + param_attr=fluid.ParamAttr(name=EMBEDDING_NAME)) + for x in word_input + #name=EMBEDDING_NAME, trainable=False)) for x in word_input ] emb_layers.append(predicate_embedding) emb_layers.append(mark_embedding) @@ -116,21 +119,16 @@ def lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark): return feature_out -def inference_network(): - predict = lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, - mark) +def inference_program(): + predict = lstm_net() - crf_decode = fluid.layers.crf_decoding( - input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) + return predict - return crf_decode - -def train_network(): +def train_program(): MIX_HIDDEN_LR = 1e-3 - predict = lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, - mark) + predict = lstm_net() target = fluid.layers.data( name='target', shape=[1], dtype='int64', lod_level=1) crf_cost = fluid.layers.linear_chain_crf( @@ -140,87 +138,122 @@ def train_network(): name='crfw', learning_rate=MIX_HIDDEN_LR)) avg_cost = fluid.layers.mean(crf_cost) - return avg_cost + return [avg_cost] -def train(use_cuda, save_path): - BATCH_SIZE = 128 - EPOCH_NUM = 1 +def train(use_cuda, train_program, save_path): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + optimizer = fluid.optimizer.SGD(learning_rate=0.01) - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.conll05.train(), buf_size=8192), - batch_size=BATCH_SIZE) - test_reader = paddle.batch( - paddle.dataset.conll05.test(), batch_size=BATCH_SIZE) + trainer = fluid.Trainer( + train_func=train_program, place=place, optimizer=optimizer) - def event_handler(event): - if isinstance(event, fluid.EndIteration): - if (event.batch_id % 10) == 0: - avg_cost = trainer.test(reader=test_reader) + feed_order = [ + 'word_data', 'ctx_n2_data', 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', + 'ctx_p2_data', 'verb_data', 'mark_data', 'target' + ] - print('BatchID {0:04}, Loss {1:2.2}'.format(event.batch_id + 1, - avg_cost)) + #embedding_param = fluid.global_scope().find_var( + # EMBEDDING_NAME).get_tensor() + #embedding_param.set( + # load_parameter(conll05.get_embedding(), WORD_DICT_LEN, WORD_DIM), + # place) - if avg_cost > 0.01: # Low threshold for speeding up CI - trainer.save_params(save_path) - return + def event_handler(event): + if isinstance(event, fluid.EndEpochEvent): + test_reader = paddle.batch( + paddle.dataset.conll05.test(), batch_size=BATCH_SIZE) + avg_cost_set = trainer.test( + reader=test_reader, feed_order=feed_order) + + # get avg cost + avg_cost = np.array(avg_cost_set).mean() + + print("avg_cost: %s" % avg_cost) + + if float(avg_cost) < 100.0: # Large value to increase CI speed + trainer.save_params(save_path) + else: + print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1, + float(avg_cost))) + if math.isnan(float(avg_cost)): + sys.exit("got NaN loss, training failed.") + + elif isinstance(event, fluid.EndStepEvent): + print("Step {0}, Epoch {1} Metrics {2}".format( + event.step, event.epoch, map(np.array, event.metrics))) + if event.step == 1: # Run 2 iterations to speed CI + trainer.save_params(save_path) + trainer.stop() - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.layers.exponential_decay( - learning_rate=0.01, - decay_steps=100000, - decay_rate=0.5, - staircase=True)) - trainer = fluid.Trainer(train_network, optimizer=sgd_optimizer, place=place) - trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler) + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.conll05.test(), buf_size=8192), + batch_size=BATCH_SIZE) + trainer.train( + num_epochs=1, + event_handler=event_handler, + reader=train_reader, + feed_order=feed_order) -def infer(use_cuda, save_path): +def infer(use_cuda, inference_program, save_path): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() inferencer = fluid.Inferencer( inference_program, param_path=save_path, place=place) - def create_random_lodtensor(lod, place, low, high): - data = np.random.random_integers(low, high, - [lod[-1], 1]).astype("int64") - res = fluid.LoDTensor() - res.set(data, place) - res.set_lod([lod]) - return res - - # Create an input example - lod = [0, 4, 10] - word = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1) - pred = create_random_lodtensor(lod, place, low=0, high=PRED_DICT_LEN - 1) - ctx_n2 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1) - ctx_n1 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1) - ctx_0 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1) - ctx_p1 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1) - ctx_p2 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1) - mark = create_random_lodtensor(lod, place, low=0, high=MARK_DICT_LEN - 1) - - results = inferencer.infer({ - 'word_data': word, - 'verb_data': pred, - 'ctx_n2_data': ctx_n2, - 'ctx_n1_data': ctx_n1, - 'ctx_0_data': ctx_0, - 'ctx_p1_data': ctx_p1, - 'ctx_p2_data': ctx_p2, - 'mark_data': mark - }) - - print("infer results: ", results) + # Setup inputs by creating LoDTensors to represent sequences of words. + # Here each word is the basic element of these LoDTensors and the shape of + # each word (base_shape) should be [1] since it is simply an index to + # look up for the corresponding word vector. + # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], + # which has only one lod level. Then the created LoDTensors will have only + # one higher level structure (sequence of words, or sentence) than the basic + # element (word). Hence the LoDTensor will hold data for three sentences of + # length 3, 4 and 2, respectively. + # Note that lod info should be a list of lists. + lod = [[3, 4, 2]] + base_shape = [1] + # The range of random integers is [low, high] + word = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) + pred = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=PRED_DICT_LEN - 1) + ctx_n2 = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) + ctx_n1 = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) + ctx_0 = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) + ctx_p1 = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) + ctx_p2 = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) + mark = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=MARK_DICT_LEN - 1) + + results = inferencer.infer( + { + 'word_data': word, + 'verb_data': pred, + 'ctx_n2_data': ctx_n2, + 'ctx_n1_data': ctx_n1, + 'ctx_0_data': ctx_0, + 'ctx_p1_data': ctx_p1, + 'ctx_p2_data': ctx_p2, + 'mark_data': mark + }, + return_numpy=False) + + print("infer results: ", np.array(results[0])) def main(use_cuda): if use_cuda and not fluid.core.is_compiled_with_cuda(): return save_path = "label_semantic_roles.inference.model" - train(use_cuda, save_path) - infer(use_cuda, save_path) + train(use_cuda, train_program, save_path) + infer(use_cuda, inference_program, save_path) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926 --- /dev/null +++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt @@ -0,0 +1,7 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +# default test +foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) +endforeach() diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py new file mode 100644 index 0000000000000000000000000000000000000000..7204c7b3c7648a24de89d41e205db5b18ed2a5fc --- /dev/null +++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py @@ -0,0 +1,319 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import contextlib + +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.framework as framework +import paddle.fluid.layers as pd +from paddle.fluid.executor import Executor +from functools import partial +import unittest +import os + +dict_size = 30000 +source_dict_dim = target_dict_dim = dict_size +hidden_dim = 32 +word_dim = 16 +batch_size = 2 +max_length = 8 +topk_size = 50 +trg_dic_size = 10000 +beam_size = 2 + +decoder_size = hidden_dim + + +def encoder(is_sparse): + # encoder + src_word_id = pd.data( + name="src_word_id", shape=[1], dtype='int64', lod_level=1) + src_embedding = pd.embedding( + input=src_word_id, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=is_sparse, + param_attr=fluid.ParamAttr(name='vemb')) + + fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') + lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4) + encoder_out = pd.sequence_last_step(input=lstm_hidden0) + return encoder_out + + +def decoder_train(context, is_sparse): + # decoder + trg_language_word = pd.data( + name="target_language_word", shape=[1], dtype='int64', lod_level=1) + trg_embedding = pd.embedding( + input=trg_language_word, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=is_sparse, + param_attr=fluid.ParamAttr(name='vemb')) + + rnn = pd.DynamicRNN() + with rnn.block(): + current_word = rnn.step_input(trg_embedding) + pre_state = rnn.memory(init=context) + current_state = pd.fc(input=[current_word, pre_state], + size=decoder_size, + act='tanh') + + current_score = pd.fc(input=current_state, + size=target_dict_dim, + act='softmax') + rnn.update_memory(pre_state, current_state) + rnn.output(current_score) + + return rnn() + + +def decoder_decode(context, is_sparse): + init_state = context + array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) + counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) + + # fill the first element with init_state + state_array = pd.create_array('float32') + pd.array_write(init_state, array=state_array, i=counter) + + # ids, scores as memory + ids_array = pd.create_array('int64') + scores_array = pd.create_array('float32') + + init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) + init_scores = pd.data( + name="init_scores", shape=[1], dtype="float32", lod_level=2) + + pd.array_write(init_ids, array=ids_array, i=counter) + pd.array_write(init_scores, array=scores_array, i=counter) + + cond = pd.less_than(x=counter, y=array_len) + + while_op = pd.While(cond=cond) + with while_op.block(): + pre_ids = pd.array_read(array=ids_array, i=counter) + pre_state = pd.array_read(array=state_array, i=counter) + pre_score = pd.array_read(array=scores_array, i=counter) + + # expand the lod of pre_state to be the same with pre_score + pre_state_expanded = pd.sequence_expand(pre_state, pre_score) + + pre_ids_emb = pd.embedding( + input=pre_ids, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=is_sparse) + + # use rnn unit to update rnn + current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], + size=decoder_size, + act='tanh') + current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) + # use score to do beam search + current_score = pd.fc(input=current_state_with_lod, + size=target_dict_dim, + act='softmax') + topk_scores, topk_indices = pd.topk(current_score, k=topk_size) + selected_ids, selected_scores = pd.beam_search( + pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) + + pd.increment(x=counter, value=1, in_place=True) + + # update the memories + pd.array_write(current_state, array=state_array, i=counter) + pd.array_write(selected_ids, array=ids_array, i=counter) + pd.array_write(selected_scores, array=scores_array, i=counter) + + pd.less_than(x=counter, y=array_len, cond=cond) + + translation_ids, translation_scores = pd.beam_search_decode( + ids=ids_array, scores=scores_array) + + # return init_ids, init_scores + + return translation_ids, translation_scores + + +def set_init_lod(data, lod, place): + res = fluid.LoDTensor() + res.set(data, place) + res.set_lod(lod) + return res + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def train_program(is_sparse): + context = encoder(is_sparse) + rnn_out = decoder_train(context, is_sparse) + label = pd.data( + name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) + cost = pd.cross_entropy(input=rnn_out, label=label) + avg_cost = pd.mean(cost) + return avg_cost + + +def train(use_cuda, is_sparse, is_local=True): + EPOCH_NUM = 1 + + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=batch_size) + + feed_order = [ + 'src_word_id', 'target_language_word', 'target_language_next_word' + ] + + def event_handler(event): + if isinstance(event, fluid.EndStepEvent): + print('pass_id=' + str(event.epoch) + ' batch=' + str(event.step)) + if event.step == 10: + trainer.stop() + + trainer = fluid.Trainer( + train_func=partial(train_program, is_sparse), + optimizer=fluid.optimizer.Adagrad( + learning_rate=1e-4, + regularization=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=0.1)), + place=place) + + trainer.train( + reader=train_reader, + num_epochs=EPOCH_NUM, + event_handler=event_handler, + feed_order=feed_order) + + +def decode_main(use_cuda, is_sparse): + + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + context = encoder(is_sparse) + translation_ids, translation_scores = decoder_decode(context, is_sparse) + + exe = Executor(place) + exe.run(framework.default_startup_program()) + + init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64') + init_scores_data = np.array( + [1. for _ in range(batch_size)], dtype='float32') + init_ids_data = init_ids_data.reshape((batch_size, 1)) + init_scores_data = init_scores_data.reshape((batch_size, 1)) + init_lod = [i for i in range(batch_size)] + [batch_size] + init_lod = [init_lod, init_lod] + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=batch_size) + for _, data in enumerate(train_data()): + init_ids = set_init_lod(init_ids_data, init_lod, place) + init_scores = set_init_lod(init_scores_data, init_lod, place) + + src_word_data = to_lodtensor(map(lambda x: x[0], data), place) + + result_ids, result_scores = exe.run( + framework.default_main_program(), + feed={ + 'src_word_id': src_word_data, + 'init_ids': init_ids, + 'init_scores': init_scores + }, + fetch_list=[translation_ids, translation_scores], + return_numpy=False) + print result_ids.lod() + break + + +class TestMachineTranslation(unittest.TestCase): + pass + + +@contextlib.contextmanager +def scope_prog_guard(): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield + + +def inject_test_train(use_cuda, is_sparse): + f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse' + if is_sparse else 'dense') + + def f(*args): + with scope_prog_guard(): + train(use_cuda, is_sparse) + + setattr(TestMachineTranslation, f_name, f) + + +def inject_test_decode(use_cuda, is_sparse, decorator=None): + f_name = 'test_{0}_{1}_decode'.format('cuda' + if use_cuda else 'cpu', 'sparse' + if is_sparse else 'dense') + + def f(*args): + with scope_prog_guard(): + decode_main(use_cuda, is_sparse) + + if decorator is not None: + f = decorator(f) + + setattr(TestMachineTranslation, f_name, f) + + +for _use_cuda_ in (False, True): + for _is_sparse_ in (False, True): + inject_test_train(_use_cuda_, _is_sparse_) + +for _use_cuda_ in (False, True): + for _is_sparse_ in (False, True): + + _decorator_ = None + if _use_cuda_: + _decorator_ = unittest.skip( + reason='Beam Search does not support CUDA!') + + inject_test_decode( + is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py index 2128d4c5b87434ebe30930dc0e338b3b50d921c2..2aac70463c64019ec97b0c3893b4b52f77967797 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py @@ -112,7 +112,7 @@ def infer(use_cuda, inference_program, save_dirname=None): results = inferencer.infer({'img': tensor_img}) - print("infer results: ", numpy.array(results[0])) + print("infer results: ", results[0]) def main(use_cuda): diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py index 041c8d778e5c03aa68dad6ef450934f09c8d2a52..32653157994f81c46f420c1b55ceddbbbf06f2fe 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py @@ -93,7 +93,7 @@ def infer(use_cuda, inference_program, save_dirname=None): results = inferencer.infer({'img': tensor_img}) - print("infer results: ", numpy.array(results[0])) + print("infer results: ", results[0]) def main(use_cuda): diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926 --- /dev/null +++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt @@ -0,0 +1,7 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +# default test +foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) +endforeach() diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py new file mode 100644 index 0000000000000000000000000000000000000000..259680cb097a12a4fc92107f6fd8595393f88bd5 --- /dev/null +++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py @@ -0,0 +1,265 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import sys +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers as layers +import paddle.fluid.nets as nets + +IS_SPARSE = True +USE_GPU = False +BATCH_SIZE = 256 + + +def get_usr_combined_features(): + # FIXME(dzh) : old API integer_value(10) may have range check. + # currently we don't have user configurated check. + + USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1 + + uid = layers.data(name='user_id', shape=[1], dtype='int64') + + usr_emb = layers.embedding( + input=uid, + dtype='float32', + size=[USR_DICT_SIZE, 32], + param_attr='user_table', + is_sparse=IS_SPARSE) + + usr_fc = layers.fc(input=usr_emb, size=32) + + USR_GENDER_DICT_SIZE = 2 + + usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64') + + usr_gender_emb = layers.embedding( + input=usr_gender_id, + size=[USR_GENDER_DICT_SIZE, 16], + param_attr='gender_table', + is_sparse=IS_SPARSE) + + usr_gender_fc = layers.fc(input=usr_gender_emb, size=16) + + USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table) + usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64") + + usr_age_emb = layers.embedding( + input=usr_age_id, + size=[USR_AGE_DICT_SIZE, 16], + is_sparse=IS_SPARSE, + param_attr='age_table') + + usr_age_fc = layers.fc(input=usr_age_emb, size=16) + + USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1 + usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64") + + usr_job_emb = layers.embedding( + input=usr_job_id, + size=[USR_JOB_DICT_SIZE, 16], + param_attr='job_table', + is_sparse=IS_SPARSE) + + usr_job_fc = layers.fc(input=usr_job_emb, size=16) + + concat_embed = layers.concat( + input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1) + + usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") + + return usr_combined_features + + +def get_mov_combined_features(): + + MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1 + + mov_id = layers.data(name='movie_id', shape=[1], dtype='int64') + + mov_emb = layers.embedding( + input=mov_id, + dtype='float32', + size=[MOV_DICT_SIZE, 32], + param_attr='movie_table', + is_sparse=IS_SPARSE) + + mov_fc = layers.fc(input=mov_emb, size=32) + + CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories()) + + category_id = layers.data( + name='category_id', shape=[1], dtype='int64', lod_level=1) + + mov_categories_emb = layers.embedding( + input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE) + + mov_categories_hidden = layers.sequence_pool( + input=mov_categories_emb, pool_type="sum") + + MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict()) + + mov_title_id = layers.data( + name='movie_title', shape=[1], dtype='int64', lod_level=1) + + mov_title_emb = layers.embedding( + input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE) + + mov_title_conv = nets.sequence_conv_pool( + input=mov_title_emb, + num_filters=32, + filter_size=3, + act="tanh", + pool_type="sum") + + concat_embed = layers.concat( + input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1) + + # FIXME(dzh) : need tanh operator + mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") + + return mov_combined_features + + +def inference_program(): + usr_combined_features = get_usr_combined_features() + mov_combined_features = get_mov_combined_features() + + inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features) + scale_infer = layers.scale(x=inference, scale=5.0) + + return scale_infer + + +def train_program(): + + scale_infer = inference_program() + + label = layers.data(name='score', shape=[1], dtype='float32') + square_cost = layers.square_error_cost(input=scale_infer, label=label) + avg_cost = layers.mean(square_cost) + + return [avg_cost, scale_infer] + + +def train(use_cuda, train_program, save_path): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + optimizer = fluid.optimizer.SGD(learning_rate=0.2) + + trainer = fluid.Trainer( + train_func=train_program, place=place, optimizer=optimizer) + + feed_order = [ + 'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id', + 'movie_title', 'score' + ] + + def event_handler(event): + if isinstance(event, fluid.EndStepEvent): + test_reader = paddle.batch( + paddle.dataset.movielens.test(), batch_size=BATCH_SIZE) + avg_cost_set = trainer.test( + reader=test_reader, feed_order=feed_order) + + # get avg cost + avg_cost = np.array(avg_cost_set).mean() + + print("avg_cost: %s" % avg_cost) + + if float(avg_cost) < 4: # Smaller value to increase CI speed + trainer.save_params(save_path) + trainer.stop() + else: + print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1, + float(avg_cost))) + if math.isnan(float(avg_cost)): + sys.exit("got NaN loss, training failed.") + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.movielens.train(), buf_size=8192), + batch_size=BATCH_SIZE) + + trainer.train( + num_epochs=1, + event_handler=event_handler, + reader=train_reader, + feed_order=[ + 'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', + 'category_id', 'movie_title', 'score' + ]) + + +def infer(use_cuda, inference_program, save_path): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + inferencer = fluid.Inferencer( + inference_program, param_path=save_path, place=place) + + def create_lod_tensor(data, lod=None): + tensor = fluid.LoDTensor() + if lod is None: + # Tensor, the shape is [batch_size, 1] + index = 0 + lod_0 = [index] + for l in range(len(data)): + index += 1 + lod_0.append(index) + lod = [lod_0] + tensor.set_lod(lod) + + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + tensor.set(flattened_data, place) + return tensor + + # Generate a random input for inference + user_id = create_lod_tensor([[1]]) + gender_id = create_lod_tensor([[1]]) + age_id = create_lod_tensor([[0]]) + job_id = create_lod_tensor([[10]]) + movie_id = create_lod_tensor([[783]]) + category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]]) + movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]], + [[0, 5]]) + + results = inferencer.infer( + { + 'user_id': user_id, + 'gender_id': gender_id, + 'age_id': age_id, + 'job_id': job_id, + 'movie_id': movie_id, + 'category_id': category_id, + 'movie_title': movie_title + }, + return_numpy=False) + + print("infer results: ", np.array(results[0])) + + +def main(use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + save_path = "recommender_system.inference.model" + train(use_cuda=use_cuda, train_program=train_program, save_path=save_path) + infer( + use_cuda=use_cuda, + inference_program=inference_program, + save_path=save_path) + + +if __name__ == '__main__': + main(USE_GPU) diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..7e32696f9909a0a440f6bdc401ac9f9594c4dec7 --- /dev/null +++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py @@ -0,0 +1,153 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle +import paddle.fluid as fluid +from functools import partial +import numpy as np + +CLASS_DIM = 2 +EMB_DIM = 128 +HID_DIM = 512 +BATCH_SIZE = 128 + + +def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim): + emb = fluid.layers.embedding( + input=data, size=[input_dim, emb_dim], is_sparse=True) + conv_3 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=3, + act="tanh", + pool_type="sqrt") + conv_4 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=4, + act="tanh", + pool_type="sqrt") + prediction = fluid.layers.fc(input=[conv_3, conv_4], + size=class_dim, + act="softmax") + return prediction + + +def inference_program(word_dict): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + dict_dim = len(word_dict) + net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM) + return net + + +def train_program(word_dict): + prediction = inference_program(word_dict) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(cost) + accuracy = fluid.layers.accuracy(input=prediction, label=label) + return [avg_cost, accuracy] + + +def train(use_cuda, train_program, save_dirname): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + optimizer = fluid.optimizer.Adagrad(learning_rate=0.002) + + word_dict = paddle.dataset.imdb.word_dict() + trainer = fluid.Trainer( + train_func=partial(train_program, word_dict), + place=place, + optimizer=optimizer) + + def event_handler(event): + if isinstance(event, fluid.EndEpochEvent): + test_reader = paddle.batch( + paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE) + avg_cost, acc = trainer.test( + reader=test_reader, feed_order=['words', 'label']) + + print("avg_cost: %s" % avg_cost) + print("acc : %s" % acc) + + if acc > 0.2: # Smaller value to increase CI speed + trainer.save_params(save_dirname) + trainer.stop() + + else: + print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format( + event.epoch + 1, avg_cost, acc)) + if math.isnan(avg_cost): + sys.exit("got NaN loss, training failed.") + elif isinstance(event, fluid.EndStepEvent): + print("Step {0}, Epoch {1} Metrics {2}".format( + event.step, event.epoch, map(np.array, event.metrics))) + if event.step == 1: # Run 2 iterations to speed CI + trainer.save_params(save_dirname) + trainer.stop() + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.train(word_dict), buf_size=25000), + batch_size=BATCH_SIZE) + + trainer.train( + num_epochs=1, + event_handler=event_handler, + reader=train_reader, + feed_order=['words', 'label']) + + +def infer(use_cuda, inference_program, save_dirname=None): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + word_dict = paddle.dataset.imdb.word_dict() + + inferencer = fluid.Inferencer( + infer_func=partial(inference_program, word_dict), + param_path=save_dirname, + place=place) + + # Setup input by creating LoDTensor to represent sequence of words. + # Here each word is the basic element of the LoDTensor and the shape of + # each word (base_shape) should be [1] since it is simply an index to + # look up for the corresponding word vector. + # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], + # which has only one lod level. Then the created LoDTensor will have only + # one higher level structure (sequence of words, or sentence) than the basic + # element (word). Hence the LoDTensor will hold data for three sentences of + # length 3, 4 and 2, respectively. + # Note that lod info should be a list of lists. + lod = [[3, 4, 2]] + base_shape = [1] + # The range of random integers is [low, high] + tensor_words = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=len(word_dict) - 1) + results = inferencer.infer({'words': tensor_words}) + print("infer results: ", results) + + +def main(use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + save_path = "understand_sentiment_conv.inference.model" + train(use_cuda, train_program, save_path) + infer(use_cuda, inference_program, save_path) + + +if __name__ == '__main__': + for use_cuda in (False, True): + main(use_cuda=use_cuda) diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py new file mode 100644 index 0000000000000000000000000000000000000000..e50b7920b17f86eada3abc700c5403053fca8771 --- /dev/null +++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py @@ -0,0 +1,168 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle +import paddle.fluid as fluid +from functools import partial +import numpy as np + +CLASS_DIM = 2 +EMB_DIM = 128 +BATCH_SIZE = 128 +LSTM_SIZE = 128 + + +def dynamic_rnn_lstm(data, input_dim, class_dim, emb_dim, lstm_size): + emb = fluid.layers.embedding( + input=data, size=[input_dim, emb_dim], is_sparse=True) + sentence = fluid.layers.fc(input=emb, size=lstm_size, act='tanh') + + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + word = rnn.step_input(sentence) + prev_hidden = rnn.memory(value=0.0, shape=[lstm_size]) + prev_cell = rnn.memory(value=0.0, shape=[lstm_size]) + + def gate_common(ipt, hidden, size): + gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True) + gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False) + return gate0 + gate1 + + forget_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden, + lstm_size)) + input_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden, + lstm_size)) + output_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden, + lstm_size)) + cell_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden, + lstm_size)) + + cell = forget_gate * prev_cell + input_gate * cell_gate + hidden = output_gate * fluid.layers.tanh(x=cell) + rnn.update_memory(prev_cell, cell) + rnn.update_memory(prev_hidden, hidden) + rnn.output(hidden) + + last = fluid.layers.sequence_last_step(rnn()) + prediction = fluid.layers.fc(input=last, size=class_dim, act="softmax") + return prediction + + +def inference_program(word_dict): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + dict_dim = len(word_dict) + pred = dynamic_rnn_lstm(data, dict_dim, CLASS_DIM, EMB_DIM, LSTM_SIZE) + return pred + + +def train_program(word_dict): + prediction = inference_program(word_dict) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(cost) + accuracy = fluid.layers.accuracy(input=prediction, label=label) + return [avg_cost, accuracy] + + +def train(use_cuda, train_program, save_dirname): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + optimizer = fluid.optimizer.Adagrad(learning_rate=0.002) + + word_dict = paddle.dataset.imdb.word_dict() + trainer = fluid.Trainer( + train_func=partial(train_program, word_dict), + place=place, + optimizer=optimizer) + + def event_handler(event): + if isinstance(event, fluid.EndEpochEvent): + test_reader = paddle.batch( + paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE) + avg_cost, acc = trainer.test( + reader=test_reader, feed_order=['words', 'label']) + + print("avg_cost: %s" % avg_cost) + print("acc : %s" % acc) + + if acc > 0.2: # Smaller value to increase CI speed + trainer.save_params(save_dirname) + trainer.stop() + + else: + print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format( + event.epoch + 1, avg_cost, acc)) + if math.isnan(avg_cost): + sys.exit("got NaN loss, training failed.") + elif isinstance(event, fluid.EndStepEvent): + print("Step {0}, Epoch {1} Metrics {2}".format( + event.step, event.epoch, map(np.array, event.metrics))) + if event.step == 1: # Run 2 iterations to speed CI + trainer.save_params(save_dirname) + trainer.stop() + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.train(word_dict), buf_size=25000), + batch_size=BATCH_SIZE) + + trainer.train( + num_epochs=1, + event_handler=event_handler, + reader=train_reader, + feed_order=['words', 'label']) + + +def infer(use_cuda, inference_program, save_dirname=None): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + word_dict = paddle.dataset.imdb.word_dict() + + inferencer = fluid.Inferencer( + infer_func=partial(inference_program, word_dict), + param_path=save_dirname, + place=place) + + # Setup input by creating LoDTensor to represent sequence of words. + # Here each word is the basic element of the LoDTensor and the shape of + # each word (base_shape) should be [1] since it is simply an index to + # look up for the corresponding word vector. + # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], + # which has only one lod level. Then the created LoDTensor will have only + # one higher level structure (sequence of words, or sentence) than the basic + # element (word). Hence the LoDTensor will hold data for three sentences of + # length 3, 4 and 2, respectively. + # Note that lod info should be a list of lists. + lod = [[3, 4, 2]] + base_shape = [1] + # The range of random integers is [low, high] + tensor_words = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=len(word_dict) - 1) + results = inferencer.infer({'words': tensor_words}) + print("infer results: ", results) + + +def main(use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + save_path = "understand_sentiment_conv.inference.model" + train(use_cuda, train_program, save_path) + infer(use_cuda, inference_program, save_path) + + +if __name__ == '__main__': + for use_cuda in (False, True): + main(use_cuda=use_cuda) diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py index 0d7cbe3874cbc0c2def9d0032737f81e662296d6..d4fb80168814359827708ad921bd3f53b14bb2ee 100644 --- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py +++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py @@ -128,17 +128,21 @@ def infer(use_cuda, inference_program, save_dirname=None): param_path=save_dirname, place=place) - def create_random_lodtensor(lod, place, low, high): - data = np.random.random_integers(low, high, - [lod[-1], 1]).astype("int64") - res = fluid.LoDTensor() - res.set(data, place) - res.set_lod([lod]) - return res - - lod = [0, 4, 10] - tensor_words = create_random_lodtensor( - lod, place, low=0, high=len(word_dict) - 1) + # Setup input by creating LoDTensor to represent sequence of words. + # Here each word is the basic element of the LoDTensor and the shape of + # each word (base_shape) should be [1] since it is simply an index to + # look up for the corresponding word vector. + # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], + # which has only one lod level. Then the created LoDTensor will have only + # one higher level structure (sequence of words, or sentence) than the basic + # element (word). Hence the LoDTensor will hold data for three sentences of + # length 3, 4 and 2, respectively. + # Note that lod info should be a list of lists. + lod = [[3, 4, 2]] + base_shape = [1] + # The range of random integers is [low, high] + tensor_words = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=len(word_dict) - 1) results = inferencer.infer({'words': tensor_words}) print("infer results: ", results) diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926 --- /dev/null +++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt @@ -0,0 +1,7 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +# default test +foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) +endforeach() diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py index bf86cd9acf8da940fcc2fb5b594e33f9b6965acb..16d73d4aff4ba31327e6d8f5ac04a36387f59daa 100644 --- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py +++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py @@ -25,16 +25,6 @@ HIDDEN_SIZE = 256 N = 5 BATCH_SIZE = 32 - -def create_random_lodtensor(lod, place, low, high): - # The range of data elements is [low, high] - data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64") - res = fluid.LoDTensor() - res.set(data, place) - res.set_lod([lod]) - return res - - word_dict = paddle.dataset.imikolov.build_dict() dict_size = len(word_dict) @@ -130,11 +120,23 @@ def infer(use_cuda, inference_program, save_dirname=None): inferencer = fluid.Inferencer( infer_func=inference_program, param_path=save_dirname, place=place) - lod = [0, 1] - first_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1) - second_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1) - third_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1) - fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1) + # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word + # is simply an index to look up for the corresponding word vector and hence + # the shape of word (base_shape) should be [1]. The length-based level of + # detail (lod) info of each LoDtensor should be [[1]] meaning there is only + # one lod_level and there is only one sequence of one word on this level. + # Note that lod info should be a list of lists. + lod = [[1]] + base_shape = [1] + # The range of random integers is [low, high] + first_word = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=dict_size - 1) + second_word = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=dict_size - 1) + third_word = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=dict_size - 1) + fourth_word = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=dict_size - 1) result = inferencer.infer( { diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py index 792ed7368d646cd9dff9255eb402b6a9b84f69a6..c6687e8ad7fcc45c82d6dcb2256e9055a81cc61c 100644 --- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py +++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py @@ -125,14 +125,6 @@ def stacked_lstm_net(data, return avg_cost, accuracy, prediction -def create_random_lodtensor(lod, place, low, high): - data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64") - res = fluid.LoDTensor() - res.set(data, place) - res.set_lod([lod]) - return res - - def train(word_dict, net_method, use_cuda, @@ -242,9 +234,21 @@ def infer(word_dict, use_cuda, save_dirname=None): word_dict_len = len(word_dict) - lod = [0, 4, 10] - tensor_words = create_random_lodtensor( - lod, place, low=0, high=word_dict_len - 1) + # Setup input by creating LoDTensor to represent sequence of words. + # Here each word is the basic element of the LoDTensor and the shape of + # each word (base_shape) should be [1] since it is simply an index to + # look up for the corresponding word vector. + # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], + # which has only one lod level. Then the created LoDTensor will have only + # one higher level structure (sequence of words, or sentence) than the basic + # element (word). Hence the LoDTensor will hold data for three sentences of + # length 3, 4 and 2, respectively. + # Note that lod info should be a list of lists. + lod = [[3, 4, 2]] + base_shape = [1] + # The range of random integers is [low, high] + tensor_words = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=word_dict_len - 1) # Construct feed as a dictionary of {feed_target_name: feed_target_data} # and results will contain a list of data corresponding to fetch_targets. diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index f1ee5dfd99e1c8b26280c010c1aaca05a004a5b6..bc8a1aafc82d62501cecfa71be0cc3851c75eae2 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -116,29 +116,6 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, return feature_out -def to_lodtensor(data, place): - seq_lens = [len(seq) for seq in data] - cur_len = 0 - lod = [cur_len] - for l in seq_lens: - cur_len += l - lod.append(cur_len) - flattened_data = np.concatenate(data, axis=0).astype("int64") - flattened_data = flattened_data.reshape([len(flattened_data), 1]) - res = fluid.LoDTensor() - res.set(flattened_data, place) - res.set_lod([lod]) - return res - - -def create_random_lodtensor(lod, place, low, high): - data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64") - res = fluid.LoDTensor() - res.set(data, place) - res.set_lod([lod]) - return res - - def train(use_cuda, save_dirname=None, is_local=True): # define network topology word = fluid.layers.data( @@ -271,23 +248,35 @@ def infer(use_cuda, save_dirname=None): [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) - lod = [0, 4, 10] - word = create_random_lodtensor( - lod, place, low=0, high=word_dict_len - 1) - pred = create_random_lodtensor( - lod, place, low=0, high=pred_dict_len - 1) - ctx_n2 = create_random_lodtensor( - lod, place, low=0, high=word_dict_len - 1) - ctx_n1 = create_random_lodtensor( - lod, place, low=0, high=word_dict_len - 1) - ctx_0 = create_random_lodtensor( - lod, place, low=0, high=word_dict_len - 1) - ctx_p1 = create_random_lodtensor( - lod, place, low=0, high=word_dict_len - 1) - ctx_p2 = create_random_lodtensor( - lod, place, low=0, high=word_dict_len - 1) - mark = create_random_lodtensor( - lod, place, low=0, high=mark_dict_len - 1) + # Setup inputs by creating LoDTensors to represent sequences of words. + # Here each word is the basic element of these LoDTensors and the shape of + # each word (base_shape) should be [1] since it is simply an index to + # look up for the corresponding word vector. + # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], + # which has only one lod level. Then the created LoDTensors will have only + # one higher level structure (sequence of words, or sentence) than the basic + # element (word). Hence the LoDTensor will hold data for three sentences of + # length 3, 4 and 2, respectively. + # Note that lod info should be a list of lists. + lod = [[3, 4, 2]] + base_shape = [1] + # The range of random integers is [low, high] + word = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=word_dict_len - 1) + pred = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=pred_dict_len - 1) + ctx_n2 = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=word_dict_len - 1) + ctx_n1 = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=word_dict_len - 1) + ctx_0 = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=word_dict_len - 1) + ctx_p1 = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=word_dict_len - 1) + ctx_p2 = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=word_dict_len - 1) + mark = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=mark_dict_len - 1) # Construct feed as a dictionary of {feed_target_name: feed_target_data} # and results will contain a list of data corresponding to fetch_targets. diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py index 30e1a5040cc92b02bbbf90dac97001812ec90134..3118d88701e5f64ae50f7ee774ea8174aa7758eb 100644 --- a/python/paddle/fluid/tests/book/test_word2vec.py +++ b/python/paddle/fluid/tests/book/test_word2vec.py @@ -21,15 +21,6 @@ import math import sys -def create_random_lodtensor(lod, place, low, high): - # The range of data elements is [low, high] - data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64") - res = fluid.LoDTensor() - res.set(data, place) - res.set_lod([lod]) - return res - - def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): PASS_NUM = 100 EMBED_SIZE = 32 @@ -175,16 +166,23 @@ def infer(use_cuda, save_dirname=None): word_dict = paddle.dataset.imikolov.build_dict() dict_size = len(word_dict) - # Setup inputs, by creating 4 words, the lod of which should be [0, 1] - lod = [0, 1] - first_word = create_random_lodtensor( - lod, place, low=0, high=dict_size - 1) - second_word = create_random_lodtensor( - lod, place, low=0, high=dict_size - 1) - third_word = create_random_lodtensor( - lod, place, low=0, high=dict_size - 1) - fourth_word = create_random_lodtensor( - lod, place, low=0, high=dict_size - 1) + # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word + # is simply an index to look up for the corresponding word vector and hence + # the shape of word (base_shape) should be [1]. The length-based level of + # detail (lod) info of each LoDtensor should be [[1]] meaning there is only + # one lod_level and there is only one sequence of one word on this level. + # Note that lod info should be a list of lists. + lod = [[1]] + base_shape = [1] + # The range of random integers is [low, high] + first_word = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=dict_size - 1) + second_word = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=dict_size - 1) + third_word = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=dict_size - 1) + fourth_word = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=dict_size - 1) assert feed_target_names[0] == 'firstw' assert feed_target_names[1] == 'secondw' diff --git a/python/paddle/fluid/tests/test_cpp_reader.py b/python/paddle/fluid/tests/test_cpp_reader.py index e54c73b2956dd99ee57804318130c261e133d21a..6cc291dfcffdd7083f498389834e37bd06ca4572 100644 --- a/python/paddle/fluid/tests/test_cpp_reader.py +++ b/python/paddle/fluid/tests/test_cpp_reader.py @@ -44,8 +44,8 @@ create_random_data_generator_op = startup_block.append_op( attrs={ "shape_concat": [1, 2, 1, 1], "ranks": [2, 2], - "min": 0.0, - "max": 1.0, + "low": 0.0, + "high": 1.0, 'lod_levels': [0, 0] }) diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py new file mode 100644 index 0000000000000000000000000000000000000000..b11131456a1f87419407c4d8626ebcde26dd7640 --- /dev/null +++ b/python/paddle/fluid/tests/test_lod_tensor.py @@ -0,0 +1,88 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor, _validate_lod, _convert_lod +import numpy +import unittest + + +class TestLoDTensor(unittest.TestCase): + def test_validate_lod(self): + lod = (1, 2, 1) + self.assertRaises(AssertionError, _validate_lod, lod, -1) + lod = [[1, 2], (2, 3)] + self.assertRaises(AssertionError, _validate_lod, lod, -1) + lod = [1, 2, 3] + self.assertRaises(AssertionError, _validate_lod, lod, -1) + + lod = [] + self.assertTrue(_validate_lod(lod, -1)) + lod = [[], [1], [3]] + self.assertFalse(_validate_lod(lod, -1)) + lod = [[0], [-1], [3]] + self.assertFalse(_validate_lod(lod, -1)) + + # Each level's sum should be equal to the number of items in the next level + # Moreover, last level's sum should be equal to the tensor height + lod = [[2, 3], [1, 3, 1, 2, 1]] + self.assertTrue(_validate_lod(lod, tensor_height=8)) + lod = [[1, 3], [2, 1, 3]] + self.assertFalse(_validate_lod(lod, tensor_height=6)) + lod = [[1, 3], [2, 1, 3, 4]] + self.assertFalse(_validate_lod(lod, tensor_height=5)) + + def test_convert_lod(self): + lod = [[1, 2, 3]] + converted_lod = [[0, 1, 3, 6]] + self.assertEqual(_convert_lod(lod), converted_lod) + + lod = [[2, 3], [1, 3, 1, 2, 1]] + converted_lod = [[0, 2, 5], [0, 1, 4, 5, 7, 8]] + self.assertEqual(_convert_lod(lod), converted_lod) + + def test_create_lod_tensor(self): + # Only numpy array or a fluid LoDTensor is valid input to + # create_lod_tensor function, currently a list of lists is not. + data = [[1, 2], [3, 4]] + self.assertRaises(Exception, create_lod_tensor, data, [], + fluid.CPUPlace()) + + # Create LoDTensor from numpy array + data = numpy.random.random([10, 1]) + lod = [[2, 1], [3, 3, 4]] + tensor = create_lod_tensor(data, lod, fluid.CPUPlace()) + self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]]) + + # Create LoDTensor from another LoDTensor, they are differnt instances + new_lod = [[2, 2, 1], [1, 2, 2, 3, 2]] + new_tensor = create_lod_tensor(tensor, new_lod, fluid.CPUPlace()) + self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]]) + self.assertEqual(new_tensor.lod(), [[0, 2, 4, 5], [0, 1, 3, 5, 8, 10]]) + + def test_create_random_int_lodtensor(self): + # The shape of a word, commonly used in speech and NLP problem, is [1] + shape = [1] + lod = [[2, 3, 5]] + dict_size = 10000 + low = 0 + high = dict_size - 1 + tensor = create_random_int_lodtensor(lod, shape, + fluid.CPUPlace(), low, high) + self.assertEqual(tensor.lod(), [[0, 2, 5, 10]]) + self.assertEqual(tensor.shape(), [10, 1]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 2ae9653953c2f5f6a399243bef2c7fb756f9692f..eed1412ba4f2b8f2209c0573359bea1e4b20d8d5 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -17,7 +17,7 @@ endif(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185 -list(REMOVE_ITEM TEST_OPS test_nce) # IXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778 +list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957 @@ -26,7 +26,7 @@ list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not function(py_test_modules TARGET_NAME) if(WITH_TESTING) - set(options "") + set(options SERIAL) set(oneValueArgs "") set(multiValueArgs MODULES DEPS ENVS) cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -34,76 +34,17 @@ function(py_test_modules TARGET_NAME) COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + if (py_test_modules_SERIAL) + set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1) + endif() endif() endfunction() - -list(REMOVE_ITEM TEST_OPS test_sequence_expand) - -# test time consuming OPs in a separate process for expliot parallism -list(REMOVE_ITEM TEST_OPS test_parallel_executor) list(REMOVE_ITEM TEST_OPS test_warpctc_op) -list(REMOVE_ITEM TEST_OPS test_dyn_rnn) -list(REMOVE_ITEM TEST_OPS test_mul_op) - -# tests that need to be run in separate process. -list(REMOVE_ITEM TEST_OPS test_multihead_attention) -list(REMOVE_ITEM TEST_OPS test_calc_gradient) -list(REMOVE_ITEM TEST_OPS test_while_op) -list(REMOVE_ITEM TEST_OPS test_lod_array_length_op) -list(REMOVE_ITEM TEST_OPS test_reorder_lod_tensor) -list(REMOVE_ITEM TEST_OPS test_profiler) -list(REMOVE_ITEM TEST_OPS test_nvprof) -list(REMOVE_ITEM TEST_OPS test_normalization_wrapper) -list(REMOVE_ITEM TEST_OPS test_executor_and_mul) -list(REMOVE_ITEM TEST_OPS test_assign_value_op) -list(REMOVE_ITEM TEST_OPS test_array_read_write_op) -list(REMOVE_ITEM TEST_OPS test_lod_rank_table) -list(REMOVE_ITEM TEST_OPS test_weight_normalization) -list(REMOVE_ITEM TEST_OPS test_conditional_block) -list(REMOVE_ITEM TEST_OPS test_parameter) -list(REMOVE_ITEM TEST_OPS test_registry) -list(REMOVE_ITEM TEST_OPS test_fetch_var) -list(REMOVE_ITEM TEST_OPS test_parallel_op) -list(REMOVE_ITEM TEST_OPS test_dynrnn_static_input) list(REMOVE_ITEM TEST_OPS test_dist_train) -list(REMOVE_ITEM TEST_OPS test_network_with_dtype) - -# tests that can be bundled together in one python process for speed. -if(WITH_FAST_BUNDLE_TEST) - py_test_modules("test_all_ops" MODULES ${TEST_OPS}) -else() - foreach(TEST_OP ${TEST_OPS}) - py_test_modules(${TEST_OP} MODULES ${TEST_OP}) - endforeach(TEST_OP) -endif(WITH_FAST_BUNDLE_TEST) - -# -py_test_modules(test_sequence_expand MODULES test_sequence_expand) -# tests with high overhead -py_test_modules(test_parallel_executor MODULES test_parallel_executor) -py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR}) -py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn) -py_test_modules(test_mul_op MODULES test_mul_op) -py_test_modules(test_network_with_dtype MODULES test_network_with_dtype) - -# tests that need to be run in separate process. -py_test_modules(test_multihead_attention MODULES test_multihead_attention) -py_test_modules(test_calc_gradient MODULES test_calc_gradient) -py_test_modules(test_while_op MODULES test_while_op) -py_test_modules(test_lod_array_length_op MODULES test_lod_array_length_op) -py_test_modules(test_reorder_lod_tensor MODULES test_reorder_lod_tensor) -py_test_modules(test_profiler MODULES test_profiler) -py_test_modules(test_nvprof MODULES test_nvprof) -py_test_modules(test_normalization_wrapper MODULES test_normalization_wrapper) -py_test_modules(test_executor_and_mul MODULES test_executor_and_mul) -py_test_modules(test_assign_value_op MODULES test_assign_value_op) -py_test_modules(test_array_read_write_op MODULES test_array_read_write_op) -py_test_modules(test_lod_rank_table MODULES test_lod_rank_table) -py_test_modules(test_weight_normalization MODULES test_weight_normalization) -py_test_modules(test_conditional_block MODULES test_conditional_block) -py_test_modules(test_parameter MODULES test_parameter) -py_test_modules(test_registry MODULES test_registry) -py_test_modules(test_fetch_var MODULES test_fetch_var) -py_test_modules(test_dynrnn_static_input MODULES test_dynrnn_static_input) -py_test_modules(test_parallel_op MODULES test_parallel_op) -py_test_modules(test_dist_train MODULES test_dist_train) +list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) +list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) +foreach(TEST_OP ${TEST_OPS}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP}) +endforeach(TEST_OP) +py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL) +py_test_modules(test_dist_train MODULES test_dist_train SERIAL) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 299ab8e51f017e1980a8b40e3830fc42b1ff7ccc..709b4bf2fcfb180c747ba3539711a58a57e3b77f 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -36,6 +36,12 @@ def randomize_probability(batch_size, class_num, dtype='float32'): def create_op(scope, op_type, inputs, outputs, attrs): kwargs = dict() + op_maker = core.op_proto_and_checker_maker + op_role_attr_name = op_maker.kOpRoleAttrName() + + if op_role_attr_name not in attrs: + attrs[op_role_attr_name] = int(op_maker.OpRole.Forward) + def __create_var__(name, var_name): scope.var(var_name).get_tensor() kwargs[name].append(var_name) diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py new file mode 100644 index 0000000000000000000000000000000000000000..c9c3c648717814c28c39a401487925824e885946 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -0,0 +1,96 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle.fluid as fluid +import time +import numpy as np + +__all__ = ['TestParallelExecutorBase'] + + +class TestParallelExecutorBase(unittest.TestCase): + def check_network_convergence(self, + method, + memory_opt=True, + iter=50, + batch_size=None, + allow_op_delay=False, + feed_dict=None, + seed=None, + use_parallel_executor=True, + balance_parameter_opt_between_cards=False): + def run_executor(exe, feed, fetch_list, program=None): + if isinstance(exe, fluid.ParallelExecutor): + res = exe.run(fetch_list=fetch_list, feed=feed) + elif isinstance(exe, fluid.Executor): + if program is None: + program = fluid.default_main_program() + res = exe.run(program=program, feed=feed, fetch_list=fetch_list) + else: + raise ValueError('Unkown type exe') + return res + + main = fluid.Program() + startup = fluid.Program() + startup.random_seed = 1 # Fix random seed + with fluid.program_guard(main, startup): + if seed is not None: + startup.random_seed = seed + loss = method(use_feed=feed_dict is not None) + adam = fluid.optimizer.Adam() + adam.minimize(loss) + if memory_opt: + fluid.memory_optimize(main) + place = fluid.CUDAPlace(0) + startup_exe = fluid.Executor(place) + startup_exe.run(startup) + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.allow_op_delay = allow_op_delay + + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce + + if use_parallel_executor: + exe = fluid.ParallelExecutor( + True, + loss_name=loss.name, + exec_strategy=exec_strategy, + build_strategy=build_strategy) + else: + exe = fluid.Executor(place=place) + + if batch_size is not None: + batch_size *= fluid.core.get_cuda_device_count() + begin = time.time() + first_loss, = run_executor( + exe=exe, feed=feed_dict, fetch_list=[loss.name]) + first_loss = np.array(first_loss) + + for i in xrange(iter): + run_executor(exe=exe, feed=feed_dict, fetch_list=[]) + + last_loss, = run_executor( + exe=exe, feed=feed_dict, fetch_list=[loss.name]) + end = time.time() + + if batch_size is not None: + print "%.4f Instance per second" % ( + (batch_size * iter + 2) / (end - begin)) + + last_loss = np.array(last_loss) + + print first_loss, last_loss + # self.assertGreater(first_loss[0], last_loss[0]) + return first_loss, last_loss diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py index d864b9b348e961c585749d47d449d775b2dfebc9..ded2f130288a4a959a1c859b2cc8ccf0912efb12 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py @@ -21,8 +21,11 @@ from op_test import OpTest def conv2dtranspose_forward_naive(input_, filter_, attrs): in_n, in_c, in_h, in_w = input_.shape - f_c, out_c, f_h, f_w = filter_.shape + f_c, f_out_c, f_h, f_w = filter_.shape + groups = attrs['groups'] assert in_c == f_c + out_c = f_out_c * groups + sub_in_c = in_c / groups stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[ 'dilations'] @@ -36,15 +39,21 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs): for n in range(in_n): for i in range(in_h): for j in range(in_w): - input_masked = input_[n, :, i, j] # (c) - input_masked = np.reshape(input_masked, (in_c, 1, 1)) - input_masked = np.tile(input_masked, (1, f_h, f_w)) - - for k in range(out_c): - tmp_out = np.sum(input_masked * filter_[:, k, :, :], axis=0) - i1, i2 = i * stride[0], i * stride[0] + d_bolck_h - j1, j2 = j * stride[0], j * stride[0] + d_bolck_h - out[n, k, i1:i2:dilations[0], j1:j2:dilations[1]] += tmp_out + for g in range(groups): + input_masked = input_[n, g * sub_in_c:(g + 1) * sub_in_c, i, + j] # (c) + input_masked = np.reshape(input_masked, (sub_in_c, 1, 1)) + input_masked = np.tile(input_masked, (1, f_h, f_w)) + + for k in range(f_out_c): + tmp_out = np.sum( + input_masked * + filter_[g * sub_in_c:(g + 1) * sub_in_c, k, :, :], + axis=0) + i1, i2 = i * stride[0], i * stride[0] + d_bolck_h + j1, j2 = j * stride[0], j * stride[0] + d_bolck_h + out[n, g * f_out_c + k, i1:i2:dilations[0], j1:j2: + dilations[1]] += tmp_out out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]] return out @@ -64,6 +73,7 @@ class TestConv2dTransposeOp(OpTest): self.attrs = { 'strides': self.stride, 'paddings': self.pad, + 'groups': self.groups, 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter @@ -127,6 +137,7 @@ class TestConv2dTransposeOp(OpTest): self.pad = [0, 0] self.stride = [1, 1] self.dilations = [1, 1] + self.groups = 1 self.input_size = [2, 3, 5, 5] # NCHW f_c = self.input_size[1] self.filter_size = [f_c, 6, 3, 3] @@ -140,16 +151,29 @@ class TestWithPad(TestConv2dTransposeOp): self.pad = [1, 1] self.stride = [1, 1] self.dilations = [1, 1] + self.groups = 1 self.input_size = [2, 3, 5, 5] # NCHW f_c = self.input_size[1] self.filter_size = [f_c, 6, 3, 3] +class TestWithGroups(TestConv2dTransposeOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.dilations = [1, 1] + self.groups = 2 + self.input_size = [2, 4, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 3, 3, 3] + + class TestWithStride(TestConv2dTransposeOp): def init_test_case(self): self.pad = [1, 1] self.stride = [2, 2] self.dilations = [1, 1] + self.groups = 1 self.input_size = [2, 3, 5, 5] # NCHW f_c = self.input_size[1] self.filter_size = [f_c, 6, 3, 3] @@ -159,6 +183,7 @@ class TestWithDilation(TestConv2dTransposeOp): def init_test_case(self): self.pad = [1, 1] self.stride = [1, 1] + self.groups = 1 self.dilations = [2, 2] self.input_size = [2, 3, 5, 5] # NCHW f_c = self.input_size[1] @@ -176,6 +201,7 @@ class TestCUDNNWithPad(TestWithPad): def init_test_case(self): self.pad = [1, 1] self.stride = [1, 1] + self.groups = 1 self.dilations = [1, 1] self.input_size = [2, 3, 5, 5] # NCHW f_c = self.input_size[1] @@ -190,6 +216,7 @@ class TestCUDNNWithStride(TestWithStride): def init_test_case(self): self.pad = [1, 1] self.stride = [2, 2] + self.groups = 1 self.dilations = [1, 1] self.input_size = [2, 3, 5, 5] # NCHW f_c = self.input_size[1] @@ -200,6 +227,21 @@ class TestCUDNNWithStride(TestWithStride): self.op_type = "conv2d_transpose" +class TestCUDNNWithGroups(TestWithGroups): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.dilations = [1, 1] + self.groups = 2 + self.input_size = [2, 4, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 3, 3, 3] + + def init_op_type(self): + self.use_cudnn = True + self.op_type = "conv2d_transpose" + + # Please Don't remove the following code. # Currently, CI use cudnn V5.0 which not support dilation conv. # class TestCUDNNWithDilation(TestWithDilation): diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py index 55ba238710c56dd0daea388cd2dcdb79243bb71e..c9f26d10df8ff39d6bd77b1597336600f676d362 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py @@ -21,8 +21,11 @@ from op_test import OpTest def conv3dtranspose_forward_naive(input_, filter_, attrs): in_n, in_c, in_d, in_h, in_w = input_.shape - f_c, out_c, f_d, f_h, f_w = filter_.shape + f_c, f_out_c, f_d, f_h, f_w = filter_.shape + groups = attrs['groups'] assert in_c == f_c + out_c = f_out_c * groups + sub_in_c = in_c / groups stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[ 'dilations'] @@ -39,18 +42,23 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs): for d in range(in_d): for i in range(in_h): for j in range(in_w): - input_masked = input_[n, :, d, i, j] # (c) - input_masked = np.reshape(input_masked, (in_c, 1, 1, 1)) - input_masked = np.tile(input_masked, (1, f_d, f_h, f_w)) - - for k in range(out_c): - tmp_out = np.sum(input_masked * filter_[:, k, :, :, :], - axis=0) - d1, d2 = d * stride[0], d * stride[0] + d_bolck_d - i1, i2 = i * stride[1], i * stride[1] + d_bolck_h - j1, j2 = j * stride[2], j * stride[2] + d_bolck_w - out[n, k, d1:d2:dilations[0], i1:i2:dilations[1], j1:j2: - dilations[2]] += tmp_out + for g in range(groups): + input_masked = input_[n, g * sub_in_c:(g + 1 + ) * sub_in_c, d, + i, j] # (c) + input_masked = np.reshape(input_masked, + (sub_in_c, 1, 1, 1)) + input_masked = np.tile(input_masked, (1, f_d, f_h, f_w)) + + for k in range(f_out_c): + tmp_out = np.sum(input_masked * filter_[ + g * sub_in_c:(g + 1) * sub_in_c, k, :, :, :], + axis=0) + d1, d2 = d * stride[0], d * stride[0] + d_bolck_d + i1, i2 = i * stride[1], i * stride[1] + d_bolck_h + j1, j2 = j * stride[2], j * stride[2] + d_bolck_w + out[n, g * f_out_c + k, d1:d2:dilations[0], i1:i2: + dilations[1], j1:j2:dilations[2]] += tmp_out out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w - pad[2]] @@ -72,6 +80,7 @@ class TestConv3dTransposeOp(OpTest): 'strides': self.stride, 'paddings': self.pad, 'dilations': self.dilations, + 'groups': self.groups, 'use_cudnn': self.use_cudnn, 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter } @@ -134,6 +143,7 @@ class TestConv3dTransposeOp(OpTest): self.pad = [0, 0, 0] self.stride = [1, 1, 1] self.dilations = [1, 1, 1] + self.groups = 1 self.input_size = [2, 3, 5, 5, 5] # NCDHW f_c = self.input_size[1] self.filter_size = [f_c, 6, 3, 3, 3] @@ -147,16 +157,29 @@ class TestWithPad(TestConv3dTransposeOp): self.pad = [1, 1, 1] self.stride = [1, 1, 1] self.dilations = [1, 1, 1] + self.groups = 1 self.input_size = [2, 3, 5, 5, 5] # NCDHW f_c = self.input_size[1] self.filter_size = [f_c, 6, 3, 3, 3] +class TestWithGroups(TestConv3dTransposeOp): + def init_test_case(self): + self.pad = [1, 1, 1] + self.stride = [1, 1, 1] + self.dilations = [1, 1, 1] + self.groups = 2 + self.input_size = [2, 4, 5, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 3, 3, 3, 3] + + class TestWithStride(TestConv3dTransposeOp): def init_test_case(self): self.pad = [1, 1, 1] self.stride = [2, 2, 2] self.dilations = [1, 1, 1] + self.groups = 1 self.input_size = [2, 3, 5, 5, 5] # NCDHW f_c = self.input_size[1] self.filter_size = [f_c, 6, 3, 3, 3] @@ -167,6 +190,7 @@ class TestWithDilation(TestConv3dTransposeOp): self.pad = [1, 1, 1] self.stride = [1, 1, 1] self.dilations = [2, 2, 2] + self.groups = 1 self.input_size = [2, 3, 5, 5, 5] # NCDHW f_c = self.input_size[1] self.filter_size = [f_c, 6, 3, 3, 3] @@ -184,6 +208,7 @@ class TestCUDNNWithPad(TestWithPad): self.pad = [1, 1, 1] self.stride = [1, 1, 1] self.dilations = [1, 1, 1] + self.groups = 1 self.input_size = [2, 3, 5, 5, 5] # NCDHW f_c = self.input_size[1] self.filter_size = [f_c, 6, 3, 3, 3] @@ -198,6 +223,7 @@ class TestCUDNNWithStride(TestWithStride): self.pad = [1, 1, 1] self.stride = [2, 2, 2] self.dilations = [1, 1, 1] + self.groups = 1 self.input_size = [2, 3, 5, 5, 5] # NCDHW f_c = self.input_size[1] self.filter_size = [f_c, 6, 3, 3, 3] @@ -207,6 +233,21 @@ class TestCUDNNWithStride(TestWithStride): self.op_type = "conv3d_transpose" +class TestCUDNNWithGroups(TestWithGroups): + def init_test_case(self): + self.pad = [1, 1, 1] + self.stride = [1, 1, 1] + self.dilations = [1, 1, 1] + self.groups = 2 + self.input_size = [2, 4, 5, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 3, 3, 3, 3] + + def init_op_type(self): + self.use_cudnn = True + self.op_type = "conv3d_transpose" + + # Please Don't remove the following code. # Currently, CI use cudnn V5.0 which not support dilation conv. # class TestCUDNNWithDilation(TestWithDilation): diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py index c2393a288c6ebb5dd4a12f7b591d12cc94f4ea55..2314bb2ed8a4eeb34752fd5d040f8a8476798aa6 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_train.py +++ b/python/paddle/fluid/tests/unittests/test_dist_train.py @@ -12,19 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import time import unittest +from multiprocessing import Process + +import numpy import paddle.fluid as fluid -import paddle.fluid.core as core import paddle.fluid.layers as layers -import numpy -from multiprocessing import Process -from threading import Thread -import os, sys -import time class TestSendOp(unittest.TestCase): + @unittest.skip( + "This test is buggy. We cannot use time.sleep to sync processes, the connection may fail in unittest." + ) def test_send(self): # Run init_serv in a thread place = fluid.CPUPlace() @@ -34,7 +36,7 @@ class TestSendOp(unittest.TestCase): p.start() time.sleep(10) - with open("/tmp/paddle.%d.selected_port" % p.pid, "r") as fn: + with open("/tmp/paddle.%d.port" % p.pid, "r") as fn: selected_port = int(fn.readlines()[0]) self.init_client(place, selected_port) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index c5414abf0fee6b686dccf7c97e9c6d5408ecf62a..c44ac59ccdb7fa212ab2a8ab83ee0c70fc498f9f 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -369,6 +369,14 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(output) print(str(program)) + def test_bilinear_interp(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[3, 9, 6], dtype="float32") + output = layers.bilinear_interp(x, 12, 12) + self.assertIsNotNone(output) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py index 779ae388f04496a7be9a6d5aa4e39b8245022925..8b15aa6822aee7bb4d53dcf1d87565fae5504821 100644 --- a/python/paddle/fluid/tests/unittests/test_operator_desc.py +++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py @@ -63,7 +63,10 @@ class TestOperator(unittest.TestCase): self.assertEqual(mul_op.output("Out"), ["mul.out"]) self.assertEqual( set(mul_op.attr_names), - set(["x_num_col_dims", "y_num_col_dims", "use_mkldnn"])) + set([ + "x_num_col_dims", "y_num_col_dims", "use_mkldnn", "op_role", + "op_role_var" + ])) self.assertEqual(mul_op.has_attr("x_num_col_dims"), True) self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT) self.assertEqual(mul_op.attr("x_num_col_dims"), 1) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py deleted file mode 100644 index 056f9e1781997aa1586d972874b652d5b725fe3f..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ /dev/null @@ -1,902 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import unittest - -import paddle.fluid as fluid -import paddle -import paddle.dataset.mnist as mnist -import paddle.dataset.wmt16 as wmt16 - - -def simple_fc_net(use_feed): - if use_feed: - img = fluid.layers.data(name='image', shape=[784], dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - else: - reader = fluid.layers.open_files( - filenames=['./mnist.recordio'], - shapes=[[-1, 784], [-1, 1]], - lod_levels=[0, 0], - dtypes=['float32', 'int64'], - thread_num=1, - for_parallel=True) - reader = fluid.layers.io.double_buffer(reader) - img, label = fluid.layers.read_file(reader) - hidden = img - for _ in xrange(4): - hidden = fluid.layers.fc( - hidden, - size=200, - act='tanh', - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=1.0))) - prediction = fluid.layers.fc(hidden, size=10, act='softmax') - loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) - return loss - - -def fc_with_batchnorm(use_feed): - if use_feed: - img = fluid.layers.data(name='image', shape=[784], dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - else: - reader = fluid.layers.open_files( - filenames=['mnist.recordio'], - shapes=[[-1, 784], [-1, 1]], - lod_levels=[0, 0], - dtypes=['float32', 'int64'], - thread_num=1, - for_parallel=True) - reader = fluid.layers.io.double_buffer(reader) - img, label = fluid.layers.read_file(reader) - - hidden = img - for _ in xrange(1): - hidden = fluid.layers.fc( - hidden, - size=200, - act='tanh', - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=1.0))) - - hidden = fluid.layers.batch_norm(input=hidden) - - prediction = fluid.layers.fc(hidden, size=10, act='softmax') - loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) - return loss - - -def squeeze_excitation(input, num_channels, reduction_ratio): - # pool = fluid.layers.pool2d( - # input=input, pool_size=0, pool_type='avg', global_pooling=True) - conv = input - shape = conv.shape - reshape = fluid.layers.reshape( - x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) - pool = fluid.layers.reduce_mean(input=reshape, dim=2) - - squeeze = fluid.layers.fc(input=pool, - size=num_channels / reduction_ratio, - act='relu') - excitation = fluid.layers.fc(input=squeeze, - size=num_channels, - act='sigmoid') - scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) - return scale - - -def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1, - act=None): - conv = fluid.layers.conv2d( - input=input, - num_filters=num_filters, - filter_size=filter_size, - stride=stride, - padding=(filter_size - 1) / 2, - groups=groups, - act=None, - bias_attr=False) - return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1) - - -def shortcut(input, ch_out, stride): - ch_in = input.shape[1] - if ch_in != ch_out: - if stride == 1: - filter_size = 1 - else: - filter_size = 3 - return conv_bn_layer(input, ch_out, filter_size, stride) - else: - return input - - -def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): - # The number of first 1x1 convolutional channels for each bottleneck build block - # was halved to reduce the compution cost. - conv0 = conv_bn_layer( - input=input, num_filters=num_filters, filter_size=1, act='relu') - conv1 = conv_bn_layer( - input=conv0, - num_filters=num_filters * 2, - filter_size=3, - stride=stride, - groups=cardinality, - act='relu') - conv2 = conv_bn_layer( - input=conv1, num_filters=num_filters * 2, filter_size=1, act=None) - scale = squeeze_excitation( - input=conv2, - num_channels=num_filters * 2, - reduction_ratio=reduction_ratio) - - short = shortcut(input, num_filters * 2, stride) - - return fluid.layers.elementwise_add(x=short, y=scale, act='relu') - - -def SE_ResNeXt50Small(batch_size=2, use_feed=False): - assert not use_feed, "SE_ResNeXt doesn't support feed yet" - - img = fluid.layers.fill_constant( - shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0) - label = fluid.layers.fill_constant( - shape=[batch_size, 1], dtype='int64', value=0.0) - - conv = conv_bn_layer( - input=img, num_filters=16, filter_size=3, stride=2, act='relu') - conv = conv_bn_layer( - input=conv, num_filters=16, filter_size=3, stride=1, act='relu') - conv = conv_bn_layer( - input=conv, num_filters=16, filter_size=3, stride=1, act='relu') - conv = fluid.layers.pool2d( - input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') - - cardinality = 32 - reduction_ratio = 16 - depth = [3, 4, 6, 3] - num_filters = [128, 256, 512, 1024] - - for block in range(len(depth)): - for i in range(depth[block]): - conv = bottleneck_block( - input=conv, - num_filters=num_filters[block], - stride=2 if i == 0 and block != 0 else 1, - cardinality=cardinality, - reduction_ratio=reduction_ratio) - - shape = conv.shape - reshape = fluid.layers.reshape( - x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) - pool = fluid.layers.reduce_mean(input=reshape, dim=2) - dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2) - # Classifier layer: - prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax') - loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) - return loss - - -import time - - -class TestParallelExecutorBase(unittest.TestCase): - def check_network_convergence(self, - method, - memory_opt=True, - iter=50, - batch_size=None, - allow_op_delay=False, - feed_dict=None, - seed=None, - use_parallel_executor=True, - balance_parameter_opt_between_cards=False): - def run_executor(exe, feed, fetch_list, program=None): - if isinstance(exe, fluid.ParallelExecutor): - res = exe.run(fetch_list=fetch_list, feed=feed) - elif isinstance(exe, fluid.Executor): - if program is None: - program = fluid.default_main_program() - res = exe.run(program=program, feed=feed, fetch_list=fetch_list) - else: - raise ValueError('Unkown type exe') - return res - - main = fluid.Program() - startup = fluid.Program() - startup.random_seed = 1 # Fix random seed - with fluid.program_guard(main, startup): - if seed is not None: - startup.random_seed = seed - loss = method(use_feed=feed_dict is not None) - adam = fluid.optimizer.Adam() - adam.minimize(loss) - if memory_opt: - fluid.memory_optimize(main) - place = fluid.CUDAPlace(0) - startup_exe = fluid.Executor(place) - startup_exe.run(startup) - exec_strategy = fluid.ExecutionStrategy() - exec_strategy.allow_op_delay = allow_op_delay - - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce - - if use_parallel_executor: - exe = fluid.ParallelExecutor( - True, - loss_name=loss.name, - exec_strategy=exec_strategy, - build_strategy=build_strategy) - else: - exe = fluid.Executor(place=place) - - if batch_size is not None: - batch_size *= fluid.core.get_cuda_device_count() - begin = time.time() - first_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) - first_loss = np.array(first_loss) - - for i in xrange(iter): - run_executor(exe=exe, feed=feed_dict, fetch_list=[]) - - last_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) - end = time.time() - - if batch_size is not None: - print "%.4f Instance per second" % ( - (batch_size * iter + 2) / (end - begin)) - - last_loss = np.array(last_loss) - - print first_loss, last_loss - # self.assertGreater(first_loss[0], last_loss[0]) - return first_loss, last_loss - - -class TestMNIST(TestParallelExecutorBase): - @classmethod - def setUpClass(cls): - # Convert mnist to recordio file - with fluid.program_guard(fluid.Program(), fluid.Program()): - reader = paddle.batch(mnist.train(), batch_size=4) - feeder = fluid.DataFeeder( - feed_list=[ # order is image and label - fluid.layers.data( - name='image', shape=[784]), - fluid.layers.data( - name='label', shape=[1], dtype='int64'), - ], - place=fluid.CPUPlace()) - fluid.recordio_writer.convert_reader_to_recordio_file( - './mnist.recordio', reader, feeder) - - def check_simple_fc_convergence(self, balance_parameter_opt_between_cards): - self.check_network_convergence(simple_fc_net) - self.check_network_convergence(simple_fc_net, allow_op_delay=True) - - img = np.zeros(shape=[32, 784], dtype='float32') - label = np.ones(shape=[32, 1], dtype='int64') - self.check_network_convergence( - simple_fc_net, - feed_dict={"image": img, - "label": label}, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) - - def test_simple_fc(self): - self.check_simple_fc_convergence(False) - - def test_simple_fc_with_new_strategy(self): - self.check_simple_fc_convergence(True) - - def check_simple_fc_parallel_accuracy(self, - balance_parameter_opt_between_cards): - img = np.zeros(shape=[32, 784], dtype='float32') - label = np.ones(shape=[32, 1], dtype='int64') - single_first_loss, single_last_loss = self.check_network_convergence( - method=simple_fc_net, - seed=1000, - feed_dict={"image": img, - "label": label}, - use_parallel_executor=False) - parallel_first_loss, parallel_last_loss = self.check_network_convergence( - method=simple_fc_net, - seed=1000, - feed_dict={"image": img, - "label": label}, - use_parallel_executor=True, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) - - for p_f in parallel_first_loss: - self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) - for p_l in parallel_last_loss: - self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) - - def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(False) - - def test_simple_fc_parallel_accuracy_with_new_strategy(self): - self.check_simple_fc_parallel_accuracy(True) - - def check_batchnorm_fc_convergence(self, - balance_parameter_opt_between_cards): - self.check_network_convergence(fc_with_batchnorm) - img = np.zeros(shape=[32, 784], dtype='float32') - label = np.ones(shape=[32, 1], dtype='int64') - self.check_network_convergence( - fc_with_batchnorm, - feed_dict={"image": img, - "label": label}, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) - - def test_batchnorm_fc(self): - self.check_batchnorm_fc_convergence(False) - - def test_batchnorm_fc_with_new_strategy(self): - self.check_batchnorm_fc_convergence(True) - - -class TestResnet(TestParallelExecutorBase): - # @classmethod - # def setUpClass(cls): - # # import os - # # if os.path.exists('./flowers.recordio'): - # # return - # with fluid.program_guard(fluid.Program(), fluid.Program()): - # reader = paddle.batch(flowers.train(), batch_size=4) - # feeder = fluid.DataFeeder( - # feed_list=[ - # fluid.layers.data( - # name='image', shape=[3, 224, 224]), - # fluid.layers.data( - # name='label', shape=[1], dtype='int64'), - # ], - # place=fluid.CPUPlace()) - # fluid.recordio_writer.convert_reader_to_recordio_file( - # "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress) - - def check_resnet_convergence(self, balance_parameter_opt_between_cards): - import functools - batch_size = 2 - self.check_network_convergence( - functools.partial( - SE_ResNeXt50Small, batch_size=batch_size), - iter=20, - batch_size=batch_size, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) - - def test_resnet(self): - self.check_resnet_convergence(False) - - def test_resnet_with_new_strategy(self): - self.check_resnet_convergence(True) - - -class ModelHyperParams(object): - # Dictionary size for source and target language. This model directly uses - # paddle.dataset.wmt16 in which , and token has - # alreay been added, but the token is not added. Transformer requires - # sequences in a mini-batch are padded to have the same length. A token is - # added into the original dictionary in paddle.dateset.wmt16. - - # size of source word dictionary. - src_vocab_size = 10000 - # index for token in source language. - src_pad_idx = src_vocab_size - - # size of target word dictionay - trg_vocab_size = 10000 - # index for token in target language. - trg_pad_idx = trg_vocab_size - - # position value corresponding to the token. - pos_pad_idx = 0 - - # max length of sequences. It should plus 1 to include position - # padding token for position encoding. - max_length = 50 - - # the dimension for word embeddings, which is also the last dimension of - # the input and output of multi-head attention, position-wise feed-forward - # networks, encoder and decoder. - - d_model = 512 - # size of the hidden layer in position-wise feed-forward networks. - d_inner_hid = 1024 - # the dimension that keys are projected to for dot-product attention. - d_key = 64 - # the dimension that values are projected to for dot-product attention. - d_value = 64 - # number of head used in multi-head attention. - n_head = 8 - # number of sub-layers to be stacked in the encoder and decoder. - n_layer = 6 - # dropout rate used by all dropout layers. - dropout = 0.1 - - -def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head): - """ - Pad the instances to the max sequence length in batch, and generate the - corresponding position data and attention bias. Then, convert the numpy - data to tensors and return a dict mapping names to tensors. - """ - - def __pad_batch_data(insts, - pad_idx, - is_target=False, - return_pos=True, - return_attn_bias=True, - return_max_len=True): - """ - Pad the instances to the max sequence length in batch, and generate the - corresponding position data and attention bias. - """ - return_list = [] - max_len = max(len(inst) for inst in insts) - inst_data = np.array( - [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) - return_list += [inst_data.astype("int64").reshape([-1, 1])] - if return_pos: - inst_pos = np.array([[ - pos_i + 1 if w_i != pad_idx else 0 - for pos_i, w_i in enumerate(inst) - ] for inst in inst_data]) - - return_list += [inst_pos.astype("int64").reshape([-1, 1])] - if return_attn_bias: - if is_target: - # This is used to avoid attention on paddings and subsequent - # words. - slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, - max_len)) - slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape( - [-1, 1, max_len, max_len]) - slf_attn_bias_data = np.tile(slf_attn_bias_data, - [1, n_head, 1, 1]) * [-1e9] - else: - # This is used to avoid attention on paddings. - slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] * - (max_len - len(inst)) - for inst in insts]) - slf_attn_bias_data = np.tile( - slf_attn_bias_data.reshape([-1, 1, 1, max_len]), - [1, n_head, max_len, 1]) - return_list += [slf_attn_bias_data.astype("float32")] - if return_max_len: - return_list += [max_len] - return return_list if len(return_list) > 1 else return_list[0] - - def data_to_tensor(data_list, name_list, input_dict, place): - assert len(data_list) == len(name_list) - for i in range(len(name_list)): - tensor = fluid.LoDTensor() - tensor.set(data_list[i], place) - input_dict[name_list[i]] = tensor - - src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data( - [inst[0] for inst in insts], src_pad_idx, is_target=False) - trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data( - [inst[1] for inst in insts], trg_pad_idx, is_target=True) - trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], - [1, 1, trg_max_len, 1]).astype("float32") - lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False, - False, False, False) - lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) - - return [ - src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, - trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight - ] - - -import transformer_model - - -def transformer(use_feed): - assert not use_feed, "transfomer doesn't support feed yet" - return transformer_model.transformer( - ModelHyperParams.src_vocab_size + 1, - ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, - ModelHyperParams.n_layer, ModelHyperParams.n_head, - ModelHyperParams.d_key, ModelHyperParams.d_value, - ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, - ModelHyperParams.dropout, ModelHyperParams.src_pad_idx, - ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) - - -class TestTransformer(TestParallelExecutorBase): - @classmethod - def setUpClass(cls): - reader = paddle.batch( - wmt16.train(ModelHyperParams.src_vocab_size, - ModelHyperParams.trg_vocab_size), - batch_size=transformer_model.batch_size) - - with fluid.recordio_writer.create_recordio_writer( - "./wmt16.recordio") as writer: - for batch in reader(): - for tensor in prepare_batch_input( - batch, ModelHyperParams.src_pad_idx, - ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head): - t = fluid.LoDTensor() - t.set(tensor, fluid.CPUPlace()) - writer.append_tensor(t) - writer.complete_append_tensor() - - @unittest.skip("transformer is buggy in multi gpu") - def test_main(self): - self.check_network_convergence(transformer) - - -class ParallelExecutorTestingDuringTraining(unittest.TestCase): - def check_network_convergence(self, build_strategy=None): - main = fluid.Program() - startup = fluid.Program() - with fluid.program_guard(main, startup): - loss = simple_fc_net(True) - test_program = main.clone(for_test=True) - - opt = fluid.optimizer.SGD(learning_rate=0.001) - opt.minimize(loss) - - batch_size = 32 - image = np.random.normal(size=(batch_size, 784)).astype('float32') - label = np.random.randint(0, 10, (batch_size, 1), dtype="int64") - - place = fluid.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(startup) - feed_dict = {'image': image, 'label': label} - - train_exe = fluid.ParallelExecutor( - use_cuda=True, - loss_name=loss.name, - main_program=main, - build_strategy=build_strategy) - - test_exe = fluid.ParallelExecutor( - use_cuda=True, - main_program=test_program, - share_vars_from=train_exe, - build_strategy=build_strategy) - - for i in xrange(5): - test_loss, = test_exe.run([loss.name], feed=feed_dict) - test_loss = np.array(test_loss) - - train_loss, = train_exe.run([loss.name], feed=feed_dict) - train_loss = np.array(train_loss) - self.assertTrue( - np.allclose( - train_loss, test_loss, atol=1e-8), - "Train loss: " + str(train_loss) + "\n Test loss:" + - str(test_loss)) - - def test_parallel_testing(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce - self.check_network_convergence(build_strategy) - - def test_parallel_testing_with_new_strategy(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce - self.check_network_convergence(build_strategy) - - -import paddle.dataset.conll05 as conll05 -import paddle.fluid as fluid - -word_dict, verb_dict, label_dict = conll05.get_dict() -word_dict_len = len(word_dict) -label_dict_len = len(label_dict) -pred_dict_len = len(verb_dict) -mark_dict_len = 2 -word_dim = 32 -mark_dim = 5 -hidden_dim = 512 -depth = 8 -mix_hidden_lr = 1e-3 -embedding_name = 'emb' - - -def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, - is_sparse, **ignored): - # 8 features - predicate_embedding = fluid.layers.embedding( - input=predicate, - is_sparse=is_sparse, - size=[pred_dict_len, word_dim], - dtype='float32', - param_attr='vemb') - - mark_embedding = fluid.layers.embedding( - input=mark, - is_sparse=is_sparse, - size=[mark_dict_len, mark_dim], - dtype='float32') - - word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] - emb_layers = [ - fluid.layers.embedding( - size=[word_dict_len, word_dim], - is_sparse=is_sparse, - input=x, - param_attr=fluid.ParamAttr( - name=embedding_name, trainable=False)) for x in word_input - ] - emb_layers.append(predicate_embedding) - emb_layers.append(mark_embedding) - - hidden_0_layers = [ - fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') - for emb in emb_layers - ] - - hidden_0 = fluid.layers.sums(input=hidden_0_layers) - - lstm_0 = fluid.layers.dynamic_lstm( - input=hidden_0, - size=hidden_dim, - candidate_activation='relu', - gate_activation='sigmoid', - cell_activation='sigmoid') - - # stack L-LSTM and R-LSTM with direct edges - input_tmp = [hidden_0, lstm_0] - - for i in range(1, depth): - mix_hidden = fluid.layers.sums(input=[ - fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'), - fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh') - ]) - - lstm = fluid.layers.dynamic_lstm( - input=mix_hidden, - size=hidden_dim, - candidate_activation='relu', - gate_activation='sigmoid', - cell_activation='sigmoid', - is_reverse=((i % 2) == 1)) - - input_tmp = [mix_hidden, lstm] - - feature_out = fluid.layers.sums(input=[ - fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'), - fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh') - ]) - - return feature_out - - -class TestCRFModel(unittest.TestCase): - def check_network_convergence(self, is_sparse, build_strategy=None): - main = fluid.Program() - startup = fluid.Program() - with fluid.program_guard(main, startup): - word = fluid.layers.data( - name='word_data', shape=[1], dtype='int64', lod_level=1) - predicate = fluid.layers.data( - name='verb_data', shape=[1], dtype='int64', lod_level=1) - ctx_n2 = fluid.layers.data( - name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) - ctx_n1 = fluid.layers.data( - name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) - ctx_0 = fluid.layers.data( - name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) - ctx_p1 = fluid.layers.data( - name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) - ctx_p2 = fluid.layers.data( - name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) - mark = fluid.layers.data( - name='mark_data', shape=[1], dtype='int64', lod_level=1) - - feature_out = db_lstm(**locals()) - target = fluid.layers.data( - name='target', shape=[1], dtype='int64', lod_level=1) - crf_cost = fluid.layers.linear_chain_crf( - input=feature_out, - label=target, - param_attr=fluid.ParamAttr( - name='crfw', learning_rate=1e-1)) - avg_cost = fluid.layers.mean(crf_cost) - - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.layers.exponential_decay( - learning_rate=0.01, - decay_steps=100000, - decay_rate=0.5, - staircase=True)) - sgd_optimizer.minimize(avg_cost) - - train_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.conll05.test(), buf_size=8192), - batch_size=16) - - place = fluid.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(startup) - - pe = fluid.ParallelExecutor( - use_cuda=True, - loss_name=avg_cost.name, - build_strategy=build_strategy) - - feeder = fluid.DataFeeder( - feed_list=[ - word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, - mark, target - ], - place=fluid.CPUPlace()) - - data = train_data() - for i in xrange(10): - cur_batch = next(data) - print map(np.array, - pe.run(feed=feeder.feed(cur_batch), - fetch_list=[avg_cost.name]))[0] - - def test_update_sparse_parameter_all_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce - self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy) - - def test_update_dense_parameter_all_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce - self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy) - - def test_update_sparse_parameter_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce - self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy) - - def test_update_dense_parameter_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce - self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy) - - -# test fetch all the variables of global_block - -import paddle.dataset.flowers as flowers -import math - - -def Lenet(data, class_dim): - conv1 = fluid.layers.conv2d(data, 32, 5, 1, act=None) - bn1 = fluid.layers.batch_norm(conv1, act='relu') - pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2) - conv2 = fluid.layers.conv2d(pool1, 50, 5, 1, act=None) - bn2 = fluid.layers.batch_norm(conv2, act='relu') - pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2) - - fc1 = fluid.layers.fc(pool2, size=500, act='relu') - fc2 = fluid.layers.fc(fc1, size=class_dim, act='softmax') - - return fc2 - - -class TestFetchOp(unittest.TestCase): - def parallel_exe(self, train_inputs, seed): - main = fluid.Program() - startup = fluid.Program() - startup.random_seed = seed - with fluid.program_guard(main, startup): - data = fluid.layers.data( - name='image', shape=[3, 224, 224], dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - out = Lenet(data, class_dim=102) - loss = fluid.layers.cross_entropy(input=out, label=label) - loss = fluid.layers.mean(loss) - - opt = fluid.optimizer.Momentum( - learning_rate=0.1, - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - - opt.minimize(loss) - - # TODO(zcd): I found that onece the memory optimizer is open, - # parallel_exe doesn't fetch some variable, such as conv2d_0.b_0@GRAD, - # conv2d_1.b_0@GRAD. Those variables should not be pruned. - # fluid.memory_optimize(main) - - place = fluid.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(startup) - - feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) - pe = fluid.ParallelExecutor( - use_cuda=True, loss_name=loss.name, main_program=main) - - fetch_list = [] - all_vars = main.global_block().vars - for k, v in all_vars.iteritems(): - if 'tmp' not in k and k[0] is not '_' or v.persistable: - fetch_list.append(k) - - for data in train_inputs: - ret = pe.run(fetch_list, feed=feeder.feed(data)) - for i in range(len(fetch_list)): - assert not math.isnan(np.sum(ret[i])) and \ - not math.isinf(np.sum(ret[i])) - - def test_fetch_op(self): - tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16) - tst_reader_iter = tst_reader() - - iters = 3 - train_inputs = [] - for i in range(iters): - train_inputs.append(tst_reader_iter.next()) - - self.parallel_exe(train_inputs, seed=1) - - -class TestFeedParallel(unittest.TestCase): - def test_main(self): - main = fluid.Program() - startup = fluid.Program() - startup.random_seed = 1 - with fluid.scope_guard(fluid.core.Scope()): - with fluid.program_guard(main, startup): - data = fluid.layers.data( - name='image', shape=[3, 224, 224], dtype='float32') - label = fluid.layers.data( - name='label', shape=[1], dtype='int64') - out = Lenet(data, class_dim=102) - loss = fluid.layers.cross_entropy(input=out, label=label) - loss = fluid.layers.mean(loss) - opt = fluid.optimizer.Momentum( - learning_rate=0.1, - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - - opt.minimize(loss) - place = fluid.CUDAPlace(0) - feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) - reader = feeder.decorate_reader( - paddle.batch( - flowers.train(), batch_size=16), multi_devices=True) - exe = fluid.Executor(place) - exe.run(startup) - pe = fluid.ParallelExecutor( - use_cuda=True, loss_name=loss.name, main_program=main) - - for batch_id, data in enumerate(reader()): - loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0]) - print batch_id, loss_np - if batch_id == 2: - break - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py new file mode 100644 index 0000000000000000000000000000000000000000..66e138b03f3b170aca4fb2207438eb9af1783c33 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -0,0 +1,197 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.dataset.conll05 as conll05 +import paddle.fluid as fluid +import unittest +import paddle +import numpy as np + +word_dict, verb_dict, label_dict = conll05.get_dict() +word_dict_len = len(word_dict) +label_dict_len = len(label_dict) +pred_dict_len = len(verb_dict) +mark_dict_len = 2 +word_dim = 32 +mark_dim = 5 +hidden_dim = 512 +depth = 8 +mix_hidden_lr = 1e-3 +embedding_name = 'emb' + + +def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, + is_sparse, **ignored): + # 8 features + predicate_embedding = fluid.layers.embedding( + input=predicate, + is_sparse=is_sparse, + size=[pred_dict_len, word_dim], + dtype='float32', + param_attr='vemb') + + mark_embedding = fluid.layers.embedding( + input=mark, + is_sparse=is_sparse, + size=[mark_dict_len, mark_dim], + dtype='float32') + + word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] + emb_layers = [ + fluid.layers.embedding( + size=[word_dict_len, word_dim], + is_sparse=is_sparse, + input=x, + param_attr=fluid.ParamAttr( + name=embedding_name, trainable=False)) for x in word_input + ] + emb_layers.append(predicate_embedding) + emb_layers.append(mark_embedding) + + hidden_0_layers = [ + fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') + for emb in emb_layers + ] + + hidden_0 = fluid.layers.sums(input=hidden_0_layers) + + lstm_0 = fluid.layers.dynamic_lstm( + input=hidden_0, + size=hidden_dim, + candidate_activation='relu', + gate_activation='sigmoid', + cell_activation='sigmoid') + + # stack L-LSTM and R-LSTM with direct edges + input_tmp = [hidden_0, lstm_0] + + for i in range(1, depth): + mix_hidden = fluid.layers.sums(input=[ + fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'), + fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh') + ]) + + lstm = fluid.layers.dynamic_lstm( + input=mix_hidden, + size=hidden_dim, + candidate_activation='relu', + gate_activation='sigmoid', + cell_activation='sigmoid', + is_reverse=((i % 2) == 1)) + + input_tmp = [mix_hidden, lstm] + + feature_out = fluid.layers.sums(input=[ + fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'), + fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh') + ]) + + return feature_out + + +class TestCRFModel(unittest.TestCase): + def check_network_convergence(self, is_sparse, build_strategy=None): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + word = fluid.layers.data( + name='word_data', shape=[1], dtype='int64', lod_level=1) + predicate = fluid.layers.data( + name='verb_data', shape=[1], dtype='int64', lod_level=1) + ctx_n2 = fluid.layers.data( + name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) + ctx_n1 = fluid.layers.data( + name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) + ctx_0 = fluid.layers.data( + name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) + ctx_p1 = fluid.layers.data( + name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) + ctx_p2 = fluid.layers.data( + name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) + mark = fluid.layers.data( + name='mark_data', shape=[1], dtype='int64', lod_level=1) + + feature_out = db_lstm(**locals()) + target = fluid.layers.data( + name='target', shape=[1], dtype='int64', lod_level=1) + crf_cost = fluid.layers.linear_chain_crf( + input=feature_out, + label=target, + param_attr=fluid.ParamAttr( + name='crfw', learning_rate=1e-1)) + avg_cost = fluid.layers.mean(crf_cost) + + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=fluid.layers.exponential_decay( + learning_rate=0.01, + decay_steps=100000, + decay_rate=0.5, + staircase=True)) + sgd_optimizer.minimize(avg_cost) + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.conll05.test(), buf_size=8192), + batch_size=16) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(startup) + + pe = fluid.ParallelExecutor( + use_cuda=True, + loss_name=avg_cost.name, + build_strategy=build_strategy) + + feeder = fluid.DataFeeder( + feed_list=[ + word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, + mark, target + ], + place=fluid.CPUPlace()) + + data = train_data() + for i in xrange(10): + cur_batch = next(data) + print map(np.array, + pe.run(feed=feeder.feed(cur_batch), + fetch_list=[avg_cost.name]))[0] + + def test_update_sparse_parameter_all_reduce(self): + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + self.check_network_convergence( + is_sparse=True, build_strategy=build_strategy) + + def test_update_dense_parameter_all_reduce(self): + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + self.check_network_convergence( + is_sparse=False, build_strategy=build_strategy) + + def test_update_sparse_parameter_reduce(self): + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce + self.check_network_convergence( + is_sparse=True, build_strategy=build_strategy) + + def test_update_dense_parameter_reduce(self): + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce + self.check_network_convergence( + is_sparse=False, build_strategy=build_strategy) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py new file mode 100644 index 0000000000000000000000000000000000000000..24f8d28c0304a77a99213374b25d0db728eca265 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py @@ -0,0 +1,132 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.dataset.flowers as flowers +import math +import paddle.fluid as fluid +import unittest +import numpy as np +import paddle + + +def Lenet(data, class_dim): + conv1 = fluid.layers.conv2d(data, 32, 5, 1, act=None) + bn1 = fluid.layers.batch_norm(conv1, act='relu') + pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2) + conv2 = fluid.layers.conv2d(pool1, 50, 5, 1, act=None) + bn2 = fluid.layers.batch_norm(conv2, act='relu') + pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2) + + fc1 = fluid.layers.fc(pool2, size=500, act='relu') + fc2 = fluid.layers.fc(fc1, size=class_dim, act='softmax') + + return fc2 + + +class TestFetchOp(unittest.TestCase): + def parallel_exe(self, train_inputs, seed): + main = fluid.Program() + startup = fluid.Program() + startup.random_seed = seed + with fluid.program_guard(main, startup): + data = fluid.layers.data( + name='image', shape=[3, 224, 224], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + out = Lenet(data, class_dim=102) + loss = fluid.layers.cross_entropy(input=out, label=label) + loss = fluid.layers.mean(loss) + + opt = fluid.optimizer.Momentum( + learning_rate=0.1, + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + + opt.minimize(loss) + + # TODO(zcd): I found that onece the memory optimizer is open, + # parallel_exe doesn't fetch some variable, such as conv2d_0.b_0@GRAD, + # conv2d_1.b_0@GRAD. Those variables should not be pruned. + # fluid.memory_optimize(main) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(startup) + + feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) + pe = fluid.ParallelExecutor( + use_cuda=True, loss_name=loss.name, main_program=main) + + fetch_list = [] + all_vars = main.global_block().vars + for k, v in all_vars.iteritems(): + if 'tmp' not in k and k[0] is not '_' or v.persistable: + fetch_list.append(k) + + for data in train_inputs: + ret = pe.run(fetch_list, feed=feeder.feed(data)) + for i in range(len(fetch_list)): + assert not math.isnan(np.sum(ret[i])) and \ + not math.isinf(np.sum(ret[i])) + + def test_fetch_op(self): + tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16) + tst_reader_iter = tst_reader() + + iters = 3 + train_inputs = [] + for i in range(iters): + train_inputs.append(tst_reader_iter.next()) + + self.parallel_exe(train_inputs, seed=1) + + +class TestFeedParallel(unittest.TestCase): + def test_main(self): + main = fluid.Program() + startup = fluid.Program() + startup.random_seed = 1 + with fluid.scope_guard(fluid.core.Scope()): + with fluid.program_guard(main, startup): + data = fluid.layers.data( + name='image', shape=[3, 224, 224], dtype='float32') + label = fluid.layers.data( + name='label', shape=[1], dtype='int64') + out = Lenet(data, class_dim=102) + loss = fluid.layers.cross_entropy(input=out, label=label) + loss = fluid.layers.mean(loss) + opt = fluid.optimizer.Momentum( + learning_rate=0.1, + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + + opt.minimize(loss) + place = fluid.CUDAPlace(0) + feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) + reader = feeder.decorate_reader( + paddle.batch( + flowers.train(), batch_size=16), multi_devices=True) + exe = fluid.Executor(place) + exe.run(startup) + pe = fluid.ParallelExecutor( + use_cuda=True, loss_name=loss.name, main_program=main) + + for batch_id, data in enumerate(reader()): + loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0]) + print batch_id, loss_np + if batch_id == 2: + break + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..015703c3e25f4e11e64ab6a7de99da12bee608f6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -0,0 +1,171 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from parallel_executor_test_base import TestParallelExecutorBase +import paddle.fluid as fluid +import numpy as np +import paddle +import paddle.dataset.mnist as mnist +import unittest + +MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio" + + +def simple_fc_net(use_feed): + if use_feed: + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + else: + reader = fluid.layers.open_files( + filenames=[MNIST_RECORDIO_FILE], + shapes=[[-1, 784], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64'], + thread_num=1, + for_parallel=True) + reader = fluid.layers.io.double_buffer(reader) + img, label = fluid.layers.read_file(reader) + hidden = img + for _ in xrange(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def fc_with_batchnorm(use_feed): + if use_feed: + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + else: + reader = fluid.layers.open_files( + filenames=[MNIST_RECORDIO_FILE], + shapes=[[-1, 784], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64'], + thread_num=1, + for_parallel=True) + reader = fluid.layers.io.double_buffer(reader) + img, label = fluid.layers.read_file(reader) + + hidden = img + for _ in xrange(1): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + + hidden = fluid.layers.batch_norm(input=hidden) + + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestMNIST(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + # Convert mnist to recordio file + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(mnist.train(), batch_size=4) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=[784]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_file( + MNIST_RECORDIO_FILE, reader, feeder) + + def check_simple_fc_convergence(self, balance_parameter_opt_between_cards): + self.check_network_convergence(simple_fc_net) + self.check_network_convergence(simple_fc_net, allow_op_delay=True) + + img = np.zeros(shape=[32, 784], dtype='float32') + label = np.ones(shape=[32, 1], dtype='int64') + self.check_network_convergence( + simple_fc_net, + feed_dict={"image": img, + "label": label}, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) + + def test_simple_fc(self): + self.check_simple_fc_convergence(False) + + def test_simple_fc_with_new_strategy(self): + self.check_simple_fc_convergence(True) + + def check_simple_fc_parallel_accuracy(self, + balance_parameter_opt_between_cards): + img = np.zeros(shape=[32, 784], dtype='float32') + label = np.ones(shape=[32, 1], dtype='int64') + single_first_loss, single_last_loss = self.check_network_convergence( + method=simple_fc_net, + seed=1000, + feed_dict={"image": img, + "label": label}, + use_parallel_executor=False) + parallel_first_loss, parallel_last_loss = self.check_network_convergence( + method=simple_fc_net, + seed=1000, + feed_dict={"image": img, + "label": label}, + use_parallel_executor=True, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) + + for p_f in parallel_first_loss: + self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) + for p_l in parallel_last_loss: + self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) + + def test_simple_fc_parallel_accuracy(self): + self.check_simple_fc_parallel_accuracy(False) + + def test_simple_fc_parallel_accuracy_with_new_strategy(self): + self.check_simple_fc_parallel_accuracy(True) + + def check_batchnorm_fc_convergence(self, + balance_parameter_opt_between_cards): + self.check_network_convergence(fc_with_batchnorm) + img = np.zeros(shape=[32, 784], dtype='float32') + label = np.ones(shape=[32, 1], dtype='int64') + self.check_network_convergence( + fc_with_batchnorm, + feed_dict={"image": img, + "label": label}, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) + + def test_batchnorm_fc(self): + self.check_batchnorm_fc_convergence(False) + + def test_batchnorm_fc_with_new_strategy(self): + self.check_batchnorm_fc_convergence(True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py new file mode 100644 index 0000000000000000000000000000000000000000..a3fa140cbb7994a36d2cbee26d598165f1f771d2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -0,0 +1,152 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +from parallel_executor_test_base import TestParallelExecutorBase +import unittest + + +def squeeze_excitation(input, num_channels, reduction_ratio): + # pool = fluid.layers.pool2d( + # input=input, pool_size=0, pool_type='avg', global_pooling=True) + conv = input + shape = conv.shape + reshape = fluid.layers.reshape( + x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) + pool = fluid.layers.reduce_mean(input=reshape, dim=2) + + squeeze = fluid.layers.fc(input=pool, + size=num_channels / reduction_ratio, + act='relu') + excitation = fluid.layers.fc(input=squeeze, + size=num_channels, + act='sigmoid') + scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) + return scale + + +def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1, + act=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) / 2, + groups=groups, + act=None, + bias_attr=False) + return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1) + + +def shortcut(input, ch_out, stride): + ch_in = input.shape[1] + if ch_in != ch_out: + if stride == 1: + filter_size = 1 + else: + filter_size = 3 + return conv_bn_layer(input, ch_out, filter_size, stride) + else: + return input + + +def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): + # The number of first 1x1 convolutional channels for each bottleneck build block + # was halved to reduce the compution cost. + conv0 = conv_bn_layer( + input=input, num_filters=num_filters, filter_size=1, act='relu') + conv1 = conv_bn_layer( + input=conv0, + num_filters=num_filters * 2, + filter_size=3, + stride=stride, + groups=cardinality, + act='relu') + conv2 = conv_bn_layer( + input=conv1, num_filters=num_filters * 2, filter_size=1, act=None) + scale = squeeze_excitation( + input=conv2, + num_channels=num_filters * 2, + reduction_ratio=reduction_ratio) + + short = shortcut(input, num_filters * 2, stride) + + return fluid.layers.elementwise_add(x=short, y=scale, act='relu') + + +def SE_ResNeXt50Small(batch_size=2, use_feed=False): + assert not use_feed, "SE_ResNeXt doesn't support feed yet" + + img = fluid.layers.fill_constant( + shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0) + label = fluid.layers.fill_constant( + shape=[batch_size, 1], dtype='int64', value=0.0) + + conv = conv_bn_layer( + input=img, num_filters=16, filter_size=3, stride=2, act='relu') + conv = conv_bn_layer( + input=conv, num_filters=16, filter_size=3, stride=1, act='relu') + conv = conv_bn_layer( + input=conv, num_filters=16, filter_size=3, stride=1, act='relu') + conv = fluid.layers.pool2d( + input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') + + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 6, 3] + num_filters = [128, 256, 512, 1024] + + for block in range(len(depth)): + for i in range(depth[block]): + conv = bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + cardinality=cardinality, + reduction_ratio=reduction_ratio) + + shape = conv.shape + reshape = fluid.layers.reshape( + x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) + pool = fluid.layers.reduce_mean(input=reshape, dim=2) + dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2) + # Classifier layer: + prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestResnet(TestParallelExecutorBase): + def check_resnet_convergence(self, balance_parameter_opt_between_cards): + import functools + batch_size = 2 + self.check_network_convergence( + functools.partial( + SE_ResNeXt50Small, batch_size=batch_size), + iter=20, + batch_size=batch_size, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) + + def test_resnet(self): + self.check_resnet_convergence(False) + + def test_resnet_with_new_strategy(self): + self.check_resnet_convergence(True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py new file mode 100644 index 0000000000000000000000000000000000000000..93a5f767867d68110cf7b8f441cc740ecd843cf9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py @@ -0,0 +1,93 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import numpy as np +import unittest + + +def simple_fc_net(): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = img + for _ in xrange(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class ParallelExecutorTestingDuringTraining(unittest.TestCase): + def check_network_convergence(self, build_strategy=None): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = simple_fc_net() + test_program = main.clone(for_test=True) + + opt = fluid.optimizer.SGD(learning_rate=0.001) + opt.minimize(loss) + + batch_size = 32 + image = np.random.normal(size=(batch_size, 784)).astype('float32') + label = np.random.randint(0, 10, (batch_size, 1), dtype="int64") + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(startup) + feed_dict = {'image': image, 'label': label} + + train_exe = fluid.ParallelExecutor( + use_cuda=True, + loss_name=loss.name, + main_program=main, + build_strategy=build_strategy) + + test_exe = fluid.ParallelExecutor( + use_cuda=True, + main_program=test_program, + share_vars_from=train_exe, + build_strategy=build_strategy) + + for i in xrange(5): + test_loss, = test_exe.run([loss.name], feed=feed_dict) + test_loss = np.array(test_loss) + + train_loss, = train_exe.run([loss.name], feed=feed_dict) + train_loss = np.array(train_loss) + self.assertTrue( + np.allclose( + train_loss, test_loss, atol=1e-8), + "Train loss: " + str(train_loss) + "\n Test loss:" + + str(test_loss)) + + def test_parallel_testing(self): + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + self.check_network_convergence(build_strategy) + + def test_parallel_testing_with_new_strategy(self): + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce + self.check_network_convergence(build_strategy) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..c81df66d987f3d3856af0e19fc935df7de2edacc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -0,0 +1,174 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import transformer_model +import numpy as np +from parallel_executor_test_base import TestParallelExecutorBase +import unittest +import paddle +import paddle.dataset.wmt16 as wmt16 + +WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio" + + +class ModelHyperParams(object): + # Dictionary size for source and target language. This model directly uses + # paddle.dataset.wmt16 in which , and token has + # alreay been added, but the token is not added. Transformer requires + # sequences in a mini-batch are padded to have the same length. A token is + # added into the original dictionary in paddle.dateset.wmt16. + + # size of source word dictionary. + src_vocab_size = 10000 + # index for token in source language. + src_pad_idx = src_vocab_size + + # size of target word dictionay + trg_vocab_size = 10000 + # index for token in target language. + trg_pad_idx = trg_vocab_size + + # position value corresponding to the token. + pos_pad_idx = 0 + + # max length of sequences. It should plus 1 to include position + # padding token for position encoding. + max_length = 50 + + # the dimension for word embeddings, which is also the last dimension of + # the input and output of multi-head attention, position-wise feed-forward + # networks, encoder and decoder. + + d_model = 512 + # size of the hidden layer in position-wise feed-forward networks. + d_inner_hid = 1024 + # the dimension that keys are projected to for dot-product attention. + d_key = 64 + # the dimension that values are projected to for dot-product attention. + d_value = 64 + # number of head used in multi-head attention. + n_head = 8 + # number of sub-layers to be stacked in the encoder and decoder. + n_layer = 6 + # dropout rate used by all dropout layers. + dropout = 0.1 + + +def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. Then, convert the numpy + data to tensors and return a dict mapping names to tensors. + """ + + def __pad_batch_data(insts, + pad_idx, + is_target=False, + return_pos=True, + return_attn_bias=True, + return_max_len=True): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. + """ + return_list = [] + max_len = max(len(inst) for inst in insts) + inst_data = np.array( + [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) + return_list += [inst_data.astype("int64").reshape([-1, 1])] + if return_pos: + inst_pos = np.array([[ + pos_i + 1 if w_i != pad_idx else 0 + for pos_i, w_i in enumerate(inst) + ] for inst in inst_data]) + + return_list += [inst_pos.astype("int64").reshape([-1, 1])] + if return_attn_bias: + if is_target: + # This is used to avoid attention on paddings and subsequent + # words. + slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, + max_len)) + slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape( + [-1, 1, max_len, max_len]) + slf_attn_bias_data = np.tile(slf_attn_bias_data, + [1, n_head, 1, 1]) * [-1e9] + else: + # This is used to avoid attention on paddings. + slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] * + (max_len - len(inst)) + for inst in insts]) + slf_attn_bias_data = np.tile( + slf_attn_bias_data.reshape([-1, 1, 1, max_len]), + [1, n_head, max_len, 1]) + return_list += [slf_attn_bias_data.astype("float32")] + if return_max_len: + return_list += [max_len] + return return_list if len(return_list) > 1 else return_list[0] + + src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data( + [inst[0] for inst in insts], src_pad_idx, is_target=False) + trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data( + [inst[1] for inst in insts], trg_pad_idx, is_target=True) + trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], + [1, 1, trg_max_len, 1]).astype("float32") + lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False, + False, False, False) + lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) + + return [ + src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + ] + + +def transformer(use_feed): + assert not use_feed, "transfomer doesn't support feed yet" + return transformer_model.transformer( + ModelHyperParams.src_vocab_size + 1, + ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, + ModelHyperParams.n_layer, ModelHyperParams.n_head, + ModelHyperParams.d_key, ModelHyperParams.d_value, + ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, + ModelHyperParams.dropout, ModelHyperParams.src_pad_idx, + ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) + + +class TestTransformer(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + reader = paddle.batch( + wmt16.train(ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size), + batch_size=transformer_model.batch_size) + + with fluid.recordio_writer.create_recordio_writer( + WMT16_RECORDIO_FILE) as writer: + for batch in reader(): + for tensor in prepare_batch_input( + batch, ModelHyperParams.src_pad_idx, + ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head): + t = fluid.LoDTensor() + t.set(tensor, fluid.CPUPlace()) + writer.append_tensor(t) + writer.complete_append_tensor() + + @unittest.skip("transformer is buggy in multi gpu") + def test_main(self): + self.check_network_convergence(transformer) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index 9b0cc3534dc551e7fdf7ef8111cad1c172f8bfa4..865c2b7df085aa6a6cb0d6eb461c342ce08695cd 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -34,8 +34,10 @@ class TestMeanOp(OpTest): def setUp(self): self.op_type = "reduce_mean" self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")} - self.attrs = {'dim': 1} - self.outputs = {'Out': self.inputs['X'].mean(axis=self.attrs['dim'])} + self.attrs = {'dim': [1]} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } def test_check_output(self): self.check_output() @@ -50,8 +52,10 @@ class TestMaxOp(OpTest): def setUp(self): self.op_type = "reduce_max" self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} - self.attrs = {'dim': -1} - self.outputs = {'Out': self.inputs['X'].max(axis=self.attrs['dim'])} + self.attrs = {'dim': [-1]} + self.outputs = { + 'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim'])) + } def test_check_output(self): self.check_output() @@ -63,8 +67,10 @@ class TestMinOp(OpTest): def setUp(self): self.op_type = "reduce_min" self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} - self.attrs = {'dim': 2} - self.outputs = {'Out': self.inputs['X'].min(axis=self.attrs['dim'])} + self.attrs = {'dim': [2]} + self.outputs = { + 'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim'])) + } def test_check_output(self): self.check_output() @@ -87,9 +93,10 @@ class TestKeepDimReduce(OpTest): def setUp(self): self.op_type = "reduce_sum" self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} - self.attrs = {'dim': -2, 'keep_dim': True} + self.attrs = {'dim': [-2], 'keep_dim': True} self.outputs = { - 'Out': self.inputs['X'].sum(axis=self.attrs['dim'], keepdims=True) + 'Out': + self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True) } def test_check_output(self): @@ -126,5 +133,67 @@ class TestReduceAll(OpTest): self.check_grad(['X'], 'Out') +## reduction in multi dims +class TestReduceMeanOpMultiAxises(OpTest): + def setUp(self): + self.op_type = "reduce_mean" + self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")} + self.attrs = {'dim': [1, 2]} + self.outputs = {'Out': self.inputs['X'].mean(axis=(1, 2))} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestReduceMaxOpMultiAxises(OpTest): + """Remove Max with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_max" + self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} + self.attrs = {'dim': [-2, -1]} + self.outputs = { + 'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim'])) + } + + def test_check_output(self): + self.check_output() + + +class TestReduceMinOpMultiAxises(OpTest): + """Remove Min with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_min" + self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} + self.attrs = {'dim': [1, 2]} + self.outputs = { + 'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim'])) + } + + def test_check_output(self): + self.check_output() + + +class TestKeepDimReduceSumMultiAxises(OpTest): + def setUp(self): + self.op_type = "reduce_sum" + self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} + self.attrs = {'dim': [-2, -1], 'keep_dim': True} + self.outputs = { + 'Out': + self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True) + } + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index 80a8f7c09cfe521f8f94a27e85fc8d86c02b3e97..9ff0ae6fca27d4681891b2033e2f8f95bd825942 100644 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -107,7 +107,7 @@ class ControlFlowGraph(object): # Repeatedly apply liveness updates until the algorithm stablize # on a complete set live input vars and live output vars. while True: - for i in range(self.op_size, 0, -1): + for i in reversed(range(self.op_size)): live_in[i] = set(self._live_in[i]) live_out[i] = set(self._live_out[i]) for s in self._successors[i]: